mirror of
				https://github.com/stevenrobertson/cuburn.git
				synced 2025-11-03 18:00:55 -05:00 
			
		
		
		
	Flat (pre-packed int) palettes in deferred mode.
This commit is contained in:
		@ -254,10 +254,34 @@ void interp_{{tname}}({{tname}}* out, float *times, float *knots,
 | 
				
			|||||||
    {{endfor}}
 | 
					    {{endfor}}
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					__global__
 | 
				
			||||||
 | 
					void interp_palette_hsv_flat(mwc_st *rctxs,
 | 
				
			||||||
 | 
					        const float *times, const float4 *sources,
 | 
				
			||||||
 | 
					        float tstart, float tstep) {
 | 
				
			||||||
 | 
					    int gid = blockIdx.x * blockDim.x + threadIdx.x;
 | 
				
			||||||
 | 
					    mwc_st rctx = rctxs[gid];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    float time = tstart + blockIdx.x * tstep;
 | 
				
			||||||
 | 
					    float4 rgba = interp_color_hsv(times, sources, time);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    // TODO: use YUV; pack Y at full precision, UV at quarter
 | 
				
			||||||
 | 
					    uint2 out;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    uint32_t r = min(255, (uint32_t) (rgba.x * 255.0f + 0.49f * mwc_next_11(rctx)));
 | 
				
			||||||
 | 
					    uint32_t g = min(255, (uint32_t) (rgba.y * 255.0f + 0.49f * mwc_next_11(rctx)));
 | 
				
			||||||
 | 
					    uint32_t b = min(255, (uint32_t) (rgba.z * 255.0f + 0.49f * mwc_next_11(rctx)));
 | 
				
			||||||
 | 
					    out.x = (1 << 22) | (r << 4);
 | 
				
			||||||
 | 
					    out.y = (g << 18) | b;
 | 
				
			||||||
 | 
					    surf2Dwrite(out, flatpal, 8 * threadIdx.x, blockIdx.x);
 | 
				
			||||||
 | 
					    rctxs[gid] = rctx;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
""")
 | 
					""")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    _decls = Template(r"""
 | 
					    _decls = Template(r"""
 | 
				
			||||||
 | 
					surface<void, cudaSurfaceType2D> flatpal;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
typedef struct {
 | 
					typedef struct {
 | 
				
			||||||
{{for name in packed}}
 | 
					{{for name in packed}}
 | 
				
			||||||
    float   {{'_'.join(name)}};
 | 
					    float   {{'_'.join(name)}};
 | 
				
			||||||
@ -314,18 +338,15 @@ void test_cr(const float *times, const float *knots, const float *t, float *r) {
 | 
				
			|||||||
    r[i] = catmull_rom(times, knots, t[i]);
 | 
					    r[i] = catmull_rom(times, knots, t[i]);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__global__
 | 
					__device__
 | 
				
			||||||
void interp_palette_hsv(uchar4 *outs, const float *times, const float4 *sources,
 | 
					float4 interp_color_hsv(const float *times, const float4 *sources, float time) {
 | 
				
			||||||
        float tstart, float tstep) {
 | 
					 | 
				
			||||||
    float time = tstart + blockIdx.x * tstep;
 | 
					 | 
				
			||||||
    int idx = fmaxf(bitwise_binsearch(times, time) + 1, 1);
 | 
					    int idx = fmaxf(bitwise_binsearch(times, time) + 1, 1);
 | 
				
			||||||
 | 
					    float lf = (times[idx] - time) / (times[idx] - times[idx-1]);
 | 
				
			||||||
 | 
					    float rf = 1.0f - lf;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    float4 left  = sources[blockDim.x * (idx - 1) + threadIdx.x];
 | 
					    float4 left  = sources[blockDim.x * (idx - 1) + threadIdx.x];
 | 
				
			||||||
    float4 right = sources[blockDim.x * (idx)     + threadIdx.x];
 | 
					    float4 right = sources[blockDim.x * (idx)     + threadIdx.x];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    float lf = (times[idx] - time) / (times[idx] - times[idx-1]);
 | 
					 | 
				
			||||||
    float rf = 1.0f - lf;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    float3 lhsv = rgb2hsv(make_float3(left.x, left.y, left.z));
 | 
					    float3 lhsv = rgb2hsv(make_float3(left.x, left.y, left.z));
 | 
				
			||||||
    float3 rhsv = rgb2hsv(make_float3(right.x, right.y, right.z));
 | 
					    float3 rhsv = rgb2hsv(make_float3(right.x, right.y, right.z));
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -346,13 +367,23 @@ void interp_palette_hsv(uchar4 *outs, const float *times, const float4 *sources,
 | 
				
			|||||||
        hsv.x += 6.0f;
 | 
					        hsv.x += 6.0f;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    float3 rgb = hsv2rgb(hsv);
 | 
					    float3 rgb = hsv2rgb(hsv);
 | 
				
			||||||
 | 
					    return make_float4(rgb.x, rgb.y, rgb.z, left.w * lf + right.w * rf);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					__global__
 | 
				
			||||||
 | 
					void interp_palette_hsv(uchar4 *outs,
 | 
				
			||||||
 | 
					        const float *times, const float4 *sources,
 | 
				
			||||||
 | 
					        float tstart, float tstep) {
 | 
				
			||||||
 | 
					    float time = tstart + blockIdx.x * tstep;
 | 
				
			||||||
 | 
					    float4 rgba = interp_color_hsv(times, sources, time);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    uchar4 out;
 | 
					    uchar4 out;
 | 
				
			||||||
    out.x = rgb.x * 255.0f;
 | 
					    out.x = rgba.x * 255.0f;
 | 
				
			||||||
    out.y = rgb.y * 255.0f;
 | 
					    out.y = rgba.y * 255.0f;
 | 
				
			||||||
    out.z = rgb.z * 255.0f;
 | 
					    out.z = rgba.z * 255.0f;
 | 
				
			||||||
    out.w = 255.0f * (left.z * lf + right.z * rf);
 | 
					    out.w = rgba.w * 255.0f;
 | 
				
			||||||
    outs[blockDim.x * blockIdx.x + threadIdx.x] = out;
 | 
					    outs[blockDim.x * blockIdx.x + threadIdx.x] = out;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
""")
 | 
					""")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
				
			|||||||
@ -385,22 +385,6 @@ void iter(
 | 
				
			|||||||
#define SHAB 12
 | 
					#define SHAB 12
 | 
				
			||||||
#define SHAW (1<<SHAB)
 | 
					#define SHAW (1<<SHAB)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
// Read from the shm accumulators and write to the global ones.
 | 
					 | 
				
			||||||
__device__
 | 
					 | 
				
			||||||
void write_shmem_helper(
 | 
					 | 
				
			||||||
        float4 *acc,
 | 
					 | 
				
			||||||
        const int glo_idx,
 | 
					 | 
				
			||||||
        const uint32_t dr,
 | 
					 | 
				
			||||||
        const uint32_t gb
 | 
					 | 
				
			||||||
) {
 | 
					 | 
				
			||||||
    float4 pix = acc[glo_idx];
 | 
					 | 
				
			||||||
    pix.x += (dr & 0xffff) / 127.0f;
 | 
					 | 
				
			||||||
    pix.w += dr >> 16;
 | 
					 | 
				
			||||||
    pix.y += (gb & 0xffff) / 127.0f;
 | 
					 | 
				
			||||||
    pix.z += (gb >> 16) / 127.0f;
 | 
					 | 
				
			||||||
    acc[glo_idx] = pix;
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
// Read the point log, accumulate in shared memory, and write the results.
 | 
					// Read the point log, accumulate in shared memory, and write the results.
 | 
				
			||||||
// This kernel is to be launched with one block for every 4,096 addresses to
 | 
					// This kernel is to be launched with one block for every 4,096 addresses to
 | 
				
			||||||
// be processed, and will handle those addresses.
 | 
					// be processed, and will handle those addresses.
 | 
				
			||||||
@ -440,8 +424,10 @@ write_shmem(
 | 
				
			|||||||
    s_acc_gb[tid+3*BS] = 0;
 | 
					    s_acc_gb[tid+3*BS] = 0;
 | 
				
			||||||
    __syncthreads();
 | 
					    __syncthreads();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    // This predicate is used for the horrible monkey-patching magic.
 | 
					    // This predicate is used for the horrible monkey-patching magic. Second
 | 
				
			||||||
    asm volatile(".reg .pred p; setp.lt.u32 p, %0, 42;" :: "r"(s_acc_dr[0]));
 | 
					    // variable is just to shut the compiler up.
 | 
				
			||||||
 | 
					    asm volatile(".reg .pred p; setp.lt.u32 p, %0, 42;"
 | 
				
			||||||
 | 
					                 :: "r"(s_acc_dr[0]), "r"(s_acc_gb[0]));
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    // log_bounds[] holds inclusive prefix sums, so that log_bounds[0] is the
 | 
					    // log_bounds[] holds inclusive prefix sums, so that log_bounds[0] is the
 | 
				
			||||||
    // largest index with radix 0, and so on.
 | 
					    // largest index with radix 0, and so on.
 | 
				
			||||||
@ -453,7 +439,7 @@ write_shmem(
 | 
				
			|||||||
    else               idx_lo = 0;
 | 
					    else               idx_lo = 0;
 | 
				
			||||||
    int idx_hi = (log_bounds[lb_idx_hi] & ~(BS - 1)) + BS;
 | 
					    int idx_hi = (log_bounds[lb_idx_hi] & ~(BS - 1)) + BS;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    float rnrounds = 1.0f / (idx_hi - idx_lo);
 | 
					    float rnrounds = {{'%d.0f' % info.palette_height}} / (idx_hi - idx_lo);
 | 
				
			||||||
    float time = tid * rnrounds;
 | 
					    float time = tid * rnrounds;
 | 
				
			||||||
    float time_step = BS * rnrounds;
 | 
					    float time_step = BS * rnrounds;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -468,65 +454,57 @@ write_shmem(
 | 
				
			|||||||
        bfe_decl(glob_addr, entry, SHAB, 12);
 | 
					        bfe_decl(glob_addr, entry, SHAB, 12);
 | 
				
			||||||
        if (glob_addr != bid) continue;
 | 
					        if (glob_addr != bid) continue;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        // Shared memory address, pre-shifted
 | 
					 | 
				
			||||||
        int shr_addr;
 | 
					 | 
				
			||||||
        asm("bfi.b32 %0, %1, 0, 2, 12;" : "=r"(shr_addr) : "r"(entry));
 | 
					 | 
				
			||||||
        bfe_decl(color, entry, 24, 8);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        float colorf = color / 255.0f;
 | 
					 | 
				
			||||||
        float4 outcol = tex2D(palTex, colorf, time);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        // TODO: change texture sampler to return shorts and avoid this
 | 
					 | 
				
			||||||
        uint32_t r = 127.0f * outcol.x;
 | 
					 | 
				
			||||||
        uint32_t g = 127.0f * outcol.y;
 | 
					 | 
				
			||||||
        uint32_t b = 127.0f * outcol.z;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        uint32_t dr = r + 0x10000, gb = g + (b << 16);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        asm volatile ({{crep("""
 | 
					        asm volatile ({{crep("""
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
    .reg .pred q;
 | 
					    .reg .pred q;
 | 
				
			||||||
    .reg .u32 d, r, g, b, dr, gb, drw, gbw, off;
 | 
					    .reg .u32 shoff, color, time, d, r, g, b, hi, lo, his, los, hiw, low;
 | 
				
			||||||
    .reg .u64 ptr;
 | 
					    .reg .u64 ptr;
 | 
				
			||||||
    .reg .f32 rf, gf, bf, df;
 | 
					    .reg .f32 rf, gf, bf, df;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    bfi.b32         shoff,  %0,     0,  2,  12;
 | 
				
			||||||
 | 
					    bfe.u32         color,  %0,     24, 8;
 | 
				
			||||||
 | 
					    shl.b32         color,  color,  3;
 | 
				
			||||||
 | 
					    cvt.rni.u32.f32 time,   %1;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    suld.b.2d.v2.b32.clamp  {his, los},   [flatpal, {color, time}];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
acc_write_start:
 | 
					acc_write_start:
 | 
				
			||||||
    // This instruction will get replaced with a LDSLK that sets 'p'.
 | 
					    // This instruction will get replaced with a LDSLK that sets 'p'.
 | 
				
			||||||
    // The 0xffff is a signature to make sure we get the right instruction,
 | 
					    // The 0xffff is a signature to make sure we get the right instruction,
 | 
				
			||||||
    // and will get replaced with a 0-offset when patching.
 | 
					    // and will get replaced with a 0-offset when patching.
 | 
				
			||||||
    ld.shared.volatile.u32   dr,     [%2+0xffff];
 | 
					    ld.shared.volatile.u32  low,    [shoff+0xffff];
 | 
				
			||||||
@p  ld.shared.volatile.u32   gb,     [%2+0x4000];
 | 
					@p  ld.shared.volatile.u32  hiw,    [shoff+0x4000];
 | 
				
			||||||
@p  add.u32         %0,     dr,     %0;
 | 
					    add.cc.u32      lo,     los,    low;
 | 
				
			||||||
@p  add.u32         %1,     gb,     %1;
 | 
					    addc.u32        hi,     his,    hiw;
 | 
				
			||||||
    // TODO: clever use of slct could remove an instruction?
 | 
					    setp.lo.u32     q,      hi,     (1023 << 22);
 | 
				
			||||||
@p  setp.lo.u32     q,      %0,     32768000;
 | 
					    selp.b32        hiw,    hi,     0,      q;
 | 
				
			||||||
@p  selp.b32        drw,    %0,     0,      q;
 | 
					    selp.b32        low,    lo,     0,      q;
 | 
				
			||||||
@p  selp.b32        gbw,    %1,     0,      q;
 | 
					@p  st.shared.volatile.u32   [shoff+0x4000],    hiw;
 | 
				
			||||||
@p  st.shared.volatile.u32   [%2+0x4000],    gbw;
 | 
					 | 
				
			||||||
    // This instruction will get replaced with an STSUL
 | 
					    // This instruction will get replaced with an STSUL
 | 
				
			||||||
@p  st.shared.volatile.u32   [%2+0xffff],    drw;
 | 
					@p  st.shared.volatile.u32   [shoff+0xffff],    low;
 | 
				
			||||||
@!p bra             acc_write_start;
 | 
					@!p bra             acc_write_start;
 | 
				
			||||||
@q  bra             oflow_write_end;
 | 
					@q  bra             oflow_write_end;
 | 
				
			||||||
    shl.b32         %2,     %2,     2;
 | 
					    shl.b32         shoff,  shoff,  2;
 | 
				
			||||||
    cvt.u64.u32     ptr,    %2;
 | 
					    cvt.u64.u32     ptr,    shoff;
 | 
				
			||||||
    add.u64         ptr,    ptr,    %3;
 | 
					    add.u64         ptr,    ptr,    %2;
 | 
				
			||||||
    and.b32         r,      %0,     0xffff;
 | 
					    bfe.u32         r,      hi,     4,      18;
 | 
				
			||||||
    shr.b32         g,      %1,     16;
 | 
					    bfe.u32         g,      lo,     18,     14;
 | 
				
			||||||
    and.b32         b,      %1,     0xffff;
 | 
					    bfi.b32         g,      hi,     g,      14,     4;
 | 
				
			||||||
 | 
					    and.b32         b,      lo,     ((1<<18)-1);
 | 
				
			||||||
    cvt.rn.f32.u32  rf,     r;
 | 
					    cvt.rn.f32.u32  rf,     r;
 | 
				
			||||||
    cvt.rn.f32.u32  gf,     g;
 | 
					    cvt.rn.f32.u32  gf,     g;
 | 
				
			||||||
    cvt.rn.f32.u32  bf,     b;
 | 
					    cvt.rn.f32.u32  bf,     b;
 | 
				
			||||||
    mul.ftz.f32     rf,     rf,     0.007874015748031496;
 | 
					    mul.ftz.f32     rf,     rf,     (1.0/255.0);
 | 
				
			||||||
    mul.ftz.f32     gf,     gf,     0.007874015748031496;
 | 
					    mul.ftz.f32     gf,     gf,     (1.0/255.0);
 | 
				
			||||||
    mul.ftz.f32     bf,     bf,     0.007874015748031496;
 | 
					    mul.ftz.f32     bf,     bf,     (1.0/255.0);
 | 
				
			||||||
    red.add.f32     [ptr],   500.0;
 | 
					    red.add.f32     [ptr],  rf;
 | 
				
			||||||
    red.add.f32     [ptr+4], bf;
 | 
					    red.add.f32     [ptr+4], gf;
 | 
				
			||||||
    red.add.f32     [ptr+8], gf;
 | 
					    red.add.f32     [ptr+8], bf;
 | 
				
			||||||
    red.add.f32     [ptr+12], rf;
 | 
					    red.add.f32     [ptr+12], 1023.0;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
oflow_write_end:
 | 
					oflow_write_end:
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
        """)}}  ::  "r"(dr), "r"(gb), "r"(shr_addr), "l"(glo_ptr));
 | 
					        """)}}  ::  "r"(entry), "f"(time), "l"(glo_ptr));
 | 
				
			||||||
        // TODO: go through the pain of manual address calculation for global ptr
 | 
					        // TODO: go through the pain of manual address calculation for global ptr
 | 
				
			||||||
        time += time_step;
 | 
					        time += time_step;
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
@ -534,7 +512,25 @@ oflow_write_end:
 | 
				
			|||||||
    __syncthreads();
 | 
					    __syncthreads();
 | 
				
			||||||
    int idx = tid;
 | 
					    int idx = tid;
 | 
				
			||||||
    for (int i = 0; i < (SHAW / BS); i++) {
 | 
					    for (int i = 0; i < (SHAW / BS); i++) {
 | 
				
			||||||
        write_shmem_helper(acc, glo_base + idx, s_acc_dr[idx], s_acc_gb[idx]);
 | 
					        int d, r, g, b;
 | 
				
			||||||
 | 
					        float4 pix = acc[glo_base + idx];
 | 
				
			||||||
 | 
					        asm({{crep("""
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					    .reg .u32 hi, lo;
 | 
				
			||||||
 | 
					    ld.shared.u32   lo,     [%4];
 | 
				
			||||||
 | 
					    ld.shared.u32   hi,     [%4+0x4000];
 | 
				
			||||||
 | 
					    shr.u32         %0,     hi,     22;
 | 
				
			||||||
 | 
					    bfe.u32         %1,     hi,     4,      18;
 | 
				
			||||||
 | 
					    bfe.u32         %2,     lo,     18,     14;
 | 
				
			||||||
 | 
					    bfi.b32         %2,     hi,     %2,     14,     4;
 | 
				
			||||||
 | 
					    and.b32         %3,     lo,     ((1<<18)-1);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					        """)}} : "=r"(d), "=r"(r), "=r"(g), "=r"(b) : "r"(idx*4));
 | 
				
			||||||
 | 
					        pix.x += r / 255.0f;
 | 
				
			||||||
 | 
					        pix.y += g / 255.0f;
 | 
				
			||||||
 | 
					        pix.z += b / 255.0f;
 | 
				
			||||||
 | 
					        pix.w += d;
 | 
				
			||||||
 | 
					        acc[glo_base + idx] = pix;
 | 
				
			||||||
        idx += BS;
 | 
					        idx += BS;
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
				
			|||||||
@ -108,8 +108,10 @@ class RenderInfo(object):
 | 
				
			|||||||
    # that palette-from-texture is enabled). For most genomes, this doesn't
 | 
					    # that palette-from-texture is enabled). For most genomes, this doesn't
 | 
				
			||||||
    # need to be very large at all. However, since only an easily-cached
 | 
					    # need to be very large at all. However, since only an easily-cached
 | 
				
			||||||
    # fraction of this will be accessed per SM, larger values shouldn't hurt
 | 
					    # fraction of this will be accessed per SM, larger values shouldn't hurt
 | 
				
			||||||
    # performance too much. Power-of-two, please.
 | 
					    # performance too much. When using deferred accumulation, increasing this
 | 
				
			||||||
    palette_height = 16
 | 
					    # value increases the number of uniquely-dithered samples, which is nice.
 | 
				
			||||||
 | 
					    # Power-of-two, please.
 | 
				
			||||||
 | 
					    palette_height = 64
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # Maximum width of DE and other spatial filters, and thus in turn the
 | 
					    # Maximum width of DE and other spatial filters, and thus in turn the
 | 
				
			||||||
    # amount of padding applied. Note that, for now, this must not be changed!
 | 
					    # amount of padding applied. Note that, for now, this must not be changed!
 | 
				
			||||||
 | 
				
			|||||||
@ -113,7 +113,6 @@ class Renderer(object):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
        reset_rb_fun = self.mod.get_function("reset_rb")
 | 
					        reset_rb_fun = self.mod.get_function("reset_rb")
 | 
				
			||||||
        packer_fun = self.mod.get_function("interp_iter_params")
 | 
					        packer_fun = self.mod.get_function("interp_iter_params")
 | 
				
			||||||
        palette_fun = self.mod.get_function("interp_palette_hsv")
 | 
					 | 
				
			||||||
        iter_fun = self.mod.get_function("iter")
 | 
					        iter_fun = self.mod.get_function("iter")
 | 
				
			||||||
        write_fun = self.mod.get_function("write_shmem")
 | 
					        write_fun = self.mod.get_function("write_shmem")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -134,7 +133,8 @@ class Renderer(object):
 | 
				
			|||||||
        event_b = None
 | 
					        event_b = None
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        nbins = info.acc_height * info.acc_stride
 | 
					        nbins = info.acc_height * info.acc_stride
 | 
				
			||||||
        d_accum = cuda.mem_alloc(16 * nbins)
 | 
					        # Extra padding in accum helps with write_shmem overruns
 | 
				
			||||||
 | 
					        d_accum = cuda.mem_alloc(16 * nbins + (1<<16))
 | 
				
			||||||
        d_out = cuda.mem_alloc(16 * nbins)
 | 
					        d_out = cuda.mem_alloc(16 * nbins)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        acc_size = np.array([info.acc_width, info.acc_height, info.acc_stride])
 | 
					        acc_size = np.array([info.acc_width, info.acc_height, info.acc_stride])
 | 
				
			||||||
@ -168,7 +168,8 @@ class Renderer(object):
 | 
				
			|||||||
        reset_rb_fun(np.int32(rb_size), block=(1,1,1))
 | 
					        reset_rb_fun(np.int32(rb_size), block=(1,1,1))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        d_points = cuda.mem_alloc(nslots * 16)
 | 
					        d_points = cuda.mem_alloc(nslots * 16)
 | 
				
			||||||
        seeds = mwc.MWC.make_seeds(nslots)
 | 
					        # We may add extra seeds to simplify palette dithering.
 | 
				
			||||||
 | 
					        seeds = mwc.MWC.make_seeds(max(nslots, 256 * info.palette_height))
 | 
				
			||||||
        d_seeds = cuda.to_device(seeds)
 | 
					        d_seeds = cuda.to_device(seeds)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        # We used to auto-calculate this to a multiple of the number of SMs on
 | 
					        # We used to auto-calculate this to a multiple of the number of SMs on
 | 
				
			||||||
@ -191,13 +192,34 @@ class Renderer(object):
 | 
				
			|||||||
        d_palint_times = cuda.to_device(palint_times)
 | 
					        d_palint_times = cuda.to_device(palint_times)
 | 
				
			||||||
        d_palint_vals = cuda.to_device(
 | 
					        d_palint_vals = cuda.to_device(
 | 
				
			||||||
                np.concatenate(map(info.db.palettes.get, pals[1::2])))
 | 
					                np.concatenate(map(info.db.palettes.get, pals[1::2])))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        if info.acc_mode == 'deferred':
 | 
				
			||||||
 | 
					            palette_fun = self.mod.get_function("interp_palette_hsv_flat")
 | 
				
			||||||
 | 
					            dsc = cuda.ArrayDescriptor3D()
 | 
				
			||||||
 | 
					            dsc.height = info.palette_height
 | 
				
			||||||
 | 
					            dsc.width = 256
 | 
				
			||||||
 | 
					            dsc.depth = 0
 | 
				
			||||||
 | 
					            dsc.format = cuda.array_format.SIGNED_INT32
 | 
				
			||||||
 | 
					            dsc.num_channels = 2
 | 
				
			||||||
 | 
					            dsc.flags = cuda.array3d_flags.SURFACE_LDST
 | 
				
			||||||
 | 
					            palarray = cuda.Array(dsc)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            tref = self.mod.get_surfref('flatpal')
 | 
				
			||||||
 | 
					            tref.set_array(palarray, 0)
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            palette_fun = self.mod.get_function("interp_palette_hsv")
 | 
				
			||||||
 | 
					            dsc = cuda.ArrayDescriptor()
 | 
				
			||||||
 | 
					            dsc.height = info.palette_height
 | 
				
			||||||
 | 
					            dsc.width = 256
 | 
				
			||||||
 | 
					            dsc.format = cuda.array_format.UNSIGNED_INT8
 | 
				
			||||||
 | 
					            dsc.num_channels = 4
 | 
				
			||||||
            d_palmem = cuda.mem_alloc(256 * info.palette_height * 4)
 | 
					            d_palmem = cuda.mem_alloc(256 * info.palette_height * 4)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        pal_array_info = cuda.ArrayDescriptor()
 | 
					            tref = self.mod.get_texref('palTex')
 | 
				
			||||||
        pal_array_info.height = info.palette_height
 | 
					            tref.set_address_2d(d_palmem, dsc, 1024)
 | 
				
			||||||
        pal_array_info.width = 256
 | 
					            tref.set_format(cuda.array_format.UNSIGNED_INT8, 4)
 | 
				
			||||||
        pal_array_info.array_format = cuda.array_format.UNSIGNED_INT8
 | 
					            tref.set_flags(cuda.TRSF_NORMALIZED_COORDINATES)
 | 
				
			||||||
        pal_array_info.num_channels = 4
 | 
					            tref.set_filter_mode(cuda.filter_mode.LINEAR)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        h_out_a = cuda.pagelocked_empty((info.acc_height, info.acc_stride, 4),
 | 
					        h_out_a = cuda.pagelocked_empty((info.acc_height, info.acc_stride, 4),
 | 
				
			||||||
                                        np.float32)
 | 
					                                        np.float32)
 | 
				
			||||||
@ -206,18 +228,17 @@ class Renderer(object):
 | 
				
			|||||||
        last_idx = None
 | 
					        last_idx = None
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        for idx, start, stop in times:
 | 
					        for idx, start, stop in times:
 | 
				
			||||||
            width = np.float32((stop-start) / info.palette_height)
 | 
					            twidth = np.float32((stop-start) / info.palette_height)
 | 
				
			||||||
            palette_fun(d_palmem, d_palint_times, d_palint_vals,
 | 
					            if info.acc_mode == 'deferred':
 | 
				
			||||||
                        np.float32(start), width,
 | 
					                palette_fun(d_seeds, d_palint_times, d_palint_vals,
 | 
				
			||||||
 | 
					                            np.float32(start), twidth,
 | 
				
			||||||
 | 
					                            block=(256,1,1), grid=(info.palette_height,1),
 | 
				
			||||||
 | 
					                            stream=write_stream)
 | 
				
			||||||
 | 
					            else:
 | 
				
			||||||
 | 
					                palette_fun(d_palmem, d_palint_times, d_palint_vals,
 | 
				
			||||||
 | 
					                            np.float32(start), twidth,
 | 
				
			||||||
                            block=(256,1,1), grid=(info.palette_height,1),
 | 
					                            block=(256,1,1), grid=(info.palette_height,1),
 | 
				
			||||||
                            stream=write_stream)
 | 
					                            stream=write_stream)
 | 
				
			||||||
 | 
					 | 
				
			||||||
            # TODO: do we need to do this each time in order to reset cache?
 | 
					 | 
				
			||||||
            tref = self.mod.get_texref('palTex')
 | 
					 | 
				
			||||||
            tref.set_address_2d(d_palmem, pal_array_info, 1024)
 | 
					 | 
				
			||||||
            tref.set_format(cuda.array_format.UNSIGNED_INT8, 4)
 | 
					 | 
				
			||||||
            tref.set_flags(cuda.TRSF_NORMALIZED_COORDINATES)
 | 
					 | 
				
			||||||
            tref.set_filter_mode(cuda.filter_mode.LINEAR)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
            width = np.float32((stop-start) / ntemporal_samples)
 | 
					            width = np.float32((stop-start) / ntemporal_samples)
 | 
				
			||||||
            packer_fun(d_infos, d_genome_times, d_genome_knots,
 | 
					            packer_fun(d_infos, d_genome_times, d_genome_knots,
 | 
				
			||||||
@ -251,12 +272,11 @@ class Renderer(object):
 | 
				
			|||||||
                    _sync_stream(iter_stream, write_stream)
 | 
					                    _sync_stream(iter_stream, write_stream)
 | 
				
			||||||
                    write_fun(d_accum, d_log_sorted, sorter.dglobal,
 | 
					                    write_fun(d_accum, d_log_sorted, sorter.dglobal,
 | 
				
			||||||
                              block=(1024, 1, 1), grid=(nwriteblocks, 1),
 | 
					                              block=(1024, 1, 1), grid=(nwriteblocks, 1),
 | 
				
			||||||
                              texrefs=[tref], stream=write_stream)
 | 
					                              stream=write_stream)
 | 
				
			||||||
            else:
 | 
					            else:
 | 
				
			||||||
                iter_fun(np.uint64(d_accum), d_seeds, d_points, d_infos,
 | 
					                iter_fun(np.uint64(d_accum), d_seeds, d_points, d_infos,
 | 
				
			||||||
                         block=(32, self._iter.NTHREADS/32, 1),
 | 
					                         block=(32, self._iter.NTHREADS/32, 1),
 | 
				
			||||||
                         grid=(ntemporal_samples, nrounds),
 | 
					                         grid=(ntemporal_samples, nrounds), stream=iter_stream)
 | 
				
			||||||
                         texrefs=[tref], stream=iter_stream)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
            util.BaseCode.fill_dptr(self.mod, d_out, 4 * nbins, filt_stream)
 | 
					            util.BaseCode.fill_dptr(self.mod, d_out, 4 * nbins, filt_stream)
 | 
				
			||||||
            _sync_stream(filt_stream, write_stream)
 | 
					            _sync_stream(filt_stream, write_stream)
 | 
				
			||||||
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user