mirror of
				https://github.com/stevenrobertson/cuburn.git
				synced 2025-11-04 02:10:45 -05:00 
			
		
		
		
	Deferred writeback.
This commit is contained in:
		@ -122,7 +122,6 @@ class IterCode(HunkOCode):
 | 
				
			|||||||
        bodies = [self._xfbody(i,x) for i,x in sorted(info.genome.xforms.items())]
 | 
					        bodies = [self._xfbody(i,x) for i,x in sorted(info.genome.xforms.items())]
 | 
				
			||||||
        bodies.append(iterbody)
 | 
					        bodies.append(iterbody)
 | 
				
			||||||
        self.defs = '\n'.join(bodies)
 | 
					        self.defs = '\n'.join(bodies)
 | 
				
			||||||
        self.decls += self.pix_helpers.substitute(info=info)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    decls = """
 | 
					    decls = """
 | 
				
			||||||
// Note: for normalized lookups, uchar4 actually returns floats
 | 
					// Note: for normalized lookups, uchar4 actually returns floats
 | 
				
			||||||
@ -132,78 +131,6 @@ __device__ int rb_head, rb_tail, rb_size;
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
"""
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    pix_helpers = Template("""
 | 
					 | 
				
			||||||
__device__
 | 
					 | 
				
			||||||
void read_pix(float4 &pix, float &den) {
 | 
					 | 
				
			||||||
    den = pix.w;
 | 
					 | 
				
			||||||
    {{if info.pal_has_alpha}}
 | 
					 | 
				
			||||||
    read_half(pix.z, pix.w, pix.z, den);
 | 
					 | 
				
			||||||
    {{endif}}
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
__device__
 | 
					 | 
				
			||||||
void write_pix(float4 &pix, float den) {
 | 
					 | 
				
			||||||
    {{if info.pal_has_alpha}}
 | 
					 | 
				
			||||||
    write_half(pix.z, pix.z, pix.w, den);
 | 
					 | 
				
			||||||
    {{endif}}
 | 
					 | 
				
			||||||
    pix.w = den;
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
__device__
 | 
					 | 
				
			||||||
void update_pix(uint64_t ptr, uint32_t i, float4 c) {
 | 
					 | 
				
			||||||
    {{if info.pal_has_alpha}}
 | 
					 | 
				
			||||||
    asm volatile ({{crep('''
 | 
					 | 
				
			||||||
    {
 | 
					 | 
				
			||||||
        .reg .u16       sz, sw;
 | 
					 | 
				
			||||||
        .reg .u64       base, off;
 | 
					 | 
				
			||||||
        .reg .f32       x, y, z, w, den, rc, tz, tw;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        // TODO: this limits the accumulation buffer to <4GB
 | 
					 | 
				
			||||||
        shl.b32         %0,     %0,     4;
 | 
					 | 
				
			||||||
        cvt.u64.u32     off,    %0;
 | 
					 | 
				
			||||||
        add.u64         base,   %1,     off;
 | 
					 | 
				
			||||||
        ld.cg.v4.f32    {x, y, z, den},         [base];
 | 
					 | 
				
			||||||
        add.f32         x,      x,      %2;
 | 
					 | 
				
			||||||
        add.f32         y,      y,      %3;
 | 
					 | 
				
			||||||
        mov.b32         {sz, sw},       z;
 | 
					 | 
				
			||||||
        cvt.rn.f32.u16  tz,     sz;
 | 
					 | 
				
			||||||
        cvt.rn.f32.u16  tw,     sw;
 | 
					 | 
				
			||||||
        mul.f32         tz,     tz,     den;
 | 
					 | 
				
			||||||
        mul.f32         tw,     tz,     den;
 | 
					 | 
				
			||||||
        fma.f32         tz,     %4,     65535.0,    tz;
 | 
					 | 
				
			||||||
        fma.f32         tw,     %5,     65535.0,    tw;
 | 
					 | 
				
			||||||
        add.f32         den,    1.0;
 | 
					 | 
				
			||||||
        rcp.approx.f32  rc,     den;
 | 
					 | 
				
			||||||
        mul.f32         tz,     tz,     rc;
 | 
					 | 
				
			||||||
        mul.f32         tw,     tw,     rc;
 | 
					 | 
				
			||||||
        cvt.rni.u16.f32 sz,     tz;
 | 
					 | 
				
			||||||
        cvt.rni.u16.f32 sw,     tw;
 | 
					 | 
				
			||||||
        mov.b32         z,      {sz, sw};
 | 
					 | 
				
			||||||
        st.cs.v4.f32    [base], {x, y, z, den};
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
    ''')}} : "+r"(i) : "l"(ptr), "f"(c.x), "f"(c.y), "f"(c.z), "f"(c.w));
 | 
					 | 
				
			||||||
    {{else}}
 | 
					 | 
				
			||||||
    asm volatile ({{crep('''
 | 
					 | 
				
			||||||
    {
 | 
					 | 
				
			||||||
        .reg .u64       base, off;
 | 
					 | 
				
			||||||
        .reg .f32       x, y, z, den;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        // TODO: this limits the accumulation buffer to <4GB
 | 
					 | 
				
			||||||
        shl.b32         %0,     %0,     4;
 | 
					 | 
				
			||||||
        cvt.u64.u32     off,    %0;
 | 
					 | 
				
			||||||
        add.u64         base,   %1,     off;
 | 
					 | 
				
			||||||
        ld.cg.v4.f32    {x, y, z, den},         [base];
 | 
					 | 
				
			||||||
        add.f32         x,      x,      %2;
 | 
					 | 
				
			||||||
        add.f32         y,      y,      %3;
 | 
					 | 
				
			||||||
        add.f32         z,      z,      %4;
 | 
					 | 
				
			||||||
        add.f32         den,    den,    1.0;
 | 
					 | 
				
			||||||
        st.cs.v4.f32    [base], {x, y, z, den};
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
    ''')}} : "+r"(i) : "l"(ptr), "f"(c.x), "f"(c.y), "f"(c.z));
 | 
					 | 
				
			||||||
    {{endif}}
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
""")
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def _xfbody(self, xfid, xform):
 | 
					    def _xfbody(self, xfid, xform):
 | 
				
			||||||
        px = self.pcp.xforms[xfid]
 | 
					        px = self.pcp.xforms[xfid]
 | 
				
			||||||
        tmpl = Template(r"""
 | 
					        tmpl = Template(r"""
 | 
				
			||||||
@ -249,19 +176,23 @@ __global__ void reset_rb(int size) {
 | 
				
			|||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__global__
 | 
					__global__
 | 
				
			||||||
void iter(uint64_t accbuf_ptr, mwc_st *msts, float4 *points,
 | 
					void iter(
 | 
				
			||||||
          const iter_params *all_params, int nsamps_to_generate) {
 | 
					        uint64_t out_ptr,
 | 
				
			||||||
 | 
					        mwc_st *msts,
 | 
				
			||||||
 | 
					        float4 *points,
 | 
				
			||||||
 | 
					        const iter_params *all_params,
 | 
				
			||||||
 | 
					        int nsamps_to_generate
 | 
				
			||||||
 | 
					) {
 | 
				
			||||||
    const iter_params *global_params = &(all_params[blockIdx.x]);
 | 
					    const iter_params *global_params = &(all_params[blockIdx.x]);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    __shared__ int nsamps;
 | 
					{{if info.acc_mode != 'deferred'}}
 | 
				
			||||||
    nsamps = nsamps_to_generate;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    __shared__ float time_frac;
 | 
					    __shared__ float time_frac;
 | 
				
			||||||
    time_frac = blockIdx.x / (float) gridDim.x;
 | 
					    time_frac = blockIdx.x / (float) gridDim.x;
 | 
				
			||||||
 | 
					{{endif}}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    // load params to shared memory cooperatively
 | 
					    // load params to shared memory cooperatively
 | 
				
			||||||
    for (int i = threadIdx.y * blockDim.x + threadIdx.x;
 | 
					    for (int i = threadIdx.y * blockDim.x + threadIdx.x;
 | 
				
			||||||
         i * 4 < sizeof(iter_params); i += blockDim.x * blockDim.y)
 | 
					         i < (sizeof(iter_params) / 4); i += blockDim.x * blockDim.y)
 | 
				
			||||||
        reinterpret_cast<float*>(¶ms)[i] =
 | 
					        reinterpret_cast<float*>(¶ms)[i] =
 | 
				
			||||||
            reinterpret_cast<const float*>(global_params)[i];
 | 
					            reinterpret_cast<const float*>(global_params)[i];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -272,9 +203,10 @@ void iter(uint64_t accbuf_ptr, mwc_st *msts, float4 *points,
 | 
				
			|||||||
    __syncthreads();
 | 
					    __syncthreads();
 | 
				
			||||||
    int this_rb_idx = rb_idx + threadIdx.x + 32 * threadIdx.y;
 | 
					    int this_rb_idx = rb_idx + threadIdx.x + 32 * threadIdx.y;
 | 
				
			||||||
    mwc_st rctx = msts[this_rb_idx];
 | 
					    mwc_st rctx = msts[this_rb_idx];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    // TODO: 4th channel unused. Kill or use for something helpful
 | 
				
			||||||
    float4 old_point = points[this_rb_idx];
 | 
					    float4 old_point = points[this_rb_idx];
 | 
				
			||||||
    float x = old_point.x, y = old_point.y,
 | 
					    float x = old_point.x, y = old_point.y, color = old_point.z;
 | 
				
			||||||
          color = old_point.z, fuse_rounds = old_point.w;
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
{{if info.chaos_used}}
 | 
					{{if info.chaos_used}}
 | 
				
			||||||
    int last_xf_used = 0;
 | 
					    int last_xf_used = 0;
 | 
				
			||||||
@ -290,18 +222,18 @@ void iter(uint64_t accbuf_ptr, mwc_st *msts, float4 *points,
 | 
				
			|||||||
    __syncthreads();
 | 
					    __syncthreads();
 | 
				
			||||||
{{endif}}
 | 
					{{endif}}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    bool fuse = false;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    while (1) {
 | 
					 | 
				
			||||||
    // This condition checks for large numbers, Infs, and NaNs.
 | 
					    // This condition checks for large numbers, Infs, and NaNs.
 | 
				
			||||||
        if (!(-(fabsf(x) + fabsf(y) > -1.0e6f))) {
 | 
					    if (!(-(fabsf(x) + fabsf(y)) > -1.0e6f)) {
 | 
				
			||||||
        x = mwc_next_11(rctx);
 | 
					        x = mwc_next_11(rctx);
 | 
				
			||||||
        y = mwc_next_11(rctx);
 | 
					        y = mwc_next_11(rctx);
 | 
				
			||||||
        color = mwc_next_01(rctx);
 | 
					        color = mwc_next_01(rctx);
 | 
				
			||||||
            fuse_rounds = {{info.fuse / 32}};
 | 
					        fuse = true;
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        // 32 rounds is somewhat arbitrary, but it has a pleasing 32-ness
 | 
					    // TODO: link up with FUSE, etc
 | 
				
			||||||
        for (int i = 0; i < 32; i++) {
 | 
					    for (int round = 0; round < 256; round++) {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
{{if info.chaos_used}}
 | 
					{{if info.chaos_used}}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -343,7 +275,7 @@ void iter(uint64_t accbuf_ptr, mwc_st *msts, float4 *points,
 | 
				
			|||||||
        int sw = (threadIdx.y * 32 + threadIdx.x * 33) & {{NTHREADS-1}};
 | 
					        int sw = (threadIdx.y * 32 + threadIdx.x * 33) & {{NTHREADS-1}};
 | 
				
			||||||
        int sr = threadIdx.y * 32 + threadIdx.x;
 | 
					        int sr = threadIdx.y * 32 + threadIdx.x;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            swap[sw] = fuse_rounds;
 | 
					        swap[sw] = fuse ? 1.0f : 0.0f;
 | 
				
			||||||
        swap[sw+{{NTHREADS}}] = x;
 | 
					        swap[sw+{{NTHREADS}}] = x;
 | 
				
			||||||
        swap[sw+{{2*NTHREADS}}] = y;
 | 
					        swap[sw+{{2*NTHREADS}}] = y;
 | 
				
			||||||
        swap[sw+{{3*NTHREADS}}] = color;
 | 
					        swap[sw+{{3*NTHREADS}}] = color;
 | 
				
			||||||
@ -353,14 +285,25 @@ void iter(uint64_t accbuf_ptr, mwc_st *msts, float4 *points,
 | 
				
			|||||||
        if (threadIdx.y == 0 && threadIdx.x < {{NWARPS}})
 | 
					        if (threadIdx.y == 0 && threadIdx.x < {{NWARPS}})
 | 
				
			||||||
            cosel[threadIdx.x] = mwc_next_01(rctx);
 | 
					            cosel[threadIdx.x] = mwc_next_01(rctx);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            fuse_rounds = swap[sr];
 | 
					        fuse = swap[sr];
 | 
				
			||||||
        x = swap[sr+{{NTHREADS}}];
 | 
					        x = swap[sr+{{NTHREADS}}];
 | 
				
			||||||
        y = swap[sr+{{2*NTHREADS}}];
 | 
					        y = swap[sr+{{2*NTHREADS}}];
 | 
				
			||||||
        color = swap[sr+{{3*NTHREADS}}];
 | 
					        color = swap[sr+{{3*NTHREADS}}];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
{{endif}}
 | 
					{{endif}}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            if (fuse_rounds > 0.0f) continue;
 | 
					{{if info.acc_mode == 'deferred'}}
 | 
				
			||||||
 | 
					        int tid = threadIdx.y * 32 + threadIdx.x;
 | 
				
			||||||
 | 
					        int offset = 4 * (256 * (256 * blockIdx.x + round) + tid);
 | 
				
			||||||
 | 
					        int *log = reinterpret_cast<int*>(out_ptr + offset);
 | 
				
			||||||
 | 
					{{endif}}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        if (fuse) {
 | 
				
			||||||
 | 
					{{if info.acc_mode == 'deferred'}}
 | 
				
			||||||
 | 
					            *log = 0xffffffff;
 | 
				
			||||||
 | 
					{{endif}}
 | 
				
			||||||
 | 
					            continue;
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
{{if 'final' in cp.xforms}}
 | 
					{{if 'final' in cp.xforms}}
 | 
				
			||||||
        float fx = x, fy = y, fcolor = color;
 | 
					        float fx = x, fy = y, fcolor = color;
 | 
				
			||||||
@ -381,25 +324,37 @@ void iter(uint64_t accbuf_ptr, mwc_st *msts, float4 *points,
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
        uint32_t ix = trunca(cx), iy = trunca(cy);
 | 
					        uint32_t ix = trunca(cx), iy = trunca(cy);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            if (ix >= {{info.acc_width}} || iy >= {{info.acc_height}})
 | 
					        if (ix >= {{info.acc_width}} || iy >= {{info.acc_height}}) {
 | 
				
			||||||
 | 
					{{if info.acc_mode == 'deferred'}}
 | 
				
			||||||
 | 
					            *log = 0xffffffff;
 | 
				
			||||||
 | 
					{{endif}}
 | 
				
			||||||
            continue;
 | 
					            continue;
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        uint32_t i = iy * {{info.acc_stride}} + ix;
 | 
					        uint32_t i = iy * {{info.acc_stride}} + ix;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					{{if info.acc_mode == 'atomic'}}
 | 
				
			||||||
        float4 outcol = tex2D(palTex, cc, time_frac);
 | 
					        float4 outcol = tex2D(palTex, cc, time_frac);
 | 
				
			||||||
            update_pix(accbuf_ptr, i, outcol);
 | 
					        float *accbuf_f = reinterpret_cast<float*>(out_ptr + (16*i));
 | 
				
			||||||
        }
 | 
					        atomicAdd(accbuf_f,   outcol.x);
 | 
				
			||||||
 | 
					        atomicAdd(accbuf_f+1, outcol.y);
 | 
				
			||||||
        int num_okay = __popc(__ballot(fuse_rounds == 0.0f));
 | 
					        atomicAdd(accbuf_f+2, outcol.z);
 | 
				
			||||||
        // Some xforms give so many badvals that a thread is almost guaranteed
 | 
					        atomicAdd(accbuf_f+3, 1.0f);
 | 
				
			||||||
        // to hit another badval before the fuse is over, causing the card to
 | 
					{{elif info.acc_mode == 'global'}}
 | 
				
			||||||
        // spin forever. To avoid this, we count a fuse round as 1/4 of a
 | 
					        float4 outcol = tex2D(palTex, cc, time_frac);
 | 
				
			||||||
        // sample below.
 | 
					        float4 *accbuf = reinterpret_cast<float4*>(out_ptr + (16*i));
 | 
				
			||||||
        if (threadIdx.x == 0) atomicSub(&nsamps, 256 + num_okay * 24);
 | 
					        float4 pix = *accbuf;
 | 
				
			||||||
        fuse_rounds = fmaxf(0.0f, fuse_rounds - 1.0f);
 | 
					        pix.x += outcol.x;
 | 
				
			||||||
 | 
					        pix.y += outcol.y;
 | 
				
			||||||
        __syncthreads();
 | 
					        pix.z += outcol.z;
 | 
				
			||||||
        if (nsamps <= 0) break;
 | 
					        pix.w += 1.0f;
 | 
				
			||||||
 | 
					        *accbuf = pix;
 | 
				
			||||||
 | 
					{{elif info.acc_mode == 'deferred'}}
 | 
				
			||||||
 | 
					        // 'color' gets the top 9 bits. TODO: add dithering via precalc.
 | 
				
			||||||
 | 
					        uint32_t icolor = cc * 512.0f;
 | 
				
			||||||
 | 
					        asm("bfi.b32    %0, %1, %0, 23, 9;" : "+r"(i) : "r"(icolor));
 | 
				
			||||||
 | 
					        *log = i;
 | 
				
			||||||
 | 
					{{endif}}
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if (threadIdx.x == 0 && threadIdx.y == 0)
 | 
					    if (threadIdx.x == 0 && threadIdx.y == 0)
 | 
				
			||||||
@ -407,10 +362,140 @@ void iter(uint64_t accbuf_ptr, mwc_st *msts, float4 *points,
 | 
				
			|||||||
    __syncthreads();
 | 
					    __syncthreads();
 | 
				
			||||||
    this_rb_idx = rb_idx + threadIdx.x + 32 * threadIdx.y;
 | 
					    this_rb_idx = rb_idx + threadIdx.x + 32 * threadIdx.y;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    points[this_rb_idx] = make_float4(x, y, color, fuse_rounds);
 | 
					    points[this_rb_idx] = make_float4(x, y, color, 0.0f);
 | 
				
			||||||
    msts[this_rb_idx] = rctx;
 | 
					    msts[this_rb_idx] = rctx;
 | 
				
			||||||
    return;
 | 
					    return;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					// Block size, shared accumulation bits, shared accumulation width.
 | 
				
			||||||
 | 
					#define BS 1024
 | 
				
			||||||
 | 
					#define SHAB 12
 | 
				
			||||||
 | 
					#define SHAW (1<<SHAB)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					// These two accumulators, used in write_shmem, hold {density, red} and
 | 
				
			||||||
 | 
					// {green, blue} values as packed u16 pairs. The fixed size represents 4,096
 | 
				
			||||||
 | 
					// pixels in the accumulator.
 | 
				
			||||||
 | 
					__shared__ uint32_t s_acc_dr[SHAW];
 | 
				
			||||||
 | 
					__shared__ uint32_t s_acc_gb[SHAW];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					// Read from the shm accumulators and write to the global ones.
 | 
				
			||||||
 | 
					__device__
 | 
				
			||||||
 | 
					void write_shmem_helper(
 | 
				
			||||||
 | 
					        float4 *acc,
 | 
				
			||||||
 | 
					        const int glo_base,
 | 
				
			||||||
 | 
					        const int idx
 | 
				
			||||||
 | 
					) {
 | 
				
			||||||
 | 
					    float4 pix = acc[glo_base+idx];
 | 
				
			||||||
 | 
					    uint32_t dr = s_acc_dr[idx];
 | 
				
			||||||
 | 
					    pix.x += (dr & 0xffff) / 255.0f;
 | 
				
			||||||
 | 
					    pix.w += dr >> 16;
 | 
				
			||||||
 | 
					    uint32_t gb = s_acc_gb[idx];
 | 
				
			||||||
 | 
					    pix.y += (gb & 0xffff) / 255.0f;
 | 
				
			||||||
 | 
					    pix.z += (gb >> 16) / 255.0f;
 | 
				
			||||||
 | 
					    acc[glo_base+idx] = pix;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					// Read the point log, accumulate in shared memory, and write the results.
 | 
				
			||||||
 | 
					// This kernel is to be launched with one block for every 4,096 addresses to
 | 
				
			||||||
 | 
					// be processed, and will handle those addresses.
 | 
				
			||||||
 | 
					//
 | 
				
			||||||
 | 
					// log_bounds is an array mapping radix values to the first index in the log
 | 
				
			||||||
 | 
					// with that radix position. For performance reasons in other parts of the
 | 
				
			||||||
 | 
					// code, the radix may actually include bits within the lower SHAB part of the
 | 
				
			||||||
 | 
					// address, or it might not cover the first few bits after the SHAB part;
 | 
				
			||||||
 | 
					// log_bounds_shift covers that. glob_addr_bits specifies the number of bits
 | 
				
			||||||
 | 
					// above SHAB which are address bits.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					__global__ void
 | 
				
			||||||
 | 
					__launch_bounds__(BS, 1)
 | 
				
			||||||
 | 
					write_shmem(
 | 
				
			||||||
 | 
					        float4 *acc,
 | 
				
			||||||
 | 
					        const uint32_t *log,
 | 
				
			||||||
 | 
					        const uint32_t *log_bounds,
 | 
				
			||||||
 | 
					        const int log_bounds_shift
 | 
				
			||||||
 | 
					) {
 | 
				
			||||||
 | 
					    const int tid = threadIdx.x;
 | 
				
			||||||
 | 
					    const int bid = blockIdx.x;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    // TODO: doesn't respect SHAW/BS
 | 
				
			||||||
 | 
					    // TODO: compare generated code with unrolled for-loop
 | 
				
			||||||
 | 
					    s_acc_dr[tid] = 0;
 | 
				
			||||||
 | 
					    s_acc_gb[tid] = 0;
 | 
				
			||||||
 | 
					    s_acc_dr[tid+BS] = 0;
 | 
				
			||||||
 | 
					    s_acc_gb[tid+BS] = 0;
 | 
				
			||||||
 | 
					    s_acc_dr[tid+2*BS] = 0;
 | 
				
			||||||
 | 
					    s_acc_gb[tid+2*BS] = 0;
 | 
				
			||||||
 | 
					    s_acc_dr[tid+3*BS] = 0;
 | 
				
			||||||
 | 
					    s_acc_gb[tid+3*BS] = 0;
 | 
				
			||||||
 | 
					    __syncthreads();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    // TODO: share across threads - discernable performance impact?
 | 
				
			||||||
 | 
					    int lb_idx_lo, lb_idx_hi;
 | 
				
			||||||
 | 
					    if (log_bounds_shift > 0) {
 | 
				
			||||||
 | 
					        lb_idx_hi = ((bid + 1) << log_bounds_shift) - 1;
 | 
				
			||||||
 | 
					        lb_idx_lo = (bid << log_bounds_shift) - 1;
 | 
				
			||||||
 | 
					    } else {
 | 
				
			||||||
 | 
					        lb_idx_hi = bid >> (-log_bounds_shift);
 | 
				
			||||||
 | 
					        lb_idx_lo = lb_idx_hi - 1;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    int idx_lo, idx_hi;
 | 
				
			||||||
 | 
					    if (lb_idx_lo < 0) idx_lo = 0;
 | 
				
			||||||
 | 
					    else idx_lo = log_bounds[lb_idx_lo] & ~(BS-1);
 | 
				
			||||||
 | 
					    idx_hi = (log_bounds[lb_idx_hi] & ~(BS - 1)) + BS;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    float rnrounds = 1.0f / (idx_hi - idx_lo);
 | 
				
			||||||
 | 
					    float time = tid * rnrounds;
 | 
				
			||||||
 | 
					    float time_step = BS * rnrounds;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    int glo_base = bid << SHAB;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    for (int i = idx_lo + tid; i < idx_hi; i += BS) {
 | 
				
			||||||
 | 
					        int entry = log[i];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        // TODO: constant '11' is really just 32 - 9 - SHAB, where 9 is the
 | 
				
			||||||
 | 
					        // number of bits assigned to color. This ignores opacity.
 | 
				
			||||||
 | 
					        bfe_decl(glob_addr, entry, SHAB, 11);
 | 
				
			||||||
 | 
					        if (glob_addr != bid) continue;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        bfe_decl(shr_addr, entry, 0, SHAB);
 | 
				
			||||||
 | 
					        bfe_decl(color, entry, 23, 9);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        float colorf = color / 512.0f;
 | 
				
			||||||
 | 
					        float4 outcol = tex2D(palTex, colorf, time);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        // TODO: change texture sampler to return shorts and avoid this
 | 
				
			||||||
 | 
					        uint32_t r = 255.0f * outcol.x;
 | 
				
			||||||
 | 
					        uint32_t g = 255.0f * outcol.y;
 | 
				
			||||||
 | 
					        uint32_t b = 255.0f * outcol.z;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        uint32_t dr = atomicAdd(s_acc_dr + shr_addr, r + 0x10000);
 | 
				
			||||||
 | 
					        uint32_t gb = atomicAdd(s_acc_gb + shr_addr, g + (b << 16));
 | 
				
			||||||
 | 
					        uint32_t d = dr >> 16;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        // Neat trick: if overflow is about to happen, write the accumulator,
 | 
				
			||||||
 | 
					        // and subtract the last known values from the accumulator again.
 | 
				
			||||||
 | 
					        // Even if the ints end up wrapping around once before the subtraction
 | 
				
			||||||
 | 
					        // can occur, the results after the subtraction will be correct.
 | 
				
			||||||
 | 
					        // (Wrapping twice will mess up the intermediate write, but is pretty
 | 
				
			||||||
 | 
					        // unlikely.)
 | 
				
			||||||
 | 
					        if (d == 250) {
 | 
				
			||||||
 | 
					            atomicSub(s_acc_dr + shr_addr, dr);
 | 
				
			||||||
 | 
					            atomicSub(s_acc_gb + shr_addr, gb);
 | 
				
			||||||
 | 
					            write_shmem_helper(acc, glo_base, shr_addr);
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					        time += time_step;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    __syncthreads();
 | 
				
			||||||
 | 
					    int idx = tid;
 | 
				
			||||||
 | 
					    for (int i = 0; i < (SHAW / BS); i++) {
 | 
				
			||||||
 | 
					        write_shmem_helper(acc, glo_base, idx);
 | 
				
			||||||
 | 
					        idx += BS;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
''')
 | 
					''')
 | 
				
			||||||
        return tmpl.substitute(
 | 
					        return tmpl.substitute(
 | 
				
			||||||
                info = self.info,
 | 
					                info = self.info,
 | 
				
			||||||
 | 
				
			|||||||
@ -71,6 +71,13 @@ float3 hsv2rgb(float3 hsv);
 | 
				
			|||||||
#define  M_SQRT2      1.41421353816986f
 | 
					#define  M_SQRT2      1.41421353816986f
 | 
				
			||||||
#define  M_SQRT1_2    0.70710676908493f
 | 
					#define  M_SQRT1_2    0.70710676908493f
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define bfe(d, s, o, w) \
 | 
				
			||||||
 | 
					        asm("bfe.u32 %0, %1, %2, %3;" : "=r"(d) : "r"(s), "r"(o), "r"(w))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define bfe_decl(d, s, o, w) \
 | 
				
			||||||
 | 
					        int d; \
 | 
				
			||||||
 | 
					        bfe(d, s, o, w)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
// TODO: use launch parameter preconfig to eliminate unnecessary parts
 | 
					// TODO: use launch parameter preconfig to eliminate unnecessary parts
 | 
				
			||||||
__device__
 | 
					__device__
 | 
				
			||||||
uint32_t gtid() {
 | 
					uint32_t gtid() {
 | 
				
			||||||
 | 
				
			|||||||
@ -99,8 +99,10 @@ class RenderInfo(object):
 | 
				
			|||||||
    genomes. The values of this class are fixed before compilation begins.
 | 
					    genomes. The values of this class are fixed before compilation begins.
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    # Number of iterations to iterate without write after generating a new
 | 
					    # Number of iterations to iterate without write after generating a new
 | 
				
			||||||
    # point, including the number of bad
 | 
					    # point. This number is currently fixed pretty deeply in the set of magic
 | 
				
			||||||
    fuse = 192
 | 
					    # constants which govern buffer sizes; changing the value here won't
 | 
				
			||||||
 | 
					    # actually change the code on the device to do something different.
 | 
				
			||||||
 | 
					    fuse = 256
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # Height of the texture pallete which gets uploaded to the GPU (assuming
 | 
					    # Height of the texture pallete which gets uploaded to the GPU (assuming
 | 
				
			||||||
    # that palette-from-texture is enabled). For most genomes, this doesn't
 | 
					    # that palette-from-texture is enabled). For most genomes, this doesn't
 | 
				
			||||||
@ -120,11 +122,19 @@ class RenderInfo(object):
 | 
				
			|||||||
    # which I'm not opposed to)
 | 
					    # which I'm not opposed to)
 | 
				
			||||||
    alpha_output_channel = False
 | 
					    alpha_output_channel = False
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # TODO: fix these
 | 
					    # There are three settings for this somewhat ersatz paramater. 'global'
 | 
				
			||||||
 | 
					    # uses unsynchronized global writes to accumulate sample points, 'atomic'
 | 
				
			||||||
 | 
					    # uses atomic global writes, and 'deferred' stores color and position in a
 | 
				
			||||||
 | 
					    # sample log, sorts the log by position, and uses shared memory to
 | 
				
			||||||
 | 
					    # perform the accumulation. Deferred has the accuracy of 'atomic' and
 | 
				
			||||||
 | 
					    # the speed of 'global' (it's actually faster!), but packs color and
 | 
				
			||||||
 | 
					    # position into a single 32-bit int for now, which limits resolution to
 | 
				
			||||||
 | 
					    # 1080p when xform opacity is respected, so the other two modes will hang
 | 
				
			||||||
 | 
					    # around until that can be extended to be memory-limited again.
 | 
				
			||||||
 | 
					    acc_mode = 'deferred'
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # TODO: fix this
 | 
				
			||||||
    chaos_used = False
 | 
					    chaos_used = False
 | 
				
			||||||
    final_xform_index = 3
 | 
					 | 
				
			||||||
    pal_has_alpha = False
 | 
					 | 
				
			||||||
    density = 2000
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __init__(self, db, **kwargs):
 | 
					    def __init__(self, db, **kwargs):
 | 
				
			||||||
        self.db = db
 | 
					        self.db = db
 | 
				
			||||||
@ -134,6 +144,7 @@ class RenderInfo(object):
 | 
				
			|||||||
        self.acc_width = self.width + 2 * self.gutter
 | 
					        self.acc_width = self.width + 2 * self.gutter
 | 
				
			||||||
        self.acc_height = self.height + 2 * self.gutter
 | 
					        self.acc_height = self.height + 2 * self.gutter
 | 
				
			||||||
        self.acc_stride = 32 * int(np.ceil(self.acc_width / 32.))
 | 
					        self.acc_stride = 32 * int(np.ceil(self.acc_width / 32.))
 | 
				
			||||||
 | 
					        self.density = self.quality
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        # Deref genome
 | 
					        # Deref genome
 | 
				
			||||||
        self.genome = self.db.genomes[self.genome]
 | 
					        self.genome = self.db.genomes[self.genome]
 | 
				
			||||||
 | 
				
			|||||||
@ -20,10 +20,13 @@ import pycuda.tools
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
import cuburn.genome
 | 
					import cuburn.genome
 | 
				
			||||||
from cuburn import affine
 | 
					from cuburn import affine
 | 
				
			||||||
from cuburn.code import util, mwc, iter, filtering
 | 
					from cuburn.code import util, mwc, iter, filtering, sort
 | 
				
			||||||
 | 
					
 | 
				
			||||||
RenderedImage = namedtuple('RenderedImage', 'buf idx gpu_time')
 | 
					RenderedImage = namedtuple('RenderedImage', 'buf idx gpu_time')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def _sync_stream(dst, src):
 | 
				
			||||||
 | 
					    dst.wait_for_event(cuda.Event(cuda.event_flags.DISABLE_TIMING).record(src))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Renderer(object):
 | 
					class Renderer(object):
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    Control structure for rendering a series of frames.
 | 
					    Control structure for rendering a series of frames.
 | 
				
			||||||
@ -107,16 +110,47 @@ class Renderer(object):
 | 
				
			|||||||
        packer_fun = self.mod.get_function("interp_iter_params")
 | 
					        packer_fun = self.mod.get_function("interp_iter_params")
 | 
				
			||||||
        palette_fun = self.mod.get_function("interp_palette_hsv")
 | 
					        palette_fun = self.mod.get_function("interp_palette_hsv")
 | 
				
			||||||
        iter_fun = self.mod.get_function("iter")
 | 
					        iter_fun = self.mod.get_function("iter")
 | 
				
			||||||
 | 
					        write_fun = self.mod.get_function("write_shmem")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        info = self.info
 | 
					        info = self.info
 | 
				
			||||||
        stream = cuda.Stream()
 | 
					
 | 
				
			||||||
        event_a = cuda.Event().record(stream)
 | 
					        # The synchronization model is messy. See helpers/task_model.svg.
 | 
				
			||||||
 | 
					        iter_stream = cuda.Stream()
 | 
				
			||||||
 | 
					        filt_stream = cuda.Stream()
 | 
				
			||||||
 | 
					        if info.acc_mode == 'deferred':
 | 
				
			||||||
 | 
					            write_stream = cuda.Stream()
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            write_stream = iter_stream
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # These events fire when the corresponding buffer is available for
 | 
				
			||||||
 | 
					        # reading on the host (i.e. the copy is done). On the first pass, 'a'
 | 
				
			||||||
 | 
					        # will be ignored, and subsequently moved to 'b'.
 | 
				
			||||||
 | 
					        event_a = cuda.Event().record(filt_stream)
 | 
				
			||||||
        event_b = None
 | 
					        event_b = None
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        nbins = info.acc_height * info.acc_stride
 | 
					        nbins = info.acc_height * info.acc_stride
 | 
				
			||||||
        d_accum = cuda.mem_alloc(16 * nbins)
 | 
					        d_accum = cuda.mem_alloc(16 * nbins)
 | 
				
			||||||
        d_out = cuda.mem_alloc(16 * nbins)
 | 
					        d_out = cuda.mem_alloc(16 * nbins)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        if info.acc_mode == 'deferred':
 | 
				
			||||||
 | 
					            # Having a fixed, power-of-two log size makes things much easier
 | 
				
			||||||
 | 
					            log_size = 64 << 20
 | 
				
			||||||
 | 
					            d_log = cuda.mem_alloc(log_size * 4)
 | 
				
			||||||
 | 
					            d_log_sorted = cuda.mem_alloc(log_size * 4)
 | 
				
			||||||
 | 
					            sorter = sort.Sorter(log_size)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            # Shared accumulators take care of the lowest 12 bits, but due to
 | 
				
			||||||
 | 
					            # a quirk of the sort implementation, asking the sort to handle
 | 
				
			||||||
 | 
					            # fewer bits than it is compiled for will make it considerably
 | 
				
			||||||
 | 
					            # slower (and it can't be compiled for <7b), so we actually dig in
 | 
				
			||||||
 | 
					            # to the accumulator's SHAB window for those cases.
 | 
				
			||||||
 | 
					            SHAB = np.int32(12)
 | 
				
			||||||
 | 
					            address_bits = np.int32(np.ceil(np.log2(nbins+1)))
 | 
				
			||||||
 | 
					            start_bit = address_bits - sorter.radix_bits
 | 
				
			||||||
 | 
					            log_shift = np.int32(SHAB - start_bit)
 | 
				
			||||||
 | 
					            nwriteblocks = int(np.ceil(nbins / (1<<SHAB)))
 | 
				
			||||||
 | 
					            print start_bit, log_shift, nwriteblocks
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        # Calculate 'nslots', the number of simultaneous running threads that
 | 
					        # Calculate 'nslots', the number of simultaneous running threads that
 | 
				
			||||||
        # can be active on the GPU during iteration (and thus the number of
 | 
					        # can be active on the GPU during iteration (and thus the number of
 | 
				
			||||||
        # slots for loading and storing RNG and point context that will be
 | 
					        # slots for loading and storing RNG and point context that will be
 | 
				
			||||||
@ -131,7 +165,6 @@ class Renderer(object):
 | 
				
			|||||||
        nsms = cuda.Context.get_device().multiprocessor_count
 | 
					        nsms = cuda.Context.get_device().multiprocessor_count
 | 
				
			||||||
        rb_size = occupancy.warps_per_mp * nsms / (iter_threads_per_block / 32)
 | 
					        rb_size = occupancy.warps_per_mp * nsms / (iter_threads_per_block / 32)
 | 
				
			||||||
        nslots = iter_threads_per_block * rb_size
 | 
					        nslots = iter_threads_per_block * rb_size
 | 
				
			||||||
        ntemporal_samples = int(np.ceil(1000. / rb_size) * rb_size)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
        # Reset the ringbuffer info for the slots
 | 
					        # Reset the ringbuffer info for the slots
 | 
				
			||||||
        reset_rb_fun(np.int32(rb_size), block=(1,1,1))
 | 
					        reset_rb_fun(np.int32(rb_size), block=(1,1,1))
 | 
				
			||||||
@ -140,6 +173,11 @@ class Renderer(object):
 | 
				
			|||||||
        seeds = mwc.MWC.make_seeds(nslots)
 | 
					        seeds = mwc.MWC.make_seeds(nslots)
 | 
				
			||||||
        d_seeds = cuda.to_device(seeds)
 | 
					        d_seeds = cuda.to_device(seeds)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # We used to auto-calculate this to a multiple of the number of SMs on
 | 
				
			||||||
 | 
					        # the device, but since we now use shorter launches and, to a certain
 | 
				
			||||||
 | 
					        # extent, allow simultaneous occupancy, that's not as important. The
 | 
				
			||||||
 | 
					        # 1024 is a magic constant, though: FUSE
 | 
				
			||||||
 | 
					        ntemporal_samples = 1024
 | 
				
			||||||
        genome_times, genome_knots = self._iter.packer.pack()
 | 
					        genome_times, genome_knots = self._iter.packer.pack()
 | 
				
			||||||
        d_genome_times = cuda.to_device(genome_times)
 | 
					        d_genome_times = cuda.to_device(genome_times)
 | 
				
			||||||
        d_genome_knots = cuda.to_device(genome_knots)
 | 
					        d_genome_knots = cuda.to_device(genome_knots)
 | 
				
			||||||
@ -174,7 +212,7 @@ class Renderer(object):
 | 
				
			|||||||
            palette_fun(d_palmem, d_palint_times, d_palint_vals,
 | 
					            palette_fun(d_palmem, d_palint_times, d_palint_vals,
 | 
				
			||||||
                        np.float32(start), width,
 | 
					                        np.float32(start), width,
 | 
				
			||||||
                        block=(256,1,1), grid=(info.palette_height,1),
 | 
					                        block=(256,1,1), grid=(info.palette_height,1),
 | 
				
			||||||
                        stream=stream)
 | 
					                        stream=write_stream)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            # TODO: do we need to do this each time in order to reset cache?
 | 
					            # TODO: do we need to do this each time in order to reset cache?
 | 
				
			||||||
            tref = self.mod.get_texref('palTex')
 | 
					            tref = self.mod.get_texref('palTex')
 | 
				
			||||||
@ -188,11 +226,11 @@ class Renderer(object):
 | 
				
			|||||||
                       np.float32(start), width, d_seeds,
 | 
					                       np.float32(start), width, d_seeds,
 | 
				
			||||||
                       np.int32(ntemporal_samples), block=(256,1,1),
 | 
					                       np.int32(ntemporal_samples), block=(256,1,1),
 | 
				
			||||||
                       grid=(int(np.ceil(ntemporal_samples/256.)),1),
 | 
					                       grid=(int(np.ceil(ntemporal_samples/256.)),1),
 | 
				
			||||||
                       stream=stream)
 | 
					                       stream=iter_stream)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            # TODO: if we only do this once per anim, does quality improve?
 | 
					            # Reset points so that they will be FUSEd
 | 
				
			||||||
            util.BaseCode.fill_dptr(self.mod, d_points, 4 * nslots,
 | 
					            util.BaseCode.fill_dptr(self.mod, d_points, 4 * nslots,
 | 
				
			||||||
                                    stream, np.float32(np.nan))
 | 
					                                    iter_stream, np.float32(np.nan))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            # Get interpolated control points for debugging
 | 
					            # Get interpolated control points for debugging
 | 
				
			||||||
            #stream.synchronize()
 | 
					            #stream.synchronize()
 | 
				
			||||||
@ -201,20 +239,34 @@ class Renderer(object):
 | 
				
			|||||||
            #for i, n in zip(d_temp[5], self._iter.packer.packed):
 | 
					            #for i, n in zip(d_temp[5], self._iter.packer.packed):
 | 
				
			||||||
                #print '%60s %g' % ('_'.join(n), i)
 | 
					                #print '%60s %g' % ('_'.join(n), i)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            util.BaseCode.fill_dptr(self.mod, d_accum, 4 * nbins, stream)
 | 
					            util.BaseCode.fill_dptr(self.mod, d_accum, 4 * nbins, write_stream)
 | 
				
			||||||
            nsamps = info.density * info.width * info.height / ntemporal_samples
 | 
					            nrounds = ( (info.density * info.width * info.height)
 | 
				
			||||||
            iter_fun(np.uint64(d_accum), d_seeds, d_points,
 | 
					                      / (ntemporal_samples * 256 * 256) ) + 1
 | 
				
			||||||
                     d_infos, np.int32(nsamps),
 | 
					            if info.acc_mode == 'deferred':
 | 
				
			||||||
 | 
					                for i in range(nrounds):
 | 
				
			||||||
 | 
					                    iter_fun(np.uint64(d_log), d_seeds, d_points, d_infos,
 | 
				
			||||||
                             block=(32, self._iter.NTHREADS/32, 1),
 | 
					                             block=(32, self._iter.NTHREADS/32, 1),
 | 
				
			||||||
                             grid=(ntemporal_samples, 1),
 | 
					                             grid=(ntemporal_samples, 1),
 | 
				
			||||||
                     texrefs=[tref], stream=stream)
 | 
					                             texrefs=[tref], stream=iter_stream)
 | 
				
			||||||
 | 
					                    _sync_stream(write_stream, iter_stream)
 | 
				
			||||||
 | 
					                    sorter.sort(d_log_sorted, d_log, log_size, start_bit, True,
 | 
				
			||||||
 | 
					                                stream=write_stream)
 | 
				
			||||||
 | 
					                    _sync_stream(iter_stream, write_stream)
 | 
				
			||||||
 | 
					                    write_fun(d_accum, d_log_sorted, sorter.dglobal, log_shift,
 | 
				
			||||||
 | 
					                              block=(1024, 1, 1), grid=(nwriteblocks, 1),
 | 
				
			||||||
 | 
					                              stream=write_stream)
 | 
				
			||||||
 | 
					            else:
 | 
				
			||||||
 | 
					                iter_fun(np.uint64(d_accum), d_seeds, d_points, d_infos,
 | 
				
			||||||
 | 
					                         block=(32, self._iter.NTHREADS/32, 1),
 | 
				
			||||||
 | 
					                         grid=(ntemporal_samples, nrounds),
 | 
				
			||||||
 | 
					                         texrefs=[tref], stream=iter_stream)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            stream.synchronize()
 | 
					            util.BaseCode.fill_dptr(self.mod, d_out, 4 * nbins, filt_stream)
 | 
				
			||||||
 | 
					            _sync_stream(filt_stream, write_stream)
 | 
				
			||||||
            util.BaseCode.fill_dptr(self.mod, d_out, 4 * nbins, stream)
 | 
					            filt.de(d_out, d_accum, info, start, stop, filt_stream)
 | 
				
			||||||
            filt.de(d_out, d_accum, info, start, stop, stream)
 | 
					            _sync_stream(write_stream, filt_stream)
 | 
				
			||||||
            filt.colorclip(d_out, info, start, stop, stream)
 | 
					            filt.colorclip(d_out, info, start, stop, filt_stream)
 | 
				
			||||||
            cuda.memcpy_dtoh_async(h_out_a, d_out, stream)
 | 
					            cuda.memcpy_dtoh_async(h_out_a, d_out, filt_stream)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            if event_b:
 | 
					            if event_b:
 | 
				
			||||||
                while not event_a.query():
 | 
					                while not event_a.query():
 | 
				
			||||||
@ -222,11 +274,10 @@ class Renderer(object):
 | 
				
			|||||||
                gpu_time = event_a.time_since(event_b)
 | 
					                gpu_time = event_a.time_since(event_b)
 | 
				
			||||||
                yield RenderedImage(self._trim(h_out_b), last_idx, gpu_time)
 | 
					                yield RenderedImage(self._trim(h_out_b), last_idx, gpu_time)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            event_a, event_b = cuda.Event().record(stream), event_a
 | 
					            event_a, event_b = cuda.Event().record(filt_stream), event_a
 | 
				
			||||||
            h_out_a, h_out_b = h_out_b, h_out_a
 | 
					            h_out_a, h_out_b = h_out_b, h_out_a
 | 
				
			||||||
            last_idx = idx
 | 
					            last_idx = idx
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 | 
				
			||||||
        while not event_a.query():
 | 
					        while not event_a.query():
 | 
				
			||||||
            timemod.sleep(0.001)
 | 
					            timemod.sleep(0.001)
 | 
				
			||||||
        gpu_time = event_a.time_since(event_b)
 | 
					        gpu_time = event_a.time_since(event_b)
 | 
				
			||||||
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user