From f3a79b200c09154d7ded0fb41b86439773bf579a Mon Sep 17 00:00:00 2001 From: Steven Robertson Date: Thu, 27 Oct 2011 12:59:58 -0400 Subject: [PATCH] New badvals mechanism. --- cuburn/code/iter.py | 204 ++++++++++++++++++++++---------------------- cuburn/code/util.py | 26 +++++- cuburn/genome.py | 8 +- cuburn/render.py | 18 +++- 4 files changed, 144 insertions(+), 112 deletions(-) diff --git a/cuburn/code/iter.py b/cuburn/code/iter.py index 8123e5b..f5f055f 100644 --- a/cuburn/code/iter.py +++ b/cuburn/code/iter.py @@ -242,10 +242,10 @@ void apply_xf_{{xfid}}(float &ox, float &oy, float &color, mwc_st &rctx) { def _iterbody(self): tmpl = Template(r''' __global__ -void iter(uint64_t accbuf_ptr, mwc_st *msts, iter_params *all_params, - int nsamps_to_generate) { - mwc_st rctx = msts[gtid()]; - iter_params *global_params = &(all_params[blockIdx.x]); +void iter(uint64_t accbuf_ptr, mwc_st *msts, float4 *points, + const iter_params *all_params, int nsamps_to_generate) { + mwc_st rctx = msts[devtid()]; + const iter_params *global_params = &(all_params[blockIdx.x]); __shared__ int nsamps; nsamps = nsamps_to_generate; @@ -257,7 +257,7 @@ void iter(uint64_t accbuf_ptr, mwc_st *msts, iter_params *all_params, for (int i = threadIdx.y * blockDim.x + threadIdx.x; i * 4 < sizeof(iter_params); i += blockDim.x * blockDim.y) reinterpret_cast(¶ms)[i] = - reinterpret_cast(global_params)[i]; + reinterpret_cast(global_params)[i]; {{if info.chaos_used}} int last_xf_used = 0; @@ -273,120 +273,118 @@ void iter(uint64_t accbuf_ptr, mwc_st *msts, iter_params *all_params, {{endif}} __syncthreads(); - int consec_bad = -{{info.fuse}}; - - float x, y, color; - x = mwc_next_11(rctx); - y = mwc_next_11(rctx); - color = mwc_next_01(rctx); + float4 old_point = points[devtid()]; + float x = old_point.x, y = old_point.y, + color = old_point.z, fuse_rounds = old_point.w; while (1) { + // This condition checks for large numbers, Infs, and NaNs. + if (!(-(fabsf(x) + fabsf(y) > -1.0e6f))) { + x = mwc_next_11(rctx); + y = mwc_next_11(rctx); + color = mwc_next_01(rctx); + fuse_rounds = {{info.fuse / 32}}; + } + + // 32 rounds is somewhat arbitrary, but it has a pleasing 32-ness + for (int i = 0; i < 32; i++) { {{if info.chaos_used}} - {{precalc_chaos(pcp, std_xforms)}} + {{precalc_chaos(pcp, std_xforms)}} - // For now, we don't attempt to use the swap buffer when chaos is used - float xfsel = mwc_next_01(rctx); + // For now, we don't attempt to use the swap buffer when chaos is used + float xfsel = mwc_next_01(rctx); - {{for prior_xform_idx, prior_xform_name in enumerate(std_xforms)}} - if (last_xf_used == {{prior_xform_idx}}) { - {{for xform_idx, xform_name in enumerate(std_xforms[:-1])}} - if (xfsel <= {{pcp['chaos_'+prior_xform_name+'_'+xform_name]}}) { - apply_xf_{{xform_name}}(x, y, color, rctx); - last_xf_used = {{xform_idx}}; + {{for prior_xform_idx, prior_xform_name in enumerate(std_xforms)}} + if (last_xf_used == {{prior_xform_idx}}) { + {{for xform_idx, xform_name in enumerate(std_xforms[:-1])}} + if (xfsel <= {{pcp['chaos_'+prior_xform_name+'_'+xform_name]}}) { + apply_xf_{{xform_name}}(x, y, color, rctx); + last_xf_used = {{xform_idx}}; + } else + {{endfor}} + { + apply_xf_{{std_xforms[-1]}}(x, y, color, rctx); + last_xf_used = {{len(std_xforms)-1}}; + } } else {{endfor}} { + printf("Something went *very* wrong.\n"); + asm("trap;"); + } + +{{else}} + {{precalc_densities(pcp, std_xforms)}} + float xfsel = cosel[threadIdx.y]; + + {{for xform_name in std_xforms[:-1]}} + if (xfsel <= {{pcp['den_'+xform_name]}}) { + apply_xf_{{xform_name}}(x, y, color, rctx); + } else + {{endfor}} apply_xf_{{std_xforms[-1]}}(x, y, color, rctx); - last_xf_used = {{len(std_xforms)-1}}; - } - } else - {{endfor}} - { - printf("Something went *very* wrong.\n"); - asm("trap;"); + + int sw = (threadIdx.y * 32 + threadIdx.x * 33) & {{NTHREADS-1}}; + int sr = threadIdx.y * 32 + threadIdx.x; + + swap[sw] = fuse_rounds; + swap[sw+{{NTHREADS}}] = x; + swap[sw+{{2*NTHREADS}}] = y; + swap[sw+{{3*NTHREADS}}] = color; + __syncthreads(); + + // We select the next xforms here, since we've just synced. + if (threadIdx.y == 0 && threadIdx.x < {{NWARPS}}) + cosel[threadIdx.x] = mwc_next_01(rctx); + + fuse_rounds = swap[sr]; + x = swap[sr+{{NTHREADS}}]; + y = swap[sr+{{2*NTHREADS}}]; + color = swap[sr+{{3*NTHREADS}}]; + +{{endif}} + + if (fuse_rounds > 0.0f) continue; + +{{if 'final' in cp.xforms}} + float fx = x, fy = y, fcolor = color; + apply_xf_final(fx, fy, fcolor, rctx); +{{endif}} + + float cx, cy, cc; + + {{precalc_camera(info, pcp.camera)}} + +{{if 'final' in cp.xforms}} + {{apply_affine('fx', 'fy', 'cx', 'cy', pcp.camera)}} + cc = fcolor; +{{else}} + {{apply_affine('x', 'y', 'cx', 'cy', pcp.camera)}} + cc = color; +{{endif}} + + uint32_t ix = trunca(cx), iy = trunca(cy); + + if (ix >= {{info.acc_width}} || iy >= {{info.acc_height}}) + continue; + + uint32_t i = iy * {{info.acc_stride}} + ix; + + float4 outcol = tex2D(palTex, cc, time_frac); + update_pix(accbuf_ptr, i, outcol); } -{{else}} - {{precalc_densities(pcp, std_xforms)}} - float xfsel = cosel[threadIdx.y]; + int num_okay = __popc(__ballot(fuse_rounds == 0.0f)); + if (threadIdx.x == 0) atomicSub(&nsamps, num_okay * 32); + fuse_rounds = fmaxf(0.0f, fuse_rounds - 1.0f); - {{for xform_name in std_xforms[:-1]}} - if (xfsel <= {{pcp['den_'+xform_name]}}) { - apply_xf_{{xform_name}}(x, y, color, rctx); - } else - {{endfor}} - apply_xf_{{std_xforms[-1]}}(x, y, color, rctx); - - // Swap thread states here so that writeback skipping logic doesn't die - int sw = (threadIdx.y * 32 + threadIdx.x * 33) & {{NTHREADS-1}}; - int sr = threadIdx.y * 32 + threadIdx.x; - - swap[sw] = consec_bad; - swap[sw+{{NTHREADS}}] = x; - swap[sw+{{2*NTHREADS}}] = y; - swap[sw+{{3*NTHREADS}}] = color; __syncthreads(); - // This is in the middle of the function so that only one sync is - // required per loop. - if (nsamps < 0) break; - - // Similarly, we select the next xforms here. - if (threadIdx.y == 0 && threadIdx.x < {{NWARPS}}) - cosel[threadIdx.x] = mwc_next_01(rctx); - - consec_bad = swap[sr]; - x = swap[sr+{{NTHREADS}}]; - y = swap[sr+{{2*NTHREADS}}]; - color = swap[sr+{{3*NTHREADS}}]; - -{{endif}} - - if (consec_bad < 0) { - consec_bad++; - continue; - } - - int remain = __popc(__ballot(1)); - if (threadIdx.x == 0) atomicSub(&nsamps, remain); - -{{if 'final' in cp.xforms}} - float fx = x, fy = y, fcolor = color; - apply_xf_final(fx, fy, fcolor, rctx); -{{endif}} - - float cx, cy, cc; - - {{precalc_camera(info, pcp.camera)}} - -{{if 'final' in cp.xforms}} - {{apply_affine('fx', 'fy', 'cx', 'cy', pcp.camera)}} - cc = fcolor; -{{else}} - {{apply_affine('x', 'y', 'cx', 'cy', pcp.camera)}} - cc = color; -{{endif}} - - uint32_t ix = trunca(cx), iy = trunca(cy); - - if (ix >= {{info.acc_width}} || iy >= {{info.acc_height}} ) { - consec_bad++; - if (consec_bad > {{info.max_oob}}) { - x = mwc_next_11(rctx); - y = mwc_next_11(rctx); - color = mwc_next_01(rctx); - consec_bad = -{{info.fuse}}; - } - continue; - } - - uint32_t i = iy * {{info.acc_stride}} + ix; - - float4 outcol = tex2D(palTex, cc, time_frac); - update_pix(accbuf_ptr, i, outcol); + if (nsamps <= 0) break; } - msts[gtid()] = rctx; + points[devtid()] = make_float4(x, y, color, fuse_rounds); + msts[devtid()] = rctx; } ''') return tmpl.substitute( diff --git a/cuburn/code/util.py b/cuburn/code/util.py index 5f16437..4368154 100644 --- a/cuburn/code/util.py +++ b/cuburn/code/util.py @@ -42,7 +42,7 @@ float3 rgb2hsv(float3 rgb); float3 hsv2rgb(float3 hsv); """ - defs = r""" + defs = Template(r""" #undef M_E #undef M_LOG2E #undef M_LOG10E @@ -80,6 +80,28 @@ uint32_t gtid() { (blockIdx.x + (gridDim.x * blockIdx.y)))); } + +/* Returns the ID of this thread on the device. Note that this counter is + * volatile according to the PTX ISA. It should be used for loading and saving + * state that must be unique across running threads, not for accessing things + * in a known order. */ +__device__ +int devtid() { + int result; + asm({{crep(''' + { + .reg .u32 tmp1, tmp2; + mov.u32 %0, %smid; + mov.u32 tmp1, %nsmid; + mov.u32 tmp2, %warpid; + mad.lo.u32 %0, %0, tmp1, tmp2; + mov.u32 tmp1, %nwarpid; + mov.u32 tmp2, %laneid; + mad.lo.u32 %0, %0, tmp1, tmp2; + }''')}} : "=r"(result) ); + return result; +} + __device__ uint32_t trunca(float f) { // truncate as used in address calculations. note the use of a signed @@ -182,7 +204,7 @@ float3 hsv2rgb(float3 hsv) { else { out.x = val; out.y = min; out.z = mid; } return out; } -""" +""").substitute() @staticmethod def fill_dptr(mod, dptr, size, stream=None, value=np.uint32(0)): diff --git a/cuburn/genome.py b/cuburn/genome.py index 58b872c..8a0d520 100644 --- a/cuburn/genome.py +++ b/cuburn/genome.py @@ -82,11 +82,9 @@ class RenderInfo(object): Determine features and constants required to render a particular set of genomes. The values of this class are fixed before compilation begins. """ - # Constant parameters which control handling of out-of-frame samples: - # Number of iterations to iterate without write after new point - fuse = 10 - # Maximum consecutive out-of-bounds points before picking new point - max_oob = 10 + # Number of iterations to iterate without write after generating a new + # point, including the number of bad + fuse = 128 # Height of the texture pallete which gets uploaded to the GPU (assuming # that palette-from-texture is enabled). For most genomes, this doesn't diff --git a/cuburn/render.py b/cuburn/render.py index 3d27412..2fda621 100644 --- a/cuburn/render.py +++ b/cuburn/render.py @@ -14,6 +14,7 @@ from fr0stlib import pyflam3 from fr0stlib.pyflam3._flam3 import * from fr0stlib.pyflam3.constants import * +import pycuda.autoinit import pycuda.compiler import pycuda.driver as cuda import pycuda.tools @@ -153,7 +154,15 @@ class Renderer(object): np.concatenate(map(info.db.palettes.get, pals[1::2]))) d_palmem = cuda.mem_alloc(256 * info.palette_height * 4) - seeds = mwc.MWC.make_seeds(self._iter.NTHREADS * cps_per_block) + # The '+1' avoids more situations where the 'smid' value is larger + # than the number of enabled SMs on a chip, which is warned against in + # the docs but not seen in the wild. Things could get nastier on + # subsequent silicon, but I doubt they'd ever kill more than 1 SM + nslots = pycuda.autoinit.device.max_threads_per_multiprocessor * \ + (pycuda.autoinit.device.multiprocessor_count + 1) + + d_points = cuda.mem_alloc(nslots * 16) + seeds = mwc.MWC.make_seeds(nslots) d_seeds = cuda.to_device(seeds) h_out = cuda.pagelocked_empty((info.acc_height, info.acc_stride, 4), @@ -200,6 +209,10 @@ class Renderer(object): block=(256,1,1), grid=(cps_per_block/256,1), stream=iter_stream) + # TODO: if we only do this once per anim, does quality improve? + util.BaseCode.fill_dptr(self.mod, d_points, 4 * nslots, + iter_stream, np.float32(np.nan)) + # Get interpolated control points for debugging #iter_stream.synchronize() #d_temp = cuda.from_device(d_infos, @@ -208,7 +221,8 @@ class Renderer(object): #print '%60s %g' % ('_'.join(n), i) nsamps = info.density * info.width * info.height / cps_per_block - iter_fun(np.uint64(d_accum), d_seeds, d_infos, np.int32(nsamps), + iter_fun(np.uint64(d_accum), d_seeds, d_points, + d_infos, np.int32(nsamps), block=(32, self._iter.NTHREADS/32, 1), grid=(cps_per_block, 1), texrefs=[tref], stream=iter_stream)