From f3a79b200c09154d7ded0fb41b86439773bf579a Mon Sep 17 00:00:00 2001
From: Steven Robertson <steven@strobe.cc>
Date: Thu, 27 Oct 2011 12:59:58 -0400
Subject: [PATCH] New badvals mechanism.

---
 cuburn/code/iter.py | 204 ++++++++++++++++++++++----------------------
 cuburn/code/util.py |  26 +++++-
 cuburn/genome.py    |   8 +-
 cuburn/render.py    |  18 +++-
 4 files changed, 144 insertions(+), 112 deletions(-)
diff --git a/cuburn/code/iter.py b/cuburn/code/iter.py
index 8123e5b..f5f055f 100644
--- a/cuburn/code/iter.py
+++ b/cuburn/code/iter.py
@@ -242,10 +242,10 @@ void apply_xf_{{xfid}}(float &ox, float &oy, float &color, mwc_st &rctx) {
     def _iterbody(self):
         tmpl = Template(r'''
 __global__
-void iter(uint64_t accbuf_ptr, mwc_st *msts, iter_params *all_params,
-          int nsamps_to_generate) {
-    mwc_st rctx = msts[gtid()];
-    iter_params *global_params = &(all_params[blockIdx.x]);
+void iter(uint64_t accbuf_ptr, mwc_st *msts, float4 *points,
+          const iter_params *all_params, int nsamps_to_generate) {
+    mwc_st rctx = msts[devtid()];
+    const iter_params *global_params = &(all_params[blockIdx.x]);
 
     __shared__ int nsamps;
     nsamps = nsamps_to_generate;
@@ -257,7 +257,7 @@ void iter(uint64_t accbuf_ptr, mwc_st *msts, iter_params *all_params,
     for (int i = threadIdx.y * blockDim.x + threadIdx.x;
          i * 4 < sizeof(iter_params); i += blockDim.x * blockDim.y)
         reinterpret_cast<float*>(&params)[i] =
-            reinterpret_cast<float*>(global_params)[i];
+            reinterpret_cast<const float*>(global_params)[i];
 
 {{if info.chaos_used}}
     int last_xf_used = 0;
@@ -273,120 +273,118 @@ void iter(uint64_t accbuf_ptr, mwc_st *msts, iter_params *all_params,
 {{endif}}
 
     __syncthreads();
-    int consec_bad = -{{info.fuse}};
-
-    float x, y, color;
-    x = mwc_next_11(rctx);
-    y = mwc_next_11(rctx);
-    color = mwc_next_01(rctx);
+    float4 old_point = points[devtid()];
+    float x = old_point.x, y = old_point.y,
+          color = old_point.z, fuse_rounds = old_point.w;
 
     while (1) {
+        // This condition checks for large numbers, Infs, and NaNs.
+        if (!(-(fabsf(x) + fabsf(y) > -1.0e6f))) {
+            x = mwc_next_11(rctx);
+            y = mwc_next_11(rctx);
+            color = mwc_next_01(rctx);
+            fuse_rounds = {{info.fuse / 32}};
+        }
+
+        // 32 rounds is somewhat arbitrary, but it has a pleasing 32-ness
+        for (int i = 0; i < 32; i++) {
 
 {{if info.chaos_used}}
 
-        {{precalc_chaos(pcp, std_xforms)}}
+            {{precalc_chaos(pcp, std_xforms)}}
 
-        // For now, we don't attempt to use the swap buffer when chaos is used
-        float xfsel = mwc_next_01(rctx);
+            // For now, we don't attempt to use the swap buffer when chaos is used
+            float xfsel = mwc_next_01(rctx);
 
-        {{for prior_xform_idx, prior_xform_name in enumerate(std_xforms)}}
-        if (last_xf_used == {{prior_xform_idx}}) {
-            {{for xform_idx, xform_name in enumerate(std_xforms[:-1])}}
-            if (xfsel <= {{pcp['chaos_'+prior_xform_name+'_'+xform_name]}}) {
-                apply_xf_{{xform_name}}(x, y, color, rctx);
-                last_xf_used = {{xform_idx}};
+            {{for prior_xform_idx, prior_xform_name in enumerate(std_xforms)}}
+            if (last_xf_used == {{prior_xform_idx}}) {
+                {{for xform_idx, xform_name in enumerate(std_xforms[:-1])}}
+                if (xfsel <= {{pcp['chaos_'+prior_xform_name+'_'+xform_name]}}) {
+                    apply_xf_{{xform_name}}(x, y, color, rctx);
+                    last_xf_used = {{xform_idx}};
+                } else
+                {{endfor}}
+                {
+                    apply_xf_{{std_xforms[-1]}}(x, y, color, rctx);
+                    last_xf_used = {{len(std_xforms)-1}};
+                }
             } else
             {{endfor}}
             {
+                printf("Something went *very* wrong.\n");
+                asm("trap;");
+            }
+
+{{else}}
+            {{precalc_densities(pcp, std_xforms)}}
+            float xfsel = cosel[threadIdx.y];
+
+            {{for xform_name in std_xforms[:-1]}}
+            if (xfsel <= {{pcp['den_'+xform_name]}}) {
+                apply_xf_{{xform_name}}(x, y, color, rctx);
+            } else
+            {{endfor}}
                 apply_xf_{{std_xforms[-1]}}(x, y, color, rctx);
-                last_xf_used = {{len(std_xforms)-1}};
-            }
-        } else
-        {{endfor}}
-        {
-            printf("Something went *very* wrong.\n");
-            asm("trap;");
+
+            int sw = (threadIdx.y * 32 + threadIdx.x * 33) & {{NTHREADS-1}};
+            int sr = threadIdx.y * 32 + threadIdx.x;
+
+            swap[sw] = fuse_rounds;
+            swap[sw+{{NTHREADS}}] = x;
+            swap[sw+{{2*NTHREADS}}] = y;
+            swap[sw+{{3*NTHREADS}}] = color;
+            __syncthreads();
+
+            // We select the next xforms here, since we've just synced.
+            if (threadIdx.y == 0 && threadIdx.x < {{NWARPS}})
+                cosel[threadIdx.x] = mwc_next_01(rctx);
+
+            fuse_rounds = swap[sr];
+            x = swap[sr+{{NTHREADS}}];
+            y = swap[sr+{{2*NTHREADS}}];
+            color = swap[sr+{{3*NTHREADS}}];
+
+{{endif}}
+
+            if (fuse_rounds > 0.0f) continue;
+
+{{if 'final' in cp.xforms}}
+            float fx = x, fy = y, fcolor = color;
+            apply_xf_final(fx, fy, fcolor, rctx);
+{{endif}}
+
+            float cx, cy, cc;
+
+            {{precalc_camera(info, pcp.camera)}}
+
+{{if 'final' in cp.xforms}}
+            {{apply_affine('fx', 'fy', 'cx', 'cy', pcp.camera)}}
+            cc = fcolor;
+{{else}}
+            {{apply_affine('x', 'y', 'cx', 'cy', pcp.camera)}}
+            cc = color;
+{{endif}}
+
+            uint32_t ix = trunca(cx), iy = trunca(cy);
+
+            if (ix >= {{info.acc_width}} || iy >= {{info.acc_height}})
+                continue;
+
+            uint32_t i = iy * {{info.acc_stride}} + ix;
+
+            float4 outcol = tex2D(palTex, cc, time_frac);
+            update_pix(accbuf_ptr, i, outcol);
         }
 
-{{else}}
-        {{precalc_densities(pcp, std_xforms)}}
-        float xfsel = cosel[threadIdx.y];
+        int num_okay = __popc(__ballot(fuse_rounds == 0.0f));
+        if (threadIdx.x == 0) atomicSub(&nsamps, num_okay * 32);
+        fuse_rounds = fmaxf(0.0f, fuse_rounds - 1.0f);
 
-        {{for xform_name in std_xforms[:-1]}}
-        if (xfsel <= {{pcp['den_'+xform_name]}}) {
-            apply_xf_{{xform_name}}(x, y, color, rctx);
-        } else
-        {{endfor}}
-            apply_xf_{{std_xforms[-1]}}(x, y, color, rctx);
-
-        // Swap thread states here so that writeback skipping logic doesn't die
-        int sw = (threadIdx.y * 32 + threadIdx.x * 33) & {{NTHREADS-1}};
-        int sr = threadIdx.y * 32 + threadIdx.x;
-
-        swap[sw] = consec_bad;
-        swap[sw+{{NTHREADS}}] = x;
-        swap[sw+{{2*NTHREADS}}] = y;
-        swap[sw+{{3*NTHREADS}}] = color;
         __syncthreads();
-        // This is in the middle of the function so that only one sync is
-        // required per loop.
-        if (nsamps < 0) break;
-
-        // Similarly, we select the next xforms here.
-        if (threadIdx.y == 0 && threadIdx.x < {{NWARPS}})
-            cosel[threadIdx.x] = mwc_next_01(rctx);
-
-        consec_bad = swap[sr];
-        x = swap[sr+{{NTHREADS}}];
-        y = swap[sr+{{2*NTHREADS}}];
-        color = swap[sr+{{3*NTHREADS}}];
-
-{{endif}}
-
-        if (consec_bad < 0) {
-            consec_bad++;
-            continue;
-        }
-
-        int remain = __popc(__ballot(1));
-        if (threadIdx.x == 0) atomicSub(&nsamps, remain);
-
-{{if 'final' in cp.xforms}}
-        float fx = x, fy = y, fcolor = color;
-        apply_xf_final(fx, fy, fcolor, rctx);
-{{endif}}
-
-        float cx, cy, cc;
-
-        {{precalc_camera(info, pcp.camera)}}
-
-{{if 'final' in cp.xforms}}
-        {{apply_affine('fx', 'fy', 'cx', 'cy', pcp.camera)}}
-        cc = fcolor;
-{{else}}
-        {{apply_affine('x', 'y', 'cx', 'cy', pcp.camera)}}
-        cc = color;
-{{endif}}
-
-        uint32_t ix = trunca(cx), iy = trunca(cy);
-
-        if (ix >= {{info.acc_width}} || iy >= {{info.acc_height}} ) {
-            consec_bad++;
-            if (consec_bad > {{info.max_oob}}) {
-                x = mwc_next_11(rctx);
-                y = mwc_next_11(rctx);
-                color = mwc_next_01(rctx);
-                consec_bad = -{{info.fuse}};
-            }
-            continue;
-        }
-
-        uint32_t i = iy * {{info.acc_stride}} + ix;
-
-        float4 outcol = tex2D(palTex, cc, time_frac);
-        update_pix(accbuf_ptr, i, outcol);
+        if (nsamps <= 0) break;
     }
-    msts[gtid()] = rctx;
+    points[devtid()] = make_float4(x, y, color, fuse_rounds);
+    msts[devtid()] = rctx;
 }
 ''')
         return tmpl.substitute(
diff --git a/cuburn/code/util.py b/cuburn/code/util.py
index 5f16437..4368154 100644
--- a/cuburn/code/util.py
+++ b/cuburn/code/util.py
@@ -42,7 +42,7 @@ float3 rgb2hsv(float3 rgb);
 float3 hsv2rgb(float3 hsv);
 """
 
-    defs = r"""
+    defs = Template(r"""
 #undef M_E
 #undef M_LOG2E
 #undef M_LOG10E
@@ -80,6 +80,28 @@ uint32_t gtid() {
                     (blockIdx.x + (gridDim.x * blockIdx.y))));
 }
 
+
+/* Returns the ID of this thread on the device. Note that this counter is
+ * volatile according to the PTX ISA. It should be used for loading and saving
+ * state that must be unique across running threads, not for accessing things
+ * in a known order. */
+__device__
+int devtid() {
+    int result;
+    asm({{crep('''
+    {
+        .reg .u32   tmp1, tmp2;
+        mov.u32     %0,     %smid;
+        mov.u32     tmp1,   %nsmid;
+        mov.u32     tmp2,   %warpid;
+        mad.lo.u32  %0,     %0,     tmp1,   tmp2;
+        mov.u32     tmp1,   %nwarpid;
+        mov.u32     tmp2,   %laneid;
+        mad.lo.u32  %0,     %0,     tmp1,   tmp2;
+    }''')}} : "=r"(result) );
+    return result;
+}
+
 __device__
 uint32_t trunca(float f) {
     // truncate as used in address calculations. note the use of a signed
@@ -182,7 +204,7 @@ float3 hsv2rgb(float3 hsv) {
     else                    { out.x = val; out.y = min; out.z = mid; }
     return out;
 }
-"""
+""").substitute()
 
     @staticmethod
     def fill_dptr(mod, dptr, size, stream=None, value=np.uint32(0)):
diff --git a/cuburn/genome.py b/cuburn/genome.py
index 58b872c..8a0d520 100644
--- a/cuburn/genome.py
+++ b/cuburn/genome.py
@@ -82,11 +82,9 @@ class RenderInfo(object):
     Determine features and constants required to render a particular set of
     genomes. The values of this class are fixed before compilation begins.
     """
-    # Constant parameters which control handling of out-of-frame samples:
-    # Number of iterations to iterate without write after new point
-    fuse = 10
-    # Maximum consecutive out-of-bounds points before picking new point
-    max_oob = 10
+    # Number of iterations to iterate without write after generating a new
+    # point, including the number of bad
+    fuse = 128
 
     # Height of the texture pallete which gets uploaded to the GPU (assuming
     # that palette-from-texture is enabled). For most genomes, this doesn't
diff --git a/cuburn/render.py b/cuburn/render.py
index 3d27412..2fda621 100644
--- a/cuburn/render.py
+++ b/cuburn/render.py
@@ -14,6 +14,7 @@ from fr0stlib import pyflam3
 from fr0stlib.pyflam3._flam3 import *
 from fr0stlib.pyflam3.constants import *
 
+import pycuda.autoinit
 import pycuda.compiler
 import pycuda.driver as cuda
 import pycuda.tools
@@ -153,7 +154,15 @@ class Renderer(object):
                 np.concatenate(map(info.db.palettes.get, pals[1::2])))
         d_palmem = cuda.mem_alloc(256 * info.palette_height * 4)
 
-        seeds = mwc.MWC.make_seeds(self._iter.NTHREADS * cps_per_block)
+        # The '+1' avoids more situations where the 'smid' value is larger
+        # than the number of enabled SMs on a chip, which is warned against in
+        # the docs but not seen in the wild. Things could get nastier on
+        # subsequent silicon, but I doubt they'd ever kill more than 1 SM
+        nslots = pycuda.autoinit.device.max_threads_per_multiprocessor * \
+                (pycuda.autoinit.device.multiprocessor_count + 1)
+
+        d_points = cuda.mem_alloc(nslots * 16)
+        seeds = mwc.MWC.make_seeds(nslots)
         d_seeds = cuda.to_device(seeds)
 
         h_out = cuda.pagelocked_empty((info.acc_height, info.acc_stride, 4),
@@ -200,6 +209,10 @@ class Renderer(object):
                        block=(256,1,1), grid=(cps_per_block/256,1),
                        stream=iter_stream)
 
+            # TODO: if we only do this once per anim, does quality improve?
+            util.BaseCode.fill_dptr(self.mod, d_points, 4 * nslots,
+                                    iter_stream, np.float32(np.nan))
+
             # Get interpolated control points for debugging
             #iter_stream.synchronize()
             #d_temp = cuda.from_device(d_infos,
@@ -208,7 +221,8 @@ class Renderer(object):
                 #print '%60s %g' % ('_'.join(n), i)
 
             nsamps = info.density * info.width * info.height / cps_per_block
-            iter_fun(np.uint64(d_accum), d_seeds, d_infos, np.int32(nsamps),
+            iter_fun(np.uint64(d_accum), d_seeds, d_points,
+                     d_infos, np.int32(nsamps),
                      block=(32, self._iter.NTHREADS/32, 1),
                      grid=(cps_per_block, 1),
                      texrefs=[tref], stream=iter_stream)