New badvals mechanism.

2025-06-30 21:37:16 -04:00 · 2011-10-27 12:59:58 -04:00
parent cac9b691a8
commit f3a79b200c
4 changed files with 144 additions and 112 deletions
--- a/cuburn/code/iter.py
+++ b/cuburn/code/iter.py
@ -242,10 +242,10 @@ void apply_xf_{{xfid}}(float &ox, float &oy, float &color, mwc_st &rctx) {
    def _iterbody(self):
        tmpl = Template(r'''
 __global__
-void iter(uint64_t accbuf_ptr, mwc_st *msts, iter_params *all_params,
+void iter(uint64_t accbuf_ptr, mwc_st *msts, float4 *points,
-          int nsamps_to_generate) {
+          const iter_params *all_params, int nsamps_to_generate) {
-    mwc_st rctx = msts[gtid()];
+    mwc_st rctx = msts[devtid()];
-    iter_params *global_params = &(all_params[blockIdx.x]);
+    const iter_params *global_params = &(all_params[blockIdx.x]);
    __shared__ int nsamps;
    nsamps = nsamps_to_generate;
@ -257,7 +257,7 @@ void iter(uint64_t accbuf_ptr, mwc_st *msts, iter_params *all_params,
    for (int i = threadIdx.y * blockDim.x + threadIdx.x;
         i * 4 < sizeof(iter_params); i += blockDim.x * blockDim.y)
        reinterpret_cast<float*>(&params)[i] =
-            reinterpret_cast<float*>(global_params)[i];
+            reinterpret_cast<const float*>(global_params)[i];
 {{if info.chaos_used}}
    int last_xf_used = 0;
@ -273,120 +273,118 @@ void iter(uint64_t accbuf_ptr, mwc_st *msts, iter_params *all_params,
 {{endif}}
    __syncthreads();
-    int consec_bad = -{{info.fuse}};
+    float4 old_point = points[devtid()];
-
+    float x = old_point.x, y = old_point.y,
-    float x, y, color;
+          color = old_point.z, fuse_rounds = old_point.w;
    x = mwc_next_11(rctx);
    y = mwc_next_11(rctx);
    color = mwc_next_01(rctx);
    while (1) {
        // This condition checks for large numbers, Infs, and NaNs.
        if (!(-(fabsf(x) + fabsf(y) > -1.0e6f))) {
            x = mwc_next_11(rctx);
            y = mwc_next_11(rctx);
            color = mwc_next_01(rctx);
            fuse_rounds = {{info.fuse / 32}};
        }
        // 32 rounds is somewhat arbitrary, but it has a pleasing 32-ness
        for (int i = 0; i < 32; i++) {
 {{if info.chaos_used}}
-        {{precalc_chaos(pcp, std_xforms)}}
+            {{precalc_chaos(pcp, std_xforms)}}
-        // For now, we don't attempt to use the swap buffer when chaos is used
+            // For now, we don't attempt to use the swap buffer when chaos is used
-        float xfsel = mwc_next_01(rctx);
+            float xfsel = mwc_next_01(rctx);
-        {{for prior_xform_idx, prior_xform_name in enumerate(std_xforms)}}
+            {{for prior_xform_idx, prior_xform_name in enumerate(std_xforms)}}
-        if (last_xf_used == {{prior_xform_idx}}) {
+            if (last_xf_used == {{prior_xform_idx}}) {
-            {{for xform_idx, xform_name in enumerate(std_xforms[:-1])}}
+                {{for xform_idx, xform_name in enumerate(std_xforms[:-1])}}
-            if (xfsel <= {{pcp['chaos_'+prior_xform_name+'_'+xform_name]}}) {
+                if (xfsel <= {{pcp['chaos_'+prior_xform_name+'_'+xform_name]}}) {
-                apply_xf_{{xform_name}}(x, y, color, rctx);
+                    apply_xf_{{xform_name}}(x, y, color, rctx);
-                last_xf_used = {{xform_idx}};
+                    last_xf_used = {{xform_idx}};
                } else
                {{endfor}}
                {
                    apply_xf_{{std_xforms[-1]}}(x, y, color, rctx);
                    last_xf_used = {{len(std_xforms)-1}};
                }
            } else
            {{endfor}}
            {
                printf("Something went *very* wrong.\n");
                asm("trap;");
            }
 {{else}}
            {{precalc_densities(pcp, std_xforms)}}
            float xfsel = cosel[threadIdx.y];
            {{for xform_name in std_xforms[:-1]}}
            if (xfsel <= {{pcp['den_'+xform_name]}}) {
                apply_xf_{{xform_name}}(x, y, color, rctx);
            } else
            {{endfor}}
                apply_xf_{{std_xforms[-1]}}(x, y, color, rctx);
-                last_xf_used = {{len(std_xforms)-1}};
+
-            }
+            int sw = (threadIdx.y * 32 + threadIdx.x * 33) & {{NTHREADS-1}};
-        } else
+            int sr = threadIdx.y * 32 + threadIdx.x;
-        {{endfor}}
+
-        {
+            swap[sw] = fuse_rounds;
-            printf("Something went *very* wrong.\n");
+            swap[sw+{{NTHREADS}}] = x;
-            asm("trap;");
+            swap[sw+{{2*NTHREADS}}] = y;
            swap[sw+{{3*NTHREADS}}] = color;
            __syncthreads();
            // We select the next xforms here, since we've just synced.
            if (threadIdx.y == 0 && threadIdx.x < {{NWARPS}})
                cosel[threadIdx.x] = mwc_next_01(rctx);
            fuse_rounds = swap[sr];
            x = swap[sr+{{NTHREADS}}];
            y = swap[sr+{{2*NTHREADS}}];
            color = swap[sr+{{3*NTHREADS}}];
 {{endif}}
            if (fuse_rounds > 0.0f) continue;
 {{if 'final' in cp.xforms}}
            float fx = x, fy = y, fcolor = color;
            apply_xf_final(fx, fy, fcolor, rctx);
 {{endif}}
            float cx, cy, cc;
            {{precalc_camera(info, pcp.camera)}}
 {{if 'final' in cp.xforms}}
            {{apply_affine('fx', 'fy', 'cx', 'cy', pcp.camera)}}
            cc = fcolor;
 {{else}}
            {{apply_affine('x', 'y', 'cx', 'cy', pcp.camera)}}
            cc = color;
 {{endif}}
            uint32_t ix = trunca(cx), iy = trunca(cy);
            if (ix >= {{info.acc_width}} || iy >= {{info.acc_height}})
                continue;
            uint32_t i = iy * {{info.acc_stride}} + ix;
            float4 outcol = tex2D(palTex, cc, time_frac);
            update_pix(accbuf_ptr, i, outcol);
        }
-{{else}}
+        int num_okay = __popc(__ballot(fuse_rounds == 0.0f));
-        {{precalc_densities(pcp, std_xforms)}}
+        if (threadIdx.x == 0) atomicSub(&nsamps, num_okay * 32);
-        float xfsel = cosel[threadIdx.y];
+        fuse_rounds = fmaxf(0.0f, fuse_rounds - 1.0f);
        {{for xform_name in std_xforms[:-1]}}
        if (xfsel <= {{pcp['den_'+xform_name]}}) {
            apply_xf_{{xform_name}}(x, y, color, rctx);
        } else
        {{endfor}}
            apply_xf_{{std_xforms[-1]}}(x, y, color, rctx);
        // Swap thread states here so that writeback skipping logic doesn't die
        int sw = (threadIdx.y * 32 + threadIdx.x * 33) & {{NTHREADS-1}};
        int sr = threadIdx.y * 32 + threadIdx.x;
        swap[sw] = consec_bad;
        swap[sw+{{NTHREADS}}] = x;
        swap[sw+{{2*NTHREADS}}] = y;
        swap[sw+{{3*NTHREADS}}] = color;
        __syncthreads();
-        // This is in the middle of the function so that only one sync is
+        if (nsamps <= 0) break;
        // required per loop.
        if (nsamps < 0) break;
        // Similarly, we select the next xforms here.
        if (threadIdx.y == 0 && threadIdx.x < {{NWARPS}})
            cosel[threadIdx.x] = mwc_next_01(rctx);
        consec_bad = swap[sr];
        x = swap[sr+{{NTHREADS}}];
        y = swap[sr+{{2*NTHREADS}}];
        color = swap[sr+{{3*NTHREADS}}];
 {{endif}}
        if (consec_bad < 0) {
            consec_bad++;
            continue;
        }
        int remain = __popc(__ballot(1));
        if (threadIdx.x == 0) atomicSub(&nsamps, remain);
 {{if 'final' in cp.xforms}}
        float fx = x, fy = y, fcolor = color;
        apply_xf_final(fx, fy, fcolor, rctx);
 {{endif}}
        float cx, cy, cc;
        {{precalc_camera(info, pcp.camera)}}
 {{if 'final' in cp.xforms}}
        {{apply_affine('fx', 'fy', 'cx', 'cy', pcp.camera)}}
        cc = fcolor;
 {{else}}
        {{apply_affine('x', 'y', 'cx', 'cy', pcp.camera)}}
        cc = color;
 {{endif}}
        uint32_t ix = trunca(cx), iy = trunca(cy);
        if (ix >= {{info.acc_width}} || iy >= {{info.acc_height}} ) {
            consec_bad++;
            if (consec_bad > {{info.max_oob}}) {
                x = mwc_next_11(rctx);
                y = mwc_next_11(rctx);
                color = mwc_next_01(rctx);
                consec_bad = -{{info.fuse}};
            }
            continue;
        }
        uint32_t i = iy * {{info.acc_stride}} + ix;
        float4 outcol = tex2D(palTex, cc, time_frac);
        update_pix(accbuf_ptr, i, outcol);
    }
-    msts[gtid()] = rctx;
+    points[devtid()] = make_float4(x, y, color, fuse_rounds);
    msts[devtid()] = rctx;
 }
 ''')
        return tmpl.substitute(
--- a/cuburn/code/util.py
+++ b/cuburn/code/util.py
@ -42,7 +42,7 @@ float3 rgb2hsv(float3 rgb);
 float3 hsv2rgb(float3 hsv);
 """
-    defs = r"""
+    defs = Template(r"""
 #undef M_E
 #undef M_LOG2E
 #undef M_LOG10E
@ -80,6 +80,28 @@ uint32_t gtid() {
                    (blockIdx.x + (gridDim.x * blockIdx.y))));
 }
 /* Returns the ID of this thread on the device. Note that this counter is
 * volatile according to the PTX ISA. It should be used for loading and saving
 * state that must be unique across running threads, not for accessing things
 * in a known order. */
 __device__
 int devtid() {
    int result;
    asm({{crep('''
    {
        .reg .u32   tmp1, tmp2;
        mov.u32     %0,     %smid;
        mov.u32     tmp1,   %nsmid;
        mov.u32     tmp2,   %warpid;
        mad.lo.u32  %0,     %0,     tmp1,   tmp2;
        mov.u32     tmp1,   %nwarpid;
        mov.u32     tmp2,   %laneid;
        mad.lo.u32  %0,     %0,     tmp1,   tmp2;
    }''')}} : "=r"(result) );
    return result;
 }
 __device__
 uint32_t trunca(float f) {
    // truncate as used in address calculations. note the use of a signed
@ -182,7 +204,7 @@ float3 hsv2rgb(float3 hsv) {
    else                    { out.x = val; out.y = min; out.z = mid; }
    return out;
 }
-"""
+""").substitute()
    @staticmethod
    def fill_dptr(mod, dptr, size, stream=None, value=np.uint32(0)):
--- a/cuburn/genome.py
+++ b/cuburn/genome.py
@ -82,11 +82,9 @@ class RenderInfo(object):
    Determine features and constants required to render a particular set of
    genomes. The values of this class are fixed before compilation begins.
    """
-    # Constant parameters which control handling of out-of-frame samples:
+    # Number of iterations to iterate without write after generating a new
-    # Number of iterations to iterate without write after new point
+    # point, including the number of bad
-    fuse = 10
+    fuse = 128
    # Maximum consecutive out-of-bounds points before picking new point
    max_oob = 10
    # Height of the texture pallete which gets uploaded to the GPU (assuming
    # that palette-from-texture is enabled). For most genomes, this doesn't
--- a/cuburn/render.py
+++ b/cuburn/render.py
@ -14,6 +14,7 @@ from fr0stlib import pyflam3
 from fr0stlib.pyflam3._flam3 import *
 from fr0stlib.pyflam3.constants import *
 import pycuda.autoinit
 import pycuda.compiler
 import pycuda.driver as cuda
 import pycuda.tools
@ -153,7 +154,15 @@ class Renderer(object):
                np.concatenate(map(info.db.palettes.get, pals[1::2])))
        d_palmem = cuda.mem_alloc(256 * info.palette_height * 4)
-        seeds = mwc.MWC.make_seeds(self._iter.NTHREADS * cps_per_block)
+        # The '+1' avoids more situations where the 'smid' value is larger
        # than the number of enabled SMs on a chip, which is warned against in
        # the docs but not seen in the wild. Things could get nastier on
        # subsequent silicon, but I doubt they'd ever kill more than 1 SM
        nslots = pycuda.autoinit.device.max_threads_per_multiprocessor * \
                (pycuda.autoinit.device.multiprocessor_count + 1)
        d_points = cuda.mem_alloc(nslots * 16)
        seeds = mwc.MWC.make_seeds(nslots)
        d_seeds = cuda.to_device(seeds)
        h_out = cuda.pagelocked_empty((info.acc_height, info.acc_stride, 4),
@ -200,6 +209,10 @@ class Renderer(object):
                       block=(256,1,1), grid=(cps_per_block/256,1),
                       stream=iter_stream)
            # TODO: if we only do this once per anim, does quality improve?
            util.BaseCode.fill_dptr(self.mod, d_points, 4 * nslots,
                                    iter_stream, np.float32(np.nan))
            # Get interpolated control points for debugging
            #iter_stream.synchronize()
            #d_temp = cuda.from_device(d_infos,
@ -208,7 +221,8 @@ class Renderer(object):
                #print '%60s %g' % ('_'.join(n), i)
            nsamps = info.density * info.width * info.height / cps_per_block
-            iter_fun(np.uint64(d_accum), d_seeds, d_infos, np.int32(nsamps),
+            iter_fun(np.uint64(d_accum), d_seeds, d_points,
                     d_infos, np.int32(nsamps),
                     block=(32, self._iter.NTHREADS/32, 1),
                     grid=(cps_per_block, 1),
                     texrefs=[tref], stream=iter_stream)