New badvals mechanism.

2025-12-22 04:48:31 -05:00 · 2011-10-27 12:59:58 -04:00
parent cac9b691a8
commit f3a79b200c
4 changed files with 144 additions and 112 deletions
--- a/cuburn/code/iter.py
+++ b/cuburn/code/iter.py
@ -242,10 +242,10 @@ void apply_xf_{{xfid}}(float &ox, float &oy, float &color, mwc_st &rctx) {
    def _iterbody(self):
        tmpl = Template(r'''
 __global__
-void iter(uint64_t accbuf_ptr, mwc_st *msts, iter_params *all_params,
+void iter(uint64_t accbuf_ptr, mwc_st *msts, float4 *points,
-          int nsamps_to_generate) {
+          const iter_params *all_params, int nsamps_to_generate) {
-    mwc_st rctx = msts[gtid()];
+    mwc_st rctx = msts[devtid()];
-    iter_params *global_params = &(all_params[blockIdx.x]);
+    const iter_params *global_params = &(all_params[blockIdx.x]);
    __shared__ int nsamps;
    nsamps = nsamps_to_generate;
@ -257,7 +257,7 @@ void iter(uint64_t accbuf_ptr, mwc_st *msts, iter_params *all_params,
    for (int i = threadIdx.y * blockDim.x + threadIdx.x;
         i * 4 < sizeof(iter_params); i += blockDim.x * blockDim.y)
        reinterpret_cast<float*>(&params)[i] =
-            reinterpret_cast<float*>(global_params)[i];
+            reinterpret_cast<const float*>(global_params)[i];
 {{if info.chaos_used}}
    int last_xf_used = 0;
@ -273,14 +273,21 @@ void iter(uint64_t accbuf_ptr, mwc_st *msts, iter_params *all_params,
 {{endif}}
    __syncthreads();
-    int consec_bad = -{{info.fuse}};
+    float4 old_point = points[devtid()];
    float x = old_point.x, y = old_point.y,
          color = old_point.z, fuse_rounds = old_point.w;
-    float x, y, color;
+    while (1) {
        // This condition checks for large numbers, Infs, and NaNs.
        if (!(-(fabsf(x) + fabsf(y) > -1.0e6f))) {
            x = mwc_next_11(rctx);
            y = mwc_next_11(rctx);
            color = mwc_next_01(rctx);
            fuse_rounds = {{info.fuse / 32}};
        }
-    while (1) {
+        // 32 rounds is somewhat arbitrary, but it has a pleasing 32-ness
        for (int i = 0; i < 32; i++) {
 {{if info.chaos_used}}
@ -319,37 +326,27 @@ void iter(uint64_t accbuf_ptr, mwc_st *msts, iter_params *all_params,
            {{endfor}}
                apply_xf_{{std_xforms[-1]}}(x, y, color, rctx);
        // Swap thread states here so that writeback skipping logic doesn't die
            int sw = (threadIdx.y * 32 + threadIdx.x * 33) & {{NTHREADS-1}};
            int sr = threadIdx.y * 32 + threadIdx.x;
-        swap[sw] = consec_bad;
+            swap[sw] = fuse_rounds;
            swap[sw+{{NTHREADS}}] = x;
            swap[sw+{{2*NTHREADS}}] = y;
            swap[sw+{{3*NTHREADS}}] = color;
            __syncthreads();
        // This is in the middle of the function so that only one sync is
        // required per loop.
        if (nsamps < 0) break;
-        // Similarly, we select the next xforms here.
+            // We select the next xforms here, since we've just synced.
            if (threadIdx.y == 0 && threadIdx.x < {{NWARPS}})
                cosel[threadIdx.x] = mwc_next_01(rctx);
-        consec_bad = swap[sr];
+            fuse_rounds = swap[sr];
            x = swap[sr+{{NTHREADS}}];
            y = swap[sr+{{2*NTHREADS}}];
            color = swap[sr+{{3*NTHREADS}}];
 {{endif}}
-        if (consec_bad < 0) {
+            if (fuse_rounds > 0.0f) continue;
            consec_bad++;
            continue;
        }
        int remain = __popc(__ballot(1));
        if (threadIdx.x == 0) atomicSub(&nsamps, remain);
 {{if 'final' in cp.xforms}}
            float fx = x, fy = y, fcolor = color;
@ -370,23 +367,24 @@ void iter(uint64_t accbuf_ptr, mwc_st *msts, iter_params *all_params,
            uint32_t ix = trunca(cx), iy = trunca(cy);
-        if (ix >= {{info.acc_width}} || iy >= {{info.acc_height}} ) {
+            if (ix >= {{info.acc_width}} || iy >= {{info.acc_height}})
            consec_bad++;
            if (consec_bad > {{info.max_oob}}) {
                x = mwc_next_11(rctx);
                y = mwc_next_11(rctx);
                color = mwc_next_01(rctx);
                consec_bad = -{{info.fuse}};
            }
                continue;
        }
            uint32_t i = iy * {{info.acc_stride}} + ix;
            float4 outcol = tex2D(palTex, cc, time_frac);
            update_pix(accbuf_ptr, i, outcol);
        }
-    msts[gtid()] = rctx;
+
        int num_okay = __popc(__ballot(fuse_rounds == 0.0f));
        if (threadIdx.x == 0) atomicSub(&nsamps, num_okay * 32);
        fuse_rounds = fmaxf(0.0f, fuse_rounds - 1.0f);
        __syncthreads();
        if (nsamps <= 0) break;
    }
    points[devtid()] = make_float4(x, y, color, fuse_rounds);
    msts[devtid()] = rctx;
 }
 ''')
        return tmpl.substitute(
--- a/cuburn/code/util.py
+++ b/cuburn/code/util.py
@ -42,7 +42,7 @@ float3 rgb2hsv(float3 rgb);
 float3 hsv2rgb(float3 hsv);
 """
-    defs = r"""
+    defs = Template(r"""
 #undef M_E
 #undef M_LOG2E
 #undef M_LOG10E
@ -80,6 +80,28 @@ uint32_t gtid() {
                    (blockIdx.x + (gridDim.x * blockIdx.y))));
 }
 /* Returns the ID of this thread on the device. Note that this counter is
 * volatile according to the PTX ISA. It should be used for loading and saving
 * state that must be unique across running threads, not for accessing things
 * in a known order. */
 __device__
 int devtid() {
    int result;
    asm({{crep('''
    {
        .reg .u32   tmp1, tmp2;
        mov.u32     %0,     %smid;
        mov.u32     tmp1,   %nsmid;
        mov.u32     tmp2,   %warpid;
        mad.lo.u32  %0,     %0,     tmp1,   tmp2;
        mov.u32     tmp1,   %nwarpid;
        mov.u32     tmp2,   %laneid;
        mad.lo.u32  %0,     %0,     tmp1,   tmp2;
    }''')}} : "=r"(result) );
    return result;
 }
 __device__
 uint32_t trunca(float f) {
    // truncate as used in address calculations. note the use of a signed
@ -182,7 +204,7 @@ float3 hsv2rgb(float3 hsv) {
    else                    { out.x = val; out.y = min; out.z = mid; }
    return out;
 }
-"""
+""").substitute()
    @staticmethod
    def fill_dptr(mod, dptr, size, stream=None, value=np.uint32(0)):
--- a/cuburn/genome.py
+++ b/cuburn/genome.py
@ -82,11 +82,9 @@ class RenderInfo(object):
    Determine features and constants required to render a particular set of
    genomes. The values of this class are fixed before compilation begins.
    """
-    # Constant parameters which control handling of out-of-frame samples:
+    # Number of iterations to iterate without write after generating a new
-    # Number of iterations to iterate without write after new point
+    # point, including the number of bad
-    fuse = 10
+    fuse = 128
    # Maximum consecutive out-of-bounds points before picking new point
    max_oob = 10
    # Height of the texture pallete which gets uploaded to the GPU (assuming
    # that palette-from-texture is enabled). For most genomes, this doesn't
--- a/cuburn/render.py
+++ b/cuburn/render.py
@ -14,6 +14,7 @@ from fr0stlib import pyflam3
 from fr0stlib.pyflam3._flam3 import *
 from fr0stlib.pyflam3.constants import *
 import pycuda.autoinit
 import pycuda.compiler
 import pycuda.driver as cuda
 import pycuda.tools
@ -153,7 +154,15 @@ class Renderer(object):
                np.concatenate(map(info.db.palettes.get, pals[1::2])))
        d_palmem = cuda.mem_alloc(256 * info.palette_height * 4)
-        seeds = mwc.MWC.make_seeds(self._iter.NTHREADS * cps_per_block)
+        # The '+1' avoids more situations where the 'smid' value is larger
        # than the number of enabled SMs on a chip, which is warned against in
        # the docs but not seen in the wild. Things could get nastier on
        # subsequent silicon, but I doubt they'd ever kill more than 1 SM
        nslots = pycuda.autoinit.device.max_threads_per_multiprocessor * \
                (pycuda.autoinit.device.multiprocessor_count + 1)
        d_points = cuda.mem_alloc(nslots * 16)
        seeds = mwc.MWC.make_seeds(nslots)
        d_seeds = cuda.to_device(seeds)
        h_out = cuda.pagelocked_empty((info.acc_height, info.acc_stride, 4),
@ -200,6 +209,10 @@ class Renderer(object):
                       block=(256,1,1), grid=(cps_per_block/256,1),
                       stream=iter_stream)
            # TODO: if we only do this once per anim, does quality improve?
            util.BaseCode.fill_dptr(self.mod, d_points, 4 * nslots,
                                    iter_stream, np.float32(np.nan))
            # Get interpolated control points for debugging
            #iter_stream.synchronize()
            #d_temp = cuda.from_device(d_infos,
@ -208,7 +221,8 @@ class Renderer(object):
                #print '%60s %g' % ('_'.join(n), i)
            nsamps = info.density * info.width * info.height / cps_per_block
-            iter_fun(np.uint64(d_accum), d_seeds, d_infos, np.int32(nsamps),
+            iter_fun(np.uint64(d_accum), d_seeds, d_points,
                     d_infos, np.int32(nsamps),
                     block=(32, self._iter.NTHREADS/32, 1),
                     grid=(cps_per_block, 1),
                     texrefs=[tref], stream=iter_stream)