Deferred writeback.

2025-07-12 03:05:14 -04:00 · 2011-11-11 17:37:27 -05:00
parent 05e1d08681
commit eb43b151dc
4 changed files with 360 additions and 206 deletions
--- a/cuburn/code/iter.py
+++ b/cuburn/code/iter.py
@ -122,7 +122,6 @@ class IterCode(HunkOCode):
        bodies = [self._xfbody(i,x) for i,x in sorted(info.genome.xforms.items())]
        bodies.append(iterbody)
        self.defs = '\n'.join(bodies)
-        self.decls += self.pix_helpers.substitute(info=info)

    decls = """
 // Note: for normalized lookups, uchar4 actually returns floats
@ -132,78 +131,6 @@ __device__ int rb_head, rb_tail, rb_size;

 """

-    pix_helpers = Template("""
-__device__
-void read_pix(float4 &pix, float &den) {
-    den = pix.w;
-    {{if info.pal_has_alpha}}
-    read_half(pix.z, pix.w, pix.z, den);
-    {{endif}}
-}
-
-__device__
-void write_pix(float4 &pix, float den) {
-    {{if info.pal_has_alpha}}
-    write_half(pix.z, pix.z, pix.w, den);
-    {{endif}}
-    pix.w = den;
-}
-
-__device__
-void update_pix(uint64_t ptr, uint32_t i, float4 c) {
-    {{if info.pal_has_alpha}}
-    asm volatile ({{crep('''
-    {
-        .reg .u16       sz, sw;
-        .reg .u64       base, off;
-        .reg .f32       x, y, z, w, den, rc, tz, tw;
-
-        // TODO: this limits the accumulation buffer to <4GB
-        shl.b32         %0,     %0,     4;
-        cvt.u64.u32     off,    %0;
-        add.u64         base,   %1,     off;
-        ld.cg.v4.f32    {x, y, z, den},         [base];
-        add.f32         x,      x,      %2;
-        add.f32         y,      y,      %3;
-        mov.b32         {sz, sw},       z;
-        cvt.rn.f32.u16  tz,     sz;
-        cvt.rn.f32.u16  tw,     sw;
-        mul.f32         tz,     tz,     den;
-        mul.f32         tw,     tz,     den;
-        fma.f32         tz,     %4,     65535.0,    tz;
-        fma.f32         tw,     %5,     65535.0,    tw;
-        add.f32         den,    1.0;
-        rcp.approx.f32  rc,     den;
-        mul.f32         tz,     tz,     rc;
-        mul.f32         tw,     tw,     rc;
-        cvt.rni.u16.f32 sz,     tz;
-        cvt.rni.u16.f32 sw,     tw;
-        mov.b32         z,      {sz, sw};
-        st.cs.v4.f32    [base], {x, y, z, den};
-    }
-    ''')}} : "+r"(i) : "l"(ptr), "f"(c.x), "f"(c.y), "f"(c.z), "f"(c.w));
-    {{else}}
-    asm volatile ({{crep('''
-    {
-        .reg .u64       base, off;
-        .reg .f32       x, y, z, den;
-
-        // TODO: this limits the accumulation buffer to <4GB
-        shl.b32         %0,     %0,     4;
-        cvt.u64.u32     off,    %0;
-        add.u64         base,   %1,     off;
-        ld.cg.v4.f32    {x, y, z, den},         [base];
-        add.f32         x,      x,      %2;
-        add.f32         y,      y,      %3;
-        add.f32         z,      z,      %4;
-        add.f32         den,    den,    1.0;
-        st.cs.v4.f32    [base], {x, y, z, den};
-    }
-    ''')}} : "+r"(i) : "l"(ptr), "f"(c.x), "f"(c.y), "f"(c.z));
-    {{endif}}
-}
-""")
-
    def _xfbody(self, xfid, xform):
        px = self.pcp.xforms[xfid]
        tmpl = Template(r"""
@ -249,19 +176,23 @@ __global__ void reset_rb(int size) {
 }

 __global__
-void iter(uint64_t accbuf_ptr, mwc_st *msts, float4 *points,
-          const iter_params *all_params, int nsamps_to_generate) {
+void iter(
+        uint64_t out_ptr,
+        mwc_st *msts,
+        float4 *points,
+        const iter_params *all_params,
+        int nsamps_to_generate
+) {
    const iter_params *global_params = &(all_params[blockIdx.x]);

-    __shared__ int nsamps;
-    nsamps = nsamps_to_generate;
-
+{{if info.acc_mode != 'deferred'}}
    __shared__ float time_frac;
    time_frac = blockIdx.x / (float) gridDim.x;
+{{endif}}

    // load params to shared memory cooperatively
    for (int i = threadIdx.y * blockDim.x + threadIdx.x;
-         i * 4 < sizeof(iter_params); i += blockDim.x * blockDim.y)
+         i < (sizeof(iter_params) / 4); i += blockDim.x * blockDim.y)
        reinterpret_cast<float*>(&params)[i] =
            reinterpret_cast<const float*>(global_params)[i];

@ -272,9 +203,10 @@ void iter(uint64_t accbuf_ptr, mwc_st *msts, float4 *points,
    __syncthreads();
    int this_rb_idx = rb_idx + threadIdx.x + 32 * threadIdx.y;
    mwc_st rctx = msts[this_rb_idx];
+
+    // TODO: 4th channel unused. Kill or use for something helpful
    float4 old_point = points[this_rb_idx];
-    float x = old_point.x, y = old_point.y,
-          color = old_point.z, fuse_rounds = old_point.w;
+    float x = old_point.x, y = old_point.y, color = old_point.z;

 {{if info.chaos_used}}
    int last_xf_used = 0;
@ -290,116 +222,139 @@ void iter(uint64_t accbuf_ptr, mwc_st *msts, float4 *points,
    __syncthreads();
 {{endif}}

+    bool fuse = false;

-    while (1) {
-        // This condition checks for large numbers, Infs, and NaNs.
-        if (!(-(fabsf(x) + fabsf(y) > -1.0e6f))) {
-            x = mwc_next_11(rctx);
-            y = mwc_next_11(rctx);
-            color = mwc_next_01(rctx);
-            fuse_rounds = {{info.fuse / 32}};
-        }
+    // This condition checks for large numbers, Infs, and NaNs.
+    if (!(-(fabsf(x) + fabsf(y)) > -1.0e6f)) {
+        x = mwc_next_11(rctx);
+        y = mwc_next_11(rctx);
+        color = mwc_next_01(rctx);
+        fuse = true;
+    }

-        // 32 rounds is somewhat arbitrary, but it has a pleasing 32-ness
-        for (int i = 0; i < 32; i++) {
+    // TODO: link up with FUSE, etc
+    for (int round = 0; round < 256; round++) {

 {{if info.chaos_used}}

-            {{precalc_chaos(pcp, std_xforms)}}
+        {{precalc_chaos(pcp, std_xforms)}}

-            // For now, we don't attempt to use the swap buffer when chaos is used
-            float xfsel = mwc_next_01(rctx);
+        // For now, we don't attempt to use the swap buffer when chaos is used
+        float xfsel = mwc_next_01(rctx);

-            {{for prior_xform_idx, prior_xform_name in enumerate(std_xforms)}}
-            if (last_xf_used == {{prior_xform_idx}}) {
-                {{for xform_idx, xform_name in enumerate(std_xforms[:-1])}}
-                if (xfsel <= {{pcp['chaos_'+prior_xform_name+'_'+xform_name]}}) {
-                    apply_xf_{{xform_name}}(x, y, color, rctx);
-                    last_xf_used = {{xform_idx}};
-                } else
-                {{endfor}}
-                {
-                    apply_xf_{{std_xforms[-1]}}(x, y, color, rctx);
-                    last_xf_used = {{len(std_xforms)-1}};
-                }
+        {{for prior_xform_idx, prior_xform_name in enumerate(std_xforms)}}
+        if (last_xf_used == {{prior_xform_idx}}) {
+            {{for xform_idx, xform_name in enumerate(std_xforms[:-1])}}
+            if (xfsel <= {{pcp['chaos_'+prior_xform_name+'_'+xform_name]}}) {
+                apply_xf_{{xform_name}}(x, y, color, rctx);
+                last_xf_used = {{xform_idx}};
            } else
            {{endfor}}
            {
-                printf("Something went *very* wrong.\n");
-                asm("trap;");
-            }
-
-{{else}}
-            {{precalc_densities(pcp, std_xforms)}}
-            float xfsel = cosel[threadIdx.y];
-
-            {{for xform_name in std_xforms[:-1]}}
-            if (xfsel <= {{pcp['den_'+xform_name]}}) {
-                apply_xf_{{xform_name}}(x, y, color, rctx);
-            } else
-            {{endfor}}
                apply_xf_{{std_xforms[-1]}}(x, y, color, rctx);
-
-            int sw = (threadIdx.y * 32 + threadIdx.x * 33) & {{NTHREADS-1}};
-            int sr = threadIdx.y * 32 + threadIdx.x;
-
-            swap[sw] = fuse_rounds;
-            swap[sw+{{NTHREADS}}] = x;
-            swap[sw+{{2*NTHREADS}}] = y;
-            swap[sw+{{3*NTHREADS}}] = color;
-            __syncthreads();
-
-            // We select the next xforms here, since we've just synced.
-            if (threadIdx.y == 0 && threadIdx.x < {{NWARPS}})
-                cosel[threadIdx.x] = mwc_next_01(rctx);
-
-            fuse_rounds = swap[sr];
-            x = swap[sr+{{NTHREADS}}];
-            y = swap[sr+{{2*NTHREADS}}];
-            color = swap[sr+{{3*NTHREADS}}];
-
-{{endif}}
-
-            if (fuse_rounds > 0.0f) continue;
-
-{{if 'final' in cp.xforms}}
-            float fx = x, fy = y, fcolor = color;
-            apply_xf_final(fx, fy, fcolor, rctx);
-{{endif}}
-
-            float cx, cy, cc;
-
-            {{precalc_camera(info, pcp.camera)}}
-
-{{if 'final' in cp.xforms}}
-            {{apply_affine('fx', 'fy', 'cx', 'cy', pcp.camera)}}
-            cc = fcolor;
-{{else}}
-            {{apply_affine('x', 'y', 'cx', 'cy', pcp.camera)}}
-            cc = color;
-{{endif}}
-
-            uint32_t ix = trunca(cx), iy = trunca(cy);
-
-            if (ix >= {{info.acc_width}} || iy >= {{info.acc_height}})
-                continue;
-
-            uint32_t i = iy * {{info.acc_stride}} + ix;
-
-            float4 outcol = tex2D(palTex, cc, time_frac);
-            update_pix(accbuf_ptr, i, outcol);
+                last_xf_used = {{len(std_xforms)-1}};
+            }
+        } else
+        {{endfor}}
+        {
+            printf("Something went *very* wrong.\n");
+            asm("trap;");
        }

-        int num_okay = __popc(__ballot(fuse_rounds == 0.0f));
-        // Some xforms give so many badvals that a thread is almost guaranteed
-        // to hit another badval before the fuse is over, causing the card to
-        // spin forever. To avoid this, we count a fuse round as 1/4 of a
-        // sample below.
-        if (threadIdx.x == 0) atomicSub(&nsamps, 256 + num_okay * 24);
-        fuse_rounds = fmaxf(0.0f, fuse_rounds - 1.0f);
+{{else}}
+        {{precalc_densities(pcp, std_xforms)}}
+        float xfsel = cosel[threadIdx.y];

+        {{for xform_name in std_xforms[:-1]}}
+        if (xfsel <= {{pcp['den_'+xform_name]}}) {
+            apply_xf_{{xform_name}}(x, y, color, rctx);
+        } else
+        {{endfor}}
+            apply_xf_{{std_xforms[-1]}}(x, y, color, rctx);
+
+        int sw = (threadIdx.y * 32 + threadIdx.x * 33) & {{NTHREADS-1}};
+        int sr = threadIdx.y * 32 + threadIdx.x;
+
+        swap[sw] = fuse ? 1.0f : 0.0f;
+        swap[sw+{{NTHREADS}}] = x;
+        swap[sw+{{2*NTHREADS}}] = y;
+        swap[sw+{{3*NTHREADS}}] = color;
        __syncthreads();
-        if (nsamps <= 0) break;
+
+        // We select the next xforms here, since we've just synced.
+        if (threadIdx.y == 0 && threadIdx.x < {{NWARPS}})
+            cosel[threadIdx.x] = mwc_next_01(rctx);
+
+        fuse = swap[sr];
+        x = swap[sr+{{NTHREADS}}];
+        y = swap[sr+{{2*NTHREADS}}];
+        color = swap[sr+{{3*NTHREADS}}];
+
+{{endif}}
+
+{{if info.acc_mode == 'deferred'}}
+        int tid = threadIdx.y * 32 + threadIdx.x;
+        int offset = 4 * (256 * (256 * blockIdx.x + round) + tid);
+        int *log = reinterpret_cast<int*>(out_ptr + offset);
+{{endif}}
+
+        if (fuse) {
+{{if info.acc_mode == 'deferred'}}
+            *log = 0xffffffff;
+{{endif}}
+            continue;
+        }
+
+{{if 'final' in cp.xforms}}
+        float fx = x, fy = y, fcolor = color;
+        apply_xf_final(fx, fy, fcolor, rctx);
+{{endif}}
+
+        float cx, cy, cc;
+
+        {{precalc_camera(info, pcp.camera)}}
+
+{{if 'final' in cp.xforms}}
+        {{apply_affine('fx', 'fy', 'cx', 'cy', pcp.camera)}}
+        cc = fcolor;
+{{else}}
+        {{apply_affine('x', 'y', 'cx', 'cy', pcp.camera)}}
+        cc = color;
+{{endif}}
+
+        uint32_t ix = trunca(cx), iy = trunca(cy);
+
+        if (ix >= {{info.acc_width}} || iy >= {{info.acc_height}}) {
+{{if info.acc_mode == 'deferred'}}
+            *log = 0xffffffff;
+{{endif}}
+            continue;
+        }
+
+        uint32_t i = iy * {{info.acc_stride}} + ix;
+
+{{if info.acc_mode == 'atomic'}}
+        float4 outcol = tex2D(palTex, cc, time_frac);
+        float *accbuf_f = reinterpret_cast<float*>(out_ptr + (16*i));
+        atomicAdd(accbuf_f,   outcol.x);
+        atomicAdd(accbuf_f+1, outcol.y);
+        atomicAdd(accbuf_f+2, outcol.z);
+        atomicAdd(accbuf_f+3, 1.0f);
+{{elif info.acc_mode == 'global'}}
+        float4 outcol = tex2D(palTex, cc, time_frac);
+        float4 *accbuf = reinterpret_cast<float4*>(out_ptr + (16*i));
+        float4 pix = *accbuf;
+        pix.x += outcol.x;
+        pix.y += outcol.y;
+        pix.z += outcol.z;
+        pix.w += 1.0f;
+        *accbuf = pix;
+{{elif info.acc_mode == 'deferred'}}
+        // 'color' gets the top 9 bits. TODO: add dithering via precalc.
+        uint32_t icolor = cc * 512.0f;
+        asm("bfi.b32    %0, %1, %0, 23, 9;" : "+r"(i) : "r"(icolor));
+        *log = i;
+{{endif}}
    }

    if (threadIdx.x == 0 && threadIdx.y == 0)
@ -407,10 +362,140 @@ void iter(uint64_t accbuf_ptr, mwc_st *msts, float4 *points,
    __syncthreads();
    this_rb_idx = rb_idx + threadIdx.x + 32 * threadIdx.y;

-    points[this_rb_idx] = make_float4(x, y, color, fuse_rounds);
+    points[this_rb_idx] = make_float4(x, y, color, 0.0f);
    msts[this_rb_idx] = rctx;
    return;
 }
+
+// Block size, shared accumulation bits, shared accumulation width.
+#define BS 1024
+#define SHAB 12
+#define SHAW (1<<SHAB)
+
+// These two accumulators, used in write_shmem, hold {density, red} and
+// {green, blue} values as packed u16 pairs. The fixed size represents 4,096
+// pixels in the accumulator.
+__shared__ uint32_t s_acc_dr[SHAW];
+__shared__ uint32_t s_acc_gb[SHAW];
+
+// Read from the shm accumulators and write to the global ones.
+__device__
+void write_shmem_helper(
+        float4 *acc,
+        const int glo_base,
+        const int idx
+) {
+    float4 pix = acc[glo_base+idx];
+    uint32_t dr = s_acc_dr[idx];
+    pix.x += (dr & 0xffff) / 255.0f;
+    pix.w += dr >> 16;
+    uint32_t gb = s_acc_gb[idx];
+    pix.y += (gb & 0xffff) / 255.0f;
+    pix.z += (gb >> 16) / 255.0f;
+    acc[glo_base+idx] = pix;
+}
+
+// Read the point log, accumulate in shared memory, and write the results.
+// This kernel is to be launched with one block for every 4,096 addresses to
+// be processed, and will handle those addresses.
+//
+// log_bounds is an array mapping radix values to the first index in the log
+// with that radix position. For performance reasons in other parts of the
+// code, the radix may actually include bits within the lower SHAB part of the
+// address, or it might not cover the first few bits after the SHAB part;
+// log_bounds_shift covers that. glob_addr_bits specifies the number of bits
+// above SHAB which are address bits.
+
+__global__ void
+__launch_bounds__(BS, 1)
+write_shmem(
+        float4 *acc,
+        const uint32_t *log,
+        const uint32_t *log_bounds,
+        const int log_bounds_shift
+) {
+    const int tid = threadIdx.x;
+    const int bid = blockIdx.x;
+
+    // TODO: doesn't respect SHAW/BS
+    // TODO: compare generated code with unrolled for-loop
+    s_acc_dr[tid] = 0;
+    s_acc_gb[tid] = 0;
+    s_acc_dr[tid+BS] = 0;
+    s_acc_gb[tid+BS] = 0;
+    s_acc_dr[tid+2*BS] = 0;
+    s_acc_gb[tid+2*BS] = 0;
+    s_acc_dr[tid+3*BS] = 0;
+    s_acc_gb[tid+3*BS] = 0;
+    __syncthreads();
+
+    // TODO: share across threads - discernable performance impact?
+    int lb_idx_lo, lb_idx_hi;
+    if (log_bounds_shift > 0) {
+        lb_idx_hi = ((bid + 1) << log_bounds_shift) - 1;
+        lb_idx_lo = (bid << log_bounds_shift) - 1;
+    } else {
+        lb_idx_hi = bid >> (-log_bounds_shift);
+        lb_idx_lo = lb_idx_hi - 1;
+    }
+
+    int idx_lo, idx_hi;
+    if (lb_idx_lo < 0) idx_lo = 0;
+    else idx_lo = log_bounds[lb_idx_lo] & ~(BS-1);
+    idx_hi = (log_bounds[lb_idx_hi] & ~(BS - 1)) + BS;
+
+    float rnrounds = 1.0f / (idx_hi - idx_lo);
+    float time = tid * rnrounds;
+    float time_step = BS * rnrounds;
+
+    int glo_base = bid << SHAB;
+
+    for (int i = idx_lo + tid; i < idx_hi; i += BS) {
+        int entry = log[i];
+
+
+        // TODO: constant '11' is really just 32 - 9 - SHAB, where 9 is the
+        // number of bits assigned to color. This ignores opacity.
+        bfe_decl(glob_addr, entry, SHAB, 11);
+        if (glob_addr != bid) continue;
+
+        bfe_decl(shr_addr, entry, 0, SHAB);
+        bfe_decl(color, entry, 23, 9);
+
+        float colorf = color / 512.0f;
+        float4 outcol = tex2D(palTex, colorf, time);
+
+        // TODO: change texture sampler to return shorts and avoid this
+        uint32_t r = 255.0f * outcol.x;
+        uint32_t g = 255.0f * outcol.y;
+        uint32_t b = 255.0f * outcol.z;
+
+        uint32_t dr = atomicAdd(s_acc_dr + shr_addr, r + 0x10000);
+        uint32_t gb = atomicAdd(s_acc_gb + shr_addr, g + (b << 16));
+        uint32_t d = dr >> 16;
+
+        // Neat trick: if overflow is about to happen, write the accumulator,
+        // and subtract the last known values from the accumulator again.
+        // Even if the ints end up wrapping around once before the subtraction
+        // can occur, the results after the subtraction will be correct.
+        // (Wrapping twice will mess up the intermediate write, but is pretty
+        // unlikely.)
+        if (d == 250) {
+            atomicSub(s_acc_dr + shr_addr, dr);
+            atomicSub(s_acc_gb + shr_addr, gb);
+            write_shmem_helper(acc, glo_base, shr_addr);
+        }
+        time += time_step;
+    }
+
+    __syncthreads();
+    int idx = tid;
+    for (int i = 0; i < (SHAW / BS); i++) {
+        write_shmem_helper(acc, glo_base, idx);
+        idx += BS;
+    }
+}
+
 ''')
        return tmpl.substitute(
                info = self.info,
--- a/cuburn/code/util.py
+++ b/cuburn/code/util.py
@ -71,6 +71,13 @@ float3 hsv2rgb(float3 hsv);
 #define  M_SQRT2      1.41421353816986f
 #define  M_SQRT1_2    0.70710676908493f

+#define bfe(d, s, o, w) \
+        asm("bfe.u32 %0, %1, %2, %3;" : "=r"(d) : "r"(s), "r"(o), "r"(w))
+
+#define bfe_decl(d, s, o, w) \
+        int d; \
+        bfe(d, s, o, w)
+
 // TODO: use launch parameter preconfig to eliminate unnecessary parts
 __device__
 uint32_t gtid() {