Initial draft of hotspot deferral.

Build an array of one-bit flags for every pixel (addressed as u32 data). If we have accumulated at least 64 points for that pixel, set the flag; thereafter only write 1/16 (and multiply subsequent points that do get written by 16). The theory is, after 64 points, the color is pretty much locked in; this lets us crank SPP up to get excellent coverage in dark areas but the bright ones don't matter so much since they're fully resolved. Still needs a lot of tuning to get peak performance, and the trigger threshold may need to be scaled along with the render size. It also will likely not scale as well to higher resolutions, because we rely on L2 cache to make this fast.
2025-12-19 11:36:28 -05:00 · 2017-04-24 16:33:39 -07:00
parent 6b2b72a3fe
commit bdcaca1f97
2 changed files with 54 additions and 19 deletions
--- a/cuburn/code/iter.py
+++ b/cuburn/code/iter.py
@ -157,7 +157,7 @@ def iter_xf_body(cp, xfid, px):
 iter_body_code = r'''
 __global__ void
 iter(uint64_t out_ptr, uint64_t atom_ptr,
-     ringbuf *rb, mwc_st *msts, float4 *points,
+     ringbuf *rb, mwc_st *msts, float4 *points, uint32_t *hotspots,
     const iter_params *all_params)
 {
    // load params to shared memory cooperatively
@ -325,10 +325,27 @@ iter(uint64_t out_ptr, uint64_t atom_ptr,
    // threshold, it zeros that cell in the integer buffer, converts the
    // former contents to floats, and adds them to the float4 buffer.
-    .reg .pred  p;
+    .reg .pred  p, h;
-    .reg .u32   off, color, hi, lo, d, y, u, v;
+    .reg .u32   off, hotmap, color, hi, lo, d, y, u, v;
-    .reg .f32   colorf, yf, uf, vf, df;
+    .reg .f32   colorf, yf, uf, vf, df, mult;
-    .reg .u64   ptr, val;
+    .reg .u64   hptr, ptr, val;
    // Calculate the address of the hotspot indicator
    shr.b32             off,    %3,     3;
    and.b32             off,    off,    0xfffffffc;
    cvt.u64.u32         hptr,   off;
    add.u64             hptr,   hptr,   %7;
    // Load the hotspot map, and use it to set the hotflag predicate.
    ld.cg.global.b32    hotmap, [hptr];
    and.b32             off,    %3,     0x1f;
    shl.b32             off,    1,      off;
    and.b32             hotmap, hotmap, off;
    setp.gt.u32         h,      hotmap, 0;
    // If the hotflag is set, skip this whole section 15/16 of the time.
    setp.gt.and.f32     p,      %8,     0.0625, h;
@p  bra                 oflow_end;
    // TODO: coord dithering better, or pre-supersampled palette?
    fma.rn.ftz.f32      colorf, %0,     255.0,  %1;
@ -358,6 +375,11 @@ iter(uint64_t out_ptr, uint64_t atom_ptr,
    setp.lo.u32         p,      hi,     (256 << 23);
@p  bra                 oflow_end;
    // Set the hotflag.
    and.b32             off,    %3,     0x1f;
    shl.b32             off,    1,      off;
    red.global.or.b32   [hptr],   off;
    // Atomically swap the integer cell with 0 and read its current value
    atom.global.exch.b64 val,   [ptr],  0;
    mov.b64             {lo, hi},       val;
@ -379,9 +401,14 @@ iter(uint64_t out_ptr, uint64_t atom_ptr,
    cvt.rn.f32.u32      uf,     u;
    cvt.rn.f32.u32      vf,     v;
    cvt.rn.f32.u32      df,     d;
-    mul.rn.ftz.f32      yf,     yf,     (1.0/255.0);
+
-    mul.rn.ftz.f32      uf,     uf,     (1.0/255.0);
+    // If we already hotspotted, each point is worth 64 times as much.
-    mul.rn.ftz.f32      vf,     vf,     (1.0/255.0);
+    selp.f32            mult,   16.0,   1.0,   h;
    mul.rn.ftz.f32      df,     df,     mult;
    mul.rn.ftz.f32      mult,   mult,   (1.0/255.0);
    mul.rn.ftz.f32      yf,     yf,     mult;
    mul.rn.ftz.f32      uf,     uf,     mult;
    mul.rn.ftz.f32      vf,     vf,     mult;
    shl.b32             off,    %3,     4;
    cvt.u64.u32         ptr,    off;
    add.u64             ptr,    ptr,    %6;
@ -393,7 +420,7 @@ oflow_end:
 }
        """)}}  ::  "f"(cc), "f"(color_dither), "r"(time), "r"(i),
                    "l"(atom_ptr), "f"(cosel[threadIdx.y + {{NWARPS}}]),
-                    "l"(out_ptr));
+                    "l"(out_ptr), "l"(hotspots), "f"(mwc_next_01(rctx)));
    }
    this_rb_idx = rb_incr(rb->tail, blockDim.x * threadIdx.y + threadIdx.x);
@ -440,7 +467,8 @@ __global__ void flush_atom(uint64_t out_ptr, uint64_t atom_ptr, int nbins) {
 {
    .reg .u32   off, hi, lo, d, y, u, v;
    .reg .u64   val, ptr;
-    .reg .f32   yf, uf, vf, df, yg, ug, vg, dg;
+    .reg .f32   yf, uf, vf, df, yg, ug, vg, dg, mult;
    .reg .pred  p;
    // TODO: use explicit movs to handle this
    shl.b32             off,    %0,     3;
@ -451,6 +479,8 @@ __global__ void flush_atom(uint64_t out_ptr, uint64_t atom_ptr, int nbins) {
    cvt.u64.u32         ptr,    off;
    add.u64             ptr,    ptr,    %2;
    ld.global.v4.f32    {yg,ug,vg,dg},  [ptr];
    setp.gt.f32         p,      dg,     0.0;
    selp.f32            mult,   16.0,   1.0,    p;
    shr.u32             d,      hi,     22;
    bfe.u32             y,      hi,     4,      18;
    bfe.u32             u,      lo,     18,     14;
@ -460,9 +490,11 @@ __global__ void flush_atom(uint64_t out_ptr, uint64_t atom_ptr, int nbins) {
    cvt.rn.f32.u32      uf,     u;
    cvt.rn.f32.u32      vf,     v;
    cvt.rn.f32.u32      df,     d;
-    fma.rn.ftz.f32      yg,     yf,     (1.0/255.0),    yg;
+    mul.rn.ftz.f32      df,     df,     mult;
-    fma.rn.ftz.f32      ug,     uf,     (1.0/255.0),    ug;
+    mul.rn.ftz.f32      mult,   mult,   (1.0/255.0);
-    fma.rn.ftz.f32      vg,     vf,     (1.0/255.0),    vg;
+    fma.rn.ftz.f32      yg,     yf,     mult,    yg;
    fma.rn.ftz.f32      ug,     uf,     mult,    ug;
    fma.rn.ftz.f32      vg,     vf,     mult,    vg;
    add.rn.ftz.f32      dg,     df,     dg;
    st.global.v4.f32    [ptr],  {yg,ug,vg,dg};
--- a/cuburn/render.py
+++ b/cuburn/render.py
@ -101,14 +101,14 @@ class Framebuffers(object):
        self.d_points = cuda.mem_alloc(self._len_d_points)
    def _clear(self):
-        self.nbins = self.d_front = self.d_back = self.d_side = None
+        self.nbins = self.d_front = self.d_back = self.d_side = self.d_uchar = None
    def free(self, stream=None):
        if stream is not None:
            stream.synchronize()
        else:
            cuda.Context.synchronize()
-        for p in (self.d_front, self.d_back, self.d_side):
+        for p in (self.d_front, self.d_back, self.d_side, self.d_uchar):
            if p is not None:
                p.free()
        self._clear()
@ -128,6 +128,7 @@ class Framebuffers(object):
            self.d_front = cuda.mem_alloc(16 * nbins)
            self.d_back  = cuda.mem_alloc(16 * nbins)
            self.d_side  = cuda.mem_alloc(16 * nbins)
            self.d_uchar = cuda.mem_alloc(nbins)
            self.nbins = nbins
        except cuda.MemoryError, e:
            # If a frame that's too large sneaks by the task distributor, we
@ -308,8 +309,9 @@ class RenderManager(ClsMod):
        fill = lambda b, s, v=i32(0): util.fill_dptr(
                self.mod, b, s, stream=self.stream_a, value=v)
        fill(self.fb.d_front,  4 * nbins)
-        fill(self.fb.d_side,   2 * nbins)
+        fill(self.fb.d_side,   4 * nbins)
        fill(self.fb.d_points, self.fb._len_d_points / 4, f32(np.nan))
        fill(self.fb.d_uchar,  nbins / 4)
        nts = self.info_a.ntemporal_samples
        nsamps = (gprof.spp(tc) * dim.w * dim.h)
@ -320,7 +322,8 @@ class RenderManager(ClsMod):
            launch('iter', rdr.mod, self.stream_a, (32, 8, 1), (nts, n),
                   self.fb.d_front, self.fb.d_side,
                   self.fb.d_rb, self.fb.d_seeds, self.fb.d_points,
-                    self.info_a.d_params)
+                   self.fb.d_uchar, self.info_a.d_params)
        # Split the launch into multiple rounds, possibly (slightly) reducing
        # work overlap but avoiding stalls when working on a device with an
        # active X session. TODO: characterize performance impact, autodetect