From f58289af53e89b62cc1ac60b7e87284d3e06b2e2 Mon Sep 17 00:00:00 2001
From: Steven Robertson <strobe@google.com>
Date: Tue, 9 May 2017 21:16:43 -0700
Subject: [PATCH] Hotspot writeback. 10x performance increase.

Create a map assigning two bits to every output bin. During the atomic
flush, compute a threshold for discarding writes altogether that would
keep us under 2% error - discard 1 of every 2 writes if we've already
accumulated 64 writes (hotspot value 1), 7 of 8 if we're above 256
(hotspot value 2), or 31 of 32 at 2048 (hotspot value 3). Pack this
value into a read-only buffer that can often be cached at L2, and for
particularly concentrated flames (which historically choke cuburn), L1.
During writeback, discard writes at the apporpriate rate. During the
flush of the integer accumulator to the float, scale the integer
accumulators by the discard rate.

This works because for most flames, there's not a lot of interesting
stuff in the middle regimes; either stuff is very well defined, in which
case we pretty much know exactly what the color is going to be
(remember, the max 2% relative error gets log-scaled as well), or it's
loosely defined so we should keep it at full accuracy.

Of course, a 10x boost is best-case-ish - a long, high-res render. I
realized though that I really didn't care about low quality stuff and
should go for broke optimizing this for my use case, which is
ridiculously high res HDR stuff. (On pathological flames, on the other
hand, 10x is conservative; this easily gives us 100x.)
---
 cuburn/code/iter.py | 225 ++++++++++++++++++++++++++++----------------
 cuburn/render.py    |  78 +++++++++------
 2 files changed, 193 insertions(+), 110 deletions(-)

diff --git a/cuburn/code/iter.py b/cuburn/code/iter.py
index 1df942d..b2ec107 100644
--- a/cuburn/code/iter.py
+++ b/cuburn/code/iter.py
@@ -157,8 +157,9 @@ def iter_xf_body(cp, xfid, px):
 iter_body_code = r'''
 __global__ void
 iter(uint64_t out_ptr, uint64_t atom_ptr,
-     ringbuf *rb, mwc_st *msts, float4 *points, uint32_t *hotspots,
-     const iter_params *all_params)
+     ringbuf *rb, mwc_st *msts, float4 *points,
+     const __restrict__ uint32_t *hotspots,
+     const __restrict__ iter_params *all_params)
 {
     // load params to shared memory cooperatively
     const iter_params *global_params = &(all_params[blockIdx.x]);
@@ -177,12 +178,9 @@ iter(uint64_t out_ptr, uint64_t atom_ptr,
         {{cp.camera.yo}} += ditherwidth * mwc_next_11(rctx);
     }
 
-
-    // TODO: spare the register, reuse at call site?
     int time = blockIdx.x >> 4;
     float color_dither = 0.49f * mwc_next_11(rctx);
 
-    // TODO: 4th channel unused. Kill or use for something helpful
     float4 old_point = points[this_rb_idx];
     float x = old_point.x, y = old_point.y, color = old_point.z;
 
@@ -315,8 +313,19 @@ iter(uint64_t out_ptr, uint64_t atom_ptr,
             continue;
         }
 
-        uint32_t i = iy * acc_size.astride + ix;
+        uint32_t hotspot_i = (
+            ((iy >> 4) * acc_size.astride) +
+            (ix & 0xfffffff0) + (iy & 0xf));
+        uint32_t hotspot_rshift = (ix & 0xf) << 1;
+        uint32_t hotspot_value = (hotspots[hotspot_i] >> hotspot_rshift) & 0x3;
+        float hotspot_mult = 1.0f;
+        if (hotspot_value > 0) {
+          hotspot_mult = ((1 << (hotspot_value << 1)) >> 1);
+          float prob = __frcp_rn(hotspot_mult);
+          if (mwc_next_01(rctx) > prob) continue;
+        }
 
+        uint32_t i = iy * acc_size.astride + ix;
         asm volatile ({{crep("""
 {
     // To prevent overflow, we need to flush each pixel before the density
@@ -327,25 +336,8 @@ iter(uint64_t out_ptr, uint64_t atom_ptr,
 
     .reg .pred  p, h;
     .reg .u32   off, hotmap, color, hi, lo, d, y, u, v;
-    .reg .f32   colorf, yf, uf, vf, df, mult;
-    .reg .u64   hptr, ptr, val;
-
-    // Calculate the address of the hotspot indicator
-    shr.b32             off,    %3,     3;
-    and.b32             off,    off,    0xfffffffc;
-    cvt.u64.u32         hptr,   off;
-    add.u64             hptr,   hptr,   %7;
-
-    // Load the hotspot map, and use it to set the hotflag predicate.
-    ld.cg.global.b32    hotmap, [hptr];
-    and.b32             off,    %3,     0x1f;
-    shl.b32             off,    1,      off;
-    and.b32             hotmap, hotmap, off;
-    setp.gt.u32         h,      hotmap, 0;
-
-    // If the hotflag is set, skip this whole section 15/16 of the time.
-    setp.gt.and.f32     p,      %8,     0.0625, h;
-@p  bra                 oflow_end;
+    .reg .f32   colorf, yf, uf, vf, df, mult, prob;
+    .reg .u64   ptr, val;
 
     // TODO: coord dithering better, or pre-supersampled palette?
     fma.rn.ftz.f32      colorf, %0,     255.0,  %1;
@@ -375,11 +367,6 @@ iter(uint64_t out_ptr, uint64_t atom_ptr,
     setp.lo.u32         p,      hi,     (256 << 23);
 @p  bra                 oflow_end;
 
-    // Set the hotflag.
-    and.b32             off,    %3,     0x1f;
-    shl.b32             off,    1,      off;
-    red.global.or.b32   [hptr],   off;
-
     // Atomically swap the integer cell with 0 and read its current value
     atom.global.exch.b64 val,   [ptr],  0;
     mov.b64             {lo, hi},       val;
@@ -402,10 +389,8 @@ iter(uint64_t out_ptr, uint64_t atom_ptr,
     cvt.rn.f32.u32      vf,     v;
     cvt.rn.f32.u32      df,     d;
 
-    // If we already hotspotted, each point is worth 64 times as much.
-    selp.f32            mult,   16.0,   1.0,   h;
-    mul.rn.ftz.f32      df,     df,     mult;
-    mul.rn.ftz.f32      mult,   mult,   (1.0/255.0);
+    mul.rn.ftz.f32      df,     df,     %7;
+    mul.rn.ftz.f32      mult,   %7,     (1.0/255.0);
     mul.rn.ftz.f32      yf,     yf,     mult;
     mul.rn.ftz.f32      uf,     uf,     mult;
     mul.rn.ftz.f32      vf,     vf,     mult;
@@ -420,7 +405,7 @@ oflow_end:
 }
         """)}}  ::  "f"(cc), "f"(color_dither), "r"(time), "r"(i),
                     "l"(atom_ptr), "f"(cosel[threadIdx.y + {{NWARPS}}]),
-                    "l"(out_ptr), "l"(hotspots), "f"(mwc_next_01(rctx)));
+                    "l"(out_ptr), "f"(hotspot_mult));
     }
 
     this_rb_idx = rb_incr(rb->tail, blockDim.x * threadIdx.y + threadIdx.x);
@@ -428,6 +413,132 @@ oflow_end:
     msts[this_rb_idx] = rctx;
     return;
 }
+
+__global__ void flush_atom(uint64_t out_ptr, uint64_t atom_ptr,
+                           __restrict__ uint32_t *hotspot_ptr, int nbins) {
+    const int yi = (blockIdx.y * blockDim.y) + threadIdx.y;
+    const int xi = (blockIdx.x * blockDim.x) + threadIdx.x;
+    const int gi = yi * acc_size.astride + xi;
+
+    const int hoti = (
+        ((yi >> 4) * acc_size.astride) +
+        (xi & 0xfffffff0) + (yi & 0xf));
+    asm volatile ({{crep("""
+{
+    .reg .u32   off, hi, lo, d, y, u, v, hotflag, tidx;
+    .reg .u64   val, ptr;
+    .reg .f32   yf, uf, vf, df, yg, ug, vg, dg, mult;
+    .reg .pred  p, q;
+
+    // Load the hotflag and compute the scale factor.
+    shl.b32             off,    %1,   2;
+    cvt.u64.u32         ptr,    off;
+    add.u64             ptr,    ptr,    %4;
+    ld.global.b32       hotflag, [ptr];
+
+    mov.b32             tidx,   %tid.x;
+    shl.b32             tidx,   tidx,       1;
+    shr.b32             hotflag,  hotflag,  tidx;
+    and.b32             hotflag,  hotflag,  0x3;
+
+    shl.b32             hotflag,  hotflag,  1;
+    shl.b32             hotflag,  1,        hotflag;
+    shr.b32             hotflag,  hotflag,  1;
+    cvt.rn.f32.u32      mult,     hotflag;
+    setp.eq.f32         p,        0.0,      mult;
+@p  mov.f32             mult,     1.0;
+
+    shl.b32             off,    %0,     3;
+    cvt.u64.u32         ptr,    off;
+    add.u64             ptr,    ptr,    %2;
+    ld.cs.global.v2.u32 {lo, hi},   [ptr];
+    st.cs.global.b64    [ptr],  0;
+
+    shl.b32             off,    %0,     4;
+    cvt.u64.u32         ptr,    off;
+    add.u64             ptr,    ptr,    %3;
+    ld.cs.global.v4.f32 {yg,ug,vg,dg},  [ptr];
+    shr.u32             d,      hi,     22;
+    bfe.u32             y,      hi,     4,      18;
+    bfe.u32             u,      lo,     18,     14;
+    bfi.b32             u,      hi,     u,      14,     4;
+    and.b32             v,      lo,     ((1<<18)-1);
+    cvt.rn.f32.u32      yf,     y;
+    cvt.rn.f32.u32      uf,     u;
+    cvt.rn.f32.u32      vf,     v;
+    cvt.rn.f32.u32      df,     d;
+    fma.rn.ftz.f32      dg,     df,     mult,    dg;
+    mul.rn.ftz.f32      mult,   mult,   (1.0/255.0);
+    fma.rn.ftz.f32      yg,     yf,     mult,    yg;
+    fma.rn.ftz.f32      ug,     uf,     mult,    ug;
+    fma.rn.ftz.f32      vg,     vf,     mult,    vg;
+
+    st.cs.global.v4.f32 [ptr],  {yg,ug,vg,dg};
+
+    .reg .u32   bal1, bal2, bal3, tidy, mask, tbit, tbit2, tbit3;
+    .reg .u32   outlo, outhi;
+
+    // Vote on whether this thread's point passes the (for now, predetermined)
+    // thresholds for each mask step
+    setp.gt.f32         p,      dg,     128.0;
+    vote.ballot.b32     bal1,   p;
+    setp.gt.f32         p,      dg,     512.0;
+    vote.ballot.b32     bal2,   p;
+    setp.gt.f32         p,      dg,     2048.0;
+    vote.ballot.b32     bal3,   p;
+
+    // Set 'q' with true if we're a low bit
+    mov.b32             tidx,   %laneid;
+    and.b32             mask,   tidx,   1;
+    setp.eq.u32         q,      mask,   0;
+
+    // Set 'mask' with the bit that we're considering in the ballots:
+    // 1 << (laneid / 2)
+    shr.b32             mask,   tidx,   1;
+    shl.b32             mask,   1,      mask;
+
+    // If this is a low bit, it's set if either the hotflag should be 1 or 3
+    //   (reduces to 1 & (2 = 3), since 2 will never be unset if 3 is set)
+    // If this is a high bit, it's set if it should be 2 or 3
+    // Set lo bit on all threads, then overwrite it on just hi ones
+    // Vote to get the value we should write
+    and.b32             tbit,   mask,   bal1;
+    setp.gt.u32         p,      tbit,   0;
+    and.b32             tbit,   mask,   bal2;
+    and.b32             tbit2,  mask,   bal3;
+    setp.and.eq.u32     p,      tbit,   tbit2,  p;
+@q  setp.gt.u32         p,      tbit,   0;
+
+    vote.ballot.b32     outlo,  p;
+
+    // Repeat this process for the high half of the warp
+    shl.b32             mask,   mask,   16;
+    and.b32             tbit,   mask,   bal1;
+    setp.gt.u32         p,      tbit,   0;
+    and.b32             tbit,   mask,   bal2;
+    and.b32             tbit2,  mask,   bal3;
+    setp.and.eq.u32     p,      tbit,   tbit2,  p;
+@q  setp.gt.u32         p,      tbit,   0;
+
+    vote.ballot.b32     outhi,  p;
+
+    // Set 'p' with whether we're the low or high bit of this warp
+    mov.b32             tidy,   %tid.y;
+    and.b32             tidy,   tidy,   1;
+    setp.gt.u32         p,      tidy,   0;
+@p  mov.b32             outlo,  outhi;
+
+    shl.b32             off,    %1,     2;
+    cvt.u64.u32         ptr,    off;
+    add.u64             ptr,    ptr,    %4;
+    mov.b32             tidx,   %tid.x;
+    // Set 'p' with whether we're thread 0 of tid.x
+    setp.eq.u32         p,      tidx,   0;
+@p  st.cs.global.b32    [ptr],  outlo;
+}
+""")}}  ::  "r"(gi), "r"(hoti), "l"(atom_ptr), "l"(out_ptr), "l"(hotspot_ptr),
+"r"(xi), "r"(yi));
+}
 '''
 
 def iter_body(cp):
@@ -458,47 +569,3 @@ def mkiterlib(gnm):
                  decls=iter_decls + interp.palintlib.decls,
                  defs='\n'.join(bodies))
     return packer, lib
-
-flushatomlib = devlib(defs=Template(r'''
-__global__ void flush_atom(uint64_t out_ptr, uint64_t atom_ptr, int nbins) {
-    int i = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;
-    if (i >= nbins) return;
-    asm volatile ({{crep("""
-{
-    .reg .u32   off, hi, lo, d, y, u, v;
-    .reg .u64   val, ptr;
-    .reg .f32   yf, uf, vf, df, yg, ug, vg, dg, mult;
-    .reg .pred  p;
-
-    // TODO: use explicit movs to handle this
-    shl.b32             off,    %0,     3;
-    cvt.u64.u32         ptr,    off;
-    add.u64             ptr,    ptr,    %1;
-    ld.global.v2.u32    {lo, hi},   [ptr];
-    shl.b32             off,    %0,     4;
-    cvt.u64.u32         ptr,    off;
-    add.u64             ptr,    ptr,    %2;
-    ld.global.v4.f32    {yg,ug,vg,dg},  [ptr];
-    setp.gt.f32         p,      dg,     0.0;
-    selp.f32            mult,   16.0,   1.0,    p;
-    shr.u32             d,      hi,     22;
-    bfe.u32             y,      hi,     4,      18;
-    bfe.u32             u,      lo,     18,     14;
-    bfi.b32             u,      hi,     u,      14,     4;
-    and.b32             v,      lo,     ((1<<18)-1);
-    cvt.rn.f32.u32      yf,     y;
-    cvt.rn.f32.u32      uf,     u;
-    cvt.rn.f32.u32      vf,     v;
-    cvt.rn.f32.u32      df,     d;
-    mul.rn.ftz.f32      df,     df,     mult;
-    mul.rn.ftz.f32      mult,   mult,   (1.0/255.0);
-    fma.rn.ftz.f32      yg,     yf,     mult,    yg;
-    fma.rn.ftz.f32      ug,     uf,     mult,    ug;
-    fma.rn.ftz.f32      vg,     vf,     mult,    vg;
-
-    add.rn.ftz.f32      dg,     df,     dg;
-    st.global.v4.f32    [ptr],  {yg,ug,vg,dg};
-}
-    """)}}  ::  "r"(i), "l"(atom_ptr), "l"(out_ptr));
-}
-''', 'flush_atom').substitute())
diff --git a/cuburn/render.py b/cuburn/render.py
index 62fb776..eba2ad2 100644
--- a/cuburn/render.py
+++ b/cuburn/render.py
@@ -43,8 +43,7 @@ class Framebuffers(object):
 
     ``d_front`` and ``d_back`` are separate buffers, each large enough to hold
     four float32 components per pixel (including any gutter pixels added for
-    alignment or padding). ``d_left`` is another buffer large enough to hold
-    two float32 components per pixel.
+    alignment or padding).
 
     Every user of this set of buffers may use and overwrite the buffers in
     any way, as long as the output for the next stage winds up in the front
@@ -53,6 +52,10 @@ class Framebuffers(object):
     exists for the side buffer, you're free to do the same by taking local
     copies of the references and exchanging them yourself.
 
+    ``d_left`` and ``d_right`` and ``d_uleft`` and ``d_uright`` are similar,
+    but without strict dependencies. Each stage is free to stomp these buffers,
+    but must be done with them by the next stage.
+
     There's one spot in the stream interleaving where the behavior is
     different: the ``Output.convert`` call must store its output to the back
     buffer, which will remain untouched until the dtoh copy of the converted
@@ -78,10 +81,10 @@ class Framebuffers(object):
         """
         Given a width and height, return a valid set of dimensions which
         include at least enough gutter to exceed the minimum, and where
-        (acc_width % 32) == 0 and (acc_height % 8) == 0.
+        (acc_width % 32) == 0 and (acc_height % 16) == 0.
         """
         awidth = width + 2 * cls.gutter
-        aheight = 8 * int(np.ceil((height + 2 * cls.gutter) / 8.))
+        aheight = 16 * int(np.ceil((height + 2 * cls.gutter) / 16.))
         astride = 32 * int(np.ceil(awidth / 32.))
         return Dimensions(width, height, awidth, aheight, astride)
 
@@ -102,14 +105,15 @@ class Framebuffers(object):
 
     def _clear(self):
         self.nbins = self.d_front = self.d_back = None
-        self.d_left = self.d_right = self.d_uchar = None
+        self.d_left = self.d_right = self.d_uleft = self.d_uright = None
 
     def free(self, stream=None):
         if stream is not None:
             stream.synchronize()
         else:
             cuda.Context.synchronize()
-        for p in (self.d_front, self.d_back, self.d_left, self.d_right, self.d_uchar):
+        for p in (self.d_front, self.d_back, self.d_left, self.d_right,
+                  self.d_uleft, self.d_uright):
             if p is not None:
                 p.free()
         self._clear()
@@ -126,11 +130,12 @@ class Framebuffers(object):
         if self.nbins >= nbins: return
         if self.nbins is not None: self.free()
         try:
-            self.d_front = cuda.mem_alloc(16 * nbins)
-            self.d_back  = cuda.mem_alloc(16 * nbins)
-            self.d_left  = cuda.mem_alloc(16 * nbins)
-            self.d_right = cuda.mem_alloc(16 * nbins)
-            self.d_uchar = cuda.mem_alloc(2  * nbins)
+            self.d_front  = cuda.mem_alloc(16 * nbins)
+            self.d_back   = cuda.mem_alloc(16 * nbins)
+            self.d_left   = cuda.mem_alloc(16 * nbins)
+            self.d_right  = cuda.mem_alloc(16 * nbins)
+            self.d_uleft  = cuda.mem_alloc(2  * nbins)
+            self.d_uright = cuda.mem_alloc(2  * nbins)
             self.nbins = nbins
         except cuda.MemoryError, e:
             # If a frame that's too large sneaks by the task distributor, we
@@ -160,8 +165,9 @@ class Framebuffers(object):
         self.d_front, self.d_back = self.d_back, self.d_front
 
     def flip_side(self):
-        """Flip the left and right buffers."""
+        """Flip the left and right buffers (float and uchar)."""
         self.d_left, self.d_right = self.d_right, self.d_left
+        self.d_uleft, self.d_uright = self.d_uright, self.d_uleft
 
 class DevSrc(object):
     """
@@ -245,7 +251,7 @@ class Renderer(object):
         self.out = output.get_output_for_profile(gprof)
 
 class RenderManager(ClsMod):
-    lib = devlib(deps=[interp.palintlib, filldptrlib, iter.flushatomlib])
+    lib = devlib(deps=[interp.palintlib, filldptrlib])
 
     def __init__(self):
         super(RenderManager, self).__init__()
@@ -316,32 +322,42 @@ class RenderManager(ClsMod):
                 self.mod, b, s, stream=self.stream_a, value=v)
         fill(self.fb.d_front,  4 * nbins)
         fill(self.fb.d_left,   4 * nbins)
+        fill(self.fb.d_right,  4 * nbins)
         fill(self.fb.d_points, self.fb._len_d_points / 4, f32(np.nan))
-        fill(self.fb.d_uchar,  nbins / 4)
+        fill(self.fb.d_uleft,  nbins / 2)
+        fill(self.fb.d_uright, nbins / 2)
 
         nts = self.info_a.ntemporal_samples
         nsamps = (gprof.spp(tc) * dim.w * dim.h)
         nrounds = int(nsamps / (nts * 256. * 256)) + 1
 
-        def launch_iter(n):
-            if n == 0: return
-            launch('iter', rdr.mod, self.stream_a, (32, 8, 1), (nts, n),
-                   self.fb.d_front, self.fb.d_left,
-                   self.fb.d_rb, self.fb.d_seeds, self.fb.d_points,
-                   self.fb.d_uchar, self.info_a.d_params)
+        # Split the launch into multiple rounds, to prevent a system on older
+        # GPUs from locking up and to give us a chance to flush some stuff.
+        hidden_stream = cuda.Stream()
+        iter_stream_left, iter_stream_right = self.stream_a, hidden_stream
+        BLOCK_SIZE = 4
 
-        # Split the launch into multiple rounds, possibly (slightly) reducing
-        # work overlap but avoiding stalls when working on a device with an
-        # active X session. TODO: characterize performance impact, autodetect
-        BLOCK_SIZE = 16
-        for i in range(BLOCK_SIZE-1, nrounds, BLOCK_SIZE):
-            launch_iter(BLOCK_SIZE)
-        launch_iter(nrounds%BLOCK_SIZE)
+        while nrounds:
+          n = min(nrounds, BLOCK_SIZE)
+          launch('iter', rdr.mod, iter_stream_left, (32, 8, 1), (nts, n),
+                 self.fb.d_front, self.fb.d_left,
+                 self.fb.d_rb, self.fb.d_seeds, self.fb.d_points,
+                 self.fb.d_uleft, self.info_a.d_params)
 
-        nblocks = int(np.ceil(np.sqrt(dim.ah*dim.astride/256.)))
-        launch('flush_atom', self.mod, self.stream_a,
-                256, (nblocks, nblocks),
-                u64(self.fb.d_front), u64(self.fb.d_left), i32(nbins))
+          # Make sure the other stream is done flushing before we start
+          iter_stream_left.wait_for_event(cuda.Event().record(iter_stream_right))
+
+          launch('flush_atom', rdr.mod, iter_stream_left,
+                  (16, 16, 1), (dim.astride / 16, dim.ah / 16),
+                  u64(self.fb.d_front), u64(self.fb.d_left),
+                  u64(self.fb.d_uleft), i32(nbins))
+
+          self.fb.flip_side()
+          iter_stream_left, iter_stream_right = iter_stream_right, iter_stream_left
+          nrounds -= n
+
+        # Always wait on all events in the hidden stream before continuing on A
+        self.stream_a.wait_for_event(cuda.Event().record(hidden_stream))
 
     def queue_frame(self, rdr, gnm, gprof, tc, copy=True):
         """