Hotspot writeback. 10x performance increase.

Create a map assigning two bits to every output bin. During the atomic flush, compute a threshold for discarding writes altogether that would keep us under 2% error - discard 1 of every 2 writes if we've already accumulated 64 writes (hotspot value 1), 7 of 8 if we're above 256 (hotspot value 2), or 31 of 32 at 2048 (hotspot value 3). Pack this value into a read-only buffer that can often be cached at L2, and for particularly concentrated flames (which historically choke cuburn), L1. During writeback, discard writes at the apporpriate rate. During the flush of the integer accumulator to the float, scale the integer accumulators by the discard rate. This works because for most flames, there's not a lot of interesting stuff in the middle regimes; either stuff is very well defined, in which case we pretty much know exactly what the color is going to be (remember, the max 2% relative error gets log-scaled as well), or it's loosely defined so we should keep it at full accuracy. Of course, a 10x boost is best-case-ish - a long, high-res render. I realized though that I really didn't care about low quality stuff and should go for broke optimizing this for my use case, which is ridiculously high res HDR stuff. (On pathological flames, on the other hand, 10x is conservative; this easily gives us 100x.)
2025-07-05 15:55:14 -04:00 · 2017-05-09 21:16:43 -07:00
parent 0bcde947b5
commit f58289af53
2 changed files with 193 additions and 110 deletions
--- a/cuburn/render.py
+++ b/cuburn/render.py
@ -43,8 +43,7 @@ class Framebuffers(object):

    ``d_front`` and ``d_back`` are separate buffers, each large enough to hold
    four float32 components per pixel (including any gutter pixels added for
-    alignment or padding). ``d_left`` is another buffer large enough to hold
-    two float32 components per pixel.
+    alignment or padding).

    Every user of this set of buffers may use and overwrite the buffers in
    any way, as long as the output for the next stage winds up in the front
@ -53,6 +52,10 @@ class Framebuffers(object):
    exists for the side buffer, you're free to do the same by taking local
    copies of the references and exchanging them yourself.

+    ``d_left`` and ``d_right`` and ``d_uleft`` and ``d_uright`` are similar,
+    but without strict dependencies. Each stage is free to stomp these buffers,
+    but must be done with them by the next stage.
+
    There's one spot in the stream interleaving where the behavior is
    different: the ``Output.convert`` call must store its output to the back
    buffer, which will remain untouched until the dtoh copy of the converted
@ -78,10 +81,10 @@ class Framebuffers(object):
        """
        Given a width and height, return a valid set of dimensions which
        include at least enough gutter to exceed the minimum, and where
-        (acc_width % 32) == 0 and (acc_height % 8) == 0.
+        (acc_width % 32) == 0 and (acc_height % 16) == 0.
        """
        awidth = width + 2 * cls.gutter
-        aheight = 8 * int(np.ceil((height + 2 * cls.gutter) / 8.))
+        aheight = 16 * int(np.ceil((height + 2 * cls.gutter) / 16.))
        astride = 32 * int(np.ceil(awidth / 32.))
        return Dimensions(width, height, awidth, aheight, astride)

@ -102,14 +105,15 @@ class Framebuffers(object):

    def _clear(self):
        self.nbins = self.d_front = self.d_back = None
-        self.d_left = self.d_right = self.d_uchar = None
+        self.d_left = self.d_right = self.d_uleft = self.d_uright = None

    def free(self, stream=None):
        if stream is not None:
            stream.synchronize()
        else:
            cuda.Context.synchronize()
-        for p in (self.d_front, self.d_back, self.d_left, self.d_right, self.d_uchar):
+        for p in (self.d_front, self.d_back, self.d_left, self.d_right,
+                  self.d_uleft, self.d_uright):
            if p is not None:
                p.free()
        self._clear()
@ -126,11 +130,12 @@ class Framebuffers(object):
        if self.nbins >= nbins: return
        if self.nbins is not None: self.free()
        try:
-            self.d_front = cuda.mem_alloc(16 * nbins)
-            self.d_back  = cuda.mem_alloc(16 * nbins)
-            self.d_left  = cuda.mem_alloc(16 * nbins)
-            self.d_right = cuda.mem_alloc(16 * nbins)
-            self.d_uchar = cuda.mem_alloc(2  * nbins)
+            self.d_front  = cuda.mem_alloc(16 * nbins)
+            self.d_back   = cuda.mem_alloc(16 * nbins)
+            self.d_left   = cuda.mem_alloc(16 * nbins)
+            self.d_right  = cuda.mem_alloc(16 * nbins)
+            self.d_uleft  = cuda.mem_alloc(2  * nbins)
+            self.d_uright = cuda.mem_alloc(2  * nbins)
            self.nbins = nbins
        except cuda.MemoryError, e:
            # If a frame that's too large sneaks by the task distributor, we
@ -160,8 +165,9 @@ class Framebuffers(object):
        self.d_front, self.d_back = self.d_back, self.d_front

    def flip_side(self):
-        """Flip the left and right buffers."""
+        """Flip the left and right buffers (float and uchar)."""
        self.d_left, self.d_right = self.d_right, self.d_left
+        self.d_uleft, self.d_uright = self.d_uright, self.d_uleft

 class DevSrc(object):
    """
@ -245,7 +251,7 @@ class Renderer(object):
        self.out = output.get_output_for_profile(gprof)

 class RenderManager(ClsMod):
-    lib = devlib(deps=[interp.palintlib, filldptrlib, iter.flushatomlib])
+    lib = devlib(deps=[interp.palintlib, filldptrlib])

    def __init__(self):
        super(RenderManager, self).__init__()
@ -316,32 +322,42 @@ class RenderManager(ClsMod):
                self.mod, b, s, stream=self.stream_a, value=v)
        fill(self.fb.d_front,  4 * nbins)
        fill(self.fb.d_left,   4 * nbins)
+        fill(self.fb.d_right,  4 * nbins)
        fill(self.fb.d_points, self.fb._len_d_points / 4, f32(np.nan))
-        fill(self.fb.d_uchar,  nbins / 4)
+        fill(self.fb.d_uleft,  nbins / 2)
+        fill(self.fb.d_uright, nbins / 2)

        nts = self.info_a.ntemporal_samples
        nsamps = (gprof.spp(tc) * dim.w * dim.h)
        nrounds = int(nsamps / (nts * 256. * 256)) + 1

-        def launch_iter(n):
-            if n == 0: return
-            launch('iter', rdr.mod, self.stream_a, (32, 8, 1), (nts, n),
-                   self.fb.d_front, self.fb.d_left,
-                   self.fb.d_rb, self.fb.d_seeds, self.fb.d_points,
-                   self.fb.d_uchar, self.info_a.d_params)
+        # Split the launch into multiple rounds, to prevent a system on older
+        # GPUs from locking up and to give us a chance to flush some stuff.
+        hidden_stream = cuda.Stream()
+        iter_stream_left, iter_stream_right = self.stream_a, hidden_stream
+        BLOCK_SIZE = 4

-        # Split the launch into multiple rounds, possibly (slightly) reducing
-        # work overlap but avoiding stalls when working on a device with an
-        # active X session. TODO: characterize performance impact, autodetect
-        BLOCK_SIZE = 16
-        for i in range(BLOCK_SIZE-1, nrounds, BLOCK_SIZE):
-            launch_iter(BLOCK_SIZE)
-        launch_iter(nrounds%BLOCK_SIZE)
+        while nrounds:
+          n = min(nrounds, BLOCK_SIZE)
+          launch('iter', rdr.mod, iter_stream_left, (32, 8, 1), (nts, n),
+                 self.fb.d_front, self.fb.d_left,
+                 self.fb.d_rb, self.fb.d_seeds, self.fb.d_points,
+                 self.fb.d_uleft, self.info_a.d_params)

-        nblocks = int(np.ceil(np.sqrt(dim.ah*dim.astride/256.)))
-        launch('flush_atom', self.mod, self.stream_a,
-                256, (nblocks, nblocks),
-                u64(self.fb.d_front), u64(self.fb.d_left), i32(nbins))
+          # Make sure the other stream is done flushing before we start
+          iter_stream_left.wait_for_event(cuda.Event().record(iter_stream_right))
+
+          launch('flush_atom', rdr.mod, iter_stream_left,
+                  (16, 16, 1), (dim.astride / 16, dim.ah / 16),
+                  u64(self.fb.d_front), u64(self.fb.d_left),
+                  u64(self.fb.d_uleft), i32(nbins))
+
+          self.fb.flip_side()
+          iter_stream_left, iter_stream_right = iter_stream_right, iter_stream_left
+          nrounds -= n
+
+        # Always wait on all events in the hidden stream before continuing on A
+        self.stream_a.wait_for_event(cuda.Event().record(hidden_stream))

    def queue_frame(self, rdr, gnm, gprof, tc, copy=True):
        """