Initial draft of hotspot deferral.

Build an array of one-bit flags for every pixel (addressed as u32 data). If we have accumulated at least 64 points for that pixel, set the flag; thereafter only write 1/16 (and multiply subsequent points that do get written by 16). The theory is, after 64 points, the color is pretty much locked in; this lets us crank SPP up to get excellent coverage in dark areas but the bright ones don't matter so much since they're fully resolved. Still needs a lot of tuning to get peak performance, and the trigger threshold may need to be scaled along with the render size. It also will likely not scale as well to higher resolutions, because we rely on L2 cache to make this fast.
2025-07-05 15:55:14 -04:00 · 2017-04-24 16:33:39 -07:00
parent 6b2b72a3fe
commit bdcaca1f97
2 changed files with 54 additions and 19 deletions
--- a/cuburn/render.py
+++ b/cuburn/render.py
@ -101,14 +101,14 @@ class Framebuffers(object):
        self.d_points = cuda.mem_alloc(self._len_d_points)

    def _clear(self):
-        self.nbins = self.d_front = self.d_back = self.d_side = None
+        self.nbins = self.d_front = self.d_back = self.d_side = self.d_uchar = None

    def free(self, stream=None):
        if stream is not None:
            stream.synchronize()
        else:
            cuda.Context.synchronize()
-        for p in (self.d_front, self.d_back, self.d_side):
+        for p in (self.d_front, self.d_back, self.d_side, self.d_uchar):
            if p is not None:
                p.free()
        self._clear()
@ -128,6 +128,7 @@ class Framebuffers(object):
            self.d_front = cuda.mem_alloc(16 * nbins)
            self.d_back  = cuda.mem_alloc(16 * nbins)
            self.d_side  = cuda.mem_alloc(16 * nbins)
+            self.d_uchar = cuda.mem_alloc(nbins)
            self.nbins = nbins
        except cuda.MemoryError, e:
            # If a frame that's too large sneaks by the task distributor, we
@ -308,8 +309,9 @@ class RenderManager(ClsMod):
        fill = lambda b, s, v=i32(0): util.fill_dptr(
                self.mod, b, s, stream=self.stream_a, value=v)
        fill(self.fb.d_front,  4 * nbins)
-        fill(self.fb.d_side,   2 * nbins)
+        fill(self.fb.d_side,   4 * nbins)
        fill(self.fb.d_points, self.fb._len_d_points / 4, f32(np.nan))
+        fill(self.fb.d_uchar,  nbins / 4)

        nts = self.info_a.ntemporal_samples
        nsamps = (gprof.spp(tc) * dim.w * dim.h)
@ -318,9 +320,10 @@ class RenderManager(ClsMod):
        def launch_iter(n):
            if n == 0: return
            launch('iter', rdr.mod, self.stream_a, (32, 8, 1), (nts, n),
-                    self.fb.d_front, self.fb.d_side,
-                    self.fb.d_rb, self.fb.d_seeds, self.fb.d_points,
-                    self.info_a.d_params)
+                   self.fb.d_front, self.fb.d_side,
+                   self.fb.d_rb, self.fb.d_seeds, self.fb.d_points,
+                   self.fb.d_uchar, self.info_a.d_params)
+
        # Split the launch into multiple rounds, possibly (slightly) reducing
        # work overlap but avoiding stalls when working on a device with an
        # active X session. TODO: characterize performance impact, autodetect