From 1398706886afb2dadc8b3711f01191298bdae4f5 Mon Sep 17 00:00:00 2001
From: Steven Robertson <steven@strobe.cc>
Date: Fri, 20 Jan 2012 09:35:12 -0500
Subject: [PATCH] Remove SS from DE, and improve performance.

---
 cuburn/code/filtering.py | 118 +++++++++++++++++++++++----------------
 cuburn/code/iter.py      |   7 +--
 cuburn/render.py         |  25 ++++-----
 3 files changed, 83 insertions(+), 67 deletions(-)

diff --git a/cuburn/code/filtering.py b/cuburn/code/filtering.py
index 32ab149..8f8ad63 100644
--- a/cuburn/code/filtering.py
+++ b/cuburn/code/filtering.py
@@ -147,78 +147,98 @@ void blur_density_v(float4 *pixbuf, const float *scratch,
     *ddst = min(*ddst, den);
 }
 
-#define W 15        // Filter width (regardless of standard deviation chosen)
-#define W2 7        // Half of filter width, rounded down
-#define FW 46       // Width of local result storage (NW+W2+W2)
+#define W 21        // Filter width (regardless of standard deviation chosen)
+#define W2 10       // Half of filter width, rounded down
+#define FW 53       // Width of local result storage (NW+W2+W2)
 #define FW2 (FW*FW)
 
 __global__
 void density_est(float4 *outbuf, const float4 *pixbuf,
                  float scale_coeff, float est_curve, float edge_clamp,
-                 float k1, float k2, int height, int stride, int ss) {
+                 float k1, float k2, int height, int stride) {
     __shared__ float de_r[FW2], de_g[FW2], de_b[FW2], de_a[FW2];
 
+    // The max supported radius is really 7, but the extra units simplify the
+    // logic in the bottlenecked section below.
+    __shared__ int minradius[32];
+
     for (int i = threadIdx.x + 32*threadIdx.y; i < FW2; i += 1024)
         de_r[i] = de_g[i] = de_b[i] = de_a[i] = 0.0f;
     __syncthreads();
 
     for (int imrow = threadIdx.y; imrow < height; imrow += 32) {
-        for (int ssx = 0; ssx < ss; ssx++) {
-            float ssxo = (0.5f + ssx) / ss;
-            for (int ssy = 0; ssy < ss; ssy++) {
-                float ssyo = (0.5f + ssy) / ss;
+        // Prepare the shared voting buffer. Syncing afterward is
+        // almost unnecessary but we do it to be safe
+        if (threadIdx.y == 0)
+            minradius[threadIdx.x] = 0.0f;
+        __syncthreads();
 
-                int col = blockIdx.x * 32 + threadIdx.x;
-                int in_idx = ss * (stride * (imrow * ss + ssy) + col) + ssx;
-                float4 in = pixbuf[in_idx];
-                // TODO: compute maximum filter radius needed across all warps
-                // to limit rounds in advance
-                float den = in.w;
+        int col = blockIdx.x * 32 + threadIdx.x;
+        int in_idx = stride * imrow + col;
+        float4 in = pixbuf[in_idx];
+        float den = in.w;
 
-                float ls = k1 * logf(1.0f + in.w * k2) / in.w;
+        float ls = k1 * logf(1.0f + in.w * k2) / in.w;
 
-                // The synchronization model used now prevents us from
-                // cutting early in the case of a zero point, so we just carry
-                // it along with us here
-                if (den <= 0) ls = 0.0f;
+        // The synchronization model used now prevents us from
+        // cutting early in the case of a zero point, so we just carry
+        // it along with us here
+        if (den <= 0) ls = 0.0f;
 
-                in.x *= ls;
-                in.y *= ls;
-                in.z *= ls;
-                in.w *= ls;
+        in.x *= ls;
+        in.y *= ls;
+        in.z *= ls;
+        in.w *= ls;
 
-                // Base index of destination for writes
-                int si = (threadIdx.y + W2) * FW + threadIdx.x + W2;
+        // Base index of destination for writes
+        int si = (threadIdx.y + W2) * FW + threadIdx.x + W2;
 
-                // Calculate scaling coefficient for the Gaussian kernel. This
-                // does not match with a normal Gaussian; it just fits with
-                // flam3's implementation.
-                float scale = powf(den * ss, est_curve) * scale_coeff;
-                scale = min(scale, 2.0f);
+        // Calculate scaling coefficient for the Gaussian kernel. This
+        // does not match with a normal Gaussian; it just fits with
+        // flam3's implementation.
+        float scale = powf(den, est_curve) * scale_coeff;
 
-                for (int jj = -W2; jj < W2; jj++) {
-                    float jjf = (jj - ssxo) * scale;
-                    float jdiff = erff(jjf + scale) - erff(jjf);
+        // Force a minimum blur radius. This works out to be a
+        // standard deviation of about 0.35px. Also force a maximum,
+        // which limits spherical error to about 2 quanta at 10 bit
+        // precision.
+        scale = max(0.30f, min(scale, 2.0f));
 
-                    for (int ii = -W2; ii < W2; ii++) {
-                        float iif = (ii - ssyo) * scale;
-                        float coeff = 0.25f * jdiff * (erff(iif + scale) - erff(iif));
+        // Determine a minimum radius for this image section.
+        int radius = (int) min(ceilf(2.12132f / scale), 10.0f);
+        if (den <= 0) radius = 0;
+        minradius[radius] = radius;
 
-                        int idx = si + FW * ii + jj;
-                        de_r[idx] += in.x * coeff;
-                        de_g[idx] += in.y * coeff;
-                        de_b[idx] += in.z * coeff;
-                        de_a[idx] += in.w * coeff;
-                        __syncthreads();
-                    }
-                }
+        // Go bottlenecked to compute the maximum radius in this block
+        __syncthreads();
+        if (threadIdx.y == 0) {
+            int blt = __ballot(minradius[threadIdx.x]);
+            minradius[0] = 31 - __clz(blt);
+        }
+        __syncthreads();
+
+        radius = minradius[0];
+
+        for (int jj = -radius; jj <= radius; jj++) {
+            float jjf = (jj - 0.5f) * scale;
+            float jdiff = erff(jjf + scale) - erff(jjf);
+
+            for (int ii = -radius; ii <= radius; ii++) {
+                float iif = (ii - 0.5f) * scale;
+                float coeff = 0.25f * jdiff * (erff(iif + scale) - erff(iif));
+
+                int idx = si + FW * ii + jj;
+                de_r[idx] += in.x * coeff;
+                de_g[idx] += in.y * coeff;
+                de_b[idx] += in.z * coeff;
+                de_a[idx] += in.w * coeff;
+                __syncthreads();
             }
         }
-
         __syncthreads();
         // TODO: could coalesce this, but what a pain
         for (int i = threadIdx.x; i < FW; i += 32) {
-            int out_idx = stride * imrow + blockIdx.x * 32 + i + W2;
+            int out_idx = stride * imrow + blockIdx.x * 32 + i;
             int si = threadIdx.y * FW + i;
             float *out = reinterpret_cast<float*>(&outbuf[out_idx]);
             atomicAdd(out,   de_r[si]);
@@ -288,11 +308,11 @@ class Filtering(object):
             edge_clamp = f32(1.2)
             fun = self.mod.get_function("density_est")
             fun(ddst, dsrc, scale_coeff, est_curve, edge_clamp, k1, k2,
-                i32(dim.ah / dim.ss), i32(dim.astride / dim.ss), i32(dim.ss),
-                block=(32, 32, 1), grid=(dim.aw/(32*dim.ss), 1), stream=stream)
+                i32(dim.ah), i32(dim.astride),
+                block=(32, 32, 1), grid=(dim.aw/32, 1), stream=stream)
 
     def colorclip(self, dbuf, gnm, dim, tc, blend, stream=None):
-        nbins = dim.ah * dim.astride / dim.ss ** 2
+        nbins = dim.ah * dim.astride
 
         # TODO: implement integration over cubic splines?
         gam = f32(1 / gnm.color.gamma(tc))
diff --git a/cuburn/code/iter.py b/cuburn/code/iter.py
index bd74874..5ba9d68 100644
--- a/cuburn/code/iter.py
+++ b/cuburn/code/iter.py
@@ -68,7 +68,7 @@ def precalc_camera(pcam):
         float rot = {{pre_cam.rotation}} * M_PI / 180.0f;
         float rotsin = sin(rot), rotcos = cos(rot);
         float cenx = {{pre_cam.center.x}}, ceny = {{pre_cam.center.y}};
-        float scale = {{pre_cam.scale}} * acc_size.width * acc_size.ss;
+        float scale = {{pre_cam.scale}} * acc_size.width;
 
         {{pre_cam._set('xx')}} = scale * rotcos;
         {{pre_cam._set('xy')}} = scale * -rotsin;
@@ -126,7 +126,6 @@ typedef struct {
     uint32_t awidth;
     uint32_t aheight;
     uint32_t astride;
-    uint32_t ss;
 } acc_size_t;
 __constant__ acc_size_t acc_size;
 
@@ -204,7 +203,7 @@ void iter(
     mwc_st rctx = msts[this_rb_idx];
 
     {{precalc_camera(pcp.camera)}}
-    if (acc_size.ss == 1 && threadIdx.y == 5 && threadIdx.x == 4) {
+    if (threadIdx.y == 5 && threadIdx.x == 4) {
         float ditherwidth = {{pcp.camera.dither_width}} * 0.33f;
         float u0 = mwc_next_01(rctx);
         float r = ditherwidth * sqrtf(-2.0f * log2f(u0) / M_LOG2E);
@@ -346,7 +345,7 @@ void iter(
 
         uint32_t ix = trunca(cx), iy = trunca(cy);
 
-        if (ix >= acc_size.awidth || iy >= acc_size.aheight) {
+        if (ix >= acc_size.astride || iy >= acc_size.aheight) {
 {{if info.acc_mode == 'deferred'}}
             *log = 0xffffffff;
 {{endif}}
diff --git a/cuburn/render.py b/cuburn/render.py
index 0b42f02..e0ed295 100644
--- a/cuburn/render.py
+++ b/cuburn/render.py
@@ -20,7 +20,7 @@ from cuburn import affine
 from cuburn.code import util, mwc, iter, interp, filtering, sort
 
 RenderedImage = namedtuple('RenderedImage', 'buf idx gpu_time')
-Dimensions = namedtuple('Dimensions', 'w h aw ah astride ss')
+Dimensions = namedtuple('Dimensions', 'w h aw ah astride')
 
 def _sync_stream(dst, src):
     dst.wait_for_event(cuda.Event(cuda.event_flags.DISABLE_TIMING).record(src))
@@ -54,7 +54,7 @@ class Renderer(object):
     # Maximum width of DE and other spatial filters, and thus in turn the
     # amount of padding applied. Note that, for now, this must not be changed!
     # The filtering code makes deep assumptions about this value.
-    gutter = 15
+    gutter = 10
 
     # Accumulation mode. Leave it at 'atomic' for now.
     acc_mode = 'atomic'
@@ -138,7 +138,7 @@ class Renderer(object):
         next(r)
         return ifilter(None, imap(r.send, chain(times, [None])))
 
-    def render_gen(self, genome, width, height, ss=1, blend=True):
+    def render_gen(self, genome, width, height, blend=True):
         """
         Render frames. This method is wrapped by the ``render()`` method; see
         its docstring for warnings and details.
@@ -182,25 +182,24 @@ class Renderer(object):
         event_a = cuda.Event().record(filt_stream)
         event_b = None
 
-        owidth = width + 2 * self.gutter
-        oheight = height + 2 * self.gutter
-        ostride = 32 * int(np.ceil(owidth / 32.))
-        awidth, aheight, astride = owidth * ss, oheight * ss, ostride * ss
-        dim = Dimensions(width, height, awidth, aheight, astride, ss)
+        awidth = width + 2 * self.gutter
+        aheight = 32 * int(np.ceil((height + 2 * self.gutter) / 32.))
+        astride = 32 * int(np.ceil(awidth / 32.))
+        dim = Dimensions(width, height, awidth, aheight, astride)
         d_acc_size = self.mod.get_global('acc_size')[0]
         cuda.memcpy_htod_async(d_acc_size, u32(list(dim)), write_stream)
 
-        nbins = awidth * aheight
+        nbins = astride * aheight
         # Extra padding in accum helps with write_shmem overruns
         d_accum = cuda.mem_alloc(16 * nbins + (1<<16))
-        d_out = cuda.mem_alloc(16 * oheight * ostride)
+        d_out = cuda.mem_alloc(16 * aheight * astride)
         if self.acc_mode == 'atomic':
             d_atom = cuda.mem_alloc(8 * nbins)
             flush_fun = self.mod.get_function("flush_atom")
 
         obuf_copy = argset(cuda.Memcpy2D(),
             src_y=self.gutter, src_x_in_bytes=16*self.gutter,
-            src_pitch=16*ostride, dst_pitch=16*width,
+            src_pitch=16*astride, dst_pitch=16*width,
             width_in_bytes=16*width, height=height)
         obuf_copy.set_src_device(d_out)
         h_out_a = cuda.pagelocked_empty((height, width, 4), f32)
@@ -344,10 +343,8 @@ class Renderer(object):
                               block=(512, 1, 1), grid=(nblocks, nblocks),
                               stream=iter_stream)
 
+            util.BaseCode.fill_dptr(self.mod, d_out, 4 * nbins, filt_stream)
             _sync_stream(filt_stream, write_stream)
-            filt.blur_density(d_accum, d_out, dim, stream=filt_stream)
-            util.BaseCode.fill_dptr(self.mod, d_out, 4 * nbins / ss ** 2,
-                                    filt_stream)
             filt.de(d_out, d_accum, genome, dim, tc, stream=filt_stream)
             _sync_stream(write_stream, filt_stream)
             filt.colorclip(d_out, genome, dim, tc, blend, stream=filt_stream)