From 3d94c256a957e6d529fff4ee1ce79395e752bed9 Mon Sep 17 00:00:00 2001
From: Steven Robertson <steven@strobe.cc>
Date: Thu, 5 May 2011 23:35:54 -0400
Subject: [PATCH] Another non-working checkin

---
 cuburn/code/filter.py | 260 ++++++++++++++++++++++++++----------------
 cuburn/code/iter.py   |  14 +--
 2 files changed, 169 insertions(+), 105 deletions(-)

diff --git a/cuburn/code/filter.py b/cuburn/code/filter.py
index f9802df..f4f102b 100644
--- a/cuburn/code/filter.py
+++ b/cuburn/code/filter.py
@@ -4,32 +4,22 @@ from cuburn.code.util import *
 class ColorClip(HunkOCode):
     defs = """
 __global__
-void logfilt(float4 *pixbuf, float k1, float k2,
-             float gamma, float vibrancy, float highpow) {
+void colorclip(float4 *pixbuf, float gamma, float vibrancy, float highpow) {
     // TODO: test if over an edge of the framebuffer
     int i = blockDim.x * blockIdx.x + threadIdx.x;
     float4 pix = pixbuf[i];
 
     if (pix.w <= 0) return;
 
-    float ls = k1 * logf(1.0 + pix.w * k2) / pix.w;
-    pix.x *= ls;
-    pix.y *= ls;
-    pix.z *= ls;
-    pix.w *= ls;
-
     float4 opix = pix;
 
     // TODO: linearized bottom range
     float alpha = powf(pix.w, gamma);
-    ls = vibrancy * alpha / pix.w;
+    float ls = vibrancy * alpha / pix.w;
 
     float maxc = fmaxf(pix.x, fmaxf(pix.y, pix.z));
     float newls = 1 / maxc;
 
-    // TODO: detect if highlight power is globally disabled and drop
-    // this branch
-
     if (maxc * ls > 1 && highpow >= 0) {
         // TODO: does CUDA autopromote the int here to a float before GPU?
         float lsratio = powf(newls / ls, highpow);
@@ -65,15 +55,18 @@ void logfilt(float4 *pixbuf, float k1, float k2,
 
 class DensityEst(HunkOCode):
     """
-    NOTE: for now, this *must* be invoked with a block size of (32,16,1), and
-    a grid size of (W/32) for vertical filtering or (H/32) for horizontal.
-
-    It will probably fail for images that are not multiples of 32.
+    NOTE: for now, this *must* be invoked with a block size of (32,32,1), and
+    a grid size of (W/32,1). At least 7 pixel gutters are required, and the
+    stride and height probably need to be multiples of 32.
     """
 
+    # Note, changing this does not yet have any effect, it's just informational
+    MAX_WIDTH=15
+
     def __init__(self, features, cp):
         self.features, self.cp = features, cp
 
+    headers = "#include<math_constants.h>\n"
     @property
     def defs(self):
         return self.defs_tmpl.substitute(features=self.features, cp=self.cp)
@@ -81,109 +74,184 @@ class DensityEst(HunkOCode):
     defs_tmpl = Template("""
 #define W 15        // Filter width (regardless of standard deviation chosen)
 #define W2 7        // Half of filter width, rounded down
-#define NW 16       // Number of warps in each set of points
-#define FW 30       // Width of local result storage per-lane (NW+W2+W2)
-#define BX 32       // The size of a block's X dimension (== 1 warp)
+#define FW 46       // Width of local result storage (NW+W2+W2)
+#define FW2 (FW*FW)
+
+__shared__ float de_r[FW2], de_g[FW2], de_b[FW2], de_a[FW2];
+
+__device__ void de_add(int ibase, int ii, int jj, float4 scaled) {
+    int idx = ibase + FW * ii + jj;
+    atomicAdd(de_r+idx, scaled.x);
+    atomicAdd(de_g+idx, scaled.y);
+    atomicAdd(de_b+idx, scaled.z);
+    atomicAdd(de_a+idx, scaled.w);
+}
 
 __global__
-void density_est(float4 *pixbuf, float *denbuf, int vertical) {
-    __shared__ float r[BX*FW], g[BX*FW], b[BX*FW], a[BX*FW];
+void logscale(float4 *pixbuf, float4 *outbuf, float k1, float k2) {
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    float4 pix = pixbuf[i];
 
-    int ibase;    // The index of the first element within this lane's strip
-    int imax;     // The maximum offset from the first element in the strip
-    int istride;  // Number of indices until next point to filter
+    float ls = fmaxf(0, k1 * logf(1.0 + pix.w * k2) / pix.w);
+    pix.x *= ls;
+    pix.y *= ls;
+    pix.z *= ls;
+    pix.w *= ls;
 
-    if (vertical) {
-        ibase = threadIdx.x + blockIdx.x * BX;
-        imax = {{features.acc_height}};
-        istride = {{features.acc_stride}};
-    } else {
-        ibase = (blockIdx.x * BX + threadIdx.x) * {{features.acc_stride}};
-        imax = {{features.acc_width}};
-        istride = 1;
-    }
+    outbuf[i] = pix;
+}
 
-    for (int i = threadIdx.x + BX*threadIdx.y; i < BX*FW; i += NW * BX)
-        r[i] = g[i] = b[i] = a[i] = 0.0f;
+__global__
+void density_est(float4 *pixbuf, float4 *outbuf, float *denbuf,
+                 float k1, float k2) {
+    for (int i = threadIdx.x + 32*threadIdx.y; i < FW2; i += 32)
+        de_r[i] = de_g[i] = de_b[i] = de_a[i] = 0.0f;
+    __syncthreads();
+
+    for (int imrow = threadIdx.y + W2; imrow < {{features.acc_height}}; imrow += 32)
+    {
+        int idx = {{features.acc_stride}} * imrow +
+                + blockIdx.x * 32 + threadIdx.x + W2;
 
-    for (int i = threadIdx.y; i < imax; i += NW) {
-        int idx = ibase+i*istride;
         float4 in = pixbuf[idx];
         float den = denbuf[idx];
 
-        int j = (threadIdx.y + W2) * 32 + threadIdx.x;
+        if (in.w > 0 && den > 0) {
+            float ls = k1 * 12 * logf(1.0 + in.w * k2) / in.w;
+            in.x *= ls;
+            in.y *= ls;
+            in.z *= ls;
+            in.w *= ls;
 
-        float sd = {{0.35 * cp.estimator}} / powf(den+1.0f, {{cp.estimator_curve}});
-        {{if cp.estimator_minimum > 1}}
-        sd = fmaxf(sd, {{cp.estimator_minimum}});
-        {{endif}}
-        sd *= sd;
+            // Calculate standard deviation of Gaussian kernel.
+            // flam3_gaussian_filter() uses an implicit standard deviation of 0.5,
+            // but the DE filters scale filter distance by the default spatial
+            // support factor of 1.5, so the effective base SD is (0.5/1.5)=1/3.
+            float sd = {{cp.estimator / 3.}};
 
-        // TODO: log scaling here? matches flam3, but, ick
-        // TODO: investigate harm caused by varying standard deviation in a
-        // separable environment
-        float coeff = rsqrtf(2.0f*M_PI*sd*sd);
-        atomicAdd(r+j, in.x * coeff);
-        atomicAdd(g+j, in.y * coeff);
-        atomicAdd(b+j, in.z * coeff);
-        atomicAdd(a+j, in.w * coeff);
-        sd = -0.5/sd;
+            // The base SD is then scaled in inverse proportion to the density of
+            // the point being scaled.
+            sd *= powf(den+1.0f, {{-cp.estimator_curve}});
 
-        // #pragma unroll
-        for (int k = 1; k <= W2; k++) {
-            float scale = exp(sd*k*k)*coeff;
-            idx = j+k*32;
-            atomicAdd(r+idx, in.x * scale);
-            atomicAdd(g+idx, in.y * scale);
-            atomicAdd(b+idx, in.z * scale);
-            atomicAdd(a+idx, in.w * scale);
-            idx = j-k*32;
-            atomicAdd(r+idx, in.x * scale);
-            atomicAdd(g+idx, in.y * scale);
-            atomicAdd(b+idx, in.z * scale);
-            atomicAdd(a+idx, in.w * scale);
+            // Clamp the final standard deviation. Things will go badly if the
+            // minimum is undershot.
+            sd = fmaxf(sd, {{max(cp.estimator_minimum / 3., 0.3)}} );
+
+            // This five-term polynomial approximates the sum of the filters with
+            // the clamping logic used here. See helpers/filt_err.py.
+            float filtsum;
+            filtsum = -0.20885075f  * sd +  0.90557721f;
+            filtsum = filtsum       * sd +  5.28363054f;
+            filtsum = filtsum       * sd + -0.11733533f;
+            filtsum = filtsum       * sd +  0.35670333f;
+            float filtscale = 1 / filtsum;
+
+            // The reciprocal SD scaling coeffecient in the Gaussian exponent.
+            // exp(-x^2/(2*sd^2)) = exp2f(x^2*rsd)
+            float rsd = -0.5f * CUDART_L2E_F / (sd * sd);
+
+            int si = (threadIdx.y + W2) * FW + threadIdx.x + W2;
+            for (int jj = 0; jj <= W2; jj++) {
+                float jj2f = jj;
+                jj2f *= jj2f;
+
+                float iif = 0;
+                for (int ii = 0; ii <= jj; ii++) {
+                    iif += 1;
+
+                    float coeff = exp2f((jj2f + iif * iif) * rsd) * filtscale;
+                    if (coeff < 0.0001f) break;
+
+                    float4 scaled;
+                    scaled.x = in.x * coeff;
+                    scaled.y = in.y * coeff;
+                    scaled.z = in.z * coeff;
+                    scaled.w = in.w * coeff;
+
+                    de_add(si,  ii,  jj, scaled);
+                    if (jj == 0) continue;
+                    de_add(si,  ii, -jj, scaled);
+                    if (ii != 0) {
+                        de_add(si, -ii,  jj, scaled);
+                        de_add(si, -ii, -jj, scaled);
+                        if (ii == jj) continue;
+                    }
+                    de_add(si,  jj,  ii, scaled);
+                    de_add(si, -jj,  ii, scaled);
+
+                    if (ii == 0) continue;
+                    de_add(si, -jj, -ii, scaled);
+                    de_add(si,  jj, -ii, scaled);
+
+                    // TODO: validate that the above avoids bank conflicts
+                }
+            }
         }
 
         __syncthreads();
-        float4 out;
-        j = threadIdx.y * BX + threadIdx.x;
-        out.x = r[j];
-        out.y = g[j];
-        out.z = b[j];
-        out.w = a[j];
-        idx = ibase+(i-W2)*istride;
-        if (idx > 0)
-            pixbuf[idx] = out;
-        __syncthreads();
+        // TODO: could coalesce this, but what a pain
+        for (int i = threadIdx.x; i < FW; i += 32) {
+            idx = {{features.acc_stride}} * imrow + blockIdx.x * 32 + i + W2;
+            int si = threadIdx.y * FW + i;
+            float *out = reinterpret_cast<float*>(&outbuf[idx]);
+            atomicAdd(out,   de_r[si]);
+            atomicAdd(out+1, de_g[si]);
+            atomicAdd(out+2, de_b[si]);
+            atomicAdd(out+3, de_a[si]);
+        }
 
+        if (threadIdx.y == 5000) {
+            for (int i = threadIdx.x; i < FW; i += 32) {
+                idx = {{features.acc_stride}} * (imrow + 32)
+                    + blockIdx.x * 32 + i + W2;
+                int si = 32 * FW + i;
+                float *out = reinterpret_cast<float*>(&outbuf[idx]);
+                atomicAdd(out,   0.2 + de_r[si]);
+                atomicAdd(out+1, de_g[si]);
+                atomicAdd(out+2, de_b[si]);
+                atomicAdd(out+3, de_a[si]);
+            }
+        }
+
+        __syncthreads();
         // TODO: shift instead of copying
-        idx = threadIdx.x + BX * threadIdx.y;
-        if (threadIdx.y < NW-2) {
-            r[idx] = r[idx + BX*NW];
-            g[idx] = g[idx + BX*NW];
-            b[idx] = b[idx + BX*NW];
-            a[idx] = a[idx + BX*NW];
+        int tid = threadIdx.y * 32 + threadIdx.x;
+        for (int i = tid; i < FW*(W2+W2); i += 512) {
+            de_r[i] = de_r[i+FW*32];
+            de_g[i] = de_g[i+FW*32];
+            de_b[i] = de_b[i+FW*32];
+            de_a[i] = de_a[i+FW*32];
         }
-        __syncthreads();
 
-        r[idx + BX*(NW-2)] = 0.0f;
-        g[idx + BX*(NW-2)] = 0.0f;
-        b[idx + BX*(NW-2)] = 0.0f;
-        a[idx + BX*(NW-2)] = 0.0f;
+        __syncthreads();
+        for (int i = tid + FW*(W2+W2); i < FW2; i += 512) {
+            de_r[i] = 0.0f;
+            de_g[i] = 0.0f;
+            de_b[i] = 0.0f;
+            de_a[i] = 0.0f;
+        }
         __syncthreads();
     }
 }
 
 """)
 
-    def invoke(self, mod, abufd, dbufd):
-        fun = mod.get_function("density_est")
-        t = fun(abufd, dbufd, np.int32(0),
-                block=(32, 16, 1), grid=(self.features.acc_height/32,1),
-                time_kernel=True)
-        print "Horizontal density estimation: %g" % t
+    def invoke(self, mod, abufd, obufd, dbufd):
+        # TODO: add no-est version
+        # TODO: come up with a general way to average these parameters
+        k1 = self.cp.brightness * 268 / 256
+        area = self.features.width * self.features.height / self.cp.ppu ** 2
+        k2 = 1 / (area * self.cp.adj_density)
+
+        if self.cp.estimator == 0:
+            fun = mod.get_function("logscale")
+            t = fun(abufd, obufd, np.float32(k1), np.float32(k2),
+                    block=(self.features.acc_width, 1, 1),
+                    grid=(self.features.acc_height, 1), time_kernel=True)
+        else:
+            fun = mod.get_function("density_est")
+            t = fun(abufd, obufd, dbufd, np.float32(k1), np.float32(k2),
+                    block=(32, 32, 1), grid=(self.features.acc_stride/32 - 1, 1),
+                    time_kernel=True)
+            print "Density estimation: %g" % t
 
-        t = fun(abufd, dbufd, np.int32(1),
-                block=(32, 16, 1), grid=(self.features.acc_width/32,1),
-                time_kernel=True)
-        print "Vertical density estimation: %g" % t
diff --git a/cuburn/code/iter.py b/cuburn/code/iter.py
index e5158fd..c8e980c 100644
--- a/cuburn/code/iter.py
+++ b/cuburn/code/iter.py
@@ -220,19 +220,15 @@ def render(features, cps):
 
     npix = features.width * features.height
 
-    k1 = cp.brightness * 268 / 256
-    area = features.width * features.height / cp.ppu ** 2
-    k2 = 1 / (area * cp.adj_density)
+    obufd = cuda.to_device(abuf)
+    de.invoke(mod, abufd, obufd, dbufd)
 
-    de.invoke(mod, abufd, dbufd)
-
-    fun = mod.get_function("logfilt")
-    t = fun(abufd, f(k1), f(k2),
-        f(1 / cp.gamma), f(cp.vibrancy), f(cp.highlight_power),
+    fun = mod.get_function("colorclip")
+    t = fun(obufd, f(1 / cp.gamma), f(cp.vibrancy), f(cp.highlight_power),
         block=(256,1,1), grid=(npix/256,1), time_kernel=True)
     print "Completed color filtering in %g seconds" % t
 
-    abuf = cuda.from_device_like(abufd, abuf)
+    abuf = cuda.from_device_like(obufd, abuf)
     return abuf, dbuf