From 12655b8611b4f6038b22952e627d9482f020d13e Mon Sep 17 00:00:00 2001
From: Steven Robertson <steven@strobe.cc>
Date: Sat, 10 Dec 2011 16:24:49 -0500
Subject: [PATCH] Make DE better

---
 cuburn/code/filtering.py | 97 +++++++++++++++++-----------------------
 cuburn/genome.py         |  2 +-
 helpers/filt_err.py      | 81 +++++++++++++++------------------
 3 files changed, 79 insertions(+), 101 deletions(-)

diff --git a/cuburn/code/filtering.py b/cuburn/code/filtering.py
index 243b26f..fdc1f86 100644
--- a/cuburn/code/filtering.py
+++ b/cuburn/code/filtering.py
@@ -85,9 +85,9 @@ void colorclip(float4 *pixbuf, float gamma, float vibrancy, float highpow,
 }
 
 
-#define W 21        // Filter width (regardless of standard deviation chosen)
-#define W2 10       // Half of filter width, rounded down
-#define FW 52       // Width of local result storage (NW+W2+W2)
+#define W 15        // Filter width (regardless of standard deviation chosen)
+#define W2 7       // Half of filter width, rounded down
+#define FW 46       // Width of local result storage (NW+W2+W2)
 #define FW2 (FW*FW)
 
 __shared__ float de_r[FW2], de_g[FW2], de_b[FW2], de_a[FW2];
@@ -116,12 +116,12 @@ void logscale(float4 *pixbuf, float4 *outbuf, float k1, float k2) {
 
 
 // See helpers/filt_err.py for source of these values.
-#define MIN_SD 0.23299530f
-#define MAX_SD 4.33333333f
+#define MAX_SCALE -0.12f
+#define MIN_SCALE -9.2103404f
 
 __global__
 void density_est(float4 *pixbuf, float4 *outbuf,
-                 float est_sd, float neg_est_curve, float est_min,
+                 float scale_coeff, float est_curve, float edge_clamp,
                  float k1, float k2, int height, int stride) {
     for (int i = threadIdx.x + 32*threadIdx.y; i < FW2; i += 32)
         de_r[i] = de_g[i] = de_b[i] = de_a[i] = 0.0f;
@@ -164,57 +164,46 @@ void density_est(float4 *pixbuf, float4 *outbuf,
             // Base index of destination for writes
             int si = (threadIdx.y + W2) * FW + threadIdx.x + W2;
 
-            // Calculate standard deviation of Gaussian kernel. The base SD is
-            // then scaled in inverse proportion to the density of the point
-            // being scaled.
-            float sd = est_sd * powf(den+1.0f, neg_est_curve);
-            // And for the gradient...
-            float diag_sd = est_sd * powf(diag_mag+1.0f, neg_est_curve);
+            // Calculate scaling coefficient for the Gaussian kernel. This
+            // does not match with a normal Gaussian; it just fits with
+            // flam3's implementation.
+            float scale = powf(den, est_curve) * scale_coeff;
 
-            // If the gradient SD is smaller than the minimum SD, we're probably
-            // on a strong edge; blur with a standard deviation around 1px.
-            if (diag_sd < MIN_SD && diag_sd < sd) {
-                sd = 0.3333333f;
+            // If the gradient scale is smaller than the minimum scale, we're
+            // probably on a strong edge; blur slightly.
+            if (diag_mag > den * 2.0f) {
+                scale = max(-9.0f, scale);
                 // Uncomment to see which pixels are being clamped
                 // de_g[si] = 1.0f;
             }
 
-            // Clamp the final standard deviation.
-            sd = fminf(MAX_SD, fmaxf(sd, est_min));
-
             // Below a certain threshold, only one coeffecient would be
             // retained anyway; we hop right to it.
-            if (sd <= MIN_SD) {
+            if (scale <= MIN_SCALE) {
                 de_add(si, 0,  0, in);
             } else {
-                // These polynomials approximates the sum of the filters
-                // with the clamping logic used here. See helpers/filt_err.py.
+                // These polynomials approximates the reciprocal of the sum of
+                // all retained filter coefficients. See helpers/filt_err.py.
                 float filtsum;
-                if (sd < 0.75f) {
-                    filtsum = -352.25061035f;
-                    filtsum = filtsum * sd +    1117.09680176f;
-                    filtsum = filtsum * sd +   -1372.48864746f;
-                    filtsum = filtsum * sd +     779.15478516f;
-                    filtsum = filtsum * sd +    -164.04229736f;
-                    filtsum = filtsum * sd +     -12.04892635f;
-                    filtsum = filtsum * sd +       9.04126644f;
-                    filtsum = filtsum * sd +       0.10304667f;
+                if (scale < -1.1f) {
+                    filtsum = 5.20066078e-06f;
+                    filtsum = filtsum * scale +   2.14025771e-04f;
+                    filtsum = filtsum * scale +   3.62761668e-03f;
+                    filtsum = filtsum * scale +   3.21970172e-02f;
+                    filtsum = filtsum * scale +   1.54297248e-01f;
+                    filtsum = filtsum * scale +   3.42210710e-01f;
+                    filtsum = filtsum * scale +   3.06015890e-02f;
+                    filtsum = filtsum * scale +   1.33724615e-01f;
                 } else {
-                    filtsum = 0.01162011f;
-                    filtsum = filtsum * sd +      -0.21552004f;
-                    filtsum = filtsum * sd +       1.66545594f;
-                    filtsum = filtsum * sd +      -7.00809765f;
-                    filtsum = filtsum * sd +      17.55487633f;
-                    filtsum = filtsum * sd +     -26.80626106f;
-                    filtsum = filtsum * sd +      30.61903954f;
-                    filtsum = filtsum * sd +     -12.00870514f;
-                    filtsum = filtsum * sd +       2.46708894f;
+                    filtsum = -1.23516649e-01f;
+                    filtsum = filtsum * scale +  -5.14862895e-01f;
+                    filtsum = filtsum * scale +  -8.61198902e-01f;
+                    filtsum = filtsum * scale +  -7.41916001e-01f;
+                    filtsum = filtsum * scale +  -3.51667106e-01f;
+                    filtsum = filtsum * scale +  -9.07439440e-02f;
+                    filtsum = filtsum * scale +  -3.30008656e-01f;
+                    filtsum = filtsum * scale +  -4.78249392e-04f;
                 }
-                float filtscale = 1.0f / filtsum;
-
-                // The reciprocal SD scaling coeffecient in the Gaussian
-                // exponent: exp(-x^2/(2*sd^2)) = exp2f(x^2*rsd)
-                float rsd = -0.5f * CUDART_L2E_F / (sd * sd);
 
                 for (int jj = 0; jj <= W2; jj++) {
                     float jj2f = jj;
@@ -222,9 +211,8 @@ void density_est(float4 *pixbuf, float4 *outbuf,
 
                     float iif = 0;
                     for (int ii = 0; ii <= jj; ii++) {
-
-                        float coeff = exp2f((jj2f + iif * iif) * rsd)
-                                    * filtscale;
+                        float coeff = expf((jj2f + iif * iif) * scale)
+                                    * filtsum;
                         if (coeff < 0.0001f) break;
                         iif += 1;
 
@@ -317,15 +305,12 @@ class Filtering(object):
             t = fun(dsrc, ddst, k1, k2,
                     block=(512, 1, 1), grid=(nbins/512, 1), stream=stream)
         else:
-            # flam3_gaussian_filter() uses an implicit standard deviation of
-            # 0.5, but the DE filters scale filter distance by the default
-            # spatial support factor of 1.5, so the effective base SD is
-            # (0.5/1.5)=1/3.
-            est_sd = np.float32(cp.de.radius(t) / 3.)
-            neg_est_curve = np.float32(-cp.de.curve(t))
-            est_min = np.float32(cp.de.minimum(t) / 3.)
+            scale_coeff = np.float32(-(1 + cp.de.radius(t)) ** -2.0)
+            est_curve = np.float32(2 * cp.de.curve(t))
+            # TODO: experiment with this
+            edge_clamp = np.float32(2.0)
             fun = self.mod.get_function("density_est")
-            fun(dsrc, ddst, est_sd, neg_est_curve, est_min, k1, k2,
+            fun(dsrc, ddst, scale_coeff, est_curve, edge_clamp, k1, k2,
                 np.int32(info.acc_height), np.int32(info.acc_stride),
                 block=(32, 32, 1), grid=(info.acc_width/32, 1), stream=stream)
 
diff --git a/cuburn/genome.py b/cuburn/genome.py
index c8dc60c..8386bf7 100644
--- a/cuburn/genome.py
+++ b/cuburn/genome.py
@@ -116,7 +116,7 @@ class RenderInfo(object):
     # Maximum width of DE and other spatial filters, and thus in turn the
     # amount of padding applied. Note that, for now, this must not be changed!
     # The filtering code makes deep assumptions about this value.
-    gutter = 22
+    gutter = 15
 
     # TODO: for now, we always throw away the alpha channel before writing.
     # All code is in place to not do this, we just need to find a way to expose
diff --git a/helpers/filt_err.py b/helpers/filt_err.py
index d9b9a7d..40bad58 100644
--- a/helpers/filt_err.py
+++ b/helpers/filt_err.py
@@ -1,7 +1,7 @@
 import numpy as np
 
 # The maximum number of coeffecients that will ever be retained on the device
-FWIDTH = 21
+FWIDTH = 15
 
 # The number of points on either side of the center in one dimension
 F2 = int(FWIDTH/2)
@@ -12,92 +12,85 @@ COEFF_EPS = 0.0001
 dists2d = np.fromfunction(lambda i, j: np.hypot(i-F2, j-F2), (FWIDTH, FWIDTH))
 dists = dists2d.flatten()
 
-# A flam3 estimator radius corresponds to a Gaussian filter with a standard
-# deviation of 1/3 the radius. We choose 13 as an arbitrary upper bound for the
-# max filter radius. The filter should reject larger radii.
-MAX_SD = 13 / 3.
 
-# The minimum estimator radius can be set as low as 0, but below a certain
-# radius only one coeffecient is retained. Since things get unstable near 0,
-# we explicitly set a minimum threshold below which no coeffecients are
-# retained.
-MIN_SD = np.sqrt(-1 / (2 * np.log(COEFF_EPS)))
+# This translates to a cap on DE filter radius of 50. Even this fits very
+# comfortably within the chosen COEFF_EPS.
+MAX_SCALE = -3/25.
 
-# Using two predicated three-term approximations is much more accurate than
-# using a very large number of terms, due to nonlinear behavior at low SD.
-# Everything above this SD uses one approximation; below, another.
-SPLIT_SD = 0.75
+# When the scale is above this value, we'd be directly clamping to one bin
+MIN_SCALE = np.log(0.0001)
 
-# The lower endpoints are undershot by this proportion to reduce error
-UNDERSHOOT = 0.98
+# Everything above this scale uses one approximation; below, another.
+SPLIT_SCALE = -1.1
 
-sds_hi = np.linspace(SPLIT_SD * UNDERSHOOT, MAX_SD, num=1000)
-sds_lo = np.linspace(MIN_SD * UNDERSHOOT, SPLIT_SD, num=1000)
+# The upper endpoints are overshot by this proportion to reduce error
+OVERSHOOT = 1.01
 
-print 'At MIN_SD = %g, these are the coeffs:' % MIN_SD
-print np.exp(dists2d**2 / (-2 * MIN_SD ** 2))
+# No longer 'scale'-related, but we call it that anyway
+scales_hi = np.linspace(SPLIT_SCALE, MAX_SCALE * OVERSHOOT, num=1000)
+scales_lo = np.linspace(MIN_SCALE, SPLIT_SCALE * OVERSHOOT, num=1000)
 
-def eval_sds(sds, name, nterms):
+def eval_scales(scales, name, nterms):
     # Calculate the filter sums at each coordinate
     sums = []
-    for sd in sds:
-        coeffs = np.exp(dists**2 / (-2 * sd ** 2))
+    for scale in scales:
+        coeffs = np.exp(dists**2 * scale)
         # Note that this sum is the sum of all coordinates, though it should
         # actually be the result of the polynomial approximation. We could do
         # a feedback loop to improve accuracy, but I don't think the difference
         # is worth worrying about.
         sum = np.sum(coeffs)
-        sums.append(np.sum(filter(lambda v: v / sum > COEFF_EPS, coeffs)))
+        sums.append(1./np.sum(filter(lambda v: v / sum > COEFF_EPS, coeffs)))
     print 'Evaluating %s:' % name
-    poly, resid, rank, sing, rcond = np.polyfit(sds, sums, nterms, full=True)
+    poly, resid, rank, sing, rcond = np.polyfit(scales, sums, nterms, full=True)
     print 'Fit for %s:' % name, poly, resid, rank, sing, rcond
     return sums, poly
 
 import matplotlib.pyplot as plt
 
-sums_hi, poly_hi = eval_sds(sds_hi, 'hi', 8)
-sums_lo, poly_lo = eval_sds(sds_lo, 'lo', 7)
+sums_hi, poly_hi = eval_scales(scales_hi, 'hi', 7)
+sums_lo, poly_lo = eval_scales(scales_lo, 'lo', 7)
 
-num_undershoots = len(filter(lambda v: v < SPLIT_SD, sds_hi))
-sds_hi = sds_hi[num_undershoots:]
-sums_hi = sums_hi[num_undershoots:]
+num_overshoots = len(filter(lambda v: v > MAX_SCALE, scales_hi))
+scales_hi = scales_hi[num_overshoots:]
+sums_hi = sums_hi[num_overshoots:]
 
-num_undershoots = len(filter(lambda v: v < MIN_SD, sds_lo))
-sds_lo = sds_lo[num_undershoots:]
-sums_lo = sums_lo[num_undershoots:]
+num_overshoots = len(filter(lambda v: v > SPLIT_SCALE, scales_lo))
+scales_lo = scales_lo[num_overshoots:]
+sums_lo = sums_lo[num_overshoots:]
 
 polyf_hi = np.float32(poly_hi)
-vals_hi = np.polyval(polyf_hi, sds_hi)
+vals_hi = np.polyval(polyf_hi, scales_hi)
 polyf_lo = np.float32(poly_lo)
-vals_lo = np.polyval(polyf_lo, sds_lo)
+vals_lo = np.polyval(polyf_lo, scales_lo)
 
 def print_filt(filts):
-    print '    filtsum = %4.8ff;' % filts[0]
+    print '    filtsum = %4.8ef;' % filts[0]
     for f in filts[1:]:
-        print '    filtsum = filtsum * sd + % 16.8ff;' % f
+        print '    filtsum = filtsum * scale + % 16.8ef;' % f
 
 print '\n\nFor your convenience:'
-print '#define MIN_SD %.8f' % MIN_SD
-print '#define MAX_SD %.8f' % MAX_SD
-print 'if (sd < %g) {' % SPLIT_SD
+print '#define MIN_SCALE %.8gf' % MIN_SCALE
+print '#define MAX_SCALE %.8gf' % MAX_SCALE
+print 'if (scale < %gf) {' % SPLIT_SCALE
 print_filt(polyf_lo)
 print '} else {'
 print_filt(polyf_hi)
 print '}'
 
-sds = np.concatenate([sds_lo, sds_hi])
+scales = np.concatenate([scales_lo, scales_hi])
 sums = np.concatenate([sums_lo, sums_hi])
 vals = np.concatenate([vals_lo, vals_hi])
 
 fig = plt.figure()
 ax = fig.add_subplot(1,1,1)
-ax.plot(sds, sums)
-ax.plot(sds, vals)
+ax.plot(scales, sums)
+ax.plot(scales, vals)
 ax.set_xlabel('stdev')
 ax.set_ylabel('filter sum')
 
 ax = ax.twinx()
-ax.plot(sds, [abs((s-v)/v) for s, v in zip(sums, vals)])
+ax.plot(scales, [abs((s-v)/v) for s, v in zip(sums, vals)])
 ax.set_ylabel('rel err')
 
 plt.show()