From 12655b8611b4f6038b22952e627d9482f020d13e Mon Sep 17 00:00:00 2001 From: Steven Robertson Date: Sat, 10 Dec 2011 16:24:49 -0500 Subject: [PATCH] Make DE better --- cuburn/code/filtering.py | 97 +++++++++++++++++----------------------- cuburn/genome.py | 2 +- helpers/filt_err.py | 81 +++++++++++++++------------------ 3 files changed, 79 insertions(+), 101 deletions(-) diff --git a/cuburn/code/filtering.py b/cuburn/code/filtering.py index 243b26f..fdc1f86 100644 --- a/cuburn/code/filtering.py +++ b/cuburn/code/filtering.py @@ -85,9 +85,9 @@ void colorclip(float4 *pixbuf, float gamma, float vibrancy, float highpow, } -#define W 21 // Filter width (regardless of standard deviation chosen) -#define W2 10 // Half of filter width, rounded down -#define FW 52 // Width of local result storage (NW+W2+W2) +#define W 15 // Filter width (regardless of standard deviation chosen) +#define W2 7 // Half of filter width, rounded down +#define FW 46 // Width of local result storage (NW+W2+W2) #define FW2 (FW*FW) __shared__ float de_r[FW2], de_g[FW2], de_b[FW2], de_a[FW2]; @@ -116,12 +116,12 @@ void logscale(float4 *pixbuf, float4 *outbuf, float k1, float k2) { // See helpers/filt_err.py for source of these values. -#define MIN_SD 0.23299530f -#define MAX_SD 4.33333333f +#define MAX_SCALE -0.12f +#define MIN_SCALE -9.2103404f __global__ void density_est(float4 *pixbuf, float4 *outbuf, - float est_sd, float neg_est_curve, float est_min, + float scale_coeff, float est_curve, float edge_clamp, float k1, float k2, int height, int stride) { for (int i = threadIdx.x + 32*threadIdx.y; i < FW2; i += 32) de_r[i] = de_g[i] = de_b[i] = de_a[i] = 0.0f; @@ -164,57 +164,46 @@ void density_est(float4 *pixbuf, float4 *outbuf, // Base index of destination for writes int si = (threadIdx.y + W2) * FW + threadIdx.x + W2; - // Calculate standard deviation of Gaussian kernel. The base SD is - // then scaled in inverse proportion to the density of the point - // being scaled. - float sd = est_sd * powf(den+1.0f, neg_est_curve); - // And for the gradient... - float diag_sd = est_sd * powf(diag_mag+1.0f, neg_est_curve); + // Calculate scaling coefficient for the Gaussian kernel. This + // does not match with a normal Gaussian; it just fits with + // flam3's implementation. + float scale = powf(den, est_curve) * scale_coeff; - // If the gradient SD is smaller than the minimum SD, we're probably - // on a strong edge; blur with a standard deviation around 1px. - if (diag_sd < MIN_SD && diag_sd < sd) { - sd = 0.3333333f; + // If the gradient scale is smaller than the minimum scale, we're + // probably on a strong edge; blur slightly. + if (diag_mag > den * 2.0f) { + scale = max(-9.0f, scale); // Uncomment to see which pixels are being clamped // de_g[si] = 1.0f; } - // Clamp the final standard deviation. - sd = fminf(MAX_SD, fmaxf(sd, est_min)); - // Below a certain threshold, only one coeffecient would be // retained anyway; we hop right to it. - if (sd <= MIN_SD) { + if (scale <= MIN_SCALE) { de_add(si, 0, 0, in); } else { - // These polynomials approximates the sum of the filters - // with the clamping logic used here. See helpers/filt_err.py. + // These polynomials approximates the reciprocal of the sum of + // all retained filter coefficients. See helpers/filt_err.py. float filtsum; - if (sd < 0.75f) { - filtsum = -352.25061035f; - filtsum = filtsum * sd + 1117.09680176f; - filtsum = filtsum * sd + -1372.48864746f; - filtsum = filtsum * sd + 779.15478516f; - filtsum = filtsum * sd + -164.04229736f; - filtsum = filtsum * sd + -12.04892635f; - filtsum = filtsum * sd + 9.04126644f; - filtsum = filtsum * sd + 0.10304667f; + if (scale < -1.1f) { + filtsum = 5.20066078e-06f; + filtsum = filtsum * scale + 2.14025771e-04f; + filtsum = filtsum * scale + 3.62761668e-03f; + filtsum = filtsum * scale + 3.21970172e-02f; + filtsum = filtsum * scale + 1.54297248e-01f; + filtsum = filtsum * scale + 3.42210710e-01f; + filtsum = filtsum * scale + 3.06015890e-02f; + filtsum = filtsum * scale + 1.33724615e-01f; } else { - filtsum = 0.01162011f; - filtsum = filtsum * sd + -0.21552004f; - filtsum = filtsum * sd + 1.66545594f; - filtsum = filtsum * sd + -7.00809765f; - filtsum = filtsum * sd + 17.55487633f; - filtsum = filtsum * sd + -26.80626106f; - filtsum = filtsum * sd + 30.61903954f; - filtsum = filtsum * sd + -12.00870514f; - filtsum = filtsum * sd + 2.46708894f; + filtsum = -1.23516649e-01f; + filtsum = filtsum * scale + -5.14862895e-01f; + filtsum = filtsum * scale + -8.61198902e-01f; + filtsum = filtsum * scale + -7.41916001e-01f; + filtsum = filtsum * scale + -3.51667106e-01f; + filtsum = filtsum * scale + -9.07439440e-02f; + filtsum = filtsum * scale + -3.30008656e-01f; + filtsum = filtsum * scale + -4.78249392e-04f; } - float filtscale = 1.0f / filtsum; - - // The reciprocal SD scaling coeffecient in the Gaussian - // exponent: exp(-x^2/(2*sd^2)) = exp2f(x^2*rsd) - float rsd = -0.5f * CUDART_L2E_F / (sd * sd); for (int jj = 0; jj <= W2; jj++) { float jj2f = jj; @@ -222,9 +211,8 @@ void density_est(float4 *pixbuf, float4 *outbuf, float iif = 0; for (int ii = 0; ii <= jj; ii++) { - - float coeff = exp2f((jj2f + iif * iif) * rsd) - * filtscale; + float coeff = expf((jj2f + iif * iif) * scale) + * filtsum; if (coeff < 0.0001f) break; iif += 1; @@ -317,15 +305,12 @@ class Filtering(object): t = fun(dsrc, ddst, k1, k2, block=(512, 1, 1), grid=(nbins/512, 1), stream=stream) else: - # flam3_gaussian_filter() uses an implicit standard deviation of - # 0.5, but the DE filters scale filter distance by the default - # spatial support factor of 1.5, so the effective base SD is - # (0.5/1.5)=1/3. - est_sd = np.float32(cp.de.radius(t) / 3.) - neg_est_curve = np.float32(-cp.de.curve(t)) - est_min = np.float32(cp.de.minimum(t) / 3.) + scale_coeff = np.float32(-(1 + cp.de.radius(t)) ** -2.0) + est_curve = np.float32(2 * cp.de.curve(t)) + # TODO: experiment with this + edge_clamp = np.float32(2.0) fun = self.mod.get_function("density_est") - fun(dsrc, ddst, est_sd, neg_est_curve, est_min, k1, k2, + fun(dsrc, ddst, scale_coeff, est_curve, edge_clamp, k1, k2, np.int32(info.acc_height), np.int32(info.acc_stride), block=(32, 32, 1), grid=(info.acc_width/32, 1), stream=stream) diff --git a/cuburn/genome.py b/cuburn/genome.py index c8dc60c..8386bf7 100644 --- a/cuburn/genome.py +++ b/cuburn/genome.py @@ -116,7 +116,7 @@ class RenderInfo(object): # Maximum width of DE and other spatial filters, and thus in turn the # amount of padding applied. Note that, for now, this must not be changed! # The filtering code makes deep assumptions about this value. - gutter = 22 + gutter = 15 # TODO: for now, we always throw away the alpha channel before writing. # All code is in place to not do this, we just need to find a way to expose diff --git a/helpers/filt_err.py b/helpers/filt_err.py index d9b9a7d..40bad58 100644 --- a/helpers/filt_err.py +++ b/helpers/filt_err.py @@ -1,7 +1,7 @@ import numpy as np # The maximum number of coeffecients that will ever be retained on the device -FWIDTH = 21 +FWIDTH = 15 # The number of points on either side of the center in one dimension F2 = int(FWIDTH/2) @@ -12,92 +12,85 @@ COEFF_EPS = 0.0001 dists2d = np.fromfunction(lambda i, j: np.hypot(i-F2, j-F2), (FWIDTH, FWIDTH)) dists = dists2d.flatten() -# A flam3 estimator radius corresponds to a Gaussian filter with a standard -# deviation of 1/3 the radius. We choose 13 as an arbitrary upper bound for the -# max filter radius. The filter should reject larger radii. -MAX_SD = 13 / 3. -# The minimum estimator radius can be set as low as 0, but below a certain -# radius only one coeffecient is retained. Since things get unstable near 0, -# we explicitly set a minimum threshold below which no coeffecients are -# retained. -MIN_SD = np.sqrt(-1 / (2 * np.log(COEFF_EPS))) +# This translates to a cap on DE filter radius of 50. Even this fits very +# comfortably within the chosen COEFF_EPS. +MAX_SCALE = -3/25. -# Using two predicated three-term approximations is much more accurate than -# using a very large number of terms, due to nonlinear behavior at low SD. -# Everything above this SD uses one approximation; below, another. -SPLIT_SD = 0.75 +# When the scale is above this value, we'd be directly clamping to one bin +MIN_SCALE = np.log(0.0001) -# The lower endpoints are undershot by this proportion to reduce error -UNDERSHOOT = 0.98 +# Everything above this scale uses one approximation; below, another. +SPLIT_SCALE = -1.1 -sds_hi = np.linspace(SPLIT_SD * UNDERSHOOT, MAX_SD, num=1000) -sds_lo = np.linspace(MIN_SD * UNDERSHOOT, SPLIT_SD, num=1000) +# The upper endpoints are overshot by this proportion to reduce error +OVERSHOOT = 1.01 -print 'At MIN_SD = %g, these are the coeffs:' % MIN_SD -print np.exp(dists2d**2 / (-2 * MIN_SD ** 2)) +# No longer 'scale'-related, but we call it that anyway +scales_hi = np.linspace(SPLIT_SCALE, MAX_SCALE * OVERSHOOT, num=1000) +scales_lo = np.linspace(MIN_SCALE, SPLIT_SCALE * OVERSHOOT, num=1000) -def eval_sds(sds, name, nterms): +def eval_scales(scales, name, nterms): # Calculate the filter sums at each coordinate sums = [] - for sd in sds: - coeffs = np.exp(dists**2 / (-2 * sd ** 2)) + for scale in scales: + coeffs = np.exp(dists**2 * scale) # Note that this sum is the sum of all coordinates, though it should # actually be the result of the polynomial approximation. We could do # a feedback loop to improve accuracy, but I don't think the difference # is worth worrying about. sum = np.sum(coeffs) - sums.append(np.sum(filter(lambda v: v / sum > COEFF_EPS, coeffs))) + sums.append(1./np.sum(filter(lambda v: v / sum > COEFF_EPS, coeffs))) print 'Evaluating %s:' % name - poly, resid, rank, sing, rcond = np.polyfit(sds, sums, nterms, full=True) + poly, resid, rank, sing, rcond = np.polyfit(scales, sums, nterms, full=True) print 'Fit for %s:' % name, poly, resid, rank, sing, rcond return sums, poly import matplotlib.pyplot as plt -sums_hi, poly_hi = eval_sds(sds_hi, 'hi', 8) -sums_lo, poly_lo = eval_sds(sds_lo, 'lo', 7) +sums_hi, poly_hi = eval_scales(scales_hi, 'hi', 7) +sums_lo, poly_lo = eval_scales(scales_lo, 'lo', 7) -num_undershoots = len(filter(lambda v: v < SPLIT_SD, sds_hi)) -sds_hi = sds_hi[num_undershoots:] -sums_hi = sums_hi[num_undershoots:] +num_overshoots = len(filter(lambda v: v > MAX_SCALE, scales_hi)) +scales_hi = scales_hi[num_overshoots:] +sums_hi = sums_hi[num_overshoots:] -num_undershoots = len(filter(lambda v: v < MIN_SD, sds_lo)) -sds_lo = sds_lo[num_undershoots:] -sums_lo = sums_lo[num_undershoots:] +num_overshoots = len(filter(lambda v: v > SPLIT_SCALE, scales_lo)) +scales_lo = scales_lo[num_overshoots:] +sums_lo = sums_lo[num_overshoots:] polyf_hi = np.float32(poly_hi) -vals_hi = np.polyval(polyf_hi, sds_hi) +vals_hi = np.polyval(polyf_hi, scales_hi) polyf_lo = np.float32(poly_lo) -vals_lo = np.polyval(polyf_lo, sds_lo) +vals_lo = np.polyval(polyf_lo, scales_lo) def print_filt(filts): - print ' filtsum = %4.8ff;' % filts[0] + print ' filtsum = %4.8ef;' % filts[0] for f in filts[1:]: - print ' filtsum = filtsum * sd + % 16.8ff;' % f + print ' filtsum = filtsum * scale + % 16.8ef;' % f print '\n\nFor your convenience:' -print '#define MIN_SD %.8f' % MIN_SD -print '#define MAX_SD %.8f' % MAX_SD -print 'if (sd < %g) {' % SPLIT_SD +print '#define MIN_SCALE %.8gf' % MIN_SCALE +print '#define MAX_SCALE %.8gf' % MAX_SCALE +print 'if (scale < %gf) {' % SPLIT_SCALE print_filt(polyf_lo) print '} else {' print_filt(polyf_hi) print '}' -sds = np.concatenate([sds_lo, sds_hi]) +scales = np.concatenate([scales_lo, scales_hi]) sums = np.concatenate([sums_lo, sums_hi]) vals = np.concatenate([vals_lo, vals_hi]) fig = plt.figure() ax = fig.add_subplot(1,1,1) -ax.plot(sds, sums) -ax.plot(sds, vals) +ax.plot(scales, sums) +ax.plot(scales, vals) ax.set_xlabel('stdev') ax.set_ylabel('filter sum') ax = ax.twinx() -ax.plot(sds, [abs((s-v)/v) for s, v in zip(sums, vals)]) +ax.plot(scales, [abs((s-v)/v) for s, v in zip(sums, vals)]) ax.set_ylabel('rel err') plt.show()