mirror of
https://github.com/stevenrobertson/cuburn.git
synced 2025-02-05 11:40:04 -05:00
Make DE better
This commit is contained in:
parent
c59829ad86
commit
12655b8611
@ -85,9 +85,9 @@ void colorclip(float4 *pixbuf, float gamma, float vibrancy, float highpow,
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
#define W 21 // Filter width (regardless of standard deviation chosen)
|
#define W 15 // Filter width (regardless of standard deviation chosen)
|
||||||
#define W2 10 // Half of filter width, rounded down
|
#define W2 7 // Half of filter width, rounded down
|
||||||
#define FW 52 // Width of local result storage (NW+W2+W2)
|
#define FW 46 // Width of local result storage (NW+W2+W2)
|
||||||
#define FW2 (FW*FW)
|
#define FW2 (FW*FW)
|
||||||
|
|
||||||
__shared__ float de_r[FW2], de_g[FW2], de_b[FW2], de_a[FW2];
|
__shared__ float de_r[FW2], de_g[FW2], de_b[FW2], de_a[FW2];
|
||||||
@ -116,12 +116,12 @@ void logscale(float4 *pixbuf, float4 *outbuf, float k1, float k2) {
|
|||||||
|
|
||||||
|
|
||||||
// See helpers/filt_err.py for source of these values.
|
// See helpers/filt_err.py for source of these values.
|
||||||
#define MIN_SD 0.23299530f
|
#define MAX_SCALE -0.12f
|
||||||
#define MAX_SD 4.33333333f
|
#define MIN_SCALE -9.2103404f
|
||||||
|
|
||||||
__global__
|
__global__
|
||||||
void density_est(float4 *pixbuf, float4 *outbuf,
|
void density_est(float4 *pixbuf, float4 *outbuf,
|
||||||
float est_sd, float neg_est_curve, float est_min,
|
float scale_coeff, float est_curve, float edge_clamp,
|
||||||
float k1, float k2, int height, int stride) {
|
float k1, float k2, int height, int stride) {
|
||||||
for (int i = threadIdx.x + 32*threadIdx.y; i < FW2; i += 32)
|
for (int i = threadIdx.x + 32*threadIdx.y; i < FW2; i += 32)
|
||||||
de_r[i] = de_g[i] = de_b[i] = de_a[i] = 0.0f;
|
de_r[i] = de_g[i] = de_b[i] = de_a[i] = 0.0f;
|
||||||
@ -164,57 +164,46 @@ void density_est(float4 *pixbuf, float4 *outbuf,
|
|||||||
// Base index of destination for writes
|
// Base index of destination for writes
|
||||||
int si = (threadIdx.y + W2) * FW + threadIdx.x + W2;
|
int si = (threadIdx.y + W2) * FW + threadIdx.x + W2;
|
||||||
|
|
||||||
// Calculate standard deviation of Gaussian kernel. The base SD is
|
// Calculate scaling coefficient for the Gaussian kernel. This
|
||||||
// then scaled in inverse proportion to the density of the point
|
// does not match with a normal Gaussian; it just fits with
|
||||||
// being scaled.
|
// flam3's implementation.
|
||||||
float sd = est_sd * powf(den+1.0f, neg_est_curve);
|
float scale = powf(den, est_curve) * scale_coeff;
|
||||||
// And for the gradient...
|
|
||||||
float diag_sd = est_sd * powf(diag_mag+1.0f, neg_est_curve);
|
|
||||||
|
|
||||||
// If the gradient SD is smaller than the minimum SD, we're probably
|
// If the gradient scale is smaller than the minimum scale, we're
|
||||||
// on a strong edge; blur with a standard deviation around 1px.
|
// probably on a strong edge; blur slightly.
|
||||||
if (diag_sd < MIN_SD && diag_sd < sd) {
|
if (diag_mag > den * 2.0f) {
|
||||||
sd = 0.3333333f;
|
scale = max(-9.0f, scale);
|
||||||
// Uncomment to see which pixels are being clamped
|
// Uncomment to see which pixels are being clamped
|
||||||
// de_g[si] = 1.0f;
|
// de_g[si] = 1.0f;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Clamp the final standard deviation.
|
|
||||||
sd = fminf(MAX_SD, fmaxf(sd, est_min));
|
|
||||||
|
|
||||||
// Below a certain threshold, only one coeffecient would be
|
// Below a certain threshold, only one coeffecient would be
|
||||||
// retained anyway; we hop right to it.
|
// retained anyway; we hop right to it.
|
||||||
if (sd <= MIN_SD) {
|
if (scale <= MIN_SCALE) {
|
||||||
de_add(si, 0, 0, in);
|
de_add(si, 0, 0, in);
|
||||||
} else {
|
} else {
|
||||||
// These polynomials approximates the sum of the filters
|
// These polynomials approximates the reciprocal of the sum of
|
||||||
// with the clamping logic used here. See helpers/filt_err.py.
|
// all retained filter coefficients. See helpers/filt_err.py.
|
||||||
float filtsum;
|
float filtsum;
|
||||||
if (sd < 0.75f) {
|
if (scale < -1.1f) {
|
||||||
filtsum = -352.25061035f;
|
filtsum = 5.20066078e-06f;
|
||||||
filtsum = filtsum * sd + 1117.09680176f;
|
filtsum = filtsum * scale + 2.14025771e-04f;
|
||||||
filtsum = filtsum * sd + -1372.48864746f;
|
filtsum = filtsum * scale + 3.62761668e-03f;
|
||||||
filtsum = filtsum * sd + 779.15478516f;
|
filtsum = filtsum * scale + 3.21970172e-02f;
|
||||||
filtsum = filtsum * sd + -164.04229736f;
|
filtsum = filtsum * scale + 1.54297248e-01f;
|
||||||
filtsum = filtsum * sd + -12.04892635f;
|
filtsum = filtsum * scale + 3.42210710e-01f;
|
||||||
filtsum = filtsum * sd + 9.04126644f;
|
filtsum = filtsum * scale + 3.06015890e-02f;
|
||||||
filtsum = filtsum * sd + 0.10304667f;
|
filtsum = filtsum * scale + 1.33724615e-01f;
|
||||||
} else {
|
} else {
|
||||||
filtsum = 0.01162011f;
|
filtsum = -1.23516649e-01f;
|
||||||
filtsum = filtsum * sd + -0.21552004f;
|
filtsum = filtsum * scale + -5.14862895e-01f;
|
||||||
filtsum = filtsum * sd + 1.66545594f;
|
filtsum = filtsum * scale + -8.61198902e-01f;
|
||||||
filtsum = filtsum * sd + -7.00809765f;
|
filtsum = filtsum * scale + -7.41916001e-01f;
|
||||||
filtsum = filtsum * sd + 17.55487633f;
|
filtsum = filtsum * scale + -3.51667106e-01f;
|
||||||
filtsum = filtsum * sd + -26.80626106f;
|
filtsum = filtsum * scale + -9.07439440e-02f;
|
||||||
filtsum = filtsum * sd + 30.61903954f;
|
filtsum = filtsum * scale + -3.30008656e-01f;
|
||||||
filtsum = filtsum * sd + -12.00870514f;
|
filtsum = filtsum * scale + -4.78249392e-04f;
|
||||||
filtsum = filtsum * sd + 2.46708894f;
|
|
||||||
}
|
}
|
||||||
float filtscale = 1.0f / filtsum;
|
|
||||||
|
|
||||||
// The reciprocal SD scaling coeffecient in the Gaussian
|
|
||||||
// exponent: exp(-x^2/(2*sd^2)) = exp2f(x^2*rsd)
|
|
||||||
float rsd = -0.5f * CUDART_L2E_F / (sd * sd);
|
|
||||||
|
|
||||||
for (int jj = 0; jj <= W2; jj++) {
|
for (int jj = 0; jj <= W2; jj++) {
|
||||||
float jj2f = jj;
|
float jj2f = jj;
|
||||||
@ -222,9 +211,8 @@ void density_est(float4 *pixbuf, float4 *outbuf,
|
|||||||
|
|
||||||
float iif = 0;
|
float iif = 0;
|
||||||
for (int ii = 0; ii <= jj; ii++) {
|
for (int ii = 0; ii <= jj; ii++) {
|
||||||
|
float coeff = expf((jj2f + iif * iif) * scale)
|
||||||
float coeff = exp2f((jj2f + iif * iif) * rsd)
|
* filtsum;
|
||||||
* filtscale;
|
|
||||||
if (coeff < 0.0001f) break;
|
if (coeff < 0.0001f) break;
|
||||||
iif += 1;
|
iif += 1;
|
||||||
|
|
||||||
@ -317,15 +305,12 @@ class Filtering(object):
|
|||||||
t = fun(dsrc, ddst, k1, k2,
|
t = fun(dsrc, ddst, k1, k2,
|
||||||
block=(512, 1, 1), grid=(nbins/512, 1), stream=stream)
|
block=(512, 1, 1), grid=(nbins/512, 1), stream=stream)
|
||||||
else:
|
else:
|
||||||
# flam3_gaussian_filter() uses an implicit standard deviation of
|
scale_coeff = np.float32(-(1 + cp.de.radius(t)) ** -2.0)
|
||||||
# 0.5, but the DE filters scale filter distance by the default
|
est_curve = np.float32(2 * cp.de.curve(t))
|
||||||
# spatial support factor of 1.5, so the effective base SD is
|
# TODO: experiment with this
|
||||||
# (0.5/1.5)=1/3.
|
edge_clamp = np.float32(2.0)
|
||||||
est_sd = np.float32(cp.de.radius(t) / 3.)
|
|
||||||
neg_est_curve = np.float32(-cp.de.curve(t))
|
|
||||||
est_min = np.float32(cp.de.minimum(t) / 3.)
|
|
||||||
fun = self.mod.get_function("density_est")
|
fun = self.mod.get_function("density_est")
|
||||||
fun(dsrc, ddst, est_sd, neg_est_curve, est_min, k1, k2,
|
fun(dsrc, ddst, scale_coeff, est_curve, edge_clamp, k1, k2,
|
||||||
np.int32(info.acc_height), np.int32(info.acc_stride),
|
np.int32(info.acc_height), np.int32(info.acc_stride),
|
||||||
block=(32, 32, 1), grid=(info.acc_width/32, 1), stream=stream)
|
block=(32, 32, 1), grid=(info.acc_width/32, 1), stream=stream)
|
||||||
|
|
||||||
|
@ -116,7 +116,7 @@ class RenderInfo(object):
|
|||||||
# Maximum width of DE and other spatial filters, and thus in turn the
|
# Maximum width of DE and other spatial filters, and thus in turn the
|
||||||
# amount of padding applied. Note that, for now, this must not be changed!
|
# amount of padding applied. Note that, for now, this must not be changed!
|
||||||
# The filtering code makes deep assumptions about this value.
|
# The filtering code makes deep assumptions about this value.
|
||||||
gutter = 22
|
gutter = 15
|
||||||
|
|
||||||
# TODO: for now, we always throw away the alpha channel before writing.
|
# TODO: for now, we always throw away the alpha channel before writing.
|
||||||
# All code is in place to not do this, we just need to find a way to expose
|
# All code is in place to not do this, we just need to find a way to expose
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
# The maximum number of coeffecients that will ever be retained on the device
|
# The maximum number of coeffecients that will ever be retained on the device
|
||||||
FWIDTH = 21
|
FWIDTH = 15
|
||||||
|
|
||||||
# The number of points on either side of the center in one dimension
|
# The number of points on either side of the center in one dimension
|
||||||
F2 = int(FWIDTH/2)
|
F2 = int(FWIDTH/2)
|
||||||
@ -12,92 +12,85 @@ COEFF_EPS = 0.0001
|
|||||||
dists2d = np.fromfunction(lambda i, j: np.hypot(i-F2, j-F2), (FWIDTH, FWIDTH))
|
dists2d = np.fromfunction(lambda i, j: np.hypot(i-F2, j-F2), (FWIDTH, FWIDTH))
|
||||||
dists = dists2d.flatten()
|
dists = dists2d.flatten()
|
||||||
|
|
||||||
# A flam3 estimator radius corresponds to a Gaussian filter with a standard
|
|
||||||
# deviation of 1/3 the radius. We choose 13 as an arbitrary upper bound for the
|
|
||||||
# max filter radius. The filter should reject larger radii.
|
|
||||||
MAX_SD = 13 / 3.
|
|
||||||
|
|
||||||
# The minimum estimator radius can be set as low as 0, but below a certain
|
# This translates to a cap on DE filter radius of 50. Even this fits very
|
||||||
# radius only one coeffecient is retained. Since things get unstable near 0,
|
# comfortably within the chosen COEFF_EPS.
|
||||||
# we explicitly set a minimum threshold below which no coeffecients are
|
MAX_SCALE = -3/25.
|
||||||
# retained.
|
|
||||||
MIN_SD = np.sqrt(-1 / (2 * np.log(COEFF_EPS)))
|
|
||||||
|
|
||||||
# Using two predicated three-term approximations is much more accurate than
|
# When the scale is above this value, we'd be directly clamping to one bin
|
||||||
# using a very large number of terms, due to nonlinear behavior at low SD.
|
MIN_SCALE = np.log(0.0001)
|
||||||
# Everything above this SD uses one approximation; below, another.
|
|
||||||
SPLIT_SD = 0.75
|
|
||||||
|
|
||||||
# The lower endpoints are undershot by this proportion to reduce error
|
# Everything above this scale uses one approximation; below, another.
|
||||||
UNDERSHOOT = 0.98
|
SPLIT_SCALE = -1.1
|
||||||
|
|
||||||
sds_hi = np.linspace(SPLIT_SD * UNDERSHOOT, MAX_SD, num=1000)
|
# The upper endpoints are overshot by this proportion to reduce error
|
||||||
sds_lo = np.linspace(MIN_SD * UNDERSHOOT, SPLIT_SD, num=1000)
|
OVERSHOOT = 1.01
|
||||||
|
|
||||||
print 'At MIN_SD = %g, these are the coeffs:' % MIN_SD
|
# No longer 'scale'-related, but we call it that anyway
|
||||||
print np.exp(dists2d**2 / (-2 * MIN_SD ** 2))
|
scales_hi = np.linspace(SPLIT_SCALE, MAX_SCALE * OVERSHOOT, num=1000)
|
||||||
|
scales_lo = np.linspace(MIN_SCALE, SPLIT_SCALE * OVERSHOOT, num=1000)
|
||||||
|
|
||||||
def eval_sds(sds, name, nterms):
|
def eval_scales(scales, name, nterms):
|
||||||
# Calculate the filter sums at each coordinate
|
# Calculate the filter sums at each coordinate
|
||||||
sums = []
|
sums = []
|
||||||
for sd in sds:
|
for scale in scales:
|
||||||
coeffs = np.exp(dists**2 / (-2 * sd ** 2))
|
coeffs = np.exp(dists**2 * scale)
|
||||||
# Note that this sum is the sum of all coordinates, though it should
|
# Note that this sum is the sum of all coordinates, though it should
|
||||||
# actually be the result of the polynomial approximation. We could do
|
# actually be the result of the polynomial approximation. We could do
|
||||||
# a feedback loop to improve accuracy, but I don't think the difference
|
# a feedback loop to improve accuracy, but I don't think the difference
|
||||||
# is worth worrying about.
|
# is worth worrying about.
|
||||||
sum = np.sum(coeffs)
|
sum = np.sum(coeffs)
|
||||||
sums.append(np.sum(filter(lambda v: v / sum > COEFF_EPS, coeffs)))
|
sums.append(1./np.sum(filter(lambda v: v / sum > COEFF_EPS, coeffs)))
|
||||||
print 'Evaluating %s:' % name
|
print 'Evaluating %s:' % name
|
||||||
poly, resid, rank, sing, rcond = np.polyfit(sds, sums, nterms, full=True)
|
poly, resid, rank, sing, rcond = np.polyfit(scales, sums, nterms, full=True)
|
||||||
print 'Fit for %s:' % name, poly, resid, rank, sing, rcond
|
print 'Fit for %s:' % name, poly, resid, rank, sing, rcond
|
||||||
return sums, poly
|
return sums, poly
|
||||||
|
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
|
|
||||||
sums_hi, poly_hi = eval_sds(sds_hi, 'hi', 8)
|
sums_hi, poly_hi = eval_scales(scales_hi, 'hi', 7)
|
||||||
sums_lo, poly_lo = eval_sds(sds_lo, 'lo', 7)
|
sums_lo, poly_lo = eval_scales(scales_lo, 'lo', 7)
|
||||||
|
|
||||||
num_undershoots = len(filter(lambda v: v < SPLIT_SD, sds_hi))
|
num_overshoots = len(filter(lambda v: v > MAX_SCALE, scales_hi))
|
||||||
sds_hi = sds_hi[num_undershoots:]
|
scales_hi = scales_hi[num_overshoots:]
|
||||||
sums_hi = sums_hi[num_undershoots:]
|
sums_hi = sums_hi[num_overshoots:]
|
||||||
|
|
||||||
num_undershoots = len(filter(lambda v: v < MIN_SD, sds_lo))
|
num_overshoots = len(filter(lambda v: v > SPLIT_SCALE, scales_lo))
|
||||||
sds_lo = sds_lo[num_undershoots:]
|
scales_lo = scales_lo[num_overshoots:]
|
||||||
sums_lo = sums_lo[num_undershoots:]
|
sums_lo = sums_lo[num_overshoots:]
|
||||||
|
|
||||||
polyf_hi = np.float32(poly_hi)
|
polyf_hi = np.float32(poly_hi)
|
||||||
vals_hi = np.polyval(polyf_hi, sds_hi)
|
vals_hi = np.polyval(polyf_hi, scales_hi)
|
||||||
polyf_lo = np.float32(poly_lo)
|
polyf_lo = np.float32(poly_lo)
|
||||||
vals_lo = np.polyval(polyf_lo, sds_lo)
|
vals_lo = np.polyval(polyf_lo, scales_lo)
|
||||||
|
|
||||||
def print_filt(filts):
|
def print_filt(filts):
|
||||||
print ' filtsum = %4.8ff;' % filts[0]
|
print ' filtsum = %4.8ef;' % filts[0]
|
||||||
for f in filts[1:]:
|
for f in filts[1:]:
|
||||||
print ' filtsum = filtsum * sd + % 16.8ff;' % f
|
print ' filtsum = filtsum * scale + % 16.8ef;' % f
|
||||||
|
|
||||||
print '\n\nFor your convenience:'
|
print '\n\nFor your convenience:'
|
||||||
print '#define MIN_SD %.8f' % MIN_SD
|
print '#define MIN_SCALE %.8gf' % MIN_SCALE
|
||||||
print '#define MAX_SD %.8f' % MAX_SD
|
print '#define MAX_SCALE %.8gf' % MAX_SCALE
|
||||||
print 'if (sd < %g) {' % SPLIT_SD
|
print 'if (scale < %gf) {' % SPLIT_SCALE
|
||||||
print_filt(polyf_lo)
|
print_filt(polyf_lo)
|
||||||
print '} else {'
|
print '} else {'
|
||||||
print_filt(polyf_hi)
|
print_filt(polyf_hi)
|
||||||
print '}'
|
print '}'
|
||||||
|
|
||||||
sds = np.concatenate([sds_lo, sds_hi])
|
scales = np.concatenate([scales_lo, scales_hi])
|
||||||
sums = np.concatenate([sums_lo, sums_hi])
|
sums = np.concatenate([sums_lo, sums_hi])
|
||||||
vals = np.concatenate([vals_lo, vals_hi])
|
vals = np.concatenate([vals_lo, vals_hi])
|
||||||
|
|
||||||
fig = plt.figure()
|
fig = plt.figure()
|
||||||
ax = fig.add_subplot(1,1,1)
|
ax = fig.add_subplot(1,1,1)
|
||||||
ax.plot(sds, sums)
|
ax.plot(scales, sums)
|
||||||
ax.plot(sds, vals)
|
ax.plot(scales, vals)
|
||||||
ax.set_xlabel('stdev')
|
ax.set_xlabel('stdev')
|
||||||
ax.set_ylabel('filter sum')
|
ax.set_ylabel('filter sum')
|
||||||
|
|
||||||
ax = ax.twinx()
|
ax = ax.twinx()
|
||||||
ax.plot(sds, [abs((s-v)/v) for s, v in zip(sums, vals)])
|
ax.plot(scales, [abs((s-v)/v) for s, v in zip(sums, vals)])
|
||||||
ax.set_ylabel('rel err')
|
ax.set_ylabel('rel err')
|
||||||
|
|
||||||
plt.show()
|
plt.show()
|
||||||
|
Loading…
Reference in New Issue
Block a user