diff --git a/cuburn/code/filtering.py b/cuburn/code/filtering.py index cea349c..cf7ec49 100644 --- a/cuburn/code/filtering.py +++ b/cuburn/code/filtering.py @@ -9,9 +9,6 @@ from pycuda.gpuarray import vec from cuburn.code.util import * _CODE = r''' -#include -#include - __global__ void colorclip(float4 *pixbuf, float gamma, float vibrance, float highpow, float linrange, float lingam, float3 bkgd, @@ -100,10 +97,11 @@ void colorclip(float4 *pixbuf, float gamma, float vibrance, float highpow, } __global__ -void logscale(float4 *pixbuf, float4 *outbuf, float k1, float k2) { +void logscale(float4 *outbuf, const float4 *pixbuf, float k1, float k2) { int i = blockDim.x * blockIdx.x + threadIdx.x; float4 pix = pixbuf[i]; + // float ls = fmaxf(0, k1 * logf(1.0f + pix.w * pix.w * k2 / (1 + pix.w)) / pix.w); float ls = fmaxf(0, k1 * logf(1.0f + pix.w * k2) / pix.w); pix.x *= ls; pix.y *= ls; @@ -113,173 +111,13 @@ void logscale(float4 *pixbuf, float4 *outbuf, float k1, float k2) { outbuf[i] = pix; } -// SS must be no greater than 4x +// Element-wise computation of ``dst[i]=dst[i]+src[i]*scale``. __global__ -void blur_density_h(float *scratch, const float4 *pixbuf, int astride) { - // TODO: respect supersampling? configurable blur width? - int x = blockIdx.x * blockDim.x + threadIdx.x; - if (x > astride) return; - int y = blockIdx.y; - const float *dsrc = - reinterpret_cast(&pixbuf[astride * y]) + 3; - - float den = 0.0f; - - den += dsrc[4*max(0, x-2)] * 0.00135f; - den += dsrc[4*max(0, x-1)] * 0.1573f; - den += dsrc[4*x] * 0.6827f; - den += dsrc[4*min(astride-1, x+1)] * 0.1573f; - den += dsrc[4*min(astride-1, x+2)] * 0.00135f; - scratch[astride * y + x] = den; -} - -__global__ -void blur_density_v(float4 *pixbuf, const float *scratch, - int astride, int aheight) { - // TODO: respect supersampling? configurable blur width? - int x = blockIdx.x * blockDim.x + threadIdx.x; - if (x > astride) return; - int y = blockIdx.y; - const float *dsrc = scratch + x; - - float den = 0.0f; - - den += dsrc[astride*max(0, y-2)] * 0.00135f; - den += dsrc[astride*max(0, y-1)] * 0.1573f; - den += dsrc[astride*y] * 0.6827f; - den += dsrc[astride*min(astride-1, y+1)] * 0.1573f; - den += dsrc[astride*min(astride-1, y+2)] * 0.00135f; - - float *ddst = reinterpret_cast(&pixbuf[astride * y + x]) + 3; - *ddst = min(*ddst, den); -} - -#define W 21 // Filter width (regardless of standard deviation chosen) -#define W2 10 // Half of filter width, rounded down -#define FW 53 // Width of local result storage (NW+W2+W2) -#define FW2 (FW*FW) - -__global__ -void density_est(float4 *outbuf, const float4 *pixbuf, - float scale_coeff, float est_curve, float edge_clamp, - float k1, float k2, int height, int stride) { - __shared__ float de_r[FW2], de_g[FW2], de_b[FW2], de_a[FW2]; - - // The max supported radius is really 7, but the extra units simplify the - // logic in the bottlenecked section below. - __shared__ int minradius[32]; - - for (int i = threadIdx.x + 32*threadIdx.y; i < FW2; i += 1024) - de_r[i] = de_g[i] = de_b[i] = de_a[i] = 0.0f; - __syncthreads(); - - for (int imrow = threadIdx.y; imrow < height; imrow += 32) { - // Prepare the shared voting buffer. Syncing afterward is - // almost unnecessary but we do it to be safe - if (threadIdx.y == 0) - minradius[threadIdx.x] = 0.0f; - __syncthreads(); - - int col = blockIdx.x * 32 + threadIdx.x; - int in_idx = stride * imrow + col; - float4 in = pixbuf[in_idx]; - float den = in.w; - - float ls = k1 * logf(1.0f + in.w * k2) / in.w; - - // The synchronization model used now prevents us from - // cutting early in the case of a zero point, so we just carry - // it along with us here - if (den <= 0) ls = 0.0f; - - in.x *= ls; - in.y *= ls; - in.z *= ls; - in.w *= ls; - - // Base index of destination for writes - int si = (threadIdx.y + W2) * FW + threadIdx.x + W2; - - // Calculate scaling coefficient for the Gaussian kernel. This - // does not match with a normal Gaussian; it just fits with - // flam3's implementation. - float scale = powf(den, est_curve) * scale_coeff; - - // Force a minimum blur radius. This works out to be a - // standard deviation of about 0.35px. Also force a maximum, - // which limits spherical error to about 2 quanta at 10 bit - // precision. - scale = max(0.30f, min(scale, 2.0f)); - - // Determine a minimum radius for this image section. - int radius = (int) min(ceilf(2.12132f / scale), 10.0f); - if (den <= 0) radius = 0; - minradius[radius] = radius; - - // Go bottlenecked to compute the maximum radius in this block - __syncthreads(); - if (threadIdx.y == 0) { - int blt = __ballot(minradius[threadIdx.x]); - minradius[0] = 31 - __clz(blt); - } - __syncthreads(); - - radius = minradius[0]; - - for (int jj = -radius; jj <= radius; jj++) { - float jjf = (jj - 0.5f) * scale; - float jdiff = erff(jjf + scale) - erff(jjf); - - for (int ii = -radius; ii <= radius; ii++) { - float iif = (ii - 0.5f) * scale; - float coeff = 0.25f * jdiff * (erff(iif + scale) - erff(iif)); - - int idx = si + FW * ii + jj; - de_r[idx] += in.x * coeff; - de_g[idx] += in.y * coeff; - de_b[idx] += in.z * coeff; - de_a[idx] += in.w * coeff; - __syncthreads(); - } - } - __syncthreads(); - // TODO: could coalesce this, but what a pain - for (int i = threadIdx.x; i < FW; i += 32) { - int out_idx = stride * imrow + blockIdx.x * 32 + i; - int si = threadIdx.y * FW + i; - float *out = reinterpret_cast(&outbuf[out_idx]); - atomicAdd(out, de_r[si]); - atomicAdd(out+1, de_g[si]); - atomicAdd(out+2, de_b[si]); - atomicAdd(out+3, de_a[si]); - } - - __syncthreads(); - int tid = threadIdx.y * 32 + threadIdx.x; - for (int i = tid; i < FW*(W2+W2); i += 1024) { - de_r[i] = de_r[i+FW*32]; - de_g[i] = de_g[i+FW*32]; - de_b[i] = de_b[i+FW*32]; - de_a[i] = de_a[i+FW*32]; - } - - __syncthreads(); - for (int i = tid + FW*(W2+W2); i < FW2; i += 1024) { - de_r[i] = 0.0f; - de_g[i] = 0.0f; - de_b[i] = 0.0f; - de_a[i] = 0.0f; - } - __syncthreads(); - } -} - -__global__ -void fma_buf(float4 *dst, const float4 *sub, int astride, float scale) { +void fma_buf(float4 *dst, const float4 *src, int astride, float scale) { int x = blockIdx.x * blockDim.x + threadIdx.x; int y = blockIdx.y * blockDim.y + threadIdx.y; int i = y * astride + x; - float4 d = dst[i], s = sub[i]; + float4 d = dst[i], s = src[i]; d.x += s.x * scale; d.y += s.y * scale; d.z += s.z * scale; @@ -288,125 +126,250 @@ void fma_buf(float4 *dst, const float4 *sub, int astride, float scale) { } texture bilateral_src; +texture blur_src; +// Apply a Gaussian-esque blur to the density channel of the texture in +// ``bilateral_src`` in the horizontal direction, and write it to ``dst``, a +// one-channel buffer +__global__ void blur_h(float *dst) { + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + + float den = 0.0f; + den += tex2D(bilateral_src, x - 2, y).w * 0.00135f; + den += tex2D(bilateral_src, x - 1, y).w * 0.1573f; + den += tex2D(bilateral_src, x, y).w * 0.6827f; + den += tex2D(bilateral_src, x + 1, y).w * 0.1573f; + den += tex2D(bilateral_src, x + 2, y).w * 0.00135f; + dst[y * (blockDim.x * gridDim.x) + x] = den; +} + +// As above, but with a one-channel texture as source +__global__ void blur_h_1cp(float *dst) { + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + + float den = 0.0f; + den += tex2D(blur_src, x - 2, y) * 0.00135f; + den += tex2D(blur_src, x - 1, y) * 0.1573f; + den += tex2D(blur_src, x, y) * 0.6827f; + den += tex2D(blur_src, x + 1, y) * 0.1573f; + den += tex2D(blur_src, x + 2, y) * 0.00135f; + dst[y * (blockDim.x * gridDim.x) + x] = den; +} + +// As above, but in the vertical direction +__global__ void blur_v(float *dst) { + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + + float den = 0.0f; + den += tex2D(blur_src, x, y - 2) * 0.00135f; + den += tex2D(blur_src, x, y - 1) * 0.1573f; + den += tex2D(blur_src, x, y ) * 0.6827f; + den += tex2D(blur_src, x, y + 1) * 0.1573f; + den += tex2D(blur_src, x, y + 2) * 0.00135f; + dst[y * (blockDim.x * gridDim.x) + x] = den; +} + +/* sstd: spatial standard deviation (Gaussian filter) + * cstd: color standard deviation (Gaussian on the range [0, 1], where 1 + * represents an "opposite" color). + * + * Density is controlled by a power-of-two Gompertz distribution: + * v = 1 - 2^(-sum^dpow * 2^((dhalfpt - x) * dspeed)) + * + * dhalfpt: The difference in density values between two points at which the + * filter admits 50% of the spatial and color kernels, when dpow + * is 0. `3` seems to be a good fit for most images at decent + * sampling levels. + * dspeed: The sharpness of the filter's cutoff around dhalfpt. At `1`, the + * filter admits 75% of a point that differs by one fewer than + * `dhalfpt` density steps from the current point (when dpow is 0); + * at `2`, it admits 93.75% of the same. `0.5` works pretty well. + * dpow: The change of filter intensity as density scales. This should be + * set automatically in response to changes in expected density per + * cell. + */ __global__ -void bilateral(float4 *dst, int astride, float dstd, float rstd) { - int xi = blockIdx.x * blockDim.x + threadIdx.x; - int yi = blockIdx.y * blockDim.y + threadIdx.y; - float x = xi, y = yi; +void bilateral(float4 *dst, float sstd, float cstd, + float dhalfpt, float dspeed, float dpow) { + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; - const float r = 9.0f; + const int r = 7; + + // Precalculate the spatial coeffecients. + __shared__ float spa_coefs[32]; + if (threadIdx.y == 0) { + float df = threadIdx.x; + spa_coefs[threadIdx.x] = expf(df * df / (-M_SQRT2 * sstd)); + } + + // 3.0f compensates for [0,3] range of `cdiff` + float cscale = 1.0f / (-M_SQRT2 * 3.0f * cstd); + + // Gather the center point, and pre-average the color values for easier + // comparison. + float4 cen = tex2D(bilateral_src, x, y); + if (cen.w > 0.0f) { + float cdrcp = 1.0f / cen.w; + cen.x *= cdrcp; + cen.y *= cdrcp; + cen.z *= cdrcp; + } + + // Compute the Sobel directional derivative of a pre-blurred version of + // the density channel at the center point. + float nw = tex2D(blur_src, x - 1, y - 1); + float ne = tex2D(blur_src, x + 1, y - 1); + float sw = tex2D(blur_src, x - 1, y + 1); + float se = tex2D(blur_src, x + 1, y + 1); + float h = ne + se + 2 * tex2D(blur_src, x + 1, y) + -(nw + sw + 2 * tex2D(blur_src, x - 1, y)); + float v = se + sw + 2 * tex2D(blur_src, x, y + 1) + -(ne + nw + 2 * tex2D(blur_src, x, y - 1)); + + // TODO: figure out how to work `mag` in to scaling the degree to which + // the directionality filter clamps + float mag = sqrtf(h * h + v * v); - float cen_den = tex2D(bilateral_src, x, y).w; float4 out = make_float4(0, 0, 0, 0); float weightsum = 0.0f; - for (float i = -r; i <= r; i++) { - for (float j = -r; j <= r; j++) { + // Be extra-sure spatial coeffecients have been written + __syncthreads(); + + for (int i = -r; i <= r; i++) { + for (int j = -r; j <= r; j++) { float4 pix = tex2D(bilateral_src, x + i, y + j); - float den_diff = log2f(fabsf(pix.w - cen_den) + 1.0f); - float factor = expf(i * i * dstd) - * expf(j * j * dstd) - * expf(den_diff * den_diff * rstd); + float cdiff = 0.5f; + + if (pix.w > 0.0f && cen.w > 0.0f) { + float pdrcp = 1.0f / pix.w; + float yd = pix.x * pdrcp - cen.x; + float ud = pix.y * pdrcp - cen.y; + float vd = pix.z * pdrcp - cen.z; + cdiff = yd * yd + ud * ud + vd * vd; + } + float ddiff = dspeed * (dhalfpt - fabsf(pix.w - cen.w) - 1.0f); + float dsum = -powf(0.5f * (pix.w + cen.w + 1.0f), dpow); + float dfact = 1.0f - exp2f(dsum * exp2f(ddiff)); + float angfact = (h * i + v * j) / (sqrtf(i*i + j*j) * mag + 1.0e-10f); + + // Oh, this is ridiculous. But it works! + float factor = spa_coefs[abs(i)] * spa_coefs[abs(j)] + * expf(cscale * cdiff) * dfact + * exp2f(2.0f * (angfact - 1.0f)); + + weightsum += factor; out.x += factor * pix.x; out.y += factor * pix.y; out.z += factor * pix.z; out.w += factor * pix.w; - weightsum += factor; } } - float weightrcp = 1.0f / weightsum; + + float weightrcp = 1.0f / (weightsum + 1e-10f); out.x *= weightrcp; out.y *= weightrcp; out.z *= weightrcp; out.w *= weightrcp; - dst[yi * astride + xi] = out; -} + // Uncomment to write directional gradient using YUV colors + /* + out.x = mag; + out.y = h; + out.z = v; + out.w = mag; + */ + const int astride = blockDim.x * gridDim.x; + dst[y * astride + x] = out; +} ''' -class Filtering(object): - +class Filtering(HunkOCode): mod = None + defs = _CODE @classmethod def init_mod(cls): if cls.mod is None: - cls.mod = pycuda.compiler.SourceModule(_CODE, + cls.mod = pycuda.compiler.SourceModule(assemble_code(BaseCode, cls), options=['-use_fast_math', '-maxrregcount', '32']) def __init__(self): self.init_mod() - def blur_density(self, ddst, dscratch, dim, stream=None): - blur_h = self.mod.get_function("blur_density_h") - blur_h(dscratch, ddst, i32(dim.astride), block=(192, 1, 1), - grid=(dim.astride / 192, dim.ah), stream=stream) - blur_v = self.mod.get_function("blur_density_v") - blur_v(ddst, dscratch, i32(dim.astride), i32(dim.ah), block=(192, 1, 1), - grid=(dim.astride / 192, dim.ah), stream=stream) + def de(self, ddst, dsrc, dscratch, gnm, dim, tc, nxf, stream=None): + # Helper variables and functions to keep it clean + sb = 16 * dim.astride + bs = sb * dim.ah + bl, gr = (32, 8, 1), (dim.astride / 32, dim.ah / 8) - def de(self, ddst, dsrc, gnm, dim, tc, nxf, stream=None): + def launch(f, *args, **kwargs): + f(*args, block=bl, grid=gr, stream=stream, **kwargs) + mkdsc = lambda c: argset(cuda.ArrayDescriptor(), height=dim.ah, + width=dim.astride, num_channels=c, + format=cuda.array_format.FLOAT) + def mktref(n): + tref = self.mod.get_texref(n) + tref.set_filter_mode(cuda.filter_mode.POINT) + tref.set_address_mode(0, cuda.address_mode.WRAP) + tref.set_address_mode(1, cuda.address_mode.WRAP) + return tref + + dsc = mkdsc(4) + tref = mktref('bilateral_src') + grad_dsc = mkdsc(1) + grad_tref = mktref('blur_src') + + bilateral, blur_h, blur_h_1cp, blur_v, fma_buf = map( + self.mod.get_function, + 'bilateral blur_h blur_h_1cp blur_v fma_buf'.split()) + ROUNDS = 2 # TODO: user customizable? + + def do_bilateral(bsrc, bdst): + tref.set_address_2d(bsrc, dsc, sb) + launch(blur_h, np.intp(bdst), texrefs=[tref]) + grad_tref.set_address_2d(bdst, grad_dsc, sb / 4) + launch(blur_v, dscratch, texrefs=[grad_tref]) + grad_tref.set_address_2d(dscratch, grad_dsc, sb / 4) + launch(blur_h_1cp, np.intp(bdst), texrefs=[tref]) + grad_tref.set_address_2d(bdst, grad_dsc, sb / 4) + launch(blur_v, dscratch, texrefs=[grad_tref]) + grad_tref.set_address_2d(dscratch, grad_dsc, sb / 4) + launch(bilateral, np.intp(bdst), f32(4), f32(0.1), + f32(5), f32(0.4), f32(0.6), texrefs=[tref, grad_tref]) + return bdst, bsrc + + # Filter the first xform, using `ddst` as an intermediate buffer. + # End result is the filtered copy in `dsrc`. + a, b = dsrc, ddst + for i in range(ROUNDS): + a, b = do_bilateral(a, b) + if ROUNDS % 2: + cuda.memcpy_dtod_async(b, a, bs, stream) + + # Filter the remaining xforms, using `ddst` as an intermediate + # buffer, then add the result to `dsrc` (at the zero'th xform). + for x in range(1, nxf): + a, b = int(dsrc) + x * bs, ddst + for i in range(ROUNDS): + a, b = do_bilateral(a, b) + launch(fma_buf, dsrc, np.intp(a), i32(dim.astride), f32(1)) + + # Log-scale the accumulated buffer in `dsrc`. k1 = f32(gnm.color.brightness(tc) * 268 / 256) # Old definition of area is (w*h/(s*s)). Since new scale 'ns' is now # s/w, new definition is (w*h/(s*s*w*w)) = (h/(s*s*w)) area = dim.h / (gnm.camera.scale(tc) ** 2 * dim.w) k2 = f32(1.0 / (area * gnm.spp(tc))) - - if gnm.de.radius(tc) == 0 or True: - # Stride in bytes, and buffer size - sb = 16 * dim.astride - bs = sb * dim.ah - - - dsc = argset(cuda.ArrayDescriptor(), height=dim.ah, - width=dim.astride, format=cuda.array_format.FLOAT, - num_channels=4) - tref = self.mod.get_texref('bilateral_src') - tref.set_filter_mode(cuda.filter_mode.POINT) - tref.set_address_mode(0, cuda.address_mode.WRAP) - tref.set_address_mode(1, cuda.address_mode.WRAP) - - bilateral = self.mod.get_function('bilateral') - fma_buf = self.mod.get_function('fma_buf') - for i in range(nxf): - tref.set_address_2d(int(dsrc) + bs * i, dsc, sb) - bilateral(ddst, i32(dim.astride), f32(-0.1), f32(-0.1), - block=(32, 8, 1), grid=(dim.astride / 32, dim.ah / 8), - texrefs=[tref], stream=stream) - if i == 0: - cuda.memcpy_dtod_async(dsrc, ddst, bs, stream) - else: - fma_buf(dsrc, ddst, i32(dim.astride), f32(1.0), - block=(32, 8, 1), grid=(dim.astride / 32, dim.ah / 8), - stream=stream) - - tref.set_address_2d(dsrc, dsc, sb) - ROUNDS = 3 - - a, b = dsrc, ddst - for i in range(ROUNDS): - bilateral(b, i32(dim.astride), f32(-0.1), f32(-0.2), - block=(32, 8, 1), grid=(dim.astride / 32, dim.ah / 8), - texrefs=[tref], stream=stream) - a, b = b, a - - nbins = dim.ah * dim.astride - fun = self.mod.get_function("logscale") - # This function only looks at one pixel, so using the same - # parameter for input and output is OK (if e.g. ROUNDS is odd) - t = fun(a, ddst, k1, k2, - block=(512, 1, 1), grid=(nbins/512, 1), stream=stream) - else: - scale_coeff = f32(1.0 / gnm.de.radius(tc)) - est_curve = f32(gnm.de.curve(tc)) - # TODO: experiment with this - edge_clamp = f32(1.2) - fun = self.mod.get_function("density_est") - fun(ddst, dsrc, scale_coeff, est_curve, edge_clamp, k1, k2, - i32(dim.ah), i32(dim.astride), - block=(32, 32, 1), grid=(dim.aw/32, 1), stream=stream) + nbins = dim.ah * dim.astride + logscale = self.mod.get_function("logscale") + t = logscale(ddst, dsrc, k1, k2, + block=(512, 1, 1), grid=(nbins/512, 1), stream=stream) def colorclip(self, dbuf, gnm, dim, tc, blend, stream=None): nbins = dim.ah * dim.astride diff --git a/cuburn/render.py b/cuburn/render.py index c666926..af0aaca 100644 --- a/cuburn/render.py +++ b/cuburn/render.py @@ -198,6 +198,10 @@ class Renderer(object): if self.acc_mode == 'atomic': d_atom = cuda.mem_alloc(8 * nbins * nxf) flush_fun = self.mod.get_function("flush_atom") + else: + # d_atom is also used as a scratch buffer during filtering, so we + # need it at least this size + d_atom = cuda.mem_alloc(4 * nbins) obuf_copy = util.argset(cuda.Memcpy2D(), src_y=self.gutter, src_x_in_bytes=16*self.gutter, @@ -350,7 +354,8 @@ class Renderer(object): util.BaseCode.fill_dptr(self.mod, d_out, 4 * nbins, filt_stream) _sync_stream(filt_stream, write_stream) - filt.de(d_out, d_accum, genome, dim, tc, nxf, stream=filt_stream) + filt.de(d_out, d_accum, d_atom, genome, dim, tc, nxf, + stream=filt_stream) _sync_stream(write_stream, filt_stream) filt.colorclip(d_out, genome, dim, tc, blend, stream=filt_stream) obuf_copy.set_dst_host(h_out_a)