From 387dfd9f8c32da919dcca67b726d00faa37c2d0a Mon Sep 17 00:00:00 2001 From: Steven Robertson Date: Sat, 28 Jan 2012 13:20:48 -0500 Subject: [PATCH] More experiments --- cuburn/code/filtering.py | 289 +++++++++++++++++++++++---------------- 1 file changed, 172 insertions(+), 117 deletions(-) diff --git a/cuburn/code/filtering.py b/cuburn/code/filtering.py index f44e5a9..33984fb 100644 --- a/cuburn/code/filtering.py +++ b/cuburn/code/filtering.py @@ -8,6 +8,38 @@ from pycuda.gpuarray import vec from cuburn.code.util import * _CODE = r''' +// Filter directions specified in degrees, using image/texture addressing +// [(0,0) is upper left corner, 90 degrees is down]. + +__constant__ float2 addressing_patterns[16] = { + { 1.0f, 0.0f}, { 0.0f, 1.0f}, // 0, 1: 0, 90 + { 1.0f, 1.0f}, {-1.0f, 1.0f}, // 2, 3: 45, 135 + { 1.0f, 0.5f}, {-0.5f, 1.0f}, // 4, 5: 22.5, 112.5 + { 1.0f, -0.5f}, { 0.5f, 1.0f}, // 6, 7: -22.5, 67.5 + { 1.0f, 0.666667f}, {-0.666667f, 1.0f}, // 8, 9: 30, 120 + { 1.0f, -0.666667f}, { 0.666667f, 1.0f}, // 10, 11: -30, 60 + { 1.0f, 0.333333f}, {-0.333333f, 1.0f}, // 12, 13: 15, 105 + { 1.0f, -0.333333f}, { 0.333333f, 1.0f}, // 14, 15: -15, 75 +}; + +// Mon dieu! A C++ feature? +template __device__ T +tex_shear(texture ref, int pattern, + float x, float y, float radius) { + float2 scale = addressing_patterns[pattern]; + float i = scale.x * radius, j = scale.y * radius; + // Round i and j to the nearest integer, choosing the nearest even when + // equidistant. It's critical that this be done before adding 'x' and 'y', + // so that addressing patterns remain consistent across the grid. + asm("{\n\t" + "cvt.rni.ftz.f32.f32 %0, %0;\n\t" + "cvt.rni.ftz.f32.f32 %1, %1;\n\t" + "}\n" : "+f"(i), "+f"(j)); + return tex2D(ref, x + i, y + j); +} + +extern "C" { + __global__ void colorclip(float4 *pixbuf, float gamma, float vibrance, float highpow, float linrange, float lingam, float3 bkgd, @@ -127,48 +159,49 @@ void fma_buf(float4 *dst, const float4 *src, int astride, float scale) { texture bilateral_src; texture blur_src; +__global__ void logscale_den(float *dst, float k2) { + int xi = blockIdx.x * blockDim.x + threadIdx.x; + int yi = blockIdx.y * blockDim.y + threadIdx.y; + float4 pix = tex2D(bilateral_src, xi, yi); + float out = logf(1.0f + pix.w * k2); + dst[yi * (blockDim.x * gridDim.x) + xi] = out; +} + +__constant__ float gauss_coefs[9] = { + 0.00443305f, 0.05400558f, 0.24203623f, 0.39905028f, + 0.24203623f, 0.05400558f, 0.00443305f +}; + // Apply a Gaussian-esque blur to the density channel of the texture in // ``bilateral_src`` in the horizontal direction, and write it to ``dst``, a -// one-channel buffer -__global__ void blur_h(float *dst) { - int x = blockIdx.x * blockDim.x + threadIdx.x; - int y = blockIdx.y * blockDim.y + threadIdx.y; +// one-channel buffer. +__global__ void den_blur(float *dst, int pattern, int upsample) { + int xi = blockIdx.x * blockDim.x + threadIdx.x; + int yi = blockIdx.y * blockDim.y + threadIdx.y; + float x = xi, y = yi; float den = 0.0f; - den += tex2D(bilateral_src, x - 2, y).w * 0.00135f; - den += tex2D(bilateral_src, x - 1, y).w * 0.1573f; - den += tex2D(bilateral_src, x, y).w * 0.6827f; - den += tex2D(bilateral_src, x + 1, y).w * 0.1573f; - den += tex2D(bilateral_src, x + 2, y).w * 0.00135f; - dst[y * (blockDim.x * gridDim.x) + x] = den; + + #pragma unroll + for (int i = 0; i < 9; i++) + den += tex_shear(bilateral_src, pattern, x, y, (i - 4) << upsample).w + * gauss_coefs[i]; + dst[yi * (blockDim.x * gridDim.x) + xi] = den; } -// As above, but with a one-channel texture as source -__global__ void blur_h_1cp(float *dst) { - int x = blockIdx.x * blockDim.x + threadIdx.x; - int y = blockIdx.y * blockDim.y + threadIdx.y; +// As above, but with the one-channel texture as source +__global__ void den_blur_1c(float *dst, int pattern, int upsample) { + int xi = blockIdx.x * blockDim.x + threadIdx.x; + int yi = blockIdx.y * blockDim.y + threadIdx.y; + float x = xi, y = yi; float den = 0.0f; - den += tex2D(blur_src, x - 2, y) * 0.00135f; - den += tex2D(blur_src, x - 1, y) * 0.1573f; - den += tex2D(blur_src, x, y) * 0.6827f; - den += tex2D(blur_src, x + 1, y) * 0.1573f; - den += tex2D(blur_src, x + 2, y) * 0.00135f; - dst[y * (blockDim.x * gridDim.x) + x] = den; -} -// As above, but in the vertical direction -__global__ void blur_v(float *dst) { - int x = blockIdx.x * blockDim.x + threadIdx.x; - int y = blockIdx.y * blockDim.y + threadIdx.y; - - float den = 0.0f; - den += tex2D(blur_src, x, y - 2) * 0.00135f; - den += tex2D(blur_src, x, y - 1) * 0.1573f; - den += tex2D(blur_src, x, y ) * 0.6827f; - den += tex2D(blur_src, x, y + 1) * 0.1573f; - den += tex2D(blur_src, x, y + 2) * 0.00135f; - dst[y * (blockDim.x * gridDim.x) + x] = den; + #pragma unroll + for (int i = 0; i < 9; i++) + den += tex_shear(blur_src, pattern, x, y, (i - 4) << upsample) + * gauss_coefs[i]; + dst[yi * (blockDim.x * gridDim.x) + xi] = den; } /* sstd: spatial standard deviation (Gaussian filter) @@ -194,13 +227,17 @@ __global__ void blur_v(float *dst) { * cell. */ __global__ -void bilateral(float4 *dst, int r, float sstd, float cstd, float angscale, - float dhalfpt, float dspeed, float dpow) { - int x = blockIdx.x * blockDim.x + threadIdx.x; - int y = blockIdx.y * blockDim.y + threadIdx.y; +void bilateral(float4 *dst, int pattern, int radius, + float sstd, float cstd, float angscale, + float dhalfpt, float dspeed, float dpow, float k2 +) { + int xi = blockIdx.x * blockDim.x + threadIdx.x; + int yi = blockIdx.y * blockDim.y + threadIdx.y; + float x = xi, y = yi; // Precalculate the spatial coeffecients. __shared__ float spa_coefs[32]; + if (threadIdx.y == 0) { float df = threadIdx.x; spa_coefs[threadIdx.x] = expf(df * df / (-M_SQRT2 * sstd)); @@ -212,27 +249,24 @@ void bilateral(float4 *dst, int r, float sstd, float cstd, float angscale, // Gather the center point, and pre-average the color values for easier // comparison. float4 cen = tex2D(bilateral_src, x, y); - if (cen.w > 0.0f) { - float cdrcp = 1.0f / cen.w; - cen.x *= cdrcp; - cen.y *= cdrcp; - cen.z *= cdrcp; - } + float cdrcp = 1.0f / (cen.w + 1.0e-6f); + cen.x *= cdrcp; + cen.y *= cdrcp; + cen.z *= cdrcp; - // Compute the Sobel directional derivative of a pre-blurred version of - // the density channel at the center point. - float nw = tex2D(blur_src, x - 1, y - 1); - float ne = tex2D(blur_src, x + 1, y - 1); - float sw = tex2D(blur_src, x - 1, y + 1); - float se = tex2D(blur_src, x + 1, y + 1); - float h = ne + se + 2 * tex2D(blur_src, x + 1, y) - -(nw + sw + 2 * tex2D(blur_src, x - 1, y)); - float v = se + sw + 2 * tex2D(blur_src, x, y + 1) - -(ne + nw + 2 * tex2D(blur_src, x, y - 1)); + float clogden = powf(cen.w, 0.8); + //logf(1.0f + cen.w * k2); - // TODO: figure out how to work `mag` in to scaling the degree to which - // the directionality filter clamps - float mag = sqrtf(h * h + v * v); + /* + // Calculate the gradient from the pre-blurred density texture in the + // "forward" and "crosswise" directions (separated by 90 degrees) + float cgrad_f = tex_shear(blur_src, pattern, x, y, 1) + - tex_shear(blur_src, pattern, x, y, -1); + float cgrad_c = tex_shear(blur_src, pattern ^ 1, x, y, 1) + - tex_shear(blur_src, pattern ^ 1, x, y, -1); + float gradrcp = 1.0f / sqrtf(cgrad_f * cgrad_f + cgrad_c * cgrad_c + 1.0e-6f); + float gradfact = cgrad_f * gradrcp; + */ float4 out = make_float4(0, 0, 0, 0); float weightsum = 0.0f; @@ -240,33 +274,41 @@ void bilateral(float4 *dst, int r, float sstd, float cstd, float angscale, // Be extra-sure spatial coeffecients have been written __syncthreads(); - for (int i = -r; i <= r; i++) { - for (int j = -r; j <= r; j++) { - float4 pix = tex2D(bilateral_src, x + i, y + j); - float cdiff = 0.5f; + float4 pix = tex_shear(bilateral_src, pattern, x, y, -radius - 1.0f); + float4 next = tex_shear(bilateral_src, pattern, x, y, -radius); - if (pix.w > 0.0f && cen.w > 0.0f) { - float pdrcp = 1.0f / pix.w; - float yd = pix.x * pdrcp - cen.x; - float ud = pix.y * pdrcp - cen.y; - float vd = pix.z * pdrcp - cen.z; - cdiff = yd * yd + ud * ud + vd * vd; - } - float ddiff = dspeed * (dhalfpt - fabsf(pix.w - cen.w) - 1.0f); - float dsum = -powf(0.5f * (pix.w + cen.w + 1.0f), dpow); - float dfact = 1.0f - exp2f(dsum * exp2f(ddiff)); - float angfact = (h * i + v * j) / (sqrtf(i*i + j*j) * mag + 1.0e-10f); + for (float r = -radius; r <= radius; r++) { + float prev = pix.w; + pix = next; + next = tex_shear(bilateral_src, pattern, x, y, r + 1.0f); - // Oh, this is ridiculous. But it works! - float factor = spa_coefs[abs(i)] * spa_coefs[abs(j)] - * expf(cscale * cdiff) * dfact - * exp2f(angscale * (-angfact - 1.0f)); - weightsum += factor; - out.x += factor * pix.x; - out.y += factor * pix.y; - out.z += factor * pix.z; - out.w += factor * pix.w; + float cdiff = 0.5f; + + if (pix.w > 0.0f && cen.w > 0.0f) { + float pdrcp = 1.0f / pix.w; + float yd = pix.x * pdrcp - cen.x; + float ud = pix.y * pdrcp - cen.y; + float vd = pix.z * pdrcp - cen.z; + cdiff = yd * yd + ud * ud + vd * vd; } + + //float logden = logf(1.0f + pix.w * k2); + float logden = powf(pix.w, 0.8); + float dfact = exp2f(-0.5f * fabsf(clogden - logden) * dhalfpt); + + float avg = tex_shear(blur_src, pattern, x, y, r); + float yayfact = (prev - next.w) / (avg + 1.0e-6f); + yayfact = expf(-expf(0.5f * yayfact)); + + // Oh, this is ridiculous. + float factor = spa_coefs[(int) fabsf(r)]; + if (r != 0) factor *= expf(cscale * cdiff) * dfact * yayfact; + // * expf(-cdrcp * expf((gradfact - 1.0f) * r)); + weightsum += factor; + out.x += factor * pix.x; + out.y += factor * pix.y; + out.z += factor * pix.z; + out.w += factor * pix.w; } float weightrcp = 1.0f / (weightsum + 1e-10f); @@ -275,17 +317,16 @@ void bilateral(float4 *dst, int r, float sstd, float cstd, float angscale, out.z *= weightrcp; out.w *= weightrcp; - // Uncomment to write directional gradient using YUV colors - /* - out.x = mag; - out.y = h; - out.z = v; - out.w = mag; - */ + //out.x = out.w = tex_shear(blur_src, pattern, x, y, 0); + //out.y = cgrad_f; + //out.z = cgrad_c; + //out.y = gradfact * out.w; const int astride = blockDim.x * gridDim.x; - dst[y * astride + x] = out; + dst[yi * astride + xi] = out; } + +} // end extern "C" ''' class Filtering(HunkOCode): @@ -296,12 +337,20 @@ class Filtering(HunkOCode): def init_mod(cls): if cls.mod is None: cls.mod = pycuda.compiler.SourceModule(assemble_code(BaseCode, cls), - options=['-use_fast_math', '-maxrregcount', '32']) + options=['-use_fast_math', '-maxrregcount', '32'], + no_extern_c=True) def __init__(self): self.init_mod() def de(self, ddst, dsrc, dscratch, gnm, dim, tc, nxf, stream=None): + # Log-scale the accumulated buffer in `dsrc`. + k1 = f32(gnm.color.brightness(tc) * 268 / 256) + # Old definition of area is (w*h/(s*s)). Since new scale 'ns' is now + # s/w, new definition is (w*h/(s*s*w*w)) = (h/(s*s*w)) + area = dim.h / (gnm.camera.scale(tc) ** 2 * dim.w) + k2 = f32(1.0 / (area * gnm.spp(tc))) + # Helper variables and functions to keep it clean sb = 16 * dim.astride bs = sb * dim.ah @@ -324,47 +373,53 @@ class Filtering(HunkOCode): grad_dsc = mkdsc(1) grad_tref = mktref('blur_src') - bilateral, blur_h, blur_h_1cp, blur_v, fma_buf = map( + bilateral, logscale_den, den_blur, den_blur_1c, fma_buf = map( self.mod.get_function, - 'bilateral blur_h blur_h_1cp blur_v fma_buf'.split()) - ROUNDS = 1 # TODO: user customizable? + 'bilateral logscale_den den_blur den_blur_1c fma_buf'.split()) + + # Number of different shear patterns to use when filtering. Must be + # even, since we depend on buffer bouncing (but I don't think that it's + # a requirement for the filter itself to get decent results). + DIRECTIONS = 8 + + def do_bilateral(bsrc, bdst, pattern, r=15, sstd=3, cstd=0.1, + angscale=2.5, dhalfpt=1, dspeed=2000000, dpow=0.6): + # Scale spatial parameters so that a "pixel" is equivalent to an + # actual pixel at 1080p + sstd *= 1920. / dim.w - def do_bilateral(bsrc, bdst): tref.set_address_2d(bsrc, dsc, sb) - launch(blur_h, np.intp(bdst), texrefs=[tref]) + launch(den_blur, np.intp(bdst), i32(pattern), i32(0), + texrefs=[tref]) grad_tref.set_address_2d(bdst, grad_dsc, sb / 4) - launch(blur_v, dscratch, texrefs=[grad_tref]) + launch(den_blur_1c, dscratch, i32(pattern), i32(1), + texrefs=[grad_tref]) grad_tref.set_address_2d(dscratch, grad_dsc, sb / 4) - launch(blur_h_1cp, np.intp(bdst), texrefs=[tref]) - grad_tref.set_address_2d(bdst, grad_dsc, sb / 4) - launch(blur_v, dscratch, texrefs=[grad_tref]) - grad_tref.set_address_2d(dscratch, grad_dsc, sb / 4) - launch(bilateral, np.intp(bdst), f32(4), f32(0.1), - f32(5), f32(0.4), f32(0.6), texrefs=[tref, grad_tref]) - return bdst, bsrc + launch(bilateral, np.intp(bdst), i32(pattern), i32(r), + f32(sstd), f32(cstd), f32(angscale), + f32(dhalfpt), f32(dspeed), f32(dpow), k2, + texrefs=[tref, grad_tref]) + + def do_bilateral_range(bsrc, bdst, npats, *args, **kwargs): + + + for i in range(npats): + do_bilateral(bsrc, bdst, i, *args, **kwargs) + bdst, bsrc = bsrc, bdst + if npats % 2: + cuda.memcpy_dtod_async(bdst, bsrc, bs, stream) # Filter the first xform, using `ddst` as an intermediate buffer. # End result is the filtered copy in `dsrc`. - a, b = dsrc, ddst - for i in range(ROUNDS): - a, b = do_bilateral(a, b) - if ROUNDS % 2: - cuda.memcpy_dtod_async(b, a, bs, stream) + do_bilateral_range(dsrc, ddst, DIRECTIONS) # Filter the remaining xforms, using `ddst` as an intermediate # buffer, then add the result to `dsrc` (at the zero'th xform). for x in range(1, nxf): - a, b = int(dsrc) + x * bs, ddst - for i in range(ROUNDS): - a, b = do_bilateral(a, b) - launch(fma_buf, dsrc, np.intp(a), i32(dim.astride), f32(1)) + src = int(dsrc) + x * bs + do_bilateral_range(src, ddst, DIRECTIONS) + launch(fma_buf, dsrc, np.intp(src), i32(dim.astride), f32(1)) - # Log-scale the accumulated buffer in `dsrc`. - k1 = f32(gnm.color.brightness(tc) * 268 / 256) - # Old definition of area is (w*h/(s*s)). Since new scale 'ns' is now - # s/w, new definition is (w*h/(s*s*w*w)) = (h/(s*s*w)) - area = dim.h / (gnm.camera.scale(tc) ** 2 * dim.w) - k2 = f32(1.0 / (area * gnm.spp(tc))) nbins = dim.ah * dim.astride logscale = self.mod.get_function("logscale") t = logscale(ddst, dsrc, k1, k2,