From 3d94c256a957e6d529fff4ee1ce79395e752bed9 Mon Sep 17 00:00:00 2001 From: Steven Robertson Date: Thu, 5 May 2011 23:35:54 -0400 Subject: [PATCH] Another non-working checkin --- cuburn/code/filter.py | 260 ++++++++++++++++++++++++++---------------- cuburn/code/iter.py | 14 +-- 2 files changed, 169 insertions(+), 105 deletions(-) diff --git a/cuburn/code/filter.py b/cuburn/code/filter.py index f9802df..f4f102b 100644 --- a/cuburn/code/filter.py +++ b/cuburn/code/filter.py @@ -4,32 +4,22 @@ from cuburn.code.util import * class ColorClip(HunkOCode): defs = """ __global__ -void logfilt(float4 *pixbuf, float k1, float k2, - float gamma, float vibrancy, float highpow) { +void colorclip(float4 *pixbuf, float gamma, float vibrancy, float highpow) { // TODO: test if over an edge of the framebuffer int i = blockDim.x * blockIdx.x + threadIdx.x; float4 pix = pixbuf[i]; if (pix.w <= 0) return; - float ls = k1 * logf(1.0 + pix.w * k2) / pix.w; - pix.x *= ls; - pix.y *= ls; - pix.z *= ls; - pix.w *= ls; - float4 opix = pix; // TODO: linearized bottom range float alpha = powf(pix.w, gamma); - ls = vibrancy * alpha / pix.w; + float ls = vibrancy * alpha / pix.w; float maxc = fmaxf(pix.x, fmaxf(pix.y, pix.z)); float newls = 1 / maxc; - // TODO: detect if highlight power is globally disabled and drop - // this branch - if (maxc * ls > 1 && highpow >= 0) { // TODO: does CUDA autopromote the int here to a float before GPU? float lsratio = powf(newls / ls, highpow); @@ -65,15 +55,18 @@ void logfilt(float4 *pixbuf, float k1, float k2, class DensityEst(HunkOCode): """ - NOTE: for now, this *must* be invoked with a block size of (32,16,1), and - a grid size of (W/32) for vertical filtering or (H/32) for horizontal. - - It will probably fail for images that are not multiples of 32. + NOTE: for now, this *must* be invoked with a block size of (32,32,1), and + a grid size of (W/32,1). At least 7 pixel gutters are required, and the + stride and height probably need to be multiples of 32. """ + # Note, changing this does not yet have any effect, it's just informational + MAX_WIDTH=15 + def __init__(self, features, cp): self.features, self.cp = features, cp + headers = "#include\n" @property def defs(self): return self.defs_tmpl.substitute(features=self.features, cp=self.cp) @@ -81,109 +74,184 @@ class DensityEst(HunkOCode): defs_tmpl = Template(""" #define W 15 // Filter width (regardless of standard deviation chosen) #define W2 7 // Half of filter width, rounded down -#define NW 16 // Number of warps in each set of points -#define FW 30 // Width of local result storage per-lane (NW+W2+W2) -#define BX 32 // The size of a block's X dimension (== 1 warp) +#define FW 46 // Width of local result storage (NW+W2+W2) +#define FW2 (FW*FW) + +__shared__ float de_r[FW2], de_g[FW2], de_b[FW2], de_a[FW2]; + +__device__ void de_add(int ibase, int ii, int jj, float4 scaled) { + int idx = ibase + FW * ii + jj; + atomicAdd(de_r+idx, scaled.x); + atomicAdd(de_g+idx, scaled.y); + atomicAdd(de_b+idx, scaled.z); + atomicAdd(de_a+idx, scaled.w); +} __global__ -void density_est(float4 *pixbuf, float *denbuf, int vertical) { - __shared__ float r[BX*FW], g[BX*FW], b[BX*FW], a[BX*FW]; +void logscale(float4 *pixbuf, float4 *outbuf, float k1, float k2) { + int i = blockDim.x * blockIdx.x + threadIdx.x; + float4 pix = pixbuf[i]; - int ibase; // The index of the first element within this lane's strip - int imax; // The maximum offset from the first element in the strip - int istride; // Number of indices until next point to filter + float ls = fmaxf(0, k1 * logf(1.0 + pix.w * k2) / pix.w); + pix.x *= ls; + pix.y *= ls; + pix.z *= ls; + pix.w *= ls; - if (vertical) { - ibase = threadIdx.x + blockIdx.x * BX; - imax = {{features.acc_height}}; - istride = {{features.acc_stride}}; - } else { - ibase = (blockIdx.x * BX + threadIdx.x) * {{features.acc_stride}}; - imax = {{features.acc_width}}; - istride = 1; - } + outbuf[i] = pix; +} - for (int i = threadIdx.x + BX*threadIdx.y; i < BX*FW; i += NW * BX) - r[i] = g[i] = b[i] = a[i] = 0.0f; +__global__ +void density_est(float4 *pixbuf, float4 *outbuf, float *denbuf, + float k1, float k2) { + for (int i = threadIdx.x + 32*threadIdx.y; i < FW2; i += 32) + de_r[i] = de_g[i] = de_b[i] = de_a[i] = 0.0f; + __syncthreads(); + + for (int imrow = threadIdx.y + W2; imrow < {{features.acc_height}}; imrow += 32) + { + int idx = {{features.acc_stride}} * imrow + + + blockIdx.x * 32 + threadIdx.x + W2; - for (int i = threadIdx.y; i < imax; i += NW) { - int idx = ibase+i*istride; float4 in = pixbuf[idx]; float den = denbuf[idx]; - int j = (threadIdx.y + W2) * 32 + threadIdx.x; + if (in.w > 0 && den > 0) { + float ls = k1 * 12 * logf(1.0 + in.w * k2) / in.w; + in.x *= ls; + in.y *= ls; + in.z *= ls; + in.w *= ls; - float sd = {{0.35 * cp.estimator}} / powf(den+1.0f, {{cp.estimator_curve}}); - {{if cp.estimator_minimum > 1}} - sd = fmaxf(sd, {{cp.estimator_minimum}}); - {{endif}} - sd *= sd; + // Calculate standard deviation of Gaussian kernel. + // flam3_gaussian_filter() uses an implicit standard deviation of 0.5, + // but the DE filters scale filter distance by the default spatial + // support factor of 1.5, so the effective base SD is (0.5/1.5)=1/3. + float sd = {{cp.estimator / 3.}}; - // TODO: log scaling here? matches flam3, but, ick - // TODO: investigate harm caused by varying standard deviation in a - // separable environment - float coeff = rsqrtf(2.0f*M_PI*sd*sd); - atomicAdd(r+j, in.x * coeff); - atomicAdd(g+j, in.y * coeff); - atomicAdd(b+j, in.z * coeff); - atomicAdd(a+j, in.w * coeff); - sd = -0.5/sd; + // The base SD is then scaled in inverse proportion to the density of + // the point being scaled. + sd *= powf(den+1.0f, {{-cp.estimator_curve}}); - // #pragma unroll - for (int k = 1; k <= W2; k++) { - float scale = exp(sd*k*k)*coeff; - idx = j+k*32; - atomicAdd(r+idx, in.x * scale); - atomicAdd(g+idx, in.y * scale); - atomicAdd(b+idx, in.z * scale); - atomicAdd(a+idx, in.w * scale); - idx = j-k*32; - atomicAdd(r+idx, in.x * scale); - atomicAdd(g+idx, in.y * scale); - atomicAdd(b+idx, in.z * scale); - atomicAdd(a+idx, in.w * scale); + // Clamp the final standard deviation. Things will go badly if the + // minimum is undershot. + sd = fmaxf(sd, {{max(cp.estimator_minimum / 3., 0.3)}} ); + + // This five-term polynomial approximates the sum of the filters with + // the clamping logic used here. See helpers/filt_err.py. + float filtsum; + filtsum = -0.20885075f * sd + 0.90557721f; + filtsum = filtsum * sd + 5.28363054f; + filtsum = filtsum * sd + -0.11733533f; + filtsum = filtsum * sd + 0.35670333f; + float filtscale = 1 / filtsum; + + // The reciprocal SD scaling coeffecient in the Gaussian exponent. + // exp(-x^2/(2*sd^2)) = exp2f(x^2*rsd) + float rsd = -0.5f * CUDART_L2E_F / (sd * sd); + + int si = (threadIdx.y + W2) * FW + threadIdx.x + W2; + for (int jj = 0; jj <= W2; jj++) { + float jj2f = jj; + jj2f *= jj2f; + + float iif = 0; + for (int ii = 0; ii <= jj; ii++) { + iif += 1; + + float coeff = exp2f((jj2f + iif * iif) * rsd) * filtscale; + if (coeff < 0.0001f) break; + + float4 scaled; + scaled.x = in.x * coeff; + scaled.y = in.y * coeff; + scaled.z = in.z * coeff; + scaled.w = in.w * coeff; + + de_add(si, ii, jj, scaled); + if (jj == 0) continue; + de_add(si, ii, -jj, scaled); + if (ii != 0) { + de_add(si, -ii, jj, scaled); + de_add(si, -ii, -jj, scaled); + if (ii == jj) continue; + } + de_add(si, jj, ii, scaled); + de_add(si, -jj, ii, scaled); + + if (ii == 0) continue; + de_add(si, -jj, -ii, scaled); + de_add(si, jj, -ii, scaled); + + // TODO: validate that the above avoids bank conflicts + } + } } __syncthreads(); - float4 out; - j = threadIdx.y * BX + threadIdx.x; - out.x = r[j]; - out.y = g[j]; - out.z = b[j]; - out.w = a[j]; - idx = ibase+(i-W2)*istride; - if (idx > 0) - pixbuf[idx] = out; - __syncthreads(); + // TODO: could coalesce this, but what a pain + for (int i = threadIdx.x; i < FW; i += 32) { + idx = {{features.acc_stride}} * imrow + blockIdx.x * 32 + i + W2; + int si = threadIdx.y * FW + i; + float *out = reinterpret_cast(&outbuf[idx]); + atomicAdd(out, de_r[si]); + atomicAdd(out+1, de_g[si]); + atomicAdd(out+2, de_b[si]); + atomicAdd(out+3, de_a[si]); + } + if (threadIdx.y == 5000) { + for (int i = threadIdx.x; i < FW; i += 32) { + idx = {{features.acc_stride}} * (imrow + 32) + + blockIdx.x * 32 + i + W2; + int si = 32 * FW + i; + float *out = reinterpret_cast(&outbuf[idx]); + atomicAdd(out, 0.2 + de_r[si]); + atomicAdd(out+1, de_g[si]); + atomicAdd(out+2, de_b[si]); + atomicAdd(out+3, de_a[si]); + } + } + + __syncthreads(); // TODO: shift instead of copying - idx = threadIdx.x + BX * threadIdx.y; - if (threadIdx.y < NW-2) { - r[idx] = r[idx + BX*NW]; - g[idx] = g[idx + BX*NW]; - b[idx] = b[idx + BX*NW]; - a[idx] = a[idx + BX*NW]; + int tid = threadIdx.y * 32 + threadIdx.x; + for (int i = tid; i < FW*(W2+W2); i += 512) { + de_r[i] = de_r[i+FW*32]; + de_g[i] = de_g[i+FW*32]; + de_b[i] = de_b[i+FW*32]; + de_a[i] = de_a[i+FW*32]; } - __syncthreads(); - r[idx + BX*(NW-2)] = 0.0f; - g[idx + BX*(NW-2)] = 0.0f; - b[idx + BX*(NW-2)] = 0.0f; - a[idx + BX*(NW-2)] = 0.0f; + __syncthreads(); + for (int i = tid + FW*(W2+W2); i < FW2; i += 512) { + de_r[i] = 0.0f; + de_g[i] = 0.0f; + de_b[i] = 0.0f; + de_a[i] = 0.0f; + } __syncthreads(); } } """) - def invoke(self, mod, abufd, dbufd): - fun = mod.get_function("density_est") - t = fun(abufd, dbufd, np.int32(0), - block=(32, 16, 1), grid=(self.features.acc_height/32,1), - time_kernel=True) - print "Horizontal density estimation: %g" % t + def invoke(self, mod, abufd, obufd, dbufd): + # TODO: add no-est version + # TODO: come up with a general way to average these parameters + k1 = self.cp.brightness * 268 / 256 + area = self.features.width * self.features.height / self.cp.ppu ** 2 + k2 = 1 / (area * self.cp.adj_density) + + if self.cp.estimator == 0: + fun = mod.get_function("logscale") + t = fun(abufd, obufd, np.float32(k1), np.float32(k2), + block=(self.features.acc_width, 1, 1), + grid=(self.features.acc_height, 1), time_kernel=True) + else: + fun = mod.get_function("density_est") + t = fun(abufd, obufd, dbufd, np.float32(k1), np.float32(k2), + block=(32, 32, 1), grid=(self.features.acc_stride/32 - 1, 1), + time_kernel=True) + print "Density estimation: %g" % t - t = fun(abufd, dbufd, np.int32(1), - block=(32, 16, 1), grid=(self.features.acc_width/32,1), - time_kernel=True) - print "Vertical density estimation: %g" % t diff --git a/cuburn/code/iter.py b/cuburn/code/iter.py index e5158fd..c8e980c 100644 --- a/cuburn/code/iter.py +++ b/cuburn/code/iter.py @@ -220,19 +220,15 @@ def render(features, cps): npix = features.width * features.height - k1 = cp.brightness * 268 / 256 - area = features.width * features.height / cp.ppu ** 2 - k2 = 1 / (area * cp.adj_density) + obufd = cuda.to_device(abuf) + de.invoke(mod, abufd, obufd, dbufd) - de.invoke(mod, abufd, dbufd) - - fun = mod.get_function("logfilt") - t = fun(abufd, f(k1), f(k2), - f(1 / cp.gamma), f(cp.vibrancy), f(cp.highlight_power), + fun = mod.get_function("colorclip") + t = fun(obufd, f(1 / cp.gamma), f(cp.vibrancy), f(cp.highlight_power), block=(256,1,1), grid=(npix/256,1), time_kernel=True) print "Completed color filtering in %g seconds" % t - abuf = cuda.from_device_like(abufd, abuf) + abuf = cuda.from_device_like(obufd, abuf) return abuf, dbuf