diff --git a/cuburn/code/filter.py b/cuburn/code/filter.py index 5da4b14..f9802df 100644 --- a/cuburn/code/filter.py +++ b/cuburn/code/filter.py @@ -63,4 +63,127 @@ void logfilt(float4 *pixbuf, float k1, float k2, } """ +class DensityEst(HunkOCode): + """ + NOTE: for now, this *must* be invoked with a block size of (32,16,1), and + a grid size of (W/32) for vertical filtering or (H/32) for horizontal. + It will probably fail for images that are not multiples of 32. + """ + + def __init__(self, features, cp): + self.features, self.cp = features, cp + + @property + def defs(self): + return self.defs_tmpl.substitute(features=self.features, cp=self.cp) + + defs_tmpl = Template(""" +#define W 15 // Filter width (regardless of standard deviation chosen) +#define W2 7 // Half of filter width, rounded down +#define NW 16 // Number of warps in each set of points +#define FW 30 // Width of local result storage per-lane (NW+W2+W2) +#define BX 32 // The size of a block's X dimension (== 1 warp) + +__global__ +void density_est(float4 *pixbuf, float *denbuf, int vertical) { + __shared__ float r[BX*FW], g[BX*FW], b[BX*FW], a[BX*FW]; + + int ibase; // The index of the first element within this lane's strip + int imax; // The maximum offset from the first element in the strip + int istride; // Number of indices until next point to filter + + if (vertical) { + ibase = threadIdx.x + blockIdx.x * BX; + imax = {{features.acc_height}}; + istride = {{features.acc_stride}}; + } else { + ibase = (blockIdx.x * BX + threadIdx.x) * {{features.acc_stride}}; + imax = {{features.acc_width}}; + istride = 1; + } + + for (int i = threadIdx.x + BX*threadIdx.y; i < BX*FW; i += NW * BX) + r[i] = g[i] = b[i] = a[i] = 0.0f; + + for (int i = threadIdx.y; i < imax; i += NW) { + int idx = ibase+i*istride; + float4 in = pixbuf[idx]; + float den = denbuf[idx]; + + int j = (threadIdx.y + W2) * 32 + threadIdx.x; + + float sd = {{0.35 * cp.estimator}} / powf(den+1.0f, {{cp.estimator_curve}}); + {{if cp.estimator_minimum > 1}} + sd = fmaxf(sd, {{cp.estimator_minimum}}); + {{endif}} + sd *= sd; + + // TODO: log scaling here? matches flam3, but, ick + // TODO: investigate harm caused by varying standard deviation in a + // separable environment + float coeff = rsqrtf(2.0f*M_PI*sd*sd); + atomicAdd(r+j, in.x * coeff); + atomicAdd(g+j, in.y * coeff); + atomicAdd(b+j, in.z * coeff); + atomicAdd(a+j, in.w * coeff); + sd = -0.5/sd; + + // #pragma unroll + for (int k = 1; k <= W2; k++) { + float scale = exp(sd*k*k)*coeff; + idx = j+k*32; + atomicAdd(r+idx, in.x * scale); + atomicAdd(g+idx, in.y * scale); + atomicAdd(b+idx, in.z * scale); + atomicAdd(a+idx, in.w * scale); + idx = j-k*32; + atomicAdd(r+idx, in.x * scale); + atomicAdd(g+idx, in.y * scale); + atomicAdd(b+idx, in.z * scale); + atomicAdd(a+idx, in.w * scale); + } + + __syncthreads(); + float4 out; + j = threadIdx.y * BX + threadIdx.x; + out.x = r[j]; + out.y = g[j]; + out.z = b[j]; + out.w = a[j]; + idx = ibase+(i-W2)*istride; + if (idx > 0) + pixbuf[idx] = out; + __syncthreads(); + + // TODO: shift instead of copying + idx = threadIdx.x + BX * threadIdx.y; + if (threadIdx.y < NW-2) { + r[idx] = r[idx + BX*NW]; + g[idx] = g[idx + BX*NW]; + b[idx] = b[idx + BX*NW]; + a[idx] = a[idx + BX*NW]; + } + __syncthreads(); + + r[idx + BX*(NW-2)] = 0.0f; + g[idx + BX*(NW-2)] = 0.0f; + b[idx + BX*(NW-2)] = 0.0f; + a[idx + BX*(NW-2)] = 0.0f; + __syncthreads(); + } +} + +""") + + def invoke(self, mod, abufd, dbufd): + fun = mod.get_function("density_est") + t = fun(abufd, dbufd, np.int32(0), + block=(32, 16, 1), grid=(self.features.acc_height/32,1), + time_kernel=True) + print "Horizontal density estimation: %g" % t + + t = fun(abufd, dbufd, np.int32(1), + block=(32, 16, 1), grid=(self.features.acc_width/32,1), + time_kernel=True) + print "Vertical density estimation: %g" % t diff --git a/cuburn/code/iter.py b/cuburn/code/iter.py index 610a246..e5158fd 100644 --- a/cuburn/code/iter.py +++ b/cuburn/code/iter.py @@ -160,11 +160,13 @@ def render(features, cps): seeds = mwc.MWC.make_seeds(512 * nsteps) iter = IterCode(features) - code = assemble_code(BaseCode, mwc.MWC, iter.packer, iter, filter.ColorClip) + de = filter.DensityEst(features, cps[0]) + code = assemble_code(BaseCode, mwc.MWC, iter.packer, iter, + filter.ColorClip, de) for lno, line in enumerate(code.split('\n')): print '%3d %s' % (lno, line) - mod = SourceModule(code, keep=True, + mod = SourceModule(code, options=['-use_fast_math', '-maxrregcount', '32']) cps_as_array = (Genome * len(cps))() @@ -222,6 +224,8 @@ def render(features, cps): area = features.width * features.height / cp.ppu ** 2 k2 = 1 / (area * cp.adj_density) + de.invoke(mod, abufd, dbufd) + fun = mod.get_function("logfilt") t = fun(abufd, f(k1), f(k2), f(1 / cp.gamma), f(cp.vibrancy), f(cp.highlight_power), @@ -229,10 +233,10 @@ def render(features, cps): print "Completed color filtering in %g seconds" % t abuf = cuda.from_device_like(abufd, abuf) - dbuf = cuda.from_device_like(dbufd, dbuf) return abuf, dbuf + # TODO: find a better place to stick this code class MemBench(HunkOCode): decls = """ diff --git a/cuburn/render.py b/cuburn/render.py index 9285b92..f025b27 100644 --- a/cuburn/render.py +++ b/cuburn/render.py @@ -112,6 +112,9 @@ class Features(object): self.width = genomes[0].width self.height = genomes[0].height + self.acc_width = genomes[0].width + self.acc_height = genomes[0].height + self.acc_stride = genomes[0].width class XFormFeatures(object): def __init__(self, xforms, xform_id):