diff --git a/cuburn/code/filter.py b/cuburn/code/filter.py index f4f102b..a41674d 100644 --- a/cuburn/code/filter.py +++ b/cuburn/code/filter.py @@ -117,7 +117,7 @@ void density_est(float4 *pixbuf, float4 *outbuf, float *denbuf, float den = denbuf[idx]; if (in.w > 0 && den > 0) { - float ls = k1 * 12 * logf(1.0 + in.w * k2) / in.w; + float ls = k1 * logf(1.0f + in.w * k2) / in.w; in.x *= ls; in.y *= ls; in.z *= ls; @@ -200,19 +200,6 @@ void density_est(float4 *pixbuf, float4 *outbuf, float *denbuf, atomicAdd(out+3, de_a[si]); } - if (threadIdx.y == 5000) { - for (int i = threadIdx.x; i < FW; i += 32) { - idx = {{features.acc_stride}} * (imrow + 32) - + blockIdx.x * 32 + i + W2; - int si = 32 * FW + i; - float *out = reinterpret_cast(&outbuf[idx]); - atomicAdd(out, 0.2 + de_r[si]); - atomicAdd(out+1, de_g[si]); - atomicAdd(out+2, de_b[si]); - atomicAdd(out+3, de_a[si]); - } - } - __syncthreads(); // TODO: shift instead of copying int tid = threadIdx.y * 32 + threadIdx.x; @@ -239,9 +226,11 @@ void density_est(float4 *pixbuf, float4 *outbuf, float *denbuf, def invoke(self, mod, abufd, obufd, dbufd): # TODO: add no-est version # TODO: come up with a general way to average these parameters + k1 = self.cp.brightness * 268 / 256 - area = self.features.width * self.features.height / self.cp.ppu ** 2 + area = self.features.acc_width * self.features.acc_height / self.cp.ppu ** 2 k2 = 1 / (area * self.cp.adj_density) + print k1, k2, area if self.cp.estimator == 0: fun = mod.get_function("logscale") @@ -251,7 +240,7 @@ void density_est(float4 *pixbuf, float4 *outbuf, float *denbuf, else: fun = mod.get_function("density_est") t = fun(abufd, obufd, dbufd, np.float32(k1), np.float32(k2), - block=(32, 32, 1), grid=(self.features.acc_stride/32 - 1, 1), + block=(32, 32, 1), grid=(self.features.acc_width/32, 1), time_kernel=True) print "Density estimation: %g" % t diff --git a/cuburn/code/iter.py b/cuburn/code/iter.py index 15f7ce4..b714569 100644 --- a/cuburn/code/iter.py +++ b/cuburn/code/iter.py @@ -8,6 +8,7 @@ import pycuda.driver as cuda from pycuda.driver import In, Out, InOut from pycuda.compiler import SourceModule import numpy as np +from scipy import ndimage from fr0stlib.pyflam3 import flam3_interpolate from cuburn.code import mwc, variations, filter @@ -125,8 +126,8 @@ void iter(mwc_st *msts, iter_info *infos, float4 *accbuf, float *denbuf) { int ix = trunca(cx+ditherx), iy = trunca(cy+dithery); - if (ix < 0 || ix >= {{features.width}} || - iy < 0 || iy >= {{features.height}} ) { + if (ix < 0 || ix >= {{features.acc_width}} || + iy < 0 || iy >= {{features.acc_height}} ) { consec_bad++; if (consec_bad > {{features.max_oob}}) { x = mwc_next_11(&rctx); @@ -137,7 +138,7 @@ void iter(mwc_st *msts, iter_info *infos, float4 *accbuf, float *denbuf) { continue; } - int i = iy * {{features.width}} + ix; + int i = iy * {{features.acc_stride}} + ix; float4 outcol = tex2D(palTex, color, {{packer.get('cp_step_frac')}}); float4 pix = accbuf[i]; @@ -148,6 +149,7 @@ void iter(mwc_st *msts, iter_info *infos, float4 *accbuf, float *denbuf) { accbuf[i] = pix; // TODO: atomic operations (or better) denbuf[i] += 1.0f; } + asm volatile ("membar.cta;"); } """) return tmpl.substitute( @@ -158,8 +160,8 @@ void iter(mwc_st *msts, iter_info *infos, float4 *accbuf, float *denbuf) { def render(features, cps): # TODO: make this adjustable via genome nsteps = 1000 - abuf = np.zeros((features.height, features.width, 4), dtype=np.float32) - dbuf = np.zeros((features.height, features.width), dtype=np.float32) + abuf = np.zeros((features.acc_height, features.acc_stride, 4), dtype=np.float32) + dbuf = np.zeros((features.acc_height, features.acc_stride), dtype=np.float32) seeds = mwc.MWC.make_seeds(512 * nsteps) iter = IterCode(features) @@ -221,9 +223,13 @@ def render(features, cps): f = np.float32 - npix = features.width * features.height + npix = features.acc_width * features.acc_height + # TODO: just allocate obufd = cuda.to_device(abuf) + dbuf = cuda.from_device_like(dbufd, dbuf) + dbuf = ndimage.filters.gaussian_filter(dbuf, 0.6) + dbufd = cuda.to_device(dbuf) de.invoke(mod, abufd, obufd, dbufd) fun = mod.get_function("colorclip") diff --git a/cuburn/render.py b/cuburn/render.py index f025b27..9267f14 100644 --- a/cuburn/render.py +++ b/cuburn/render.py @@ -38,7 +38,9 @@ class Genome(pyflam3.Genome): """ # TODO: when reading as a property during packing, this may be # calculated 6 times instead of 1 - return ( affine.translate(0.5 * cp.width, 0.5 * cp.height) + # TODO: also requires knowing gutter width + g = Features.gutter + return ( affine.translate(0.5 * cp.width + g, 0.5 * cp.height + g) * affine.scale(cp.ppu, cp.ppu) * affine.translate(-cp._center[0], -cp._center[1]) * affine.rotate(cp.rotate * 2 * np.pi / 360, @@ -90,6 +92,10 @@ class Features(object): # performance too much. Power-of-two, please. palette_height = 16 + # Maximum width of DE and other spatial filters, and thus in turn the + # amount of padding applied + gutter = 16 + def __init__(self, genomes): any = lambda l: bool(filter(None, map(l, genomes))) self.max_ntemporal_samples = max( @@ -112,9 +118,9 @@ class Features(object): self.width = genomes[0].width self.height = genomes[0].height - self.acc_width = genomes[0].width - self.acc_height = genomes[0].height - self.acc_stride = genomes[0].width + self.acc_width = genomes[0].width + 2 * self.gutter + self.acc_height = genomes[0].height + 2 * self.gutter + self.acc_stride = genomes[0].width + 2 * self.gutter class XFormFeatures(object): def __init__(self, xforms, xform_id): diff --git a/main.py b/main.py index 5316f5b..bf1ce85 100644 --- a/main.py +++ b/main.py @@ -44,6 +44,10 @@ def main(args): genomes = Genome.from_string(fp.read()) anim = Animation(genomes) accum, den = render(anim.features, genomes) + accum = np.delete(accum, np.s_[:16], axis=0) + accum = np.delete(accum, np.s_[:16], axis=1) + accum = np.delete(accum, np.s_[-16:], axis=0) + accum = np.delete(accum, np.s_[-16:], axis=1) noalpha = np.delete(accum, 3, axis=2) scipy.misc.imsave('rendered.png', noalpha)