diff --git a/cuburn/code/filtering.py b/cuburn/code/filtering.py index 8f8ad63..0c184cf 100644 --- a/cuburn/code/filtering.py +++ b/cuburn/code/filtering.py @@ -2,6 +2,7 @@ import numpy as np from numpy import float32 as f32, int32 as i32 +import pycuda.driver as cuda import pycuda.compiler from pycuda.gpuarray import vec @@ -266,6 +267,57 @@ void density_est(float4 *outbuf, const float4 *pixbuf, __syncthreads(); } } + +__global__ +void fma_buf(float4 *dst, const float4 *sub, int astride, float scale) { + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + int i = y * astride + x; + float4 d = dst[i], s = sub[i]; + d.x += s.x * scale; + d.y += s.y * scale; + d.z += s.z * scale; + d.w += s.w * scale; + dst[i] = d; +} + +texture bilateral_src; + +__global__ +void bilateral(float4 *dst, int astride, float dstd, float rstd) { + int xi = blockIdx.x * blockDim.x + threadIdx.x; + int yi = blockIdx.y * blockDim.y + threadIdx.y; + float x = xi, y = yi; + + const float r = 9.0f; + + float cen_den = tex2D(bilateral_src, x, y).w; + float4 out = make_float4(0, 0, 0, 0); + float weightsum = 0.0f; + + for (float i = -r; i <= r; i++) { + for (float j = -r; j <= r; j++) { + float4 pix = tex2D(bilateral_src, x + i, y + j); + float den_diff = log2f(fabsf(pix.w - cen_den) + 1.0f); + float factor = expf(i * i * dstd) + * expf(j * j * dstd) + * expf(den_diff * den_diff * rstd); + out.x += factor * pix.x; + out.y += factor * pix.y; + out.z += factor * pix.z; + out.w += factor * pix.w; + weightsum += factor; + } + } + float weightrcp = 1.0f / weightsum; + out.x *= weightrcp; + out.y *= weightrcp; + out.z *= weightrcp; + out.w *= weightrcp; + + dst[yi * astride + xi] = out; +} + ''' class Filtering(object): @@ -289,17 +341,56 @@ class Filtering(object): blur_v(ddst, dscratch, i32(dim.astride), i32(dim.ah), block=(192, 1, 1), grid=(dim.astride / 192, dim.ah), stream=stream) - def de(self, ddst, dsrc, gnm, dim, tc, stream=None): + def de(self, ddst, dsrc, gnm, dim, tc, nxf, stream=None): k1 = f32(gnm.color.brightness(tc) * 268 / 256) # Old definition of area is (w*h/(s*s)). Since new scale 'ns' is now # s/w, new definition is (w*h/(s*s*w*w)) = (h/(s*s*w)) area = dim.h / (gnm.camera.scale(tc) ** 2 * dim.w) k2 = f32(1.0 / (area * gnm.spp(tc))) - if gnm.de.radius == 0: + if gnm.de.radius(tc) == 0 or True: + # Stride in bytes, and buffer size + sb = 16 * dim.astride + bs = sb * dim.ah + + + dsc = argset(cuda.ArrayDescriptor(), height=dim.ah, + width=dim.astride, format=cuda.array_format.FLOAT, + num_channels=4) + tref = self.mod.get_texref('bilateral_src') + tref.set_filter_mode(cuda.filter_mode.POINT) + tref.set_address_mode(0, cuda.address_mode.WRAP) + tref.set_address_mode(1, cuda.address_mode.WRAP) + + bilateral = self.mod.get_function('bilateral') + fma_buf = self.mod.get_function('fma_buf') + for i in range(nxf): + tref.set_address_2d(int(dsrc) + bs * i, dsc, sb) + bilateral(ddst, i32(dim.astride), f32(-0.1), f32(-0.1), + block=(32, 8, 1), grid=(dim.astride / 32, dim.ah / 8), + texrefs=[tref], stream=stream) + if i == 0: + cuda.memcpy_dtod_async(dsrc, ddst, bs, stream) + else: + fma_buf(dsrc, ddst, i32(dim.astride), f32(1.0), + block=(32, 8, 1), grid=(dim.astride / 32, dim.ah / 8), + stream=stream) + + tref.set_address_2d(dsrc, dsc, sb) + ROUNDS = 3 + + a, b = dsrc, ddst + for i in range(ROUNDS): + bilateral(b, i32(dim.astride), f32(-0.1), f32(-0.2), + block=(32, 8, 1), grid=(dim.astride / 32, dim.ah / 8), + texrefs=[tref], stream=stream) + a, b = b, a + nbins = dim.ah * dim.astride fun = self.mod.get_function("logscale") - t = fun(dsrc, ddst, k1, k2, + # This function only looks at one pixel, so using the same + # parameter for input and output is OK (if e.g. ROUNDS is odd) + t = fun(a, ddst, k1, k2, block=(512, 1, 1), grid=(nbins/512, 1), stream=stream) else: scale_coeff = f32(1.0 / gnm.de.radius(tc)) diff --git a/cuburn/code/iter.py b/cuburn/code/iter.py index 5ba9d68..03c8bc6 100644 --- a/cuburn/code/iter.py +++ b/cuburn/code/iter.py @@ -240,6 +240,7 @@ void iter( if (threadIdx.y == 0 && threadIdx.x < {{NWARPS*2}}) cosel[threadIdx.x] = mwc_next_01(rctx); __syncthreads(); + int last_xf_used = 0; {{endif}} bool fuse = false; @@ -285,12 +286,16 @@ void iter( {{precalc_densities(pcp, std_xforms)}} float xfsel = cosel[threadIdx.y]; - {{for xform_name in std_xforms[:-1]}} + {{for xform_idx, xform_name in enumerate(std_xforms[:-1])}} if (xfsel <= {{pcp['den_'+xform_name]}}) { apply_xf_{{xform_name}}(x, y, color, rctx); + last_xf_used = {{xform_idx}}; } else {{endfor}} + { apply_xf_{{std_xforms[-1]}}(x, y, color, rctx); + last_xf_used = {{len(std_xforms)-1}}; + } // Rotate points between threads. int swr = threadIdx.y + threadIdx.x @@ -298,7 +303,7 @@ void iter( int sw = (swr * 32 + threadIdx.x) & {{NTHREADS-1}}; int sr = threadIdx.y * 32 + threadIdx.x; - swap[sw] = fuse ? 1.0f : 0.0f; + swap[sw] = fuse ? -1.0f : last_xf_used; swap[sw+{{NTHREADS}}] = x; swap[sw+{{2*NTHREADS}}] = y; swap[sw+{{3*NTHREADS}}] = color; @@ -308,7 +313,8 @@ void iter( if (threadIdx.y == 0 && threadIdx.x < {{NWARPS*2}}) cosel[threadIdx.x] = mwc_next_01(rctx); - fuse = swap[sr]; + last_xf_used = swap[sr]; + fuse = (last_xf_used < 0); x = swap[sr+{{NTHREADS}}]; y = swap[sr+{{2*NTHREADS}}]; color = swap[sr+{{3*NTHREADS}}]; @@ -352,7 +358,8 @@ void iter( continue; } - uint32_t i = iy * acc_size.astride + ix; + uint32_t i = (last_xf_used * acc_size.aheight + iy) + * acc_size.astride + ix; {{if info.acc_mode == 'atomic'}} asm volatile ({{crep(""" { diff --git a/cuburn/render.py b/cuburn/render.py index 1dca997..2f0d09a 100644 --- a/cuburn/render.py +++ b/cuburn/render.py @@ -185,11 +185,11 @@ class Renderer(object): cuda.memcpy_htod_async(d_acc_size, u32(list(dim)), write_stream) nbins = astride * aheight - # Extra padding in accum helps with write_shmem overruns - d_accum = cuda.mem_alloc(16 * nbins + (1<<16)) - d_out = cuda.mem_alloc(16 * aheight * astride) + nxf = len(filter(lambda g: g != 'final', genome.xforms)) + d_accum = cuda.mem_alloc(16 * nbins * nxf) + d_out = cuda.mem_alloc(16 * nbins) if self.acc_mode == 'atomic': - d_atom = cuda.mem_alloc(8 * nbins) + d_atom = cuda.mem_alloc(8 * nbins * nxf) flush_fun = self.mod.get_function("flush_atom") obuf_copy = util.argset(cuda.Memcpy2D(), @@ -310,9 +310,11 @@ class Renderer(object): #for i, n in zip(d_temp[5], self._iter.packer.packed): #print '%60s %g' % ('_'.join(n), i) - util.BaseCode.fill_dptr(self.mod, d_accum, 4 * nbins, write_stream) + util.BaseCode.fill_dptr(self.mod, d_accum, 4 * nbins * nxf, + write_stream) if self.acc_mode == 'atomic': - util.BaseCode.fill_dptr(self.mod, d_atom, 2 * nbins, write_stream) + util.BaseCode.fill_dptr(self.mod, d_atom, 2 * nbins * nxf, + write_stream) nrounds = int( (genome.spp(tc) * width * height) / (ntemporal_samples * 256 * 256) ) + 1 if self.acc_mode == 'deferred': @@ -334,14 +336,14 @@ class Renderer(object): iter_fun(*args, block=(32, self._iter.NTHREADS/32, 1), grid=(ntemporal_samples, nrounds), stream=iter_stream) if self.acc_mode == 'atomic': - nblocks = int(np.ceil(np.sqrt(nbins/float(512)))) - flush_fun(u64(d_accum), u64(d_atom), i32(nbins), + nblocks = int(np.ceil(np.sqrt(nbins*nxf/float(512)))) + flush_fun(u64(d_accum), u64(d_atom), i32(nbins*nxf), block=(512, 1, 1), grid=(nblocks, nblocks), stream=iter_stream) util.BaseCode.fill_dptr(self.mod, d_out, 4 * nbins, filt_stream) _sync_stream(filt_stream, write_stream) - filt.de(d_out, d_accum, genome, dim, tc, stream=filt_stream) + filt.de(d_out, d_accum, genome, dim, tc, nxf, stream=filt_stream) _sync_stream(write_stream, filt_stream) filt.colorclip(d_out, genome, dim, tc, blend, stream=filt_stream) obuf_copy.set_dst_host(h_out_a)