From 1398706886afb2dadc8b3711f01191298bdae4f5 Mon Sep 17 00:00:00 2001 From: Steven Robertson Date: Fri, 20 Jan 2012 09:35:12 -0500 Subject: [PATCH] Remove SS from DE, and improve performance. --- cuburn/code/filtering.py | 118 +++++++++++++++++++++++---------------- cuburn/code/iter.py | 7 +-- cuburn/render.py | 25 ++++----- 3 files changed, 83 insertions(+), 67 deletions(-) diff --git a/cuburn/code/filtering.py b/cuburn/code/filtering.py index 32ab149..8f8ad63 100644 --- a/cuburn/code/filtering.py +++ b/cuburn/code/filtering.py @@ -147,78 +147,98 @@ void blur_density_v(float4 *pixbuf, const float *scratch, *ddst = min(*ddst, den); } -#define W 15 // Filter width (regardless of standard deviation chosen) -#define W2 7 // Half of filter width, rounded down -#define FW 46 // Width of local result storage (NW+W2+W2) +#define W 21 // Filter width (regardless of standard deviation chosen) +#define W2 10 // Half of filter width, rounded down +#define FW 53 // Width of local result storage (NW+W2+W2) #define FW2 (FW*FW) __global__ void density_est(float4 *outbuf, const float4 *pixbuf, float scale_coeff, float est_curve, float edge_clamp, - float k1, float k2, int height, int stride, int ss) { + float k1, float k2, int height, int stride) { __shared__ float de_r[FW2], de_g[FW2], de_b[FW2], de_a[FW2]; + // The max supported radius is really 7, but the extra units simplify the + // logic in the bottlenecked section below. + __shared__ int minradius[32]; + for (int i = threadIdx.x + 32*threadIdx.y; i < FW2; i += 1024) de_r[i] = de_g[i] = de_b[i] = de_a[i] = 0.0f; __syncthreads(); for (int imrow = threadIdx.y; imrow < height; imrow += 32) { - for (int ssx = 0; ssx < ss; ssx++) { - float ssxo = (0.5f + ssx) / ss; - for (int ssy = 0; ssy < ss; ssy++) { - float ssyo = (0.5f + ssy) / ss; + // Prepare the shared voting buffer. Syncing afterward is + // almost unnecessary but we do it to be safe + if (threadIdx.y == 0) + minradius[threadIdx.x] = 0.0f; + __syncthreads(); - int col = blockIdx.x * 32 + threadIdx.x; - int in_idx = ss * (stride * (imrow * ss + ssy) + col) + ssx; - float4 in = pixbuf[in_idx]; - // TODO: compute maximum filter radius needed across all warps - // to limit rounds in advance - float den = in.w; + int col = blockIdx.x * 32 + threadIdx.x; + int in_idx = stride * imrow + col; + float4 in = pixbuf[in_idx]; + float den = in.w; - float ls = k1 * logf(1.0f + in.w * k2) / in.w; + float ls = k1 * logf(1.0f + in.w * k2) / in.w; - // The synchronization model used now prevents us from - // cutting early in the case of a zero point, so we just carry - // it along with us here - if (den <= 0) ls = 0.0f; + // The synchronization model used now prevents us from + // cutting early in the case of a zero point, so we just carry + // it along with us here + if (den <= 0) ls = 0.0f; - in.x *= ls; - in.y *= ls; - in.z *= ls; - in.w *= ls; + in.x *= ls; + in.y *= ls; + in.z *= ls; + in.w *= ls; - // Base index of destination for writes - int si = (threadIdx.y + W2) * FW + threadIdx.x + W2; + // Base index of destination for writes + int si = (threadIdx.y + W2) * FW + threadIdx.x + W2; - // Calculate scaling coefficient for the Gaussian kernel. This - // does not match with a normal Gaussian; it just fits with - // flam3's implementation. - float scale = powf(den * ss, est_curve) * scale_coeff; - scale = min(scale, 2.0f); + // Calculate scaling coefficient for the Gaussian kernel. This + // does not match with a normal Gaussian; it just fits with + // flam3's implementation. + float scale = powf(den, est_curve) * scale_coeff; - for (int jj = -W2; jj < W2; jj++) { - float jjf = (jj - ssxo) * scale; - float jdiff = erff(jjf + scale) - erff(jjf); + // Force a minimum blur radius. This works out to be a + // standard deviation of about 0.35px. Also force a maximum, + // which limits spherical error to about 2 quanta at 10 bit + // precision. + scale = max(0.30f, min(scale, 2.0f)); - for (int ii = -W2; ii < W2; ii++) { - float iif = (ii - ssyo) * scale; - float coeff = 0.25f * jdiff * (erff(iif + scale) - erff(iif)); + // Determine a minimum radius for this image section. + int radius = (int) min(ceilf(2.12132f / scale), 10.0f); + if (den <= 0) radius = 0; + minradius[radius] = radius; - int idx = si + FW * ii + jj; - de_r[idx] += in.x * coeff; - de_g[idx] += in.y * coeff; - de_b[idx] += in.z * coeff; - de_a[idx] += in.w * coeff; - __syncthreads(); - } - } + // Go bottlenecked to compute the maximum radius in this block + __syncthreads(); + if (threadIdx.y == 0) { + int blt = __ballot(minradius[threadIdx.x]); + minradius[0] = 31 - __clz(blt); + } + __syncthreads(); + + radius = minradius[0]; + + for (int jj = -radius; jj <= radius; jj++) { + float jjf = (jj - 0.5f) * scale; + float jdiff = erff(jjf + scale) - erff(jjf); + + for (int ii = -radius; ii <= radius; ii++) { + float iif = (ii - 0.5f) * scale; + float coeff = 0.25f * jdiff * (erff(iif + scale) - erff(iif)); + + int idx = si + FW * ii + jj; + de_r[idx] += in.x * coeff; + de_g[idx] += in.y * coeff; + de_b[idx] += in.z * coeff; + de_a[idx] += in.w * coeff; + __syncthreads(); } } - __syncthreads(); // TODO: could coalesce this, but what a pain for (int i = threadIdx.x; i < FW; i += 32) { - int out_idx = stride * imrow + blockIdx.x * 32 + i + W2; + int out_idx = stride * imrow + blockIdx.x * 32 + i; int si = threadIdx.y * FW + i; float *out = reinterpret_cast(&outbuf[out_idx]); atomicAdd(out, de_r[si]); @@ -288,11 +308,11 @@ class Filtering(object): edge_clamp = f32(1.2) fun = self.mod.get_function("density_est") fun(ddst, dsrc, scale_coeff, est_curve, edge_clamp, k1, k2, - i32(dim.ah / dim.ss), i32(dim.astride / dim.ss), i32(dim.ss), - block=(32, 32, 1), grid=(dim.aw/(32*dim.ss), 1), stream=stream) + i32(dim.ah), i32(dim.astride), + block=(32, 32, 1), grid=(dim.aw/32, 1), stream=stream) def colorclip(self, dbuf, gnm, dim, tc, blend, stream=None): - nbins = dim.ah * dim.astride / dim.ss ** 2 + nbins = dim.ah * dim.astride # TODO: implement integration over cubic splines? gam = f32(1 / gnm.color.gamma(tc)) diff --git a/cuburn/code/iter.py b/cuburn/code/iter.py index bd74874..5ba9d68 100644 --- a/cuburn/code/iter.py +++ b/cuburn/code/iter.py @@ -68,7 +68,7 @@ def precalc_camera(pcam): float rot = {{pre_cam.rotation}} * M_PI / 180.0f; float rotsin = sin(rot), rotcos = cos(rot); float cenx = {{pre_cam.center.x}}, ceny = {{pre_cam.center.y}}; - float scale = {{pre_cam.scale}} * acc_size.width * acc_size.ss; + float scale = {{pre_cam.scale}} * acc_size.width; {{pre_cam._set('xx')}} = scale * rotcos; {{pre_cam._set('xy')}} = scale * -rotsin; @@ -126,7 +126,6 @@ typedef struct { uint32_t awidth; uint32_t aheight; uint32_t astride; - uint32_t ss; } acc_size_t; __constant__ acc_size_t acc_size; @@ -204,7 +203,7 @@ void iter( mwc_st rctx = msts[this_rb_idx]; {{precalc_camera(pcp.camera)}} - if (acc_size.ss == 1 && threadIdx.y == 5 && threadIdx.x == 4) { + if (threadIdx.y == 5 && threadIdx.x == 4) { float ditherwidth = {{pcp.camera.dither_width}} * 0.33f; float u0 = mwc_next_01(rctx); float r = ditherwidth * sqrtf(-2.0f * log2f(u0) / M_LOG2E); @@ -346,7 +345,7 @@ void iter( uint32_t ix = trunca(cx), iy = trunca(cy); - if (ix >= acc_size.awidth || iy >= acc_size.aheight) { + if (ix >= acc_size.astride || iy >= acc_size.aheight) { {{if info.acc_mode == 'deferred'}} *log = 0xffffffff; {{endif}} diff --git a/cuburn/render.py b/cuburn/render.py index 0b42f02..e0ed295 100644 --- a/cuburn/render.py +++ b/cuburn/render.py @@ -20,7 +20,7 @@ from cuburn import affine from cuburn.code import util, mwc, iter, interp, filtering, sort RenderedImage = namedtuple('RenderedImage', 'buf idx gpu_time') -Dimensions = namedtuple('Dimensions', 'w h aw ah astride ss') +Dimensions = namedtuple('Dimensions', 'w h aw ah astride') def _sync_stream(dst, src): dst.wait_for_event(cuda.Event(cuda.event_flags.DISABLE_TIMING).record(src)) @@ -54,7 +54,7 @@ class Renderer(object): # Maximum width of DE and other spatial filters, and thus in turn the # amount of padding applied. Note that, for now, this must not be changed! # The filtering code makes deep assumptions about this value. - gutter = 15 + gutter = 10 # Accumulation mode. Leave it at 'atomic' for now. acc_mode = 'atomic' @@ -138,7 +138,7 @@ class Renderer(object): next(r) return ifilter(None, imap(r.send, chain(times, [None]))) - def render_gen(self, genome, width, height, ss=1, blend=True): + def render_gen(self, genome, width, height, blend=True): """ Render frames. This method is wrapped by the ``render()`` method; see its docstring for warnings and details. @@ -182,25 +182,24 @@ class Renderer(object): event_a = cuda.Event().record(filt_stream) event_b = None - owidth = width + 2 * self.gutter - oheight = height + 2 * self.gutter - ostride = 32 * int(np.ceil(owidth / 32.)) - awidth, aheight, astride = owidth * ss, oheight * ss, ostride * ss - dim = Dimensions(width, height, awidth, aheight, astride, ss) + awidth = width + 2 * self.gutter + aheight = 32 * int(np.ceil((height + 2 * self.gutter) / 32.)) + astride = 32 * int(np.ceil(awidth / 32.)) + dim = Dimensions(width, height, awidth, aheight, astride) d_acc_size = self.mod.get_global('acc_size')[0] cuda.memcpy_htod_async(d_acc_size, u32(list(dim)), write_stream) - nbins = awidth * aheight + nbins = astride * aheight # Extra padding in accum helps with write_shmem overruns d_accum = cuda.mem_alloc(16 * nbins + (1<<16)) - d_out = cuda.mem_alloc(16 * oheight * ostride) + d_out = cuda.mem_alloc(16 * aheight * astride) if self.acc_mode == 'atomic': d_atom = cuda.mem_alloc(8 * nbins) flush_fun = self.mod.get_function("flush_atom") obuf_copy = argset(cuda.Memcpy2D(), src_y=self.gutter, src_x_in_bytes=16*self.gutter, - src_pitch=16*ostride, dst_pitch=16*width, + src_pitch=16*astride, dst_pitch=16*width, width_in_bytes=16*width, height=height) obuf_copy.set_src_device(d_out) h_out_a = cuda.pagelocked_empty((height, width, 4), f32) @@ -344,10 +343,8 @@ class Renderer(object): block=(512, 1, 1), grid=(nblocks, nblocks), stream=iter_stream) + util.BaseCode.fill_dptr(self.mod, d_out, 4 * nbins, filt_stream) _sync_stream(filt_stream, write_stream) - filt.blur_density(d_accum, d_out, dim, stream=filt_stream) - util.BaseCode.fill_dptr(self.mod, d_out, 4 * nbins / ss ** 2, - filt_stream) filt.de(d_out, d_accum, genome, dim, tc, stream=filt_stream) _sync_stream(write_stream, filt_stream) filt.colorclip(d_out, genome, dim, tc, blend, stream=filt_stream)