From f58289af53e89b62cc1ac60b7e87284d3e06b2e2 Mon Sep 17 00:00:00 2001 From: Steven Robertson Date: Tue, 9 May 2017 21:16:43 -0700 Subject: [PATCH] Hotspot writeback. 10x performance increase. Create a map assigning two bits to every output bin. During the atomic flush, compute a threshold for discarding writes altogether that would keep us under 2% error - discard 1 of every 2 writes if we've already accumulated 64 writes (hotspot value 1), 7 of 8 if we're above 256 (hotspot value 2), or 31 of 32 at 2048 (hotspot value 3). Pack this value into a read-only buffer that can often be cached at L2, and for particularly concentrated flames (which historically choke cuburn), L1. During writeback, discard writes at the apporpriate rate. During the flush of the integer accumulator to the float, scale the integer accumulators by the discard rate. This works because for most flames, there's not a lot of interesting stuff in the middle regimes; either stuff is very well defined, in which case we pretty much know exactly what the color is going to be (remember, the max 2% relative error gets log-scaled as well), or it's loosely defined so we should keep it at full accuracy. Of course, a 10x boost is best-case-ish - a long, high-res render. I realized though that I really didn't care about low quality stuff and should go for broke optimizing this for my use case, which is ridiculously high res HDR stuff. (On pathological flames, on the other hand, 10x is conservative; this easily gives us 100x.) --- cuburn/code/iter.py | 225 ++++++++++++++++++++++++++++---------------- cuburn/render.py | 78 +++++++++------ 2 files changed, 193 insertions(+), 110 deletions(-) diff --git a/cuburn/code/iter.py b/cuburn/code/iter.py index 1df942d..b2ec107 100644 --- a/cuburn/code/iter.py +++ b/cuburn/code/iter.py @@ -157,8 +157,9 @@ def iter_xf_body(cp, xfid, px): iter_body_code = r''' __global__ void iter(uint64_t out_ptr, uint64_t atom_ptr, - ringbuf *rb, mwc_st *msts, float4 *points, uint32_t *hotspots, - const iter_params *all_params) + ringbuf *rb, mwc_st *msts, float4 *points, + const __restrict__ uint32_t *hotspots, + const __restrict__ iter_params *all_params) { // load params to shared memory cooperatively const iter_params *global_params = &(all_params[blockIdx.x]); @@ -177,12 +178,9 @@ iter(uint64_t out_ptr, uint64_t atom_ptr, {{cp.camera.yo}} += ditherwidth * mwc_next_11(rctx); } - - // TODO: spare the register, reuse at call site? int time = blockIdx.x >> 4; float color_dither = 0.49f * mwc_next_11(rctx); - // TODO: 4th channel unused. Kill or use for something helpful float4 old_point = points[this_rb_idx]; float x = old_point.x, y = old_point.y, color = old_point.z; @@ -315,8 +313,19 @@ iter(uint64_t out_ptr, uint64_t atom_ptr, continue; } - uint32_t i = iy * acc_size.astride + ix; + uint32_t hotspot_i = ( + ((iy >> 4) * acc_size.astride) + + (ix & 0xfffffff0) + (iy & 0xf)); + uint32_t hotspot_rshift = (ix & 0xf) << 1; + uint32_t hotspot_value = (hotspots[hotspot_i] >> hotspot_rshift) & 0x3; + float hotspot_mult = 1.0f; + if (hotspot_value > 0) { + hotspot_mult = ((1 << (hotspot_value << 1)) >> 1); + float prob = __frcp_rn(hotspot_mult); + if (mwc_next_01(rctx) > prob) continue; + } + uint32_t i = iy * acc_size.astride + ix; asm volatile ({{crep(""" { // To prevent overflow, we need to flush each pixel before the density @@ -327,25 +336,8 @@ iter(uint64_t out_ptr, uint64_t atom_ptr, .reg .pred p, h; .reg .u32 off, hotmap, color, hi, lo, d, y, u, v; - .reg .f32 colorf, yf, uf, vf, df, mult; - .reg .u64 hptr, ptr, val; - - // Calculate the address of the hotspot indicator - shr.b32 off, %3, 3; - and.b32 off, off, 0xfffffffc; - cvt.u64.u32 hptr, off; - add.u64 hptr, hptr, %7; - - // Load the hotspot map, and use it to set the hotflag predicate. - ld.cg.global.b32 hotmap, [hptr]; - and.b32 off, %3, 0x1f; - shl.b32 off, 1, off; - and.b32 hotmap, hotmap, off; - setp.gt.u32 h, hotmap, 0; - - // If the hotflag is set, skip this whole section 15/16 of the time. - setp.gt.and.f32 p, %8, 0.0625, h; -@p bra oflow_end; + .reg .f32 colorf, yf, uf, vf, df, mult, prob; + .reg .u64 ptr, val; // TODO: coord dithering better, or pre-supersampled palette? fma.rn.ftz.f32 colorf, %0, 255.0, %1; @@ -375,11 +367,6 @@ iter(uint64_t out_ptr, uint64_t atom_ptr, setp.lo.u32 p, hi, (256 << 23); @p bra oflow_end; - // Set the hotflag. - and.b32 off, %3, 0x1f; - shl.b32 off, 1, off; - red.global.or.b32 [hptr], off; - // Atomically swap the integer cell with 0 and read its current value atom.global.exch.b64 val, [ptr], 0; mov.b64 {lo, hi}, val; @@ -402,10 +389,8 @@ iter(uint64_t out_ptr, uint64_t atom_ptr, cvt.rn.f32.u32 vf, v; cvt.rn.f32.u32 df, d; - // If we already hotspotted, each point is worth 64 times as much. - selp.f32 mult, 16.0, 1.0, h; - mul.rn.ftz.f32 df, df, mult; - mul.rn.ftz.f32 mult, mult, (1.0/255.0); + mul.rn.ftz.f32 df, df, %7; + mul.rn.ftz.f32 mult, %7, (1.0/255.0); mul.rn.ftz.f32 yf, yf, mult; mul.rn.ftz.f32 uf, uf, mult; mul.rn.ftz.f32 vf, vf, mult; @@ -420,7 +405,7 @@ oflow_end: } """)}} :: "f"(cc), "f"(color_dither), "r"(time), "r"(i), "l"(atom_ptr), "f"(cosel[threadIdx.y + {{NWARPS}}]), - "l"(out_ptr), "l"(hotspots), "f"(mwc_next_01(rctx))); + "l"(out_ptr), "f"(hotspot_mult)); } this_rb_idx = rb_incr(rb->tail, blockDim.x * threadIdx.y + threadIdx.x); @@ -428,6 +413,132 @@ oflow_end: msts[this_rb_idx] = rctx; return; } + +__global__ void flush_atom(uint64_t out_ptr, uint64_t atom_ptr, + __restrict__ uint32_t *hotspot_ptr, int nbins) { + const int yi = (blockIdx.y * blockDim.y) + threadIdx.y; + const int xi = (blockIdx.x * blockDim.x) + threadIdx.x; + const int gi = yi * acc_size.astride + xi; + + const int hoti = ( + ((yi >> 4) * acc_size.astride) + + (xi & 0xfffffff0) + (yi & 0xf)); + asm volatile ({{crep(""" +{ + .reg .u32 off, hi, lo, d, y, u, v, hotflag, tidx; + .reg .u64 val, ptr; + .reg .f32 yf, uf, vf, df, yg, ug, vg, dg, mult; + .reg .pred p, q; + + // Load the hotflag and compute the scale factor. + shl.b32 off, %1, 2; + cvt.u64.u32 ptr, off; + add.u64 ptr, ptr, %4; + ld.global.b32 hotflag, [ptr]; + + mov.b32 tidx, %tid.x; + shl.b32 tidx, tidx, 1; + shr.b32 hotflag, hotflag, tidx; + and.b32 hotflag, hotflag, 0x3; + + shl.b32 hotflag, hotflag, 1; + shl.b32 hotflag, 1, hotflag; + shr.b32 hotflag, hotflag, 1; + cvt.rn.f32.u32 mult, hotflag; + setp.eq.f32 p, 0.0, mult; +@p mov.f32 mult, 1.0; + + shl.b32 off, %0, 3; + cvt.u64.u32 ptr, off; + add.u64 ptr, ptr, %2; + ld.cs.global.v2.u32 {lo, hi}, [ptr]; + st.cs.global.b64 [ptr], 0; + + shl.b32 off, %0, 4; + cvt.u64.u32 ptr, off; + add.u64 ptr, ptr, %3; + ld.cs.global.v4.f32 {yg,ug,vg,dg}, [ptr]; + shr.u32 d, hi, 22; + bfe.u32 y, hi, 4, 18; + bfe.u32 u, lo, 18, 14; + bfi.b32 u, hi, u, 14, 4; + and.b32 v, lo, ((1<<18)-1); + cvt.rn.f32.u32 yf, y; + cvt.rn.f32.u32 uf, u; + cvt.rn.f32.u32 vf, v; + cvt.rn.f32.u32 df, d; + fma.rn.ftz.f32 dg, df, mult, dg; + mul.rn.ftz.f32 mult, mult, (1.0/255.0); + fma.rn.ftz.f32 yg, yf, mult, yg; + fma.rn.ftz.f32 ug, uf, mult, ug; + fma.rn.ftz.f32 vg, vf, mult, vg; + + st.cs.global.v4.f32 [ptr], {yg,ug,vg,dg}; + + .reg .u32 bal1, bal2, bal3, tidy, mask, tbit, tbit2, tbit3; + .reg .u32 outlo, outhi; + + // Vote on whether this thread's point passes the (for now, predetermined) + // thresholds for each mask step + setp.gt.f32 p, dg, 128.0; + vote.ballot.b32 bal1, p; + setp.gt.f32 p, dg, 512.0; + vote.ballot.b32 bal2, p; + setp.gt.f32 p, dg, 2048.0; + vote.ballot.b32 bal3, p; + + // Set 'q' with true if we're a low bit + mov.b32 tidx, %laneid; + and.b32 mask, tidx, 1; + setp.eq.u32 q, mask, 0; + + // Set 'mask' with the bit that we're considering in the ballots: + // 1 << (laneid / 2) + shr.b32 mask, tidx, 1; + shl.b32 mask, 1, mask; + + // If this is a low bit, it's set if either the hotflag should be 1 or 3 + // (reduces to 1 & (2 = 3), since 2 will never be unset if 3 is set) + // If this is a high bit, it's set if it should be 2 or 3 + // Set lo bit on all threads, then overwrite it on just hi ones + // Vote to get the value we should write + and.b32 tbit, mask, bal1; + setp.gt.u32 p, tbit, 0; + and.b32 tbit, mask, bal2; + and.b32 tbit2, mask, bal3; + setp.and.eq.u32 p, tbit, tbit2, p; +@q setp.gt.u32 p, tbit, 0; + + vote.ballot.b32 outlo, p; + + // Repeat this process for the high half of the warp + shl.b32 mask, mask, 16; + and.b32 tbit, mask, bal1; + setp.gt.u32 p, tbit, 0; + and.b32 tbit, mask, bal2; + and.b32 tbit2, mask, bal3; + setp.and.eq.u32 p, tbit, tbit2, p; +@q setp.gt.u32 p, tbit, 0; + + vote.ballot.b32 outhi, p; + + // Set 'p' with whether we're the low or high bit of this warp + mov.b32 tidy, %tid.y; + and.b32 tidy, tidy, 1; + setp.gt.u32 p, tidy, 0; +@p mov.b32 outlo, outhi; + + shl.b32 off, %1, 2; + cvt.u64.u32 ptr, off; + add.u64 ptr, ptr, %4; + mov.b32 tidx, %tid.x; + // Set 'p' with whether we're thread 0 of tid.x + setp.eq.u32 p, tidx, 0; +@p st.cs.global.b32 [ptr], outlo; +} +""")}} :: "r"(gi), "r"(hoti), "l"(atom_ptr), "l"(out_ptr), "l"(hotspot_ptr), +"r"(xi), "r"(yi)); +} ''' def iter_body(cp): @@ -458,47 +569,3 @@ def mkiterlib(gnm): decls=iter_decls + interp.palintlib.decls, defs='\n'.join(bodies)) return packer, lib - -flushatomlib = devlib(defs=Template(r''' -__global__ void flush_atom(uint64_t out_ptr, uint64_t atom_ptr, int nbins) { - int i = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x; - if (i >= nbins) return; - asm volatile ({{crep(""" -{ - .reg .u32 off, hi, lo, d, y, u, v; - .reg .u64 val, ptr; - .reg .f32 yf, uf, vf, df, yg, ug, vg, dg, mult; - .reg .pred p; - - // TODO: use explicit movs to handle this - shl.b32 off, %0, 3; - cvt.u64.u32 ptr, off; - add.u64 ptr, ptr, %1; - ld.global.v2.u32 {lo, hi}, [ptr]; - shl.b32 off, %0, 4; - cvt.u64.u32 ptr, off; - add.u64 ptr, ptr, %2; - ld.global.v4.f32 {yg,ug,vg,dg}, [ptr]; - setp.gt.f32 p, dg, 0.0; - selp.f32 mult, 16.0, 1.0, p; - shr.u32 d, hi, 22; - bfe.u32 y, hi, 4, 18; - bfe.u32 u, lo, 18, 14; - bfi.b32 u, hi, u, 14, 4; - and.b32 v, lo, ((1<<18)-1); - cvt.rn.f32.u32 yf, y; - cvt.rn.f32.u32 uf, u; - cvt.rn.f32.u32 vf, v; - cvt.rn.f32.u32 df, d; - mul.rn.ftz.f32 df, df, mult; - mul.rn.ftz.f32 mult, mult, (1.0/255.0); - fma.rn.ftz.f32 yg, yf, mult, yg; - fma.rn.ftz.f32 ug, uf, mult, ug; - fma.rn.ftz.f32 vg, vf, mult, vg; - - add.rn.ftz.f32 dg, df, dg; - st.global.v4.f32 [ptr], {yg,ug,vg,dg}; -} - """)}} :: "r"(i), "l"(atom_ptr), "l"(out_ptr)); -} -''', 'flush_atom').substitute()) diff --git a/cuburn/render.py b/cuburn/render.py index 62fb776..eba2ad2 100644 --- a/cuburn/render.py +++ b/cuburn/render.py @@ -43,8 +43,7 @@ class Framebuffers(object): ``d_front`` and ``d_back`` are separate buffers, each large enough to hold four float32 components per pixel (including any gutter pixels added for - alignment or padding). ``d_left`` is another buffer large enough to hold - two float32 components per pixel. + alignment or padding). Every user of this set of buffers may use and overwrite the buffers in any way, as long as the output for the next stage winds up in the front @@ -53,6 +52,10 @@ class Framebuffers(object): exists for the side buffer, you're free to do the same by taking local copies of the references and exchanging them yourself. + ``d_left`` and ``d_right`` and ``d_uleft`` and ``d_uright`` are similar, + but without strict dependencies. Each stage is free to stomp these buffers, + but must be done with them by the next stage. + There's one spot in the stream interleaving where the behavior is different: the ``Output.convert`` call must store its output to the back buffer, which will remain untouched until the dtoh copy of the converted @@ -78,10 +81,10 @@ class Framebuffers(object): """ Given a width and height, return a valid set of dimensions which include at least enough gutter to exceed the minimum, and where - (acc_width % 32) == 0 and (acc_height % 8) == 0. + (acc_width % 32) == 0 and (acc_height % 16) == 0. """ awidth = width + 2 * cls.gutter - aheight = 8 * int(np.ceil((height + 2 * cls.gutter) / 8.)) + aheight = 16 * int(np.ceil((height + 2 * cls.gutter) / 16.)) astride = 32 * int(np.ceil(awidth / 32.)) return Dimensions(width, height, awidth, aheight, astride) @@ -102,14 +105,15 @@ class Framebuffers(object): def _clear(self): self.nbins = self.d_front = self.d_back = None - self.d_left = self.d_right = self.d_uchar = None + self.d_left = self.d_right = self.d_uleft = self.d_uright = None def free(self, stream=None): if stream is not None: stream.synchronize() else: cuda.Context.synchronize() - for p in (self.d_front, self.d_back, self.d_left, self.d_right, self.d_uchar): + for p in (self.d_front, self.d_back, self.d_left, self.d_right, + self.d_uleft, self.d_uright): if p is not None: p.free() self._clear() @@ -126,11 +130,12 @@ class Framebuffers(object): if self.nbins >= nbins: return if self.nbins is not None: self.free() try: - self.d_front = cuda.mem_alloc(16 * nbins) - self.d_back = cuda.mem_alloc(16 * nbins) - self.d_left = cuda.mem_alloc(16 * nbins) - self.d_right = cuda.mem_alloc(16 * nbins) - self.d_uchar = cuda.mem_alloc(2 * nbins) + self.d_front = cuda.mem_alloc(16 * nbins) + self.d_back = cuda.mem_alloc(16 * nbins) + self.d_left = cuda.mem_alloc(16 * nbins) + self.d_right = cuda.mem_alloc(16 * nbins) + self.d_uleft = cuda.mem_alloc(2 * nbins) + self.d_uright = cuda.mem_alloc(2 * nbins) self.nbins = nbins except cuda.MemoryError, e: # If a frame that's too large sneaks by the task distributor, we @@ -160,8 +165,9 @@ class Framebuffers(object): self.d_front, self.d_back = self.d_back, self.d_front def flip_side(self): - """Flip the left and right buffers.""" + """Flip the left and right buffers (float and uchar).""" self.d_left, self.d_right = self.d_right, self.d_left + self.d_uleft, self.d_uright = self.d_uright, self.d_uleft class DevSrc(object): """ @@ -245,7 +251,7 @@ class Renderer(object): self.out = output.get_output_for_profile(gprof) class RenderManager(ClsMod): - lib = devlib(deps=[interp.palintlib, filldptrlib, iter.flushatomlib]) + lib = devlib(deps=[interp.palintlib, filldptrlib]) def __init__(self): super(RenderManager, self).__init__() @@ -316,32 +322,42 @@ class RenderManager(ClsMod): self.mod, b, s, stream=self.stream_a, value=v) fill(self.fb.d_front, 4 * nbins) fill(self.fb.d_left, 4 * nbins) + fill(self.fb.d_right, 4 * nbins) fill(self.fb.d_points, self.fb._len_d_points / 4, f32(np.nan)) - fill(self.fb.d_uchar, nbins / 4) + fill(self.fb.d_uleft, nbins / 2) + fill(self.fb.d_uright, nbins / 2) nts = self.info_a.ntemporal_samples nsamps = (gprof.spp(tc) * dim.w * dim.h) nrounds = int(nsamps / (nts * 256. * 256)) + 1 - def launch_iter(n): - if n == 0: return - launch('iter', rdr.mod, self.stream_a, (32, 8, 1), (nts, n), - self.fb.d_front, self.fb.d_left, - self.fb.d_rb, self.fb.d_seeds, self.fb.d_points, - self.fb.d_uchar, self.info_a.d_params) + # Split the launch into multiple rounds, to prevent a system on older + # GPUs from locking up and to give us a chance to flush some stuff. + hidden_stream = cuda.Stream() + iter_stream_left, iter_stream_right = self.stream_a, hidden_stream + BLOCK_SIZE = 4 - # Split the launch into multiple rounds, possibly (slightly) reducing - # work overlap but avoiding stalls when working on a device with an - # active X session. TODO: characterize performance impact, autodetect - BLOCK_SIZE = 16 - for i in range(BLOCK_SIZE-1, nrounds, BLOCK_SIZE): - launch_iter(BLOCK_SIZE) - launch_iter(nrounds%BLOCK_SIZE) + while nrounds: + n = min(nrounds, BLOCK_SIZE) + launch('iter', rdr.mod, iter_stream_left, (32, 8, 1), (nts, n), + self.fb.d_front, self.fb.d_left, + self.fb.d_rb, self.fb.d_seeds, self.fb.d_points, + self.fb.d_uleft, self.info_a.d_params) - nblocks = int(np.ceil(np.sqrt(dim.ah*dim.astride/256.))) - launch('flush_atom', self.mod, self.stream_a, - 256, (nblocks, nblocks), - u64(self.fb.d_front), u64(self.fb.d_left), i32(nbins)) + # Make sure the other stream is done flushing before we start + iter_stream_left.wait_for_event(cuda.Event().record(iter_stream_right)) + + launch('flush_atom', rdr.mod, iter_stream_left, + (16, 16, 1), (dim.astride / 16, dim.ah / 16), + u64(self.fb.d_front), u64(self.fb.d_left), + u64(self.fb.d_uleft), i32(nbins)) + + self.fb.flip_side() + iter_stream_left, iter_stream_right = iter_stream_right, iter_stream_left + nrounds -= n + + # Always wait on all events in the hidden stream before continuing on A + self.stream_a.wait_for_event(cuda.Event().record(hidden_stream)) def queue_frame(self, rdr, gnm, gprof, tc, copy=True): """