diff --git a/cuburn/code/iter.py b/cuburn/code/iter.py index 0fde8d6..1df942d 100644 --- a/cuburn/code/iter.py +++ b/cuburn/code/iter.py @@ -157,7 +157,7 @@ def iter_xf_body(cp, xfid, px): iter_body_code = r''' __global__ void iter(uint64_t out_ptr, uint64_t atom_ptr, - ringbuf *rb, mwc_st *msts, float4 *points, + ringbuf *rb, mwc_st *msts, float4 *points, uint32_t *hotspots, const iter_params *all_params) { // load params to shared memory cooperatively @@ -325,10 +325,27 @@ iter(uint64_t out_ptr, uint64_t atom_ptr, // threshold, it zeros that cell in the integer buffer, converts the // former contents to floats, and adds them to the float4 buffer. - .reg .pred p; - .reg .u32 off, color, hi, lo, d, y, u, v; - .reg .f32 colorf, yf, uf, vf, df; - .reg .u64 ptr, val; + .reg .pred p, h; + .reg .u32 off, hotmap, color, hi, lo, d, y, u, v; + .reg .f32 colorf, yf, uf, vf, df, mult; + .reg .u64 hptr, ptr, val; + + // Calculate the address of the hotspot indicator + shr.b32 off, %3, 3; + and.b32 off, off, 0xfffffffc; + cvt.u64.u32 hptr, off; + add.u64 hptr, hptr, %7; + + // Load the hotspot map, and use it to set the hotflag predicate. + ld.cg.global.b32 hotmap, [hptr]; + and.b32 off, %3, 0x1f; + shl.b32 off, 1, off; + and.b32 hotmap, hotmap, off; + setp.gt.u32 h, hotmap, 0; + + // If the hotflag is set, skip this whole section 15/16 of the time. + setp.gt.and.f32 p, %8, 0.0625, h; +@p bra oflow_end; // TODO: coord dithering better, or pre-supersampled palette? fma.rn.ftz.f32 colorf, %0, 255.0, %1; @@ -358,6 +375,11 @@ iter(uint64_t out_ptr, uint64_t atom_ptr, setp.lo.u32 p, hi, (256 << 23); @p bra oflow_end; + // Set the hotflag. + and.b32 off, %3, 0x1f; + shl.b32 off, 1, off; + red.global.or.b32 [hptr], off; + // Atomically swap the integer cell with 0 and read its current value atom.global.exch.b64 val, [ptr], 0; mov.b64 {lo, hi}, val; @@ -379,9 +401,14 @@ iter(uint64_t out_ptr, uint64_t atom_ptr, cvt.rn.f32.u32 uf, u; cvt.rn.f32.u32 vf, v; cvt.rn.f32.u32 df, d; - mul.rn.ftz.f32 yf, yf, (1.0/255.0); - mul.rn.ftz.f32 uf, uf, (1.0/255.0); - mul.rn.ftz.f32 vf, vf, (1.0/255.0); + + // If we already hotspotted, each point is worth 64 times as much. + selp.f32 mult, 16.0, 1.0, h; + mul.rn.ftz.f32 df, df, mult; + mul.rn.ftz.f32 mult, mult, (1.0/255.0); + mul.rn.ftz.f32 yf, yf, mult; + mul.rn.ftz.f32 uf, uf, mult; + mul.rn.ftz.f32 vf, vf, mult; shl.b32 off, %3, 4; cvt.u64.u32 ptr, off; add.u64 ptr, ptr, %6; @@ -393,7 +420,7 @@ oflow_end: } """)}} :: "f"(cc), "f"(color_dither), "r"(time), "r"(i), "l"(atom_ptr), "f"(cosel[threadIdx.y + {{NWARPS}}]), - "l"(out_ptr)); + "l"(out_ptr), "l"(hotspots), "f"(mwc_next_01(rctx))); } this_rb_idx = rb_incr(rb->tail, blockDim.x * threadIdx.y + threadIdx.x); @@ -440,7 +467,8 @@ __global__ void flush_atom(uint64_t out_ptr, uint64_t atom_ptr, int nbins) { { .reg .u32 off, hi, lo, d, y, u, v; .reg .u64 val, ptr; - .reg .f32 yf, uf, vf, df, yg, ug, vg, dg; + .reg .f32 yf, uf, vf, df, yg, ug, vg, dg, mult; + .reg .pred p; // TODO: use explicit movs to handle this shl.b32 off, %0, 3; @@ -451,6 +479,8 @@ __global__ void flush_atom(uint64_t out_ptr, uint64_t atom_ptr, int nbins) { cvt.u64.u32 ptr, off; add.u64 ptr, ptr, %2; ld.global.v4.f32 {yg,ug,vg,dg}, [ptr]; + setp.gt.f32 p, dg, 0.0; + selp.f32 mult, 16.0, 1.0, p; shr.u32 d, hi, 22; bfe.u32 y, hi, 4, 18; bfe.u32 u, lo, 18, 14; @@ -460,9 +490,11 @@ __global__ void flush_atom(uint64_t out_ptr, uint64_t atom_ptr, int nbins) { cvt.rn.f32.u32 uf, u; cvt.rn.f32.u32 vf, v; cvt.rn.f32.u32 df, d; - fma.rn.ftz.f32 yg, yf, (1.0/255.0), yg; - fma.rn.ftz.f32 ug, uf, (1.0/255.0), ug; - fma.rn.ftz.f32 vg, vf, (1.0/255.0), vg; + mul.rn.ftz.f32 df, df, mult; + mul.rn.ftz.f32 mult, mult, (1.0/255.0); + fma.rn.ftz.f32 yg, yf, mult, yg; + fma.rn.ftz.f32 ug, uf, mult, ug; + fma.rn.ftz.f32 vg, vf, mult, vg; add.rn.ftz.f32 dg, df, dg; st.global.v4.f32 [ptr], {yg,ug,vg,dg}; diff --git a/cuburn/render.py b/cuburn/render.py index 736eee7..41fb2ed 100644 --- a/cuburn/render.py +++ b/cuburn/render.py @@ -101,14 +101,14 @@ class Framebuffers(object): self.d_points = cuda.mem_alloc(self._len_d_points) def _clear(self): - self.nbins = self.d_front = self.d_back = self.d_side = None + self.nbins = self.d_front = self.d_back = self.d_side = self.d_uchar = None def free(self, stream=None): if stream is not None: stream.synchronize() else: cuda.Context.synchronize() - for p in (self.d_front, self.d_back, self.d_side): + for p in (self.d_front, self.d_back, self.d_side, self.d_uchar): if p is not None: p.free() self._clear() @@ -128,6 +128,7 @@ class Framebuffers(object): self.d_front = cuda.mem_alloc(16 * nbins) self.d_back = cuda.mem_alloc(16 * nbins) self.d_side = cuda.mem_alloc(16 * nbins) + self.d_uchar = cuda.mem_alloc(nbins) self.nbins = nbins except cuda.MemoryError, e: # If a frame that's too large sneaks by the task distributor, we @@ -308,8 +309,9 @@ class RenderManager(ClsMod): fill = lambda b, s, v=i32(0): util.fill_dptr( self.mod, b, s, stream=self.stream_a, value=v) fill(self.fb.d_front, 4 * nbins) - fill(self.fb.d_side, 2 * nbins) + fill(self.fb.d_side, 4 * nbins) fill(self.fb.d_points, self.fb._len_d_points / 4, f32(np.nan)) + fill(self.fb.d_uchar, nbins / 4) nts = self.info_a.ntemporal_samples nsamps = (gprof.spp(tc) * dim.w * dim.h) @@ -318,9 +320,10 @@ class RenderManager(ClsMod): def launch_iter(n): if n == 0: return launch('iter', rdr.mod, self.stream_a, (32, 8, 1), (nts, n), - self.fb.d_front, self.fb.d_side, - self.fb.d_rb, self.fb.d_seeds, self.fb.d_points, - self.info_a.d_params) + self.fb.d_front, self.fb.d_side, + self.fb.d_rb, self.fb.d_seeds, self.fb.d_points, + self.fb.d_uchar, self.info_a.d_params) + # Split the launch into multiple rounds, possibly (slightly) reducing # work overlap but avoiding stalls when working on a device with an # active X session. TODO: characterize performance impact, autodetect