mirror of
https://github.com/stevenrobertson/cuburn.git
synced 2025-02-05 11:40:04 -05:00
Initial draft of hotspot deferral.
Build an array of one-bit flags for every pixel (addressed as u32 data). If we have accumulated at least 64 points for that pixel, set the flag; thereafter only write 1/16 (and multiply subsequent points that do get written by 16). The theory is, after 64 points, the color is pretty much locked in; this lets us crank SPP up to get excellent coverage in dark areas but the bright ones don't matter so much since they're fully resolved. Still needs a lot of tuning to get peak performance, and the trigger threshold may need to be scaled along with the render size. It also will likely not scale as well to higher resolutions, because we rely on L2 cache to make this fast.
This commit is contained in:
parent
6b2b72a3fe
commit
bdcaca1f97
@ -157,7 +157,7 @@ def iter_xf_body(cp, xfid, px):
|
|||||||
iter_body_code = r'''
|
iter_body_code = r'''
|
||||||
__global__ void
|
__global__ void
|
||||||
iter(uint64_t out_ptr, uint64_t atom_ptr,
|
iter(uint64_t out_ptr, uint64_t atom_ptr,
|
||||||
ringbuf *rb, mwc_st *msts, float4 *points,
|
ringbuf *rb, mwc_st *msts, float4 *points, uint32_t *hotspots,
|
||||||
const iter_params *all_params)
|
const iter_params *all_params)
|
||||||
{
|
{
|
||||||
// load params to shared memory cooperatively
|
// load params to shared memory cooperatively
|
||||||
@ -325,10 +325,27 @@ iter(uint64_t out_ptr, uint64_t atom_ptr,
|
|||||||
// threshold, it zeros that cell in the integer buffer, converts the
|
// threshold, it zeros that cell in the integer buffer, converts the
|
||||||
// former contents to floats, and adds them to the float4 buffer.
|
// former contents to floats, and adds them to the float4 buffer.
|
||||||
|
|
||||||
.reg .pred p;
|
.reg .pred p, h;
|
||||||
.reg .u32 off, color, hi, lo, d, y, u, v;
|
.reg .u32 off, hotmap, color, hi, lo, d, y, u, v;
|
||||||
.reg .f32 colorf, yf, uf, vf, df;
|
.reg .f32 colorf, yf, uf, vf, df, mult;
|
||||||
.reg .u64 ptr, val;
|
.reg .u64 hptr, ptr, val;
|
||||||
|
|
||||||
|
// Calculate the address of the hotspot indicator
|
||||||
|
shr.b32 off, %3, 3;
|
||||||
|
and.b32 off, off, 0xfffffffc;
|
||||||
|
cvt.u64.u32 hptr, off;
|
||||||
|
add.u64 hptr, hptr, %7;
|
||||||
|
|
||||||
|
// Load the hotspot map, and use it to set the hotflag predicate.
|
||||||
|
ld.cg.global.b32 hotmap, [hptr];
|
||||||
|
and.b32 off, %3, 0x1f;
|
||||||
|
shl.b32 off, 1, off;
|
||||||
|
and.b32 hotmap, hotmap, off;
|
||||||
|
setp.gt.u32 h, hotmap, 0;
|
||||||
|
|
||||||
|
// If the hotflag is set, skip this whole section 15/16 of the time.
|
||||||
|
setp.gt.and.f32 p, %8, 0.0625, h;
|
||||||
|
@p bra oflow_end;
|
||||||
|
|
||||||
// TODO: coord dithering better, or pre-supersampled palette?
|
// TODO: coord dithering better, or pre-supersampled palette?
|
||||||
fma.rn.ftz.f32 colorf, %0, 255.0, %1;
|
fma.rn.ftz.f32 colorf, %0, 255.0, %1;
|
||||||
@ -358,6 +375,11 @@ iter(uint64_t out_ptr, uint64_t atom_ptr,
|
|||||||
setp.lo.u32 p, hi, (256 << 23);
|
setp.lo.u32 p, hi, (256 << 23);
|
||||||
@p bra oflow_end;
|
@p bra oflow_end;
|
||||||
|
|
||||||
|
// Set the hotflag.
|
||||||
|
and.b32 off, %3, 0x1f;
|
||||||
|
shl.b32 off, 1, off;
|
||||||
|
red.global.or.b32 [hptr], off;
|
||||||
|
|
||||||
// Atomically swap the integer cell with 0 and read its current value
|
// Atomically swap the integer cell with 0 and read its current value
|
||||||
atom.global.exch.b64 val, [ptr], 0;
|
atom.global.exch.b64 val, [ptr], 0;
|
||||||
mov.b64 {lo, hi}, val;
|
mov.b64 {lo, hi}, val;
|
||||||
@ -379,9 +401,14 @@ iter(uint64_t out_ptr, uint64_t atom_ptr,
|
|||||||
cvt.rn.f32.u32 uf, u;
|
cvt.rn.f32.u32 uf, u;
|
||||||
cvt.rn.f32.u32 vf, v;
|
cvt.rn.f32.u32 vf, v;
|
||||||
cvt.rn.f32.u32 df, d;
|
cvt.rn.f32.u32 df, d;
|
||||||
mul.rn.ftz.f32 yf, yf, (1.0/255.0);
|
|
||||||
mul.rn.ftz.f32 uf, uf, (1.0/255.0);
|
// If we already hotspotted, each point is worth 64 times as much.
|
||||||
mul.rn.ftz.f32 vf, vf, (1.0/255.0);
|
selp.f32 mult, 16.0, 1.0, h;
|
||||||
|
mul.rn.ftz.f32 df, df, mult;
|
||||||
|
mul.rn.ftz.f32 mult, mult, (1.0/255.0);
|
||||||
|
mul.rn.ftz.f32 yf, yf, mult;
|
||||||
|
mul.rn.ftz.f32 uf, uf, mult;
|
||||||
|
mul.rn.ftz.f32 vf, vf, mult;
|
||||||
shl.b32 off, %3, 4;
|
shl.b32 off, %3, 4;
|
||||||
cvt.u64.u32 ptr, off;
|
cvt.u64.u32 ptr, off;
|
||||||
add.u64 ptr, ptr, %6;
|
add.u64 ptr, ptr, %6;
|
||||||
@ -393,7 +420,7 @@ oflow_end:
|
|||||||
}
|
}
|
||||||
""")}} :: "f"(cc), "f"(color_dither), "r"(time), "r"(i),
|
""")}} :: "f"(cc), "f"(color_dither), "r"(time), "r"(i),
|
||||||
"l"(atom_ptr), "f"(cosel[threadIdx.y + {{NWARPS}}]),
|
"l"(atom_ptr), "f"(cosel[threadIdx.y + {{NWARPS}}]),
|
||||||
"l"(out_ptr));
|
"l"(out_ptr), "l"(hotspots), "f"(mwc_next_01(rctx)));
|
||||||
}
|
}
|
||||||
|
|
||||||
this_rb_idx = rb_incr(rb->tail, blockDim.x * threadIdx.y + threadIdx.x);
|
this_rb_idx = rb_incr(rb->tail, blockDim.x * threadIdx.y + threadIdx.x);
|
||||||
@ -440,7 +467,8 @@ __global__ void flush_atom(uint64_t out_ptr, uint64_t atom_ptr, int nbins) {
|
|||||||
{
|
{
|
||||||
.reg .u32 off, hi, lo, d, y, u, v;
|
.reg .u32 off, hi, lo, d, y, u, v;
|
||||||
.reg .u64 val, ptr;
|
.reg .u64 val, ptr;
|
||||||
.reg .f32 yf, uf, vf, df, yg, ug, vg, dg;
|
.reg .f32 yf, uf, vf, df, yg, ug, vg, dg, mult;
|
||||||
|
.reg .pred p;
|
||||||
|
|
||||||
// TODO: use explicit movs to handle this
|
// TODO: use explicit movs to handle this
|
||||||
shl.b32 off, %0, 3;
|
shl.b32 off, %0, 3;
|
||||||
@ -451,6 +479,8 @@ __global__ void flush_atom(uint64_t out_ptr, uint64_t atom_ptr, int nbins) {
|
|||||||
cvt.u64.u32 ptr, off;
|
cvt.u64.u32 ptr, off;
|
||||||
add.u64 ptr, ptr, %2;
|
add.u64 ptr, ptr, %2;
|
||||||
ld.global.v4.f32 {yg,ug,vg,dg}, [ptr];
|
ld.global.v4.f32 {yg,ug,vg,dg}, [ptr];
|
||||||
|
setp.gt.f32 p, dg, 0.0;
|
||||||
|
selp.f32 mult, 16.0, 1.0, p;
|
||||||
shr.u32 d, hi, 22;
|
shr.u32 d, hi, 22;
|
||||||
bfe.u32 y, hi, 4, 18;
|
bfe.u32 y, hi, 4, 18;
|
||||||
bfe.u32 u, lo, 18, 14;
|
bfe.u32 u, lo, 18, 14;
|
||||||
@ -460,9 +490,11 @@ __global__ void flush_atom(uint64_t out_ptr, uint64_t atom_ptr, int nbins) {
|
|||||||
cvt.rn.f32.u32 uf, u;
|
cvt.rn.f32.u32 uf, u;
|
||||||
cvt.rn.f32.u32 vf, v;
|
cvt.rn.f32.u32 vf, v;
|
||||||
cvt.rn.f32.u32 df, d;
|
cvt.rn.f32.u32 df, d;
|
||||||
fma.rn.ftz.f32 yg, yf, (1.0/255.0), yg;
|
mul.rn.ftz.f32 df, df, mult;
|
||||||
fma.rn.ftz.f32 ug, uf, (1.0/255.0), ug;
|
mul.rn.ftz.f32 mult, mult, (1.0/255.0);
|
||||||
fma.rn.ftz.f32 vg, vf, (1.0/255.0), vg;
|
fma.rn.ftz.f32 yg, yf, mult, yg;
|
||||||
|
fma.rn.ftz.f32 ug, uf, mult, ug;
|
||||||
|
fma.rn.ftz.f32 vg, vf, mult, vg;
|
||||||
|
|
||||||
add.rn.ftz.f32 dg, df, dg;
|
add.rn.ftz.f32 dg, df, dg;
|
||||||
st.global.v4.f32 [ptr], {yg,ug,vg,dg};
|
st.global.v4.f32 [ptr], {yg,ug,vg,dg};
|
||||||
|
@ -101,14 +101,14 @@ class Framebuffers(object):
|
|||||||
self.d_points = cuda.mem_alloc(self._len_d_points)
|
self.d_points = cuda.mem_alloc(self._len_d_points)
|
||||||
|
|
||||||
def _clear(self):
|
def _clear(self):
|
||||||
self.nbins = self.d_front = self.d_back = self.d_side = None
|
self.nbins = self.d_front = self.d_back = self.d_side = self.d_uchar = None
|
||||||
|
|
||||||
def free(self, stream=None):
|
def free(self, stream=None):
|
||||||
if stream is not None:
|
if stream is not None:
|
||||||
stream.synchronize()
|
stream.synchronize()
|
||||||
else:
|
else:
|
||||||
cuda.Context.synchronize()
|
cuda.Context.synchronize()
|
||||||
for p in (self.d_front, self.d_back, self.d_side):
|
for p in (self.d_front, self.d_back, self.d_side, self.d_uchar):
|
||||||
if p is not None:
|
if p is not None:
|
||||||
p.free()
|
p.free()
|
||||||
self._clear()
|
self._clear()
|
||||||
@ -128,6 +128,7 @@ class Framebuffers(object):
|
|||||||
self.d_front = cuda.mem_alloc(16 * nbins)
|
self.d_front = cuda.mem_alloc(16 * nbins)
|
||||||
self.d_back = cuda.mem_alloc(16 * nbins)
|
self.d_back = cuda.mem_alloc(16 * nbins)
|
||||||
self.d_side = cuda.mem_alloc(16 * nbins)
|
self.d_side = cuda.mem_alloc(16 * nbins)
|
||||||
|
self.d_uchar = cuda.mem_alloc(nbins)
|
||||||
self.nbins = nbins
|
self.nbins = nbins
|
||||||
except cuda.MemoryError, e:
|
except cuda.MemoryError, e:
|
||||||
# If a frame that's too large sneaks by the task distributor, we
|
# If a frame that's too large sneaks by the task distributor, we
|
||||||
@ -308,8 +309,9 @@ class RenderManager(ClsMod):
|
|||||||
fill = lambda b, s, v=i32(0): util.fill_dptr(
|
fill = lambda b, s, v=i32(0): util.fill_dptr(
|
||||||
self.mod, b, s, stream=self.stream_a, value=v)
|
self.mod, b, s, stream=self.stream_a, value=v)
|
||||||
fill(self.fb.d_front, 4 * nbins)
|
fill(self.fb.d_front, 4 * nbins)
|
||||||
fill(self.fb.d_side, 2 * nbins)
|
fill(self.fb.d_side, 4 * nbins)
|
||||||
fill(self.fb.d_points, self.fb._len_d_points / 4, f32(np.nan))
|
fill(self.fb.d_points, self.fb._len_d_points / 4, f32(np.nan))
|
||||||
|
fill(self.fb.d_uchar, nbins / 4)
|
||||||
|
|
||||||
nts = self.info_a.ntemporal_samples
|
nts = self.info_a.ntemporal_samples
|
||||||
nsamps = (gprof.spp(tc) * dim.w * dim.h)
|
nsamps = (gprof.spp(tc) * dim.w * dim.h)
|
||||||
@ -320,7 +322,8 @@ class RenderManager(ClsMod):
|
|||||||
launch('iter', rdr.mod, self.stream_a, (32, 8, 1), (nts, n),
|
launch('iter', rdr.mod, self.stream_a, (32, 8, 1), (nts, n),
|
||||||
self.fb.d_front, self.fb.d_side,
|
self.fb.d_front, self.fb.d_side,
|
||||||
self.fb.d_rb, self.fb.d_seeds, self.fb.d_points,
|
self.fb.d_rb, self.fb.d_seeds, self.fb.d_points,
|
||||||
self.info_a.d_params)
|
self.fb.d_uchar, self.info_a.d_params)
|
||||||
|
|
||||||
# Split the launch into multiple rounds, possibly (slightly) reducing
|
# Split the launch into multiple rounds, possibly (slightly) reducing
|
||||||
# work overlap but avoiding stalls when working on a device with an
|
# work overlap but avoiding stalls when working on a device with an
|
||||||
# active X session. TODO: characterize performance impact, autodetect
|
# active X session. TODO: characterize performance impact, autodetect
|
||||||
|
Loading…
Reference in New Issue
Block a user