From 97180003a417a9175fe3dc5a6ab6d12206bcfaa7 Mon Sep 17 00:00:00 2001 From: Steven Robertson Date: Sat, 9 Oct 2010 11:18:58 -0400 Subject: [PATCH] Broken: Variations, CP stream implemented --- cuburn/device_code.py | 282 ++++++++++++++++-------------------------- cuburn/render.py | 10 +- cuburn/variations.py | 131 +++++++------------- main.py | 6 +- 4 files changed, 160 insertions(+), 269 deletions(-) diff --git a/cuburn/device_code.py b/cuburn/device_code.py index a07045e..04e458a 100644 --- a/cuburn/device_code.py +++ b/cuburn/device_code.py @@ -9,199 +9,131 @@ import struct import pycuda.driver as cuda import numpy as np -from pyptx import ptx, run +from pyptx import ptx, run, util from cuburn.variations import Variations class IterThread(object): - entry_name = 'iter_thread' - entry_params = [] + def __init__(self, entry, features): + self.features = features + self.mwc = MWCRNG(entry) + self.cp = util.DataStream(entry) + self.vars = Variations(features) - def __init__(self): - self.cps_uploaded = False + entry.add_param('u32', 'num_cps') + entry.add_ptr_param('u32', 'cp_started_count') + entry.add_ptr_param('u8', 'cp_data') - def deps(self): - return [MWCRNG, CPDataStream, HistScatter, Variations, ShufflePoints, - Timeouter] + with entry.body(): + self.entry_body(entry) - def module_setup(self): - mem.global_.u32('g_cp_array', - cp.stream_size*features.max_ntemporal_samples) - mem.global_.u32('g_num_cps') - mem.global_.u32('g_num_cps_started') - # TODO move into debug statement - mem.global_.u32('g_num_rounds', ctx.nthreads) - mem.global_.u32('g_num_writes', ctx.nthreads) - mem.global_.b32('g_whatever', ctx.nthreads) - - def entry(self): - # Index number of current CP, shared across CTA - mem.shared.u32('s_cp_idx') + def entry_body(self, entry): + e, r, o, m, p, s = entry.locals + # Index of this CTA's current CP + e.declare_mem('shared', 'u32', 'cp_idx') # Number of samples that have been generated so far in this CTA # If this number is negative, we're still fusing points, so this # behaves slightly differently (see ``fuse_loop_start``) # TODO: replace (or at least simplify) this logic - mem.shared.s32('s_num_samples') - mem.shared.f32('s_xf_sel', ctx.warps_per_cta) + e.declare_mem('shared', 'f32', 'num_samples') - # TODO: temporary, for testing - mem.local.u32('l_num_rounds') - mem.local.u32('l_num_writes') - op.st.local.u32(addr(l_num_rounds), 0) - op.st.local.u32(addr(l_num_writes), 0) + # The per-warp transform selection indices + e.declare_mem('shared', 'f32', 'xf_sel', e.nwarps_cta) - reg.f32('x y color consec_bad') - mwc.next_f32_11(x) - mwc.next_f32_11(y) - mwc.next_f32_01(color) - op.mov.f32(consec_bad, float(-features.fuse)) + # TODO: re-add this logic using the printf formatter. + #mem.local.u32('l_num_rounds') + #mem.local.u32('l_num_writes') + #op.st.local.u32(addr(l_num_rounds), 0) + #op.st.local.u32(addr(l_num_writes), 0) - comment("Ensure all init is done") - op.bar.sync(0) + # Declare IFS-space coordinates for doing iterations + r.x, r.y, r.color = r.f32(), r.f32(), r.f32() + r.x, r.y = self.mwc.next_f32_11(), self.mwc.next_f32_11() + r.color = self.mwc.next_f32_01() + # This thread's sample's good/bad/fusing state + r.consec_bad = r.f32(-self.features.fuse) + e.comment("The main loop entry point") + cp_loop_start = e.label() + with s.tid_x == 0: + o.st(m.cp_idx.addr, o.atom.add(p.cp_started_count[0], 1)) + o.st(m.num_samples.addr, 0) - label('cp_loop_start') - reg.u32('cp_idx cpA') - with block("Claim a CP"): - std.set_is_first_thread(reg.pred('p_is_first')) - op.atom.add.u32(cp_idx, addr(g_num_cps_started), 1, ifp=p_is_first) - op.st.volatile.shared.u32(addr(s_cp_idx), cp_idx, ifp=p_is_first) - op.st.volatile.shared.s32(addr(s_num_samples), 0) + e.comment("Load the CP index in all threads") + o.bar.sync(0) + cp_idx = o.ld.volatile(m.cp_idx.addr) - comment("Load the CP index in all threads") - op.bar.sync(0) - op.ld.volatile.shared.u32(cp_idx, addr(s_cp_idx)) + e.comment("Check to see if this CP is valid (if not, we're done)") + all_cps_done = e.forward_label() + with cp_idx < p.num_cps: + o.bra.uni(all_cps_done) + self.cp.addr = p.cp_data[cp_idx * self.cp.stream_size] - with block("Check to see if this CP is valid (if not, we're done)"): - reg.u32('num_cps') - reg.pred('p_last_cp') - op.ldu.u32(num_cps, addr(g_num_cps)) - op.setp.ge.u32(p_last_cp, cp_idx, num_cps) - op.bra('all_cps_done', ifp=p_last_cp) + loop_start = e.forward_label() + with s.tid_x < e.nwarps_cta: + o.bra(loop_start) - with block('Load CP address'): - op.mov.u32(cpA, g_cp_array) - op.mad.lo.u32(cpA, cp_idx, cp.stream_size, cpA) + e.comment("Choose the xform for each warp") + choose_xform = e.label() + o.st.volatile(m.xf_sel[s.tid_x], self.mwc.next_f32_01()) + e.declare_label(loop_start) + e.comment("Execute the xform given by xf_sel") + xf_labels = [e.forward_label() for xf in self.features.xforms] + xf_sel = o.ld.volatile(m.xf_sel[s.tid_x >> 5]) + for i, xf in enumerate(self.features.xforms): + xf_density = self.cp.get.f32('cp.xforms[%d].cweight'%xf.id) + with xf_density <= xf_sel: + o.bra.uni(xf_labels[i]) + e.comment("This code should be unreachable") + o.trap() - label('iter_loop_choose_xform') - with block("Choose the xform for each warp"): - timeout.check_time(5) - comment("On subsequent runs, only warp 0 will hit this code") - reg.u32('x_addr x_offset') - reg.f32('xf_sel') - op.mov.u32(x_addr, s_xf_sel) - op.mov.u32(x_offset, '%tid.x') - op.and_.b32(x_offset, x_offset, ctx.warps_per_cta-1) - op.mad.lo.u32(x_addr, x_offset, 4, x_addr) - mwc.next_f32_01(xf_sel) - op.st.volatile.shared.f32(addr(x_addr), xf_sel) + xforms_done = e.forward_label() + for i, xf in enumerate(self.features.xforms): + e.declare_label(xf_labels[i]) + r.x, r.y, r.color = self.vars.apply_xform( + e, self.cp, r.x, r.y, r.color, xf.id) + o.bra.uni(xforms_done) - label('iter_loop_start') + e.comment("Determine write location, and whether point is valid") + e.declare_label(xforms_done) + histidx, is_valid = self.camera.get_index(r.x, r.y) + is_valid &= (r.consec_bad >= 0) - #timeout.check_time(10) + e.comment("Scatter point to pointbuffer") + self.hist.scatter(histidx, r.color, 0, is_valid) - with block(): - reg.u32('num_rounds') - reg.pred('overload') - op.ld.local.u32(num_rounds, addr(l_num_rounds)) - op.add.u32(num_rounds, num_rounds, 1) - op.st.local.u32(addr(l_num_rounds), num_rounds) + done_picking_new_point = e.forward_label() + with ~is_valid: + r.consec_bad += 1 + with r.consec_bad < self.features.max_bad: + o.bra(done_picking_new_point) - with block("Select an xform"): - reg.f32('xf_sel') - reg.u32('warp_offset xf_sel_addr') - op.mov.u32(warp_offset, '%tid.x') - op.mov.u32(xf_sel_addr, s_xf_sel) - op.shr.u32(warp_offset, warp_offset, 5) - op.mad.lo.u32(xf_sel_addr, warp_offset, 4, xf_sel_addr) - op.ld.volatile.shared.f32(xf_sel, addr(xf_sel_addr)) + e.comment("If too many consecutive bad values, pick a new point") + r.x, r.y = self.mwc.next_f32_11(), self.mwc.next_f32_11() + r.color = self.mwc.next_f32_01() + r.consec_bad = -self.features.fuse - reg.f32('xf_density') - reg.pred('xf_jump') - for xf in features.xforms: - cp.get(cpA, xf_density, 'cp.xforms[%d].cweight' % xf.id) - op.setp.le.f32(xf_jump, xf_sel, xf_density) - op.bra('XFORM_%d' % xf.id, ifp=xf_jump) - std.asrt("Reached end of xforms without choosing one") + e.declare_label(done_picking_new_point) - for xf in features.xforms: - label('XFORM_%d' % xf.id) - variations.apply_xform(x, y, color, x, y, color, xf.id) - op.bra("xform_done") - - label("xform_done") - - reg.pred('p_valid_pt') - with block("Write the result"): - reg.u32('hist_index') - camera.get_index(hist_index, x, y, p_valid_pt) - comment('if consec_bad < 0, point is fusing; treat as invalid') - op.setp.and_.ge.f32(p_valid_pt, consec_bad, 0., p_valid_pt) - # TODO: save and pass correct xform value here - hist.scatter(hist_index, color, 0, p_valid_pt, 'ldst') - with block(): - reg.u32('num_writes') - op.ld.local.u32(num_writes, addr(l_num_writes)) - op.add.u32(num_writes, num_writes, 1, ifp=p_valid_pt) - op.st.local.u32(addr(l_num_writes), num_writes) - - with block("If the result was invalid, handle badvals"): - reg.pred('need_new_point') - op.add.f32(consec_bad, consec_bad, 1., ifnotp=p_valid_pt) - op.setp.ge.f32(need_new_point, consec_bad, float(features.max_bad)) - op.bra('badval_done', ifnotp=need_new_point) - - comment('If consec_bad > 5, pick a new random point') - mwc.next_f32_11(x) - mwc.next_f32_11(y) - mwc.next_f32_01(color) - op.mov.f32(consec_bad, float(-features.fuse)) - label('badval_done') - - with block("Increment number of samples by number of good values"): - reg.b32('good_samples laneid') - reg.pred('p_is_first') - op.vote.ballot.b32(good_samples, p_valid_pt) - op.popc.b32(good_samples, good_samples) - op.mov.u32(laneid, '%laneid') - op.setp.eq.u32(p_is_first, laneid, 0) - op.red.shared.add.s32(addr(s_num_samples), good_samples, - ifp=p_is_first) - - with block("Check to see if we're done with this CP"): - reg.pred('p_cp_done') - reg.s32('num_samples num_samples_needed') - comment('Sync before making decision to prevent divergence') - op.bar.sync(3) - op.ld.volatile.shared.s32(num_samples, addr(s_num_samples)) - cp.get(cpA, num_samples_needed, 'cp.nsamples') - op.setp.ge.s32(p_cp_done, num_samples, num_samples_needed) - op.bra.uni(cp_loop_start, ifp=p_cp_done) + e.comment("Determine number of good samples, and whether we're done") + num_samples = o.ld(m.num_samples) + num_samples += o.bar.red.popc(0, is_valid) + with s.tid_x == 0: + o.st(m.num_samples, num_samples) + with num_samples >= self.cp.get('nsamples'): + o.bra.uni(cp_loop_start) comment('Shuffle points between threads') shuf.shuffle(x, y, color, consec_bad) - with block("If in first warp, pick new offset"): - reg.u32('tid') - reg.pred('first_warp') - op.mov.u32(tid, '%tid.x') - assert ctx.warps_per_cta <= 32, \ - "Special-case for CTAs with >1024 threads not implemented" - op.setp.lo.u32(first_warp, tid, 32) - op.bra(iter_loop_choose_xform, ifp=first_warp) - op.bra(iter_loop_start) + with s.tid_x < e.nwarps_cta: + o.bra(choose_xform) + o.bra(loop_start) - label('all_cps_done') - # TODO this is for testing, move it to a debug statement - with block(): - reg.u32('num_rounds num_writes') - op.ld.local.u32(num_rounds, addr(l_num_rounds)) - op.ld.local.u32(num_writes, addr(l_num_writes)) - std.store_per_thread(g_num_rounds, num_rounds, - g_num_writes, num_writes) + e.declare_label(all_cps_done) def upload_cp_stream(self, ctx, cp_stream, num_cps): cp_array_dp, cp_array_l = ctx.mod.get_global('g_cp_array') @@ -525,40 +457,41 @@ class MWCRNG(object): raise EnvironmentError('primes.bin not found') self.nthreads_ready = 0 self.mults, self.state = None, None + self.entry = entry - entry.add_ptr_param('mwc_mults', 'u32') - entry.add_ptr_param('mwc_states', 'u32') + entry.add_ptr_param('u32', 'mwc_mults') + entry.add_ptr_param('u32', 'mwc_states') with entry.head(): - self.entry_head(entry) - entry.tail_callback(self.entry_tail, entry) + self.entry_head() + entry.tail_callback(self.entry_tail) - def entry_head(self, entry): - e, r, o, m, p, s = entry.locals + def entry_head(self): + e, r, o, m, p, s = self.entry.locals gtid = s.ctaid_x * s.ntid_x + s.tid_x r.mwc_mult, r.mwc_state, r.mwc_carry = r.u32(), r.u32(), r.u32() r.mwc_mult = o.ld(p.mwc_mults[gtid]) r.mwc_state, r.mwc_carry = o.ld.v2(p.mwc_states[2*gtid]) - def entry_tail(self, entry): - e, r, o, m, p, s = entry.locals + def entry_tail(self): + e, r, o, m, p, s = self.entry.locals gtid = s.ctaid_x * s.ntid_x + s.tid_x o.st.v2.u32(p.mwc_states[2*gtid], r.mwc_state, r.mwc_carry) - def next_b32(self, entry): - e, r, o, m, p, s = entry.locals + def next_b32(self): + e, r, o, m, p, s = self.entry.locals carry = o.cvt.u64(r.mwc_carry) mwc_out = o.mad.wide(r.mwc_mult, r.mwc_state, carry) r.mwc_state, r.mwc_carry = o.split.v2(mwc_out) return r.mwc_state - def next_f32_01(self, entry): - e, r, o, m, p, s = entry.locals + def next_f32_01(self): + e, r, o, m, p, s = self.entry.locals mwc_float = o.cvt.rn.f32.u32(self.next_b32()) return o.mul.f32(mwc_float, 1./(1<<32)) - def next_f32_11(self, entry): - e, r, o, m, p, s = entry.locals + def next_f32_11(self): + e, r, o, m, p, s = self.entry.locals mwc_float = o.cvt.rn.f32.s32(self.next_b32()) return o.mul.f32(mwc_float, 1./(1<<31)) @@ -610,7 +543,7 @@ class MWCRNGTest(object): def __init__(self, entry): self.mwc = MWCRNG(entry) - entry.add_ptr_param('mwc_test_sums', 'u64') + entry.add_ptr_param('u64', 'mwc_test_sums') with entry.body(): self.entry_body(entry) @@ -649,7 +582,6 @@ class MWCRNGTest(object): dsums = cuda.mem_alloc(8*ctx.nthreads) ctx.set_param('mwc_test_sums', dsums) print "Took %g seconds." % ctx.call_timed() - print ctx.nthreads dsums = cuda.from_device(dsums, ctx.nthreads, np.uint64) if not np.all(np.equal(sums, dsums)): print "Sum discrepancy!" diff --git a/cuburn/render.py b/cuburn/render.py index 5f71860..23fd577 100644 --- a/cuburn/render.py +++ b/cuburn/render.py @@ -144,20 +144,16 @@ class Animation(object): self.filters = Filters(self._frame, genomes[0]) self.features = Features(genomes, self.filters) - self.ctx = None - def compile(self): """ Create a PTX kernel optimized for this animation, compile it, and attach it to a LaunchContext with a thread distribution optimized for the active device. """ - # TODO: user-configurable test control - self.ctx = LaunchContext([IterThread], block=(512,1,1), grid=(28,1), - tests=True) - # TODO: user-configurable verbosity control - self.ctx.compile(verbose=3, anim=self, features=self.features) # TODO: automatic optimization of block parameters + entry = ptx.Entry("iterate", 512) + iter = IterThread(entry, self.features) + self.mod = run.Module([entry]) def render_frame(self, time=0): # TODO: support more nuanced frame control than just 'time' diff --git a/cuburn/variations.py b/cuburn/variations.py index 355abb8..3540872 100644 --- a/cuburn/variations.py +++ b/cuburn/variations.py @@ -8,8 +8,8 @@ class Variations(object): shortname = "variations" - def __init__(self): - self.xform_idx = None + def __init__(self, features): + self.features = features names = [ "linear", "sinusoidal", "spherical", "swirl", "horseshoe", "polar", "handkerchief", "heart", "disc", "spiral", "hyperbolic", @@ -27,100 +27,65 @@ class Variations(object): "waves2", "exp", "log", "sin", "cos", "tan", "sec", "csc", "cot", "sinh", "cosh", "tanh", "sech", "csch", "coth", "auger", "flux", ] - def xfg(self, dst, expr): - """ - Convenience wrapper around cp.get which loads the given property from - the current CP and XF. - """ - # xform_idx is set by apply_xform on the current instance, but the - # expression will be evaluated using each CP in stream packing. - cp.get(cpA, dst, 'cp.xforms[%d].%s' % (self.xform_idx, expr)) - - def xfg_v2(self, dst1, expr1, dst2, expr2): - cp.get_v2(cpA, dst1, 'cp.xforms[%d].%s' % (self.xform_idx, expr1), - dst2, 'cp.xforms[%d].%s' % (self.xform_idx, expr2)) - - def xfg_v4(self, d1, e1, d2, e2, d3, e3, d4, e4): - cp.get_v4(cpA, d1, 'cp.xforms[%d].%s' % (self.xform_idx, e1), - d2, 'cp.xforms[%d].%s' % (self.xform_idx, e2), - d3, 'cp.xforms[%d].%s' % (self.xform_idx, e3), - d4, 'cp.xforms[%d].%s' % (self.xform_idx, e4)) - - def apply_xform(self, xo, yo, co, xi, yi, ci, xform_idx): + def apply_xform(self, entry, cp, x, y, color, xform_idx): """ Apply a transform. This function necessarily makes a copy of the input variables, so it's safe to use the same registers for input and output. """ - with block("Apply xform %d" % xform_idx): - self.xform_idx = xform_idx + e, r, o, m, p, s = entry.locals - with block('Modify color'): - reg.f32('c_speed c_new') - cp.get_v2(cpA, - c_speed, '(1.0 - cp.xforms[%d].color_speed)' % xform_idx, - c_new, 'cp.xforms[%d].color * cp.xforms[%d].color_speed' % - (xform_idx, xform_idx)) - op.fma.rn.ftz.f32(co, ci, c_speed, c_new) + # For use in retrieving properties from the control point datastream + xfs = lambda stval: 'cp.xforms[%d].%s' % (xform_idx, stval) - reg.f32('xt yt') - with block("Do affine transformation"): - # TODO: verify that this is the best performance (register - # usage vs number of loads) - reg.f32('c00 c10 c20 c01 c11 c21') - self.xfg_v4(c00, 'coefs[0][0]', c01, 'coefs[0][1]', - c20, 'coefs[2][0]', c21, 'coefs[2][1]') - op.fma.rn.ftz.f32(xt, c00, xi, c20) - op.fma.rn.ftz.f32(yt, c01, xi, c21) - self.xfg_v2(c10, 'coefs[1][0]', c11, 'coefs[1][1]') - op.fma.rn.ftz.f32(xt, c10, yi, xt) - op.fma.rn.ftz.f32(yt, c11, yi, yt) + e.comment('Color transformation') + c_speed, c_val = cp.get.v2.f32('1.0 - %s' % xfs('color_speed'), + '%s * %s' % (xfs('color'), xfs('color_speed'))) + color = color * c_speed + c_val - op.mov.f32(xo, '0.0') - op.mov.f32(yo, '0.0') + e.comment('Affine transformation') + c00, c20 = cp.get.v2.f32(xfs('coefs[0][0]'), xfs('coefs[2][0]')) + xt = x * c00 + c20 + c01, c21 = cp.get.v2.f32(xfs('coefs[0][1]'), xfs('coefs[2][1]')) + yt = x * c01 + c21 + c10, c11 = cp.get.v2.f32(xfs('coefs[1][0]'), xfs('coefs[1][1]')) + xt += y * c10 + yt += y * c11 - for var_name in sorted(features.xforms[xform_idx].vars): - func = getattr(self, var_name, None) - if not func: - raise NotImplementedError( - "Haven't implemented %s yet" % var_name) - with block('%s variation' % var_name): - reg.f32('wgt') - self.xfg(wgt, var_name) - func(xo, yo, xt, yt, wgt) + xo, yo = o.mov.f32(0), o.mov.f32(0) + for var_name in sorted(self.features.xforms[xform_idx].vars): + func = getattr(self, var_name, None) + if not func: + raise NotImplementedError( + "Haven't implemented %s yet" % var_name) + e.comment('%s variation' % var_name) + xtemp, ytemp = func(o, xt, yt, cp.get.f32(xfs(var_name))) + xo += xtemp + yo += ytemp - if features.xforms[xform_idx].has_post: - with block("Affine post-transformation"): - op.mov.f32(xt, xo) - op.mov.f32(yt, yo) - reg.f32('c00 c10 c20 c01 c11 c21') - self.xfg_v4(c00, 'post[0][0]', c01, 'post[0][1]', - c20, 'post[2][0]', c21, 'post[2][1]') - op.fma.rn.ftz.f32(xo, c00, xt, c20) - op.fma.rn.ftz.f32(yo, c01, xt, c21) - self.xfg_v2(c10, 'post[1][0]', c11, 'post[1][1]') - op.fma.rn.ftz.f32(xo, c10, yt, xo) - op.fma.rn.ftz.f32(yo, c11, yt, yo) + if self.features.xforms[xform_idx].has_post: + e.comment('Affine post-transformation') + c00, c20 = cp.get.v2.f32(xfs('post[0][0]'), xfs('post[2][0]')) + xt = xo * c00 + c20 + c01, c21 = cp.get.v2.f32(xfs('post[0][1]'), xfs('post[2][1]')) + yt = xo * c01 + c21 + c10, c11 = cp.get.v2.f32(xfs('post[1][0]'), xfs('post[1][1]')) + xt += yo * c10 + yt += yo * c11 + xo, yo = xt, yt - def linear(self, xo, yo, xi, yi, wgt): - op.fma.rn.ftz.f32(xo, xi, wgt, xo) - op.fma.rn.ftz.f32(yo, yi, wgt, yo) + self.xform_idx = None + return xo, yo, color - def sinusoidal(self, xo, yo, xi, yi, wgt): - reg.f32('sinval') - op.sin.approx.ftz.f32(sinval, xi) - op.fma.rn.ftz.f32(xo, sinval, wgt, xo) - op.sin.approx.ftz.f32(sinval, yi) - op.fma.rn.ftz.f32(yo, sinval, wgt, yo) + def linear(self, o, x, y, wgt): + return x * wgt, y * wgt - def spherical(self, xo, yo, xi, yi, wgt): - reg.f32('r2') - op.fma.rn.ftz.f32(r2, xi, xi, '1e-30') - op.fma.rn.ftz.f32(r2, yi, yi, r2) - op.rcp.approx.f32(r2, r2) - op.mul.rn.ftz.f32(r2, r2, wgt) - op.fma.rn.ftz.f32(xo, xi, r2, xo) - op.fma.rn.ftz.f32(yo, yi, r2, yo) + def sinusoidal(self, o, x, y, wgt): + return o.sin(x) * wgt, o.sin(y) * wgt + def spherical(self, o, x, y, wgt): + rsquared = x * x + y * y + rrcp = o.rcp(rsquared) * wgt + return x * wgt, y * wgt diff --git a/main.py b/main.py index 6b3aa0d..69013f7 100644 --- a/main.py +++ b/main.py @@ -42,7 +42,7 @@ def disass(mod): subprocess.check_call('/home/steven/code/decuda/elfToCubin.py --nouveau ' '/tmp/elf.o'.split()) -def main(args): +def mwctest(): mwcent = ptx.Entry("mwc_test", 512) mwctest = MWCRNGTest(mwcent) @@ -57,9 +57,7 @@ def main(args): ctx = mod.get_context('mwc_test', 14) mwctest.run_test(ctx) - return - - +def main(args): with open(args[-1]) as fp: genomes = Genome.from_string(fp.read()) anim = Animation(genomes)