Broken: Variations, CP stream implemented

This commit is contained in:
Steven Robertson 2010-10-09 11:18:58 -04:00
parent 576d2fa683
commit 97180003a4
4 changed files with 160 additions and 269 deletions

View File

@ -9,199 +9,131 @@ import struct
import pycuda.driver as cuda import pycuda.driver as cuda
import numpy as np import numpy as np
from pyptx import ptx, run from pyptx import ptx, run, util
from cuburn.variations import Variations from cuburn.variations import Variations
class IterThread(object): class IterThread(object):
entry_name = 'iter_thread' def __init__(self, entry, features):
entry_params = [] self.features = features
self.mwc = MWCRNG(entry)
self.cp = util.DataStream(entry)
self.vars = Variations(features)
def __init__(self): entry.add_param('u32', 'num_cps')
self.cps_uploaded = False entry.add_ptr_param('u32', 'cp_started_count')
entry.add_ptr_param('u8', 'cp_data')
def deps(self): with entry.body():
return [MWCRNG, CPDataStream, HistScatter, Variations, ShufflePoints, self.entry_body(entry)
Timeouter]
def module_setup(self): def entry_body(self, entry):
mem.global_.u32('g_cp_array', e, r, o, m, p, s = entry.locals
cp.stream_size*features.max_ntemporal_samples) # Index of this CTA's current CP
mem.global_.u32('g_num_cps') e.declare_mem('shared', 'u32', 'cp_idx')
mem.global_.u32('g_num_cps_started')
# TODO move into debug statement
mem.global_.u32('g_num_rounds', ctx.nthreads)
mem.global_.u32('g_num_writes', ctx.nthreads)
mem.global_.b32('g_whatever', ctx.nthreads)
def entry(self):
# Index number of current CP, shared across CTA
mem.shared.u32('s_cp_idx')
# Number of samples that have been generated so far in this CTA # Number of samples that have been generated so far in this CTA
# If this number is negative, we're still fusing points, so this # If this number is negative, we're still fusing points, so this
# behaves slightly differently (see ``fuse_loop_start``) # behaves slightly differently (see ``fuse_loop_start``)
# TODO: replace (or at least simplify) this logic # TODO: replace (or at least simplify) this logic
mem.shared.s32('s_num_samples') e.declare_mem('shared', 'f32', 'num_samples')
mem.shared.f32('s_xf_sel', ctx.warps_per_cta)
# TODO: temporary, for testing # The per-warp transform selection indices
mem.local.u32('l_num_rounds') e.declare_mem('shared', 'f32', 'xf_sel', e.nwarps_cta)
mem.local.u32('l_num_writes')
op.st.local.u32(addr(l_num_rounds), 0)
op.st.local.u32(addr(l_num_writes), 0)
reg.f32('x y color consec_bad') # TODO: re-add this logic using the printf formatter.
mwc.next_f32_11(x) #mem.local.u32('l_num_rounds')
mwc.next_f32_11(y) #mem.local.u32('l_num_writes')
mwc.next_f32_01(color) #op.st.local.u32(addr(l_num_rounds), 0)
op.mov.f32(consec_bad, float(-features.fuse)) #op.st.local.u32(addr(l_num_writes), 0)
comment("Ensure all init is done") # Declare IFS-space coordinates for doing iterations
op.bar.sync(0) r.x, r.y, r.color = r.f32(), r.f32(), r.f32()
r.x, r.y = self.mwc.next_f32_11(), self.mwc.next_f32_11()
r.color = self.mwc.next_f32_01()
# This thread's sample's good/bad/fusing state
r.consec_bad = r.f32(-self.features.fuse)
e.comment("The main loop entry point")
cp_loop_start = e.label()
with s.tid_x == 0:
o.st(m.cp_idx.addr, o.atom.add(p.cp_started_count[0], 1))
o.st(m.num_samples.addr, 0)
label('cp_loop_start') e.comment("Load the CP index in all threads")
reg.u32('cp_idx cpA') o.bar.sync(0)
with block("Claim a CP"): cp_idx = o.ld.volatile(m.cp_idx.addr)
std.set_is_first_thread(reg.pred('p_is_first'))
op.atom.add.u32(cp_idx, addr(g_num_cps_started), 1, ifp=p_is_first)
op.st.volatile.shared.u32(addr(s_cp_idx), cp_idx, ifp=p_is_first)
op.st.volatile.shared.s32(addr(s_num_samples), 0)
comment("Load the CP index in all threads") e.comment("Check to see if this CP is valid (if not, we're done)")
op.bar.sync(0) all_cps_done = e.forward_label()
op.ld.volatile.shared.u32(cp_idx, addr(s_cp_idx)) with cp_idx < p.num_cps:
o.bra.uni(all_cps_done)
self.cp.addr = p.cp_data[cp_idx * self.cp.stream_size]
with block("Check to see if this CP is valid (if not, we're done)"): loop_start = e.forward_label()
reg.u32('num_cps') with s.tid_x < e.nwarps_cta:
reg.pred('p_last_cp') o.bra(loop_start)
op.ldu.u32(num_cps, addr(g_num_cps))
op.setp.ge.u32(p_last_cp, cp_idx, num_cps)
op.bra('all_cps_done', ifp=p_last_cp)
with block('Load CP address'): e.comment("Choose the xform for each warp")
op.mov.u32(cpA, g_cp_array) choose_xform = e.label()
op.mad.lo.u32(cpA, cp_idx, cp.stream_size, cpA) o.st.volatile(m.xf_sel[s.tid_x], self.mwc.next_f32_01())
e.declare_label(loop_start)
e.comment("Execute the xform given by xf_sel")
xf_labels = [e.forward_label() for xf in self.features.xforms]
xf_sel = o.ld.volatile(m.xf_sel[s.tid_x >> 5])
for i, xf in enumerate(self.features.xforms):
xf_density = self.cp.get.f32('cp.xforms[%d].cweight'%xf.id)
with xf_density <= xf_sel:
o.bra.uni(xf_labels[i])
e.comment("This code should be unreachable")
o.trap()
label('iter_loop_choose_xform') xforms_done = e.forward_label()
with block("Choose the xform for each warp"): for i, xf in enumerate(self.features.xforms):
timeout.check_time(5) e.declare_label(xf_labels[i])
comment("On subsequent runs, only warp 0 will hit this code") r.x, r.y, r.color = self.vars.apply_xform(
reg.u32('x_addr x_offset') e, self.cp, r.x, r.y, r.color, xf.id)
reg.f32('xf_sel') o.bra.uni(xforms_done)
op.mov.u32(x_addr, s_xf_sel)
op.mov.u32(x_offset, '%tid.x')
op.and_.b32(x_offset, x_offset, ctx.warps_per_cta-1)
op.mad.lo.u32(x_addr, x_offset, 4, x_addr)
mwc.next_f32_01(xf_sel)
op.st.volatile.shared.f32(addr(x_addr), xf_sel)
label('iter_loop_start') e.comment("Determine write location, and whether point is valid")
e.declare_label(xforms_done)
histidx, is_valid = self.camera.get_index(r.x, r.y)
is_valid &= (r.consec_bad >= 0)
#timeout.check_time(10) e.comment("Scatter point to pointbuffer")
self.hist.scatter(histidx, r.color, 0, is_valid)
with block(): done_picking_new_point = e.forward_label()
reg.u32('num_rounds') with ~is_valid:
reg.pred('overload') r.consec_bad += 1
op.ld.local.u32(num_rounds, addr(l_num_rounds)) with r.consec_bad < self.features.max_bad:
op.add.u32(num_rounds, num_rounds, 1) o.bra(done_picking_new_point)
op.st.local.u32(addr(l_num_rounds), num_rounds)
with block("Select an xform"): e.comment("If too many consecutive bad values, pick a new point")
reg.f32('xf_sel') r.x, r.y = self.mwc.next_f32_11(), self.mwc.next_f32_11()
reg.u32('warp_offset xf_sel_addr') r.color = self.mwc.next_f32_01()
op.mov.u32(warp_offset, '%tid.x') r.consec_bad = -self.features.fuse
op.mov.u32(xf_sel_addr, s_xf_sel)
op.shr.u32(warp_offset, warp_offset, 5)
op.mad.lo.u32(xf_sel_addr, warp_offset, 4, xf_sel_addr)
op.ld.volatile.shared.f32(xf_sel, addr(xf_sel_addr))
reg.f32('xf_density') e.declare_label(done_picking_new_point)
reg.pred('xf_jump')
for xf in features.xforms:
cp.get(cpA, xf_density, 'cp.xforms[%d].cweight' % xf.id)
op.setp.le.f32(xf_jump, xf_sel, xf_density)
op.bra('XFORM_%d' % xf.id, ifp=xf_jump)
std.asrt("Reached end of xforms without choosing one")
for xf in features.xforms: e.comment("Determine number of good samples, and whether we're done")
label('XFORM_%d' % xf.id) num_samples = o.ld(m.num_samples)
variations.apply_xform(x, y, color, x, y, color, xf.id) num_samples += o.bar.red.popc(0, is_valid)
op.bra("xform_done") with s.tid_x == 0:
o.st(m.num_samples, num_samples)
label("xform_done") with num_samples >= self.cp.get('nsamples'):
o.bra.uni(cp_loop_start)
reg.pred('p_valid_pt')
with block("Write the result"):
reg.u32('hist_index')
camera.get_index(hist_index, x, y, p_valid_pt)
comment('if consec_bad < 0, point is fusing; treat as invalid')
op.setp.and_.ge.f32(p_valid_pt, consec_bad, 0., p_valid_pt)
# TODO: save and pass correct xform value here
hist.scatter(hist_index, color, 0, p_valid_pt, 'ldst')
with block():
reg.u32('num_writes')
op.ld.local.u32(num_writes, addr(l_num_writes))
op.add.u32(num_writes, num_writes, 1, ifp=p_valid_pt)
op.st.local.u32(addr(l_num_writes), num_writes)
with block("If the result was invalid, handle badvals"):
reg.pred('need_new_point')
op.add.f32(consec_bad, consec_bad, 1., ifnotp=p_valid_pt)
op.setp.ge.f32(need_new_point, consec_bad, float(features.max_bad))
op.bra('badval_done', ifnotp=need_new_point)
comment('If consec_bad > 5, pick a new random point')
mwc.next_f32_11(x)
mwc.next_f32_11(y)
mwc.next_f32_01(color)
op.mov.f32(consec_bad, float(-features.fuse))
label('badval_done')
with block("Increment number of samples by number of good values"):
reg.b32('good_samples laneid')
reg.pred('p_is_first')
op.vote.ballot.b32(good_samples, p_valid_pt)
op.popc.b32(good_samples, good_samples)
op.mov.u32(laneid, '%laneid')
op.setp.eq.u32(p_is_first, laneid, 0)
op.red.shared.add.s32(addr(s_num_samples), good_samples,
ifp=p_is_first)
with block("Check to see if we're done with this CP"):
reg.pred('p_cp_done')
reg.s32('num_samples num_samples_needed')
comment('Sync before making decision to prevent divergence')
op.bar.sync(3)
op.ld.volatile.shared.s32(num_samples, addr(s_num_samples))
cp.get(cpA, num_samples_needed, 'cp.nsamples')
op.setp.ge.s32(p_cp_done, num_samples, num_samples_needed)
op.bra.uni(cp_loop_start, ifp=p_cp_done)
comment('Shuffle points between threads') comment('Shuffle points between threads')
shuf.shuffle(x, y, color, consec_bad) shuf.shuffle(x, y, color, consec_bad)
with block("If in first warp, pick new offset"): with s.tid_x < e.nwarps_cta:
reg.u32('tid') o.bra(choose_xform)
reg.pred('first_warp') o.bra(loop_start)
op.mov.u32(tid, '%tid.x')
assert ctx.warps_per_cta <= 32, \
"Special-case for CTAs with >1024 threads not implemented"
op.setp.lo.u32(first_warp, tid, 32)
op.bra(iter_loop_choose_xform, ifp=first_warp)
op.bra(iter_loop_start)
label('all_cps_done') e.declare_label(all_cps_done)
# TODO this is for testing, move it to a debug statement
with block():
reg.u32('num_rounds num_writes')
op.ld.local.u32(num_rounds, addr(l_num_rounds))
op.ld.local.u32(num_writes, addr(l_num_writes))
std.store_per_thread(g_num_rounds, num_rounds,
g_num_writes, num_writes)
def upload_cp_stream(self, ctx, cp_stream, num_cps): def upload_cp_stream(self, ctx, cp_stream, num_cps):
cp_array_dp, cp_array_l = ctx.mod.get_global('g_cp_array') cp_array_dp, cp_array_l = ctx.mod.get_global('g_cp_array')
@ -525,40 +457,41 @@ class MWCRNG(object):
raise EnvironmentError('primes.bin not found') raise EnvironmentError('primes.bin not found')
self.nthreads_ready = 0 self.nthreads_ready = 0
self.mults, self.state = None, None self.mults, self.state = None, None
self.entry = entry
entry.add_ptr_param('mwc_mults', 'u32') entry.add_ptr_param('u32', 'mwc_mults')
entry.add_ptr_param('mwc_states', 'u32') entry.add_ptr_param('u32', 'mwc_states')
with entry.head(): with entry.head():
self.entry_head(entry) self.entry_head()
entry.tail_callback(self.entry_tail, entry) entry.tail_callback(self.entry_tail)
def entry_head(self, entry): def entry_head(self):
e, r, o, m, p, s = entry.locals e, r, o, m, p, s = self.entry.locals
gtid = s.ctaid_x * s.ntid_x + s.tid_x gtid = s.ctaid_x * s.ntid_x + s.tid_x
r.mwc_mult, r.mwc_state, r.mwc_carry = r.u32(), r.u32(), r.u32() r.mwc_mult, r.mwc_state, r.mwc_carry = r.u32(), r.u32(), r.u32()
r.mwc_mult = o.ld(p.mwc_mults[gtid]) r.mwc_mult = o.ld(p.mwc_mults[gtid])
r.mwc_state, r.mwc_carry = o.ld.v2(p.mwc_states[2*gtid]) r.mwc_state, r.mwc_carry = o.ld.v2(p.mwc_states[2*gtid])
def entry_tail(self, entry): def entry_tail(self):
e, r, o, m, p, s = entry.locals e, r, o, m, p, s = self.entry.locals
gtid = s.ctaid_x * s.ntid_x + s.tid_x gtid = s.ctaid_x * s.ntid_x + s.tid_x
o.st.v2.u32(p.mwc_states[2*gtid], r.mwc_state, r.mwc_carry) o.st.v2.u32(p.mwc_states[2*gtid], r.mwc_state, r.mwc_carry)
def next_b32(self, entry): def next_b32(self):
e, r, o, m, p, s = entry.locals e, r, o, m, p, s = self.entry.locals
carry = o.cvt.u64(r.mwc_carry) carry = o.cvt.u64(r.mwc_carry)
mwc_out = o.mad.wide(r.mwc_mult, r.mwc_state, carry) mwc_out = o.mad.wide(r.mwc_mult, r.mwc_state, carry)
r.mwc_state, r.mwc_carry = o.split.v2(mwc_out) r.mwc_state, r.mwc_carry = o.split.v2(mwc_out)
return r.mwc_state return r.mwc_state
def next_f32_01(self, entry): def next_f32_01(self):
e, r, o, m, p, s = entry.locals e, r, o, m, p, s = self.entry.locals
mwc_float = o.cvt.rn.f32.u32(self.next_b32()) mwc_float = o.cvt.rn.f32.u32(self.next_b32())
return o.mul.f32(mwc_float, 1./(1<<32)) return o.mul.f32(mwc_float, 1./(1<<32))
def next_f32_11(self, entry): def next_f32_11(self):
e, r, o, m, p, s = entry.locals e, r, o, m, p, s = self.entry.locals
mwc_float = o.cvt.rn.f32.s32(self.next_b32()) mwc_float = o.cvt.rn.f32.s32(self.next_b32())
return o.mul.f32(mwc_float, 1./(1<<31)) return o.mul.f32(mwc_float, 1./(1<<31))
@ -610,7 +543,7 @@ class MWCRNGTest(object):
def __init__(self, entry): def __init__(self, entry):
self.mwc = MWCRNG(entry) self.mwc = MWCRNG(entry)
entry.add_ptr_param('mwc_test_sums', 'u64') entry.add_ptr_param('u64', 'mwc_test_sums')
with entry.body(): with entry.body():
self.entry_body(entry) self.entry_body(entry)
@ -649,7 +582,6 @@ class MWCRNGTest(object):
dsums = cuda.mem_alloc(8*ctx.nthreads) dsums = cuda.mem_alloc(8*ctx.nthreads)
ctx.set_param('mwc_test_sums', dsums) ctx.set_param('mwc_test_sums', dsums)
print "Took %g seconds." % ctx.call_timed() print "Took %g seconds." % ctx.call_timed()
print ctx.nthreads
dsums = cuda.from_device(dsums, ctx.nthreads, np.uint64) dsums = cuda.from_device(dsums, ctx.nthreads, np.uint64)
if not np.all(np.equal(sums, dsums)): if not np.all(np.equal(sums, dsums)):
print "Sum discrepancy!" print "Sum discrepancy!"

View File

@ -144,20 +144,16 @@ class Animation(object):
self.filters = Filters(self._frame, genomes[0]) self.filters = Filters(self._frame, genomes[0])
self.features = Features(genomes, self.filters) self.features = Features(genomes, self.filters)
self.ctx = None
def compile(self): def compile(self):
""" """
Create a PTX kernel optimized for this animation, compile it, and Create a PTX kernel optimized for this animation, compile it, and
attach it to a LaunchContext with a thread distribution optimized for attach it to a LaunchContext with a thread distribution optimized for
the active device. the active device.
""" """
# TODO: user-configurable test control
self.ctx = LaunchContext([IterThread], block=(512,1,1), grid=(28,1),
tests=True)
# TODO: user-configurable verbosity control
self.ctx.compile(verbose=3, anim=self, features=self.features)
# TODO: automatic optimization of block parameters # TODO: automatic optimization of block parameters
entry = ptx.Entry("iterate", 512)
iter = IterThread(entry, self.features)
self.mod = run.Module([entry])
def render_frame(self, time=0): def render_frame(self, time=0):
# TODO: support more nuanced frame control than just 'time' # TODO: support more nuanced frame control than just 'time'

View File

@ -8,8 +8,8 @@ class Variations(object):
shortname = "variations" shortname = "variations"
def __init__(self): def __init__(self, features):
self.xform_idx = None self.features = features
names = [ "linear", "sinusoidal", "spherical", "swirl", "horseshoe", names = [ "linear", "sinusoidal", "spherical", "swirl", "horseshoe",
"polar", "handkerchief", "heart", "disc", "spiral", "hyperbolic", "polar", "handkerchief", "heart", "disc", "spiral", "hyperbolic",
@ -27,100 +27,65 @@ class Variations(object):
"waves2", "exp", "log", "sin", "cos", "tan", "sec", "csc", "cot", "waves2", "exp", "log", "sin", "cos", "tan", "sec", "csc", "cot",
"sinh", "cosh", "tanh", "sech", "csch", "coth", "auger", "flux", ] "sinh", "cosh", "tanh", "sech", "csch", "coth", "auger", "flux", ]
def xfg(self, dst, expr): def apply_xform(self, entry, cp, x, y, color, xform_idx):
"""
Convenience wrapper around cp.get which loads the given property from
the current CP and XF.
"""
# xform_idx is set by apply_xform on the current instance, but the
# expression will be evaluated using each CP in stream packing.
cp.get(cpA, dst, 'cp.xforms[%d].%s' % (self.xform_idx, expr))
def xfg_v2(self, dst1, expr1, dst2, expr2):
cp.get_v2(cpA, dst1, 'cp.xforms[%d].%s' % (self.xform_idx, expr1),
dst2, 'cp.xforms[%d].%s' % (self.xform_idx, expr2))
def xfg_v4(self, d1, e1, d2, e2, d3, e3, d4, e4):
cp.get_v4(cpA, d1, 'cp.xforms[%d].%s' % (self.xform_idx, e1),
d2, 'cp.xforms[%d].%s' % (self.xform_idx, e2),
d3, 'cp.xforms[%d].%s' % (self.xform_idx, e3),
d4, 'cp.xforms[%d].%s' % (self.xform_idx, e4))
def apply_xform(self, xo, yo, co, xi, yi, ci, xform_idx):
""" """
Apply a transform. Apply a transform.
This function necessarily makes a copy of the input variables, so it's This function necessarily makes a copy of the input variables, so it's
safe to use the same registers for input and output. safe to use the same registers for input and output.
""" """
with block("Apply xform %d" % xform_idx): e, r, o, m, p, s = entry.locals
self.xform_idx = xform_idx
with block('Modify color'): # For use in retrieving properties from the control point datastream
reg.f32('c_speed c_new') xfs = lambda stval: 'cp.xforms[%d].%s' % (xform_idx, stval)
cp.get_v2(cpA,
c_speed, '(1.0 - cp.xforms[%d].color_speed)' % xform_idx,
c_new, 'cp.xforms[%d].color * cp.xforms[%d].color_speed' %
(xform_idx, xform_idx))
op.fma.rn.ftz.f32(co, ci, c_speed, c_new)
reg.f32('xt yt') e.comment('Color transformation')
with block("Do affine transformation"): c_speed, c_val = cp.get.v2.f32('1.0 - %s' % xfs('color_speed'),
# TODO: verify that this is the best performance (register '%s * %s' % (xfs('color'), xfs('color_speed')))
# usage vs number of loads) color = color * c_speed + c_val
reg.f32('c00 c10 c20 c01 c11 c21')
self.xfg_v4(c00, 'coefs[0][0]', c01, 'coefs[0][1]',
c20, 'coefs[2][0]', c21, 'coefs[2][1]')
op.fma.rn.ftz.f32(xt, c00, xi, c20)
op.fma.rn.ftz.f32(yt, c01, xi, c21)
self.xfg_v2(c10, 'coefs[1][0]', c11, 'coefs[1][1]')
op.fma.rn.ftz.f32(xt, c10, yi, xt)
op.fma.rn.ftz.f32(yt, c11, yi, yt)
op.mov.f32(xo, '0.0') e.comment('Affine transformation')
op.mov.f32(yo, '0.0') c00, c20 = cp.get.v2.f32(xfs('coefs[0][0]'), xfs('coefs[2][0]'))
xt = x * c00 + c20
c01, c21 = cp.get.v2.f32(xfs('coefs[0][1]'), xfs('coefs[2][1]'))
yt = x * c01 + c21
c10, c11 = cp.get.v2.f32(xfs('coefs[1][0]'), xfs('coefs[1][1]'))
xt += y * c10
yt += y * c11
for var_name in sorted(features.xforms[xform_idx].vars): xo, yo = o.mov.f32(0), o.mov.f32(0)
for var_name in sorted(self.features.xforms[xform_idx].vars):
func = getattr(self, var_name, None) func = getattr(self, var_name, None)
if not func: if not func:
raise NotImplementedError( raise NotImplementedError(
"Haven't implemented %s yet" % var_name) "Haven't implemented %s yet" % var_name)
with block('%s variation' % var_name): e.comment('%s variation' % var_name)
reg.f32('wgt') xtemp, ytemp = func(o, xt, yt, cp.get.f32(xfs(var_name)))
self.xfg(wgt, var_name) xo += xtemp
func(xo, yo, xt, yt, wgt) yo += ytemp
if features.xforms[xform_idx].has_post: if self.features.xforms[xform_idx].has_post:
with block("Affine post-transformation"): e.comment('Affine post-transformation')
op.mov.f32(xt, xo) c00, c20 = cp.get.v2.f32(xfs('post[0][0]'), xfs('post[2][0]'))
op.mov.f32(yt, yo) xt = xo * c00 + c20
reg.f32('c00 c10 c20 c01 c11 c21') c01, c21 = cp.get.v2.f32(xfs('post[0][1]'), xfs('post[2][1]'))
self.xfg_v4(c00, 'post[0][0]', c01, 'post[0][1]', yt = xo * c01 + c21
c20, 'post[2][0]', c21, 'post[2][1]') c10, c11 = cp.get.v2.f32(xfs('post[1][0]'), xfs('post[1][1]'))
op.fma.rn.ftz.f32(xo, c00, xt, c20) xt += yo * c10
op.fma.rn.ftz.f32(yo, c01, xt, c21) yt += yo * c11
self.xfg_v2(c10, 'post[1][0]', c11, 'post[1][1]') xo, yo = xt, yt
op.fma.rn.ftz.f32(xo, c10, yt, xo)
op.fma.rn.ftz.f32(yo, c11, yt, yo)
def linear(self, xo, yo, xi, yi, wgt): self.xform_idx = None
op.fma.rn.ftz.f32(xo, xi, wgt, xo) return xo, yo, color
op.fma.rn.ftz.f32(yo, yi, wgt, yo)
def sinusoidal(self, xo, yo, xi, yi, wgt): def linear(self, o, x, y, wgt):
reg.f32('sinval') return x * wgt, y * wgt
op.sin.approx.ftz.f32(sinval, xi)
op.fma.rn.ftz.f32(xo, sinval, wgt, xo)
op.sin.approx.ftz.f32(sinval, yi)
op.fma.rn.ftz.f32(yo, sinval, wgt, yo)
def spherical(self, xo, yo, xi, yi, wgt): def sinusoidal(self, o, x, y, wgt):
reg.f32('r2') return o.sin(x) * wgt, o.sin(y) * wgt
op.fma.rn.ftz.f32(r2, xi, xi, '1e-30')
op.fma.rn.ftz.f32(r2, yi, yi, r2)
op.rcp.approx.f32(r2, r2)
op.mul.rn.ftz.f32(r2, r2, wgt)
op.fma.rn.ftz.f32(xo, xi, r2, xo)
op.fma.rn.ftz.f32(yo, yi, r2, yo)
def spherical(self, o, x, y, wgt):
rsquared = x * x + y * y
rrcp = o.rcp(rsquared) * wgt
return x * wgt, y * wgt

View File

@ -42,7 +42,7 @@ def disass(mod):
subprocess.check_call('/home/steven/code/decuda/elfToCubin.py --nouveau ' subprocess.check_call('/home/steven/code/decuda/elfToCubin.py --nouveau '
'/tmp/elf.o'.split()) '/tmp/elf.o'.split())
def main(args): def mwctest():
mwcent = ptx.Entry("mwc_test", 512) mwcent = ptx.Entry("mwc_test", 512)
mwctest = MWCRNGTest(mwcent) mwctest = MWCRNGTest(mwcent)
@ -57,9 +57,7 @@ def main(args):
ctx = mod.get_context('mwc_test', 14) ctx = mod.get_context('mwc_test', 14)
mwctest.run_test(ctx) mwctest.run_test(ctx)
return def main(args):
with open(args[-1]) as fp: with open(args[-1]) as fp:
genomes = Genome.from_string(fp.read()) genomes = Genome.from_string(fp.read())
anim = Animation(genomes) anim = Animation(genomes)