mirror of
https://github.com/stevenrobertson/cuburn.git
synced 2025-02-05 11:40:04 -05:00
Broken: Variations, CP stream implemented
This commit is contained in:
parent
576d2fa683
commit
97180003a4
@ -9,199 +9,131 @@ import struct
|
|||||||
import pycuda.driver as cuda
|
import pycuda.driver as cuda
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from pyptx import ptx, run
|
from pyptx import ptx, run, util
|
||||||
from cuburn.variations import Variations
|
from cuburn.variations import Variations
|
||||||
|
|
||||||
class IterThread(object):
|
class IterThread(object):
|
||||||
entry_name = 'iter_thread'
|
def __init__(self, entry, features):
|
||||||
entry_params = []
|
self.features = features
|
||||||
|
self.mwc = MWCRNG(entry)
|
||||||
|
self.cp = util.DataStream(entry)
|
||||||
|
self.vars = Variations(features)
|
||||||
|
|
||||||
def __init__(self):
|
entry.add_param('u32', 'num_cps')
|
||||||
self.cps_uploaded = False
|
entry.add_ptr_param('u32', 'cp_started_count')
|
||||||
|
entry.add_ptr_param('u8', 'cp_data')
|
||||||
|
|
||||||
def deps(self):
|
with entry.body():
|
||||||
return [MWCRNG, CPDataStream, HistScatter, Variations, ShufflePoints,
|
self.entry_body(entry)
|
||||||
Timeouter]
|
|
||||||
|
|
||||||
def module_setup(self):
|
def entry_body(self, entry):
|
||||||
mem.global_.u32('g_cp_array',
|
e, r, o, m, p, s = entry.locals
|
||||||
cp.stream_size*features.max_ntemporal_samples)
|
# Index of this CTA's current CP
|
||||||
mem.global_.u32('g_num_cps')
|
e.declare_mem('shared', 'u32', 'cp_idx')
|
||||||
mem.global_.u32('g_num_cps_started')
|
|
||||||
# TODO move into debug statement
|
|
||||||
mem.global_.u32('g_num_rounds', ctx.nthreads)
|
|
||||||
mem.global_.u32('g_num_writes', ctx.nthreads)
|
|
||||||
mem.global_.b32('g_whatever', ctx.nthreads)
|
|
||||||
|
|
||||||
def entry(self):
|
|
||||||
# Index number of current CP, shared across CTA
|
|
||||||
mem.shared.u32('s_cp_idx')
|
|
||||||
|
|
||||||
# Number of samples that have been generated so far in this CTA
|
# Number of samples that have been generated so far in this CTA
|
||||||
# If this number is negative, we're still fusing points, so this
|
# If this number is negative, we're still fusing points, so this
|
||||||
# behaves slightly differently (see ``fuse_loop_start``)
|
# behaves slightly differently (see ``fuse_loop_start``)
|
||||||
# TODO: replace (or at least simplify) this logic
|
# TODO: replace (or at least simplify) this logic
|
||||||
mem.shared.s32('s_num_samples')
|
e.declare_mem('shared', 'f32', 'num_samples')
|
||||||
mem.shared.f32('s_xf_sel', ctx.warps_per_cta)
|
|
||||||
|
|
||||||
# TODO: temporary, for testing
|
# The per-warp transform selection indices
|
||||||
mem.local.u32('l_num_rounds')
|
e.declare_mem('shared', 'f32', 'xf_sel', e.nwarps_cta)
|
||||||
mem.local.u32('l_num_writes')
|
|
||||||
op.st.local.u32(addr(l_num_rounds), 0)
|
|
||||||
op.st.local.u32(addr(l_num_writes), 0)
|
|
||||||
|
|
||||||
reg.f32('x y color consec_bad')
|
# TODO: re-add this logic using the printf formatter.
|
||||||
mwc.next_f32_11(x)
|
#mem.local.u32('l_num_rounds')
|
||||||
mwc.next_f32_11(y)
|
#mem.local.u32('l_num_writes')
|
||||||
mwc.next_f32_01(color)
|
#op.st.local.u32(addr(l_num_rounds), 0)
|
||||||
op.mov.f32(consec_bad, float(-features.fuse))
|
#op.st.local.u32(addr(l_num_writes), 0)
|
||||||
|
|
||||||
comment("Ensure all init is done")
|
# Declare IFS-space coordinates for doing iterations
|
||||||
op.bar.sync(0)
|
r.x, r.y, r.color = r.f32(), r.f32(), r.f32()
|
||||||
|
r.x, r.y = self.mwc.next_f32_11(), self.mwc.next_f32_11()
|
||||||
|
r.color = self.mwc.next_f32_01()
|
||||||
|
|
||||||
|
# This thread's sample's good/bad/fusing state
|
||||||
|
r.consec_bad = r.f32(-self.features.fuse)
|
||||||
|
|
||||||
|
e.comment("The main loop entry point")
|
||||||
|
cp_loop_start = e.label()
|
||||||
|
with s.tid_x == 0:
|
||||||
|
o.st(m.cp_idx.addr, o.atom.add(p.cp_started_count[0], 1))
|
||||||
|
o.st(m.num_samples.addr, 0)
|
||||||
|
|
||||||
label('cp_loop_start')
|
e.comment("Load the CP index in all threads")
|
||||||
reg.u32('cp_idx cpA')
|
o.bar.sync(0)
|
||||||
with block("Claim a CP"):
|
cp_idx = o.ld.volatile(m.cp_idx.addr)
|
||||||
std.set_is_first_thread(reg.pred('p_is_first'))
|
|
||||||
op.atom.add.u32(cp_idx, addr(g_num_cps_started), 1, ifp=p_is_first)
|
|
||||||
op.st.volatile.shared.u32(addr(s_cp_idx), cp_idx, ifp=p_is_first)
|
|
||||||
op.st.volatile.shared.s32(addr(s_num_samples), 0)
|
|
||||||
|
|
||||||
comment("Load the CP index in all threads")
|
e.comment("Check to see if this CP is valid (if not, we're done)")
|
||||||
op.bar.sync(0)
|
all_cps_done = e.forward_label()
|
||||||
op.ld.volatile.shared.u32(cp_idx, addr(s_cp_idx))
|
with cp_idx < p.num_cps:
|
||||||
|
o.bra.uni(all_cps_done)
|
||||||
|
self.cp.addr = p.cp_data[cp_idx * self.cp.stream_size]
|
||||||
|
|
||||||
with block("Check to see if this CP is valid (if not, we're done)"):
|
loop_start = e.forward_label()
|
||||||
reg.u32('num_cps')
|
with s.tid_x < e.nwarps_cta:
|
||||||
reg.pred('p_last_cp')
|
o.bra(loop_start)
|
||||||
op.ldu.u32(num_cps, addr(g_num_cps))
|
|
||||||
op.setp.ge.u32(p_last_cp, cp_idx, num_cps)
|
|
||||||
op.bra('all_cps_done', ifp=p_last_cp)
|
|
||||||
|
|
||||||
with block('Load CP address'):
|
e.comment("Choose the xform for each warp")
|
||||||
op.mov.u32(cpA, g_cp_array)
|
choose_xform = e.label()
|
||||||
op.mad.lo.u32(cpA, cp_idx, cp.stream_size, cpA)
|
o.st.volatile(m.xf_sel[s.tid_x], self.mwc.next_f32_01())
|
||||||
|
|
||||||
|
e.declare_label(loop_start)
|
||||||
|
e.comment("Execute the xform given by xf_sel")
|
||||||
|
xf_labels = [e.forward_label() for xf in self.features.xforms]
|
||||||
|
xf_sel = o.ld.volatile(m.xf_sel[s.tid_x >> 5])
|
||||||
|
for i, xf in enumerate(self.features.xforms):
|
||||||
|
xf_density = self.cp.get.f32('cp.xforms[%d].cweight'%xf.id)
|
||||||
|
with xf_density <= xf_sel:
|
||||||
|
o.bra.uni(xf_labels[i])
|
||||||
|
|
||||||
|
e.comment("This code should be unreachable")
|
||||||
|
o.trap()
|
||||||
|
|
||||||
label('iter_loop_choose_xform')
|
xforms_done = e.forward_label()
|
||||||
with block("Choose the xform for each warp"):
|
for i, xf in enumerate(self.features.xforms):
|
||||||
timeout.check_time(5)
|
e.declare_label(xf_labels[i])
|
||||||
comment("On subsequent runs, only warp 0 will hit this code")
|
r.x, r.y, r.color = self.vars.apply_xform(
|
||||||
reg.u32('x_addr x_offset')
|
e, self.cp, r.x, r.y, r.color, xf.id)
|
||||||
reg.f32('xf_sel')
|
o.bra.uni(xforms_done)
|
||||||
op.mov.u32(x_addr, s_xf_sel)
|
|
||||||
op.mov.u32(x_offset, '%tid.x')
|
|
||||||
op.and_.b32(x_offset, x_offset, ctx.warps_per_cta-1)
|
|
||||||
op.mad.lo.u32(x_addr, x_offset, 4, x_addr)
|
|
||||||
mwc.next_f32_01(xf_sel)
|
|
||||||
op.st.volatile.shared.f32(addr(x_addr), xf_sel)
|
|
||||||
|
|
||||||
label('iter_loop_start')
|
e.comment("Determine write location, and whether point is valid")
|
||||||
|
e.declare_label(xforms_done)
|
||||||
|
histidx, is_valid = self.camera.get_index(r.x, r.y)
|
||||||
|
is_valid &= (r.consec_bad >= 0)
|
||||||
|
|
||||||
#timeout.check_time(10)
|
e.comment("Scatter point to pointbuffer")
|
||||||
|
self.hist.scatter(histidx, r.color, 0, is_valid)
|
||||||
|
|
||||||
with block():
|
done_picking_new_point = e.forward_label()
|
||||||
reg.u32('num_rounds')
|
with ~is_valid:
|
||||||
reg.pred('overload')
|
r.consec_bad += 1
|
||||||
op.ld.local.u32(num_rounds, addr(l_num_rounds))
|
with r.consec_bad < self.features.max_bad:
|
||||||
op.add.u32(num_rounds, num_rounds, 1)
|
o.bra(done_picking_new_point)
|
||||||
op.st.local.u32(addr(l_num_rounds), num_rounds)
|
|
||||||
|
|
||||||
with block("Select an xform"):
|
e.comment("If too many consecutive bad values, pick a new point")
|
||||||
reg.f32('xf_sel')
|
r.x, r.y = self.mwc.next_f32_11(), self.mwc.next_f32_11()
|
||||||
reg.u32('warp_offset xf_sel_addr')
|
r.color = self.mwc.next_f32_01()
|
||||||
op.mov.u32(warp_offset, '%tid.x')
|
r.consec_bad = -self.features.fuse
|
||||||
op.mov.u32(xf_sel_addr, s_xf_sel)
|
|
||||||
op.shr.u32(warp_offset, warp_offset, 5)
|
|
||||||
op.mad.lo.u32(xf_sel_addr, warp_offset, 4, xf_sel_addr)
|
|
||||||
op.ld.volatile.shared.f32(xf_sel, addr(xf_sel_addr))
|
|
||||||
|
|
||||||
reg.f32('xf_density')
|
e.declare_label(done_picking_new_point)
|
||||||
reg.pred('xf_jump')
|
|
||||||
for xf in features.xforms:
|
|
||||||
cp.get(cpA, xf_density, 'cp.xforms[%d].cweight' % xf.id)
|
|
||||||
op.setp.le.f32(xf_jump, xf_sel, xf_density)
|
|
||||||
op.bra('XFORM_%d' % xf.id, ifp=xf_jump)
|
|
||||||
std.asrt("Reached end of xforms without choosing one")
|
|
||||||
|
|
||||||
for xf in features.xforms:
|
e.comment("Determine number of good samples, and whether we're done")
|
||||||
label('XFORM_%d' % xf.id)
|
num_samples = o.ld(m.num_samples)
|
||||||
variations.apply_xform(x, y, color, x, y, color, xf.id)
|
num_samples += o.bar.red.popc(0, is_valid)
|
||||||
op.bra("xform_done")
|
with s.tid_x == 0:
|
||||||
|
o.st(m.num_samples, num_samples)
|
||||||
label("xform_done")
|
with num_samples >= self.cp.get('nsamples'):
|
||||||
|
o.bra.uni(cp_loop_start)
|
||||||
reg.pred('p_valid_pt')
|
|
||||||
with block("Write the result"):
|
|
||||||
reg.u32('hist_index')
|
|
||||||
camera.get_index(hist_index, x, y, p_valid_pt)
|
|
||||||
comment('if consec_bad < 0, point is fusing; treat as invalid')
|
|
||||||
op.setp.and_.ge.f32(p_valid_pt, consec_bad, 0., p_valid_pt)
|
|
||||||
# TODO: save and pass correct xform value here
|
|
||||||
hist.scatter(hist_index, color, 0, p_valid_pt, 'ldst')
|
|
||||||
with block():
|
|
||||||
reg.u32('num_writes')
|
|
||||||
op.ld.local.u32(num_writes, addr(l_num_writes))
|
|
||||||
op.add.u32(num_writes, num_writes, 1, ifp=p_valid_pt)
|
|
||||||
op.st.local.u32(addr(l_num_writes), num_writes)
|
|
||||||
|
|
||||||
with block("If the result was invalid, handle badvals"):
|
|
||||||
reg.pred('need_new_point')
|
|
||||||
op.add.f32(consec_bad, consec_bad, 1., ifnotp=p_valid_pt)
|
|
||||||
op.setp.ge.f32(need_new_point, consec_bad, float(features.max_bad))
|
|
||||||
op.bra('badval_done', ifnotp=need_new_point)
|
|
||||||
|
|
||||||
comment('If consec_bad > 5, pick a new random point')
|
|
||||||
mwc.next_f32_11(x)
|
|
||||||
mwc.next_f32_11(y)
|
|
||||||
mwc.next_f32_01(color)
|
|
||||||
op.mov.f32(consec_bad, float(-features.fuse))
|
|
||||||
label('badval_done')
|
|
||||||
|
|
||||||
with block("Increment number of samples by number of good values"):
|
|
||||||
reg.b32('good_samples laneid')
|
|
||||||
reg.pred('p_is_first')
|
|
||||||
op.vote.ballot.b32(good_samples, p_valid_pt)
|
|
||||||
op.popc.b32(good_samples, good_samples)
|
|
||||||
op.mov.u32(laneid, '%laneid')
|
|
||||||
op.setp.eq.u32(p_is_first, laneid, 0)
|
|
||||||
op.red.shared.add.s32(addr(s_num_samples), good_samples,
|
|
||||||
ifp=p_is_first)
|
|
||||||
|
|
||||||
with block("Check to see if we're done with this CP"):
|
|
||||||
reg.pred('p_cp_done')
|
|
||||||
reg.s32('num_samples num_samples_needed')
|
|
||||||
comment('Sync before making decision to prevent divergence')
|
|
||||||
op.bar.sync(3)
|
|
||||||
op.ld.volatile.shared.s32(num_samples, addr(s_num_samples))
|
|
||||||
cp.get(cpA, num_samples_needed, 'cp.nsamples')
|
|
||||||
op.setp.ge.s32(p_cp_done, num_samples, num_samples_needed)
|
|
||||||
op.bra.uni(cp_loop_start, ifp=p_cp_done)
|
|
||||||
|
|
||||||
comment('Shuffle points between threads')
|
comment('Shuffle points between threads')
|
||||||
shuf.shuffle(x, y, color, consec_bad)
|
shuf.shuffle(x, y, color, consec_bad)
|
||||||
|
|
||||||
with block("If in first warp, pick new offset"):
|
with s.tid_x < e.nwarps_cta:
|
||||||
reg.u32('tid')
|
o.bra(choose_xform)
|
||||||
reg.pred('first_warp')
|
o.bra(loop_start)
|
||||||
op.mov.u32(tid, '%tid.x')
|
|
||||||
assert ctx.warps_per_cta <= 32, \
|
|
||||||
"Special-case for CTAs with >1024 threads not implemented"
|
|
||||||
op.setp.lo.u32(first_warp, tid, 32)
|
|
||||||
op.bra(iter_loop_choose_xform, ifp=first_warp)
|
|
||||||
op.bra(iter_loop_start)
|
|
||||||
|
|
||||||
label('all_cps_done')
|
e.declare_label(all_cps_done)
|
||||||
# TODO this is for testing, move it to a debug statement
|
|
||||||
with block():
|
|
||||||
reg.u32('num_rounds num_writes')
|
|
||||||
op.ld.local.u32(num_rounds, addr(l_num_rounds))
|
|
||||||
op.ld.local.u32(num_writes, addr(l_num_writes))
|
|
||||||
std.store_per_thread(g_num_rounds, num_rounds,
|
|
||||||
g_num_writes, num_writes)
|
|
||||||
|
|
||||||
def upload_cp_stream(self, ctx, cp_stream, num_cps):
|
def upload_cp_stream(self, ctx, cp_stream, num_cps):
|
||||||
cp_array_dp, cp_array_l = ctx.mod.get_global('g_cp_array')
|
cp_array_dp, cp_array_l = ctx.mod.get_global('g_cp_array')
|
||||||
@ -525,40 +457,41 @@ class MWCRNG(object):
|
|||||||
raise EnvironmentError('primes.bin not found')
|
raise EnvironmentError('primes.bin not found')
|
||||||
self.nthreads_ready = 0
|
self.nthreads_ready = 0
|
||||||
self.mults, self.state = None, None
|
self.mults, self.state = None, None
|
||||||
|
self.entry = entry
|
||||||
|
|
||||||
entry.add_ptr_param('mwc_mults', 'u32')
|
entry.add_ptr_param('u32', 'mwc_mults')
|
||||||
entry.add_ptr_param('mwc_states', 'u32')
|
entry.add_ptr_param('u32', 'mwc_states')
|
||||||
|
|
||||||
with entry.head():
|
with entry.head():
|
||||||
self.entry_head(entry)
|
self.entry_head()
|
||||||
entry.tail_callback(self.entry_tail, entry)
|
entry.tail_callback(self.entry_tail)
|
||||||
|
|
||||||
def entry_head(self, entry):
|
def entry_head(self):
|
||||||
e, r, o, m, p, s = entry.locals
|
e, r, o, m, p, s = self.entry.locals
|
||||||
gtid = s.ctaid_x * s.ntid_x + s.tid_x
|
gtid = s.ctaid_x * s.ntid_x + s.tid_x
|
||||||
r.mwc_mult, r.mwc_state, r.mwc_carry = r.u32(), r.u32(), r.u32()
|
r.mwc_mult, r.mwc_state, r.mwc_carry = r.u32(), r.u32(), r.u32()
|
||||||
r.mwc_mult = o.ld(p.mwc_mults[gtid])
|
r.mwc_mult = o.ld(p.mwc_mults[gtid])
|
||||||
r.mwc_state, r.mwc_carry = o.ld.v2(p.mwc_states[2*gtid])
|
r.mwc_state, r.mwc_carry = o.ld.v2(p.mwc_states[2*gtid])
|
||||||
|
|
||||||
def entry_tail(self, entry):
|
def entry_tail(self):
|
||||||
e, r, o, m, p, s = entry.locals
|
e, r, o, m, p, s = self.entry.locals
|
||||||
gtid = s.ctaid_x * s.ntid_x + s.tid_x
|
gtid = s.ctaid_x * s.ntid_x + s.tid_x
|
||||||
o.st.v2.u32(p.mwc_states[2*gtid], r.mwc_state, r.mwc_carry)
|
o.st.v2.u32(p.mwc_states[2*gtid], r.mwc_state, r.mwc_carry)
|
||||||
|
|
||||||
def next_b32(self, entry):
|
def next_b32(self):
|
||||||
e, r, o, m, p, s = entry.locals
|
e, r, o, m, p, s = self.entry.locals
|
||||||
carry = o.cvt.u64(r.mwc_carry)
|
carry = o.cvt.u64(r.mwc_carry)
|
||||||
mwc_out = o.mad.wide(r.mwc_mult, r.mwc_state, carry)
|
mwc_out = o.mad.wide(r.mwc_mult, r.mwc_state, carry)
|
||||||
r.mwc_state, r.mwc_carry = o.split.v2(mwc_out)
|
r.mwc_state, r.mwc_carry = o.split.v2(mwc_out)
|
||||||
return r.mwc_state
|
return r.mwc_state
|
||||||
|
|
||||||
def next_f32_01(self, entry):
|
def next_f32_01(self):
|
||||||
e, r, o, m, p, s = entry.locals
|
e, r, o, m, p, s = self.entry.locals
|
||||||
mwc_float = o.cvt.rn.f32.u32(self.next_b32())
|
mwc_float = o.cvt.rn.f32.u32(self.next_b32())
|
||||||
return o.mul.f32(mwc_float, 1./(1<<32))
|
return o.mul.f32(mwc_float, 1./(1<<32))
|
||||||
|
|
||||||
def next_f32_11(self, entry):
|
def next_f32_11(self):
|
||||||
e, r, o, m, p, s = entry.locals
|
e, r, o, m, p, s = self.entry.locals
|
||||||
mwc_float = o.cvt.rn.f32.s32(self.next_b32())
|
mwc_float = o.cvt.rn.f32.s32(self.next_b32())
|
||||||
return o.mul.f32(mwc_float, 1./(1<<31))
|
return o.mul.f32(mwc_float, 1./(1<<31))
|
||||||
|
|
||||||
@ -610,7 +543,7 @@ class MWCRNGTest(object):
|
|||||||
|
|
||||||
def __init__(self, entry):
|
def __init__(self, entry):
|
||||||
self.mwc = MWCRNG(entry)
|
self.mwc = MWCRNG(entry)
|
||||||
entry.add_ptr_param('mwc_test_sums', 'u64')
|
entry.add_ptr_param('u64', 'mwc_test_sums')
|
||||||
|
|
||||||
with entry.body():
|
with entry.body():
|
||||||
self.entry_body(entry)
|
self.entry_body(entry)
|
||||||
@ -649,7 +582,6 @@ class MWCRNGTest(object):
|
|||||||
dsums = cuda.mem_alloc(8*ctx.nthreads)
|
dsums = cuda.mem_alloc(8*ctx.nthreads)
|
||||||
ctx.set_param('mwc_test_sums', dsums)
|
ctx.set_param('mwc_test_sums', dsums)
|
||||||
print "Took %g seconds." % ctx.call_timed()
|
print "Took %g seconds." % ctx.call_timed()
|
||||||
print ctx.nthreads
|
|
||||||
dsums = cuda.from_device(dsums, ctx.nthreads, np.uint64)
|
dsums = cuda.from_device(dsums, ctx.nthreads, np.uint64)
|
||||||
if not np.all(np.equal(sums, dsums)):
|
if not np.all(np.equal(sums, dsums)):
|
||||||
print "Sum discrepancy!"
|
print "Sum discrepancy!"
|
||||||
|
@ -144,20 +144,16 @@ class Animation(object):
|
|||||||
self.filters = Filters(self._frame, genomes[0])
|
self.filters = Filters(self._frame, genomes[0])
|
||||||
self.features = Features(genomes, self.filters)
|
self.features = Features(genomes, self.filters)
|
||||||
|
|
||||||
self.ctx = None
|
|
||||||
|
|
||||||
def compile(self):
|
def compile(self):
|
||||||
"""
|
"""
|
||||||
Create a PTX kernel optimized for this animation, compile it, and
|
Create a PTX kernel optimized for this animation, compile it, and
|
||||||
attach it to a LaunchContext with a thread distribution optimized for
|
attach it to a LaunchContext with a thread distribution optimized for
|
||||||
the active device.
|
the active device.
|
||||||
"""
|
"""
|
||||||
# TODO: user-configurable test control
|
|
||||||
self.ctx = LaunchContext([IterThread], block=(512,1,1), grid=(28,1),
|
|
||||||
tests=True)
|
|
||||||
# TODO: user-configurable verbosity control
|
|
||||||
self.ctx.compile(verbose=3, anim=self, features=self.features)
|
|
||||||
# TODO: automatic optimization of block parameters
|
# TODO: automatic optimization of block parameters
|
||||||
|
entry = ptx.Entry("iterate", 512)
|
||||||
|
iter = IterThread(entry, self.features)
|
||||||
|
self.mod = run.Module([entry])
|
||||||
|
|
||||||
def render_frame(self, time=0):
|
def render_frame(self, time=0):
|
||||||
# TODO: support more nuanced frame control than just 'time'
|
# TODO: support more nuanced frame control than just 'time'
|
||||||
|
@ -8,8 +8,8 @@ class Variations(object):
|
|||||||
|
|
||||||
shortname = "variations"
|
shortname = "variations"
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self, features):
|
||||||
self.xform_idx = None
|
self.features = features
|
||||||
|
|
||||||
names = [ "linear", "sinusoidal", "spherical", "swirl", "horseshoe",
|
names = [ "linear", "sinusoidal", "spherical", "swirl", "horseshoe",
|
||||||
"polar", "handkerchief", "heart", "disc", "spiral", "hyperbolic",
|
"polar", "handkerchief", "heart", "disc", "spiral", "hyperbolic",
|
||||||
@ -27,100 +27,65 @@ class Variations(object):
|
|||||||
"waves2", "exp", "log", "sin", "cos", "tan", "sec", "csc", "cot",
|
"waves2", "exp", "log", "sin", "cos", "tan", "sec", "csc", "cot",
|
||||||
"sinh", "cosh", "tanh", "sech", "csch", "coth", "auger", "flux", ]
|
"sinh", "cosh", "tanh", "sech", "csch", "coth", "auger", "flux", ]
|
||||||
|
|
||||||
def xfg(self, dst, expr):
|
def apply_xform(self, entry, cp, x, y, color, xform_idx):
|
||||||
"""
|
|
||||||
Convenience wrapper around cp.get which loads the given property from
|
|
||||||
the current CP and XF.
|
|
||||||
"""
|
|
||||||
# xform_idx is set by apply_xform on the current instance, but the
|
|
||||||
# expression will be evaluated using each CP in stream packing.
|
|
||||||
cp.get(cpA, dst, 'cp.xforms[%d].%s' % (self.xform_idx, expr))
|
|
||||||
|
|
||||||
def xfg_v2(self, dst1, expr1, dst2, expr2):
|
|
||||||
cp.get_v2(cpA, dst1, 'cp.xforms[%d].%s' % (self.xform_idx, expr1),
|
|
||||||
dst2, 'cp.xforms[%d].%s' % (self.xform_idx, expr2))
|
|
||||||
|
|
||||||
def xfg_v4(self, d1, e1, d2, e2, d3, e3, d4, e4):
|
|
||||||
cp.get_v4(cpA, d1, 'cp.xforms[%d].%s' % (self.xform_idx, e1),
|
|
||||||
d2, 'cp.xforms[%d].%s' % (self.xform_idx, e2),
|
|
||||||
d3, 'cp.xforms[%d].%s' % (self.xform_idx, e3),
|
|
||||||
d4, 'cp.xforms[%d].%s' % (self.xform_idx, e4))
|
|
||||||
|
|
||||||
def apply_xform(self, xo, yo, co, xi, yi, ci, xform_idx):
|
|
||||||
"""
|
"""
|
||||||
Apply a transform.
|
Apply a transform.
|
||||||
|
|
||||||
This function necessarily makes a copy of the input variables, so it's
|
This function necessarily makes a copy of the input variables, so it's
|
||||||
safe to use the same registers for input and output.
|
safe to use the same registers for input and output.
|
||||||
"""
|
"""
|
||||||
with block("Apply xform %d" % xform_idx):
|
e, r, o, m, p, s = entry.locals
|
||||||
self.xform_idx = xform_idx
|
|
||||||
|
|
||||||
with block('Modify color'):
|
# For use in retrieving properties from the control point datastream
|
||||||
reg.f32('c_speed c_new')
|
xfs = lambda stval: 'cp.xforms[%d].%s' % (xform_idx, stval)
|
||||||
cp.get_v2(cpA,
|
|
||||||
c_speed, '(1.0 - cp.xforms[%d].color_speed)' % xform_idx,
|
|
||||||
c_new, 'cp.xforms[%d].color * cp.xforms[%d].color_speed' %
|
|
||||||
(xform_idx, xform_idx))
|
|
||||||
op.fma.rn.ftz.f32(co, ci, c_speed, c_new)
|
|
||||||
|
|
||||||
reg.f32('xt yt')
|
e.comment('Color transformation')
|
||||||
with block("Do affine transformation"):
|
c_speed, c_val = cp.get.v2.f32('1.0 - %s' % xfs('color_speed'),
|
||||||
# TODO: verify that this is the best performance (register
|
'%s * %s' % (xfs('color'), xfs('color_speed')))
|
||||||
# usage vs number of loads)
|
color = color * c_speed + c_val
|
||||||
reg.f32('c00 c10 c20 c01 c11 c21')
|
|
||||||
self.xfg_v4(c00, 'coefs[0][0]', c01, 'coefs[0][1]',
|
|
||||||
c20, 'coefs[2][0]', c21, 'coefs[2][1]')
|
|
||||||
op.fma.rn.ftz.f32(xt, c00, xi, c20)
|
|
||||||
op.fma.rn.ftz.f32(yt, c01, xi, c21)
|
|
||||||
self.xfg_v2(c10, 'coefs[1][0]', c11, 'coefs[1][1]')
|
|
||||||
op.fma.rn.ftz.f32(xt, c10, yi, xt)
|
|
||||||
op.fma.rn.ftz.f32(yt, c11, yi, yt)
|
|
||||||
|
|
||||||
op.mov.f32(xo, '0.0')
|
e.comment('Affine transformation')
|
||||||
op.mov.f32(yo, '0.0')
|
c00, c20 = cp.get.v2.f32(xfs('coefs[0][0]'), xfs('coefs[2][0]'))
|
||||||
|
xt = x * c00 + c20
|
||||||
|
c01, c21 = cp.get.v2.f32(xfs('coefs[0][1]'), xfs('coefs[2][1]'))
|
||||||
|
yt = x * c01 + c21
|
||||||
|
c10, c11 = cp.get.v2.f32(xfs('coefs[1][0]'), xfs('coefs[1][1]'))
|
||||||
|
xt += y * c10
|
||||||
|
yt += y * c11
|
||||||
|
|
||||||
for var_name in sorted(features.xforms[xform_idx].vars):
|
xo, yo = o.mov.f32(0), o.mov.f32(0)
|
||||||
|
for var_name in sorted(self.features.xforms[xform_idx].vars):
|
||||||
func = getattr(self, var_name, None)
|
func = getattr(self, var_name, None)
|
||||||
if not func:
|
if not func:
|
||||||
raise NotImplementedError(
|
raise NotImplementedError(
|
||||||
"Haven't implemented %s yet" % var_name)
|
"Haven't implemented %s yet" % var_name)
|
||||||
with block('%s variation' % var_name):
|
e.comment('%s variation' % var_name)
|
||||||
reg.f32('wgt')
|
xtemp, ytemp = func(o, xt, yt, cp.get.f32(xfs(var_name)))
|
||||||
self.xfg(wgt, var_name)
|
xo += xtemp
|
||||||
func(xo, yo, xt, yt, wgt)
|
yo += ytemp
|
||||||
|
|
||||||
if features.xforms[xform_idx].has_post:
|
if self.features.xforms[xform_idx].has_post:
|
||||||
with block("Affine post-transformation"):
|
e.comment('Affine post-transformation')
|
||||||
op.mov.f32(xt, xo)
|
c00, c20 = cp.get.v2.f32(xfs('post[0][0]'), xfs('post[2][0]'))
|
||||||
op.mov.f32(yt, yo)
|
xt = xo * c00 + c20
|
||||||
reg.f32('c00 c10 c20 c01 c11 c21')
|
c01, c21 = cp.get.v2.f32(xfs('post[0][1]'), xfs('post[2][1]'))
|
||||||
self.xfg_v4(c00, 'post[0][0]', c01, 'post[0][1]',
|
yt = xo * c01 + c21
|
||||||
c20, 'post[2][0]', c21, 'post[2][1]')
|
c10, c11 = cp.get.v2.f32(xfs('post[1][0]'), xfs('post[1][1]'))
|
||||||
op.fma.rn.ftz.f32(xo, c00, xt, c20)
|
xt += yo * c10
|
||||||
op.fma.rn.ftz.f32(yo, c01, xt, c21)
|
yt += yo * c11
|
||||||
self.xfg_v2(c10, 'post[1][0]', c11, 'post[1][1]')
|
xo, yo = xt, yt
|
||||||
op.fma.rn.ftz.f32(xo, c10, yt, xo)
|
|
||||||
op.fma.rn.ftz.f32(yo, c11, yt, yo)
|
|
||||||
|
|
||||||
def linear(self, xo, yo, xi, yi, wgt):
|
self.xform_idx = None
|
||||||
op.fma.rn.ftz.f32(xo, xi, wgt, xo)
|
return xo, yo, color
|
||||||
op.fma.rn.ftz.f32(yo, yi, wgt, yo)
|
|
||||||
|
|
||||||
def sinusoidal(self, xo, yo, xi, yi, wgt):
|
def linear(self, o, x, y, wgt):
|
||||||
reg.f32('sinval')
|
return x * wgt, y * wgt
|
||||||
op.sin.approx.ftz.f32(sinval, xi)
|
|
||||||
op.fma.rn.ftz.f32(xo, sinval, wgt, xo)
|
|
||||||
op.sin.approx.ftz.f32(sinval, yi)
|
|
||||||
op.fma.rn.ftz.f32(yo, sinval, wgt, yo)
|
|
||||||
|
|
||||||
def spherical(self, xo, yo, xi, yi, wgt):
|
def sinusoidal(self, o, x, y, wgt):
|
||||||
reg.f32('r2')
|
return o.sin(x) * wgt, o.sin(y) * wgt
|
||||||
op.fma.rn.ftz.f32(r2, xi, xi, '1e-30')
|
|
||||||
op.fma.rn.ftz.f32(r2, yi, yi, r2)
|
|
||||||
op.rcp.approx.f32(r2, r2)
|
|
||||||
op.mul.rn.ftz.f32(r2, r2, wgt)
|
|
||||||
op.fma.rn.ftz.f32(xo, xi, r2, xo)
|
|
||||||
op.fma.rn.ftz.f32(yo, yi, r2, yo)
|
|
||||||
|
|
||||||
|
def spherical(self, o, x, y, wgt):
|
||||||
|
rsquared = x * x + y * y
|
||||||
|
rrcp = o.rcp(rsquared) * wgt
|
||||||
|
return x * wgt, y * wgt
|
||||||
|
|
||||||
|
6
main.py
6
main.py
@ -42,7 +42,7 @@ def disass(mod):
|
|||||||
subprocess.check_call('/home/steven/code/decuda/elfToCubin.py --nouveau '
|
subprocess.check_call('/home/steven/code/decuda/elfToCubin.py --nouveau '
|
||||||
'/tmp/elf.o'.split())
|
'/tmp/elf.o'.split())
|
||||||
|
|
||||||
def main(args):
|
def mwctest():
|
||||||
mwcent = ptx.Entry("mwc_test", 512)
|
mwcent = ptx.Entry("mwc_test", 512)
|
||||||
mwctest = MWCRNGTest(mwcent)
|
mwctest = MWCRNGTest(mwcent)
|
||||||
|
|
||||||
@ -57,9 +57,7 @@ def main(args):
|
|||||||
ctx = mod.get_context('mwc_test', 14)
|
ctx = mod.get_context('mwc_test', 14)
|
||||||
mwctest.run_test(ctx)
|
mwctest.run_test(ctx)
|
||||||
|
|
||||||
return
|
def main(args):
|
||||||
|
|
||||||
|
|
||||||
with open(args[-1]) as fp:
|
with open(args[-1]) as fp:
|
||||||
genomes = Genome.from_string(fp.read())
|
genomes = Genome.from_string(fp.read())
|
||||||
anim = Animation(genomes)
|
anim = Animation(genomes)
|
||||||
|
Loading…
Reference in New Issue
Block a user