mirror of
https://github.com/stevenrobertson/cuburn.git
synced 2025-02-05 03:30:05 -05:00
Finally runs again
This commit is contained in:
parent
27e7fd82a3
commit
f3298e0bed
@ -15,7 +15,8 @@ from cuburnlib.ptx import PTXModule
|
||||
class LaunchContext(object):
|
||||
"""
|
||||
Context collecting the information needed to create, run, and gather the
|
||||
results of a device computation.
|
||||
results of a device computation. This may eventually also include an actual
|
||||
CUDA context, but for now it just uses the global one.
|
||||
|
||||
To create the fastest device code across multiple device families, this
|
||||
context may decide to iteratively refine the final PTX by regenerating
|
||||
@ -32,34 +33,27 @@ class LaunchContext(object):
|
||||
`mod`: Final compiled module. Unavailable during assembly.
|
||||
|
||||
"""
|
||||
def __init__(self, entries, block=(1,1,1), grid=(1,1), seed=None,
|
||||
tests=False):
|
||||
def __init__(self, entries, block=(1,1,1), grid=(1,1), tests=False):
|
||||
self.entry_types = entries
|
||||
self.block, self.grid, self.build_tests = block, grid, tests
|
||||
self.rand = np.random.mtrand.RandomState(seed)
|
||||
self.setup_done = False
|
||||
|
||||
@property
|
||||
def threads(self):
|
||||
return reduce(lambda a, b: a*b, self.block + self.grid)
|
||||
|
||||
def print_source(self):
|
||||
print '\n'.join(["%03d %s" % (i+1, l) for (i, l) in
|
||||
enumerate(self.ptx.source.split('\n'))])
|
||||
|
||||
def compile(self, to_inject={}, verbose=False):
|
||||
inj = dict(to_inject)
|
||||
inj['ctx'] = self
|
||||
self.ptx = PTXModule(self.entry_types, inj, self.build_tests)
|
||||
def compile(self, verbose=False, **kwargs):
|
||||
kwargs['ctx'] = self
|
||||
self.ptx = PTXModule(self.entry_types, kwargs, self.build_tests)
|
||||
try:
|
||||
self.mod = cuda.module_from_buffer(self.ptx.source)
|
||||
except (cuda.CompileError, cuda.RuntimeError), e:
|
||||
print "Aww, dang, compile error. Here's the source:"
|
||||
self.print_source()
|
||||
self.ptx.print_source()
|
||||
raise e
|
||||
if verbose:
|
||||
if verbose >= 3:
|
||||
self.print_source()
|
||||
self.ptx.print_source()
|
||||
for entry in self.ptx.entries:
|
||||
func = self.mod.get_function(entry.entry_name)
|
||||
print "Compiled %s: used %d regs, %d sm, %d local" % (
|
||||
|
@ -4,126 +4,34 @@ Contains the PTX fragments which will drive the device.
|
||||
|
||||
import os
|
||||
import time
|
||||
import struct
|
||||
|
||||
import pycuda.driver as cuda
|
||||
import numpy as np
|
||||
|
||||
from cuburnlib.ptx import *
|
||||
|
||||
"""
|
||||
Here's the current draft of the full algorithm implementation.
|
||||
|
||||
declare xform jump table
|
||||
|
||||
load random state
|
||||
|
||||
clear x_coord, y_coord, z_coord, w_coord;
|
||||
store -(FUSE+1) to shared (per-warp) num_samples_sh
|
||||
clear badvals [1]
|
||||
|
||||
load param (global_cp_idx_addr)
|
||||
index table start (global_cp_idx) [2]
|
||||
load count of indexes from global cp index =>
|
||||
store to qlocal current_cp_num [3]
|
||||
|
||||
outermost loop start:
|
||||
load current_cp_num
|
||||
if current_cp_num <= 0:
|
||||
exit
|
||||
|
||||
load param global_cp_idx_addr
|
||||
calculate offset into address with current_cp_num, global_cp_idx_addr
|
||||
load cp_base_address
|
||||
stream_start (cp_base, cp_base_addr) [4]
|
||||
|
||||
FUSE_START:
|
||||
num_samples += 1
|
||||
if num_samples >= 0:
|
||||
# Okay, we're done FUSEing, prepare to enter normal loop
|
||||
load num_samples => store to shared (per-warp) num_samples
|
||||
|
||||
|
||||
ITER_LOOP_START:
|
||||
reg xform_addr, xform_stream_addr, xform_select
|
||||
|
||||
mwc_next_u32 to xform_select
|
||||
# Performance test: roll/unroll this loop?
|
||||
stream_load xform_prob (cp_stream)
|
||||
if xform_select <= xform_prob:
|
||||
bra.uni XFORM_1_LBL
|
||||
...
|
||||
stream_load xform_prob (cp_stream)
|
||||
if xform_select <= xform_prob:
|
||||
bra.uni XFORM_N_LBL
|
||||
|
||||
XFORM_1_LBL:
|
||||
stream_load xform_1_ (cp_stream)
|
||||
...
|
||||
bra.uni XFORM_POST
|
||||
|
||||
XFORM_POST:
|
||||
[if final_xform:]
|
||||
[do final_xform]
|
||||
|
||||
if num_samples < 0:
|
||||
# FUSE still in progress
|
||||
bra.uni FUSE_START
|
||||
|
||||
FRAGMENT_WRITEBACK:
|
||||
# Unknown at this time.
|
||||
|
||||
SHUFFLE:
|
||||
# Unknown at this time.
|
||||
|
||||
load num_samples from num_samples_sh
|
||||
num_samples -= 1
|
||||
if num_samples > 0:
|
||||
bra.uni ITER_LOOP_START
|
||||
|
||||
|
||||
[1] Tracking 'badvals' can put a pretty large hit on performance, particularly
|
||||
for images that sample a small amount of the grid. So this might be cut
|
||||
when rendering for performance. On the other hand, it might actually help
|
||||
tune the algorithm later, so it'll definitely be an option.
|
||||
|
||||
[2] Control points for each temporal sample will be preloaded to the
|
||||
device in the compact DataStream format (more on this later). Their
|
||||
locations are represented in an index table, which starts with a single
|
||||
`.u32 length`, followed by `length` pointers. To avoid having to keep
|
||||
reloading `length`, or worse, using a register to hold it in memory, we
|
||||
instead count *down* to zero. This is a very common idiom.
|
||||
|
||||
[3] 'qlocal' is quasi-local storage. it could easily be actual local storage,
|
||||
depending on how local storage is implemented, but the extra 128-byte loads
|
||||
for such values might make a performance difference. qlocal variables may
|
||||
be identical across a warp or even a CTA, and so variables noted as
|
||||
"qlocal" here might end up in shared memory or even a small per-warp or
|
||||
per-CTA buffer in global memory created specifically for this purpose,
|
||||
after benchmarking is done.
|
||||
|
||||
[4] DataStreams are "opaque" data serialization structures defined below. The
|
||||
structure of a stream is actually created while parsing the DSL by the load
|
||||
statements themselves. Some benchmarks need to be done before DataStreams
|
||||
stop being "opaque" and become simply "dynamic".
|
||||
"""
|
||||
|
||||
class IterThread(PTXTest):
|
||||
entry_name = 'iter_thread'
|
||||
entry_params = []
|
||||
|
||||
def __init__(self):
|
||||
self.cps_uploaded = False
|
||||
|
||||
def deps(self):
|
||||
return [MWCRNG, CPDataStream]
|
||||
|
||||
@ptx_func
|
||||
def module_setup(self):
|
||||
mem.global_.u32('g_cp_array',
|
||||
[features.max_ntemporal_samples,'*',cp_stream_size])
|
||||
cp_stream_size*features.max_ntemporal_samples)
|
||||
mem.global_.u32('g_num_cps')
|
||||
# TODO move into debug statement
|
||||
mem.global_.u32('g_num_rounds', ctx.threads)
|
||||
mem.global_.u32('g_num_writes', ctx.threads)
|
||||
|
||||
@ptx_func
|
||||
def entry():
|
||||
def entry(self):
|
||||
reg.f32('x_coord y_coord color_coord alpha_coord')
|
||||
|
||||
# TODO: temporary, for testing
|
||||
@ -158,8 +66,8 @@ class IterThread(PTXTest):
|
||||
op.mov.s32(num_samples, -(features.num_fuse_samples+1))
|
||||
|
||||
# TODO: Move cp_num to qlocal storage (or spill it, rarely accessed)
|
||||
reg.u32('cp_num cpA')
|
||||
mov.u32(cp_num, 0)
|
||||
reg.u32('cp_idx cpA')
|
||||
op.mov.u32(cp_idx, 0)
|
||||
|
||||
label('cp_loop_start')
|
||||
op.bar.sync(0)
|
||||
@ -168,19 +76,19 @@ class IterThread(PTXTest):
|
||||
reg.u32('num_cps')
|
||||
reg.pred('p_last_cp')
|
||||
op.ldu.u32(num_cps, addr(g_num_cps))
|
||||
op.setp.lt.u32(p_last_cp, cp_num, num_cps)
|
||||
op.setp.ge.u32(p_last_cp, cp_idx, num_cps)
|
||||
op.bra.uni('all_cps_done', ifp=p_last_cp)
|
||||
|
||||
with block('Load CP address'):
|
||||
op.mov.u32(cpA, g_cp_array)
|
||||
op.mad.lo.u32(cpA, cp_num, cp_stream_size, cpA)
|
||||
op.mad.lo.u32(cpA, cp_idx, cp_stream_size, cpA)
|
||||
|
||||
with block('Increment CP number, load num_samples (unless in fuse)'):
|
||||
reg.pred('p_in_fuse')
|
||||
op.setp.lt.s32(p_in_fuse, num_samples, 0)
|
||||
op.add.u32(cp_num, cp_num, 1, ifp=p_in_fuse)
|
||||
cp_stream_get(cpA, num_samples, 'cp.samples_per_thread',
|
||||
ifp=p_in_fuse)
|
||||
with block('Increment CP index, load num_samples (unless in fuse)'):
|
||||
reg.pred('p_not_in_fuse')
|
||||
op.setp.ge.s32(p_not_in_fuse, num_samples, 0)
|
||||
op.add.u32(cp_idx, cp_idx, 1, ifp=p_not_in_fuse)
|
||||
cp_stream_get(cpA, num_samples, 'samples_per_thread',
|
||||
ifp=p_not_in_fuse)
|
||||
|
||||
label('fuse_loop_start')
|
||||
with block('FUSE-specific stuff'):
|
||||
@ -188,7 +96,7 @@ class IterThread(PTXTest):
|
||||
comment('If num_samples == -1, set it to 0 and jump back up')
|
||||
comment('This will start the normal CP loading machinery')
|
||||
op.setp.eq.s32(p_fuse, num_samples, -1)
|
||||
op.mov.s32(p_fuse, 0, ifp=p_fuse)
|
||||
op.mov.s32(num_samples, 0, ifp=p_fuse)
|
||||
op.bra.uni(cp_loop_start, ifp=p_fuse)
|
||||
|
||||
comment('If num_samples < -1, still fusing, so increment')
|
||||
@ -204,33 +112,55 @@ class IterThread(PTXTest):
|
||||
with block("Test if we're still in FUSE"):
|
||||
reg.pred('p_in_fuse')
|
||||
op.setp.lt.s32(p_in_fuse, num_samples, 0)
|
||||
op.bra.uni(fuse_start, ifp=p_in_fuse)
|
||||
op.bra.uni(fuse_loop_start, ifp=p_in_fuse)
|
||||
|
||||
with block("Ordinarily, we'd write the result here"):
|
||||
op.add.u32(num_writes, num_writes, 1)
|
||||
|
||||
with block("Check to see if we're done with this CP"):
|
||||
reg.pred('p_cp_done')
|
||||
op.add.s32(num_samples, num_samples, -1)
|
||||
op.setp.eq.s32(p_cp_done, num_samples, 0)
|
||||
op.bra.uni(cp_loop_start, ifp=p_cp_done)
|
||||
|
||||
op.bra.uni(iter_loop_start)
|
||||
|
||||
|
||||
label('all_cps_done')
|
||||
# TODO this is for testing, move it to a debug statement
|
||||
store_per_thread(g_num_rounds, num_rounds)
|
||||
store_per_thread(g_num_writes, num_writes)
|
||||
|
||||
def call(self, ctx):
|
||||
raise HorribleDeathError("Okay I'm going to bed now")
|
||||
def upload_cp_stream(self, ctx, cp_stream, num_cps):
|
||||
cp_array_dp, cp_array_l = ctx.mod.get_global('g_cp_array')
|
||||
assert len(cp_stream) <= cp_array_l, "Stream too big!"
|
||||
cuda.memcpy_htod_async(cp_array_dp, cp_stream)
|
||||
num_cps_dp, num_cps_l = ctx.mod.get_global('g_num_cps')
|
||||
cuda.memcpy_htod_async(num_cps_dp, struct.pack('i', num_cps))
|
||||
self.cps_uploaded = True
|
||||
|
||||
def call(self, ctx):
|
||||
if not self.cps_uploaded:
|
||||
raise Error("Cannot call IterThread before uploading CPs")
|
||||
func = ctx.mod.get_function('iter_thread')
|
||||
dtime = func(block=ctx.block, grid=ctx.grid, time_kernel=True)
|
||||
|
||||
num_rounds_dp, num_rounds_l = ctx.mod.get_global('g_num_rounds')
|
||||
num_writes_dp, num_writes_l = ctx.mod.get_global('g_num_writes')
|
||||
rounds = cuda.from_device(num_rounds_dp, ctx.threads, np.uint32)
|
||||
writes = cuda.from_device(num_writes_dp, ctx.threads, np.uint32)
|
||||
print "Rounds:", rounds
|
||||
print "Writes:", writes
|
||||
|
||||
class MWCRNG(PTXFragment):
|
||||
def __init__(self):
|
||||
self.rand = np.random
|
||||
self.threads_ready = 0
|
||||
if not os.path.isfile('primes.bin'):
|
||||
raise EnvironmentError('primes.bin not found')
|
||||
|
||||
def set_seed(self, seed):
|
||||
self.rand = np.random.mtrand.RandomState(seed)
|
||||
|
||||
@ptx_func
|
||||
def module_setup(self):
|
||||
mem.global_.u32('mwc_rng_mults', ctx.threads)
|
||||
@ -284,13 +214,13 @@ class MWCRNG(PTXFragment):
|
||||
# Randomness in choosing multipliers is good, but larger multipliers
|
||||
# have longer periods, which is also good. This is a compromise.
|
||||
mults = np.array(mults[:ctx.threads*4])
|
||||
ctx.rand.shuffle(mults)
|
||||
self.rand.shuffle(mults)
|
||||
# Copy multipliers and seeds to the device
|
||||
multdp, multl = ctx.mod.get_global('mwc_rng_mults')
|
||||
cuda.memcpy_htod_async(multdp, mults.tostring()[:multl])
|
||||
# Intentionally excludes both 0 and (2^32-1), as they can lead to
|
||||
# degenerate sequences of period 0
|
||||
states = np.array(ctx.rand.randint(1, 0xffffffff, size=2*ctx.threads),
|
||||
states = np.array(self.rand.randint(1, 0xffffffff, size=2*ctx.threads),
|
||||
dtype=np.uint32)
|
||||
statedp, statel = ctx.mod.get_global('mwc_rng_state')
|
||||
cuda.memcpy_htod_async(statedp, states.tostring())
|
||||
@ -376,7 +306,7 @@ class MWCRNGTest(PTXTest):
|
||||
class CameraCoordTransform(PTXFragment):
|
||||
pass
|
||||
|
||||
class CPDataStream(PTXFragment):
|
||||
class CPDataStream(DataStream):
|
||||
"""DataStream which stores the control points."""
|
||||
prefix = 'cp'
|
||||
|
||||
|
125
cuburnlib/ptx.py
125
cuburnlib/ptx.py
@ -11,7 +11,7 @@ easier to maintain using this system.
|
||||
# If you see 'import inspect', you know you're in for a good time
|
||||
import inspect
|
||||
import types
|
||||
import traceback
|
||||
import struct
|
||||
from cStringIO import StringIO
|
||||
from collections import namedtuple
|
||||
|
||||
@ -116,6 +116,8 @@ class _BlockInjector(object):
|
||||
self.dead = False
|
||||
map(self.inject, self.to_inject.items())
|
||||
def __exit__(self, exc_type, exc_val, tb):
|
||||
# Do some real exceptorin'
|
||||
if exc_type is not None: return
|
||||
for k in self.injected:
|
||||
del self.inject_into[k]
|
||||
self.dead = True
|
||||
@ -137,17 +139,27 @@ class _Block(object):
|
||||
inj = self.stack[-1].injectors
|
||||
[inj.remove(i) for i in inj if i.dead]
|
||||
def push_ctx(self):
|
||||
# Move most recent active injector to new context
|
||||
self.clean_injectors()
|
||||
last_inj = self.stack[-1].injectors.pop()
|
||||
self.stack.append(BlockCtx(dict(self.stack[-1].locals), [],
|
||||
[last_inj]))
|
||||
self.stack.append(BlockCtx(dict(self.stack[-1].locals), [], []))
|
||||
# The only reason we should have no injectors in the previous block is
|
||||
# if we are hitting a new ptx_func entry point or global declaration at
|
||||
# PTX module scope, which means the stack only contains the outer
|
||||
# context and the current one (i.e. len(stack) == 2)
|
||||
if len(self.stack[-2].injectors) == 0:
|
||||
assert len(self.stack) == 2, "Empty injector list too early!"
|
||||
# Otherwise, the active injector in the previous block is the one for
|
||||
# the Python function which is currently creating a new PTX block, and
|
||||
# and it needs to be promoted to the current block
|
||||
else:
|
||||
self.stack[-1].injectors.append(self.stack[-2].injectors.pop())
|
||||
def pop_ctx(self):
|
||||
self.clean_injectors()
|
||||
bs = self.stack.pop()
|
||||
# TODO: figure out why this next line is needed
|
||||
[bs.injectors.remove(i) for i in bs.injectors if i.dead]
|
||||
self.stack[-1].code.extend(bs.code)
|
||||
if len(self.stack) == 1:
|
||||
# We're on outer_ctx, so all injectors should be gone
|
||||
# We're on outer_ctx, so all injectors should be gone.
|
||||
assert len(bs.injectors) == 0, "Injector/context mismatch"
|
||||
return
|
||||
# The only injector should be the one added in push_ctx
|
||||
@ -186,7 +198,7 @@ class _Block(object):
|
||||
spacing. To keep things simple, nested lists and tuples will be reduced
|
||||
in this manner (but not other iterable types). Coercion will not happen
|
||||
until after the entire DSL call tree has been walked. This allows a
|
||||
class to submit a mutable type (e.g. the trivial `StrVar`) when first
|
||||
class to submit a mutable type (e.g. ``DelayVar``) when first
|
||||
walked with an undefined value, then substitute the correct value on
|
||||
being finalized.
|
||||
|
||||
@ -196,14 +208,23 @@ class _Block(object):
|
||||
"""
|
||||
self.stack[-1].code.append(PTXStmt(prefix, op, vars, semi, indent))
|
||||
|
||||
class StrVar(object):
|
||||
class DelayVar(object):
|
||||
"""
|
||||
Trivial wrapper to allow deferred variable substitution.
|
||||
"""
|
||||
def __init__(self, val=None):
|
||||
self.val = val
|
||||
def __str__(self):
|
||||
return str(val)
|
||||
return str(self.val)
|
||||
def __mul__(self, other):
|
||||
# Oh this is truly egregious
|
||||
return DelayVarProxy(self, "self.other.val*" + str(other))
|
||||
|
||||
class DelayVarProxy(object):
|
||||
def __init__(self, other, expr):
|
||||
self.other, self.expr = other, expr
|
||||
def __str__(self):
|
||||
return str(eval(self.expr))
|
||||
|
||||
class _PTXFuncWrapper(object):
|
||||
"""Enables ptx_func"""
|
||||
@ -298,6 +319,9 @@ class Block(object):
|
||||
self.block.code(op=['// ', self.comment], semi=False)
|
||||
self.comment = None
|
||||
def __exit__(self, exc_type, exc_value, tb):
|
||||
# Allow exceptions to be propagated; things get really messy if we try
|
||||
# to pop the stack if things aren't ordered correctly
|
||||
if exc_type is not None: return
|
||||
self.block.code(indent=-1)
|
||||
self.block.code(op='}', semi=False)
|
||||
self.block.pop_ctx()
|
||||
@ -370,12 +394,14 @@ class Op(_CallChain):
|
||||
"""
|
||||
def _call(self, op, *args, **kwargs):
|
||||
pred = ''
|
||||
if 'ifp' in kwargs:
|
||||
if 'ifnotp' in kwargs:
|
||||
ifp = kwargs.get('ifp')
|
||||
ifnotp = kwargs.get('ifnotp')
|
||||
if ifp:
|
||||
if ifnotp:
|
||||
raise SyntaxError("can't use both, fool")
|
||||
pred = ['@', kwargs['ifp']]
|
||||
if 'ifnotp' in kwargs:
|
||||
pred = ['@!', kwargs['ifnotp']]
|
||||
pred = ['@', ifp]
|
||||
if ifnotp:
|
||||
pred = ['@!', ifnotp]
|
||||
self.block.code(pred, '.'.join(op), _softjoin(args, ','))
|
||||
|
||||
class Mem(object):
|
||||
@ -421,7 +447,7 @@ class Mem(object):
|
||||
>>> op.st.global.v2.u32(addr(areg), vec(reg1, reg2))
|
||||
>>> op.ld.global.v2.u32(vec(reg1, reg2), addr(areg, 8))
|
||||
"""
|
||||
return ['[', areg, aoffset and '+' or '', aoffset, ']']
|
||||
return ['[', areg, aoffset is not '' and '+' or '', aoffset, ']']
|
||||
|
||||
class _MemFactory(_CallChain):
|
||||
"""Actual `mem` object"""
|
||||
@ -538,8 +564,8 @@ class PTXFragment(object):
|
||||
"""
|
||||
Called after running all PTX DSL functions, but before code generation,
|
||||
to allow fragments which postponed variable evaluation (e.g. using
|
||||
`StrVar`) to fill in the resulting values. Most fragments should not
|
||||
use this.
|
||||
``DelayVar``) to fill in the resulting values. Most fragments should
|
||||
not use this.
|
||||
|
||||
If implemented, this function *may* use an @ptx_func decorator to
|
||||
access the global DSL scope, but pretty please don't emit any code
|
||||
@ -796,6 +822,13 @@ class PTXModule(object):
|
||||
raise ValueError("Too many recompiles scheduled!")
|
||||
self.__needs_recompilation = True
|
||||
|
||||
def print_source(self):
|
||||
if not hasattr(self, 'source'):
|
||||
raise ValueError("Not assembled yet!")
|
||||
print '\n'.join(["%03d %s" % (i+1, l) for (i, l) in
|
||||
enumerate(self.source.split('\n'))])
|
||||
|
||||
|
||||
def _flatten(val):
|
||||
if isinstance(val, (list, tuple)):
|
||||
return ''.join(map(_flatten, val))
|
||||
@ -806,7 +839,7 @@ class PTXFormatter(object):
|
||||
Formats PTXStmt items into beautiful code. Well, the beautiful part is
|
||||
postponed for now.
|
||||
"""
|
||||
def __init__(self, indent_amt=2, oplen_max=20, varlen_max=12):
|
||||
def __init__(self, indent_amt=4, oplen_max=20, varlen_max=12):
|
||||
self.idamt, self.opm, self.vm = indent_amt, oplen_max, varlen_max
|
||||
def format(self, code):
|
||||
out = []
|
||||
@ -844,7 +877,7 @@ class PTXFormatter(object):
|
||||
_TExp = namedtuple('_TExp', 'type exprlist')
|
||||
_DataCell = namedtuple('_DataCell', 'offset size texp')
|
||||
|
||||
class DataStream(object):
|
||||
class DataStream(PTXFragment):
|
||||
"""
|
||||
Simple interface between Python and PTX, designed to create and tightly
|
||||
pack control structs.
|
||||
@ -914,19 +947,19 @@ class DataStream(object):
|
||||
self.cells = []
|
||||
self.stream_size = 0
|
||||
self.free = {}
|
||||
self.size_strvar = StrVar("not_yet_determined")
|
||||
self.size_delayvars = []
|
||||
|
||||
_types = dict(s8='b', u8='B', s16='h', u16='H', s32='i', u32='I', f32='f',
|
||||
s64='l', u64='L', f64='d')
|
||||
def _get_type(self, *regs):
|
||||
def _get_type(self, regs):
|
||||
size = int(regs[0].type[1:])
|
||||
for r in regs:
|
||||
for reg in regs:
|
||||
if reg.type not in self._types:
|
||||
raise TypeError("Register %s of type %s not supported" %
|
||||
(reg.name, reg.type))
|
||||
if int(r.type[1:]) != size:
|
||||
if int(reg.type[1:]) != size:
|
||||
raise TypeError("Can't vector-load different size regs")
|
||||
return size, ''.join([self._types.get(r.type) for r in regs])
|
||||
return size/8, ''.join([self._types.get(r.type) for r in regs])
|
||||
|
||||
def _alloc(self, vsize, texp):
|
||||
# A really crappy allocator. May later include optimizations for
|
||||
@ -939,7 +972,7 @@ class DataStream(object):
|
||||
if idx is None:
|
||||
# No aligned free cells, allocate a new `align`-byte free cell
|
||||
assert alloc not in self.free
|
||||
self.free[alloc] = idx = len(self.stream_size)
|
||||
self.free[alloc] = idx = len(self.cells)
|
||||
self.cells.append(_DataCell(self.stream_size, alloc, None))
|
||||
self.stream_size += alloc
|
||||
# Overwrite the free cell at `idx` with texp
|
||||
@ -958,27 +991,28 @@ class DataStream(object):
|
||||
self.cells.insert(fidx, _DataCell(foffset, fsize, None))
|
||||
foffset += fsize
|
||||
self.free[fsize] = fidx
|
||||
fsize *= 2
|
||||
# Adjust indexes. This is ugly, but evidently unavoidable
|
||||
if fidx-idx:
|
||||
for k, v in filter(lambda k, v: v > idx, self.free.items()):
|
||||
for k, v in filter(lambda (k, v): v > idx, self.free.items()):
|
||||
self.free[k] = v+(fidx-idx)
|
||||
return self.offset
|
||||
return offset
|
||||
|
||||
@ptx_func
|
||||
def _stream_get_internal(self, areg, dregs, exprs, ifp, ifnotp):
|
||||
size, type = self._get_type(dregs)
|
||||
vsize = size * len(dregs)
|
||||
texp = _TExp(type, [expr])
|
||||
if texp in self.expr_map:
|
||||
texp = _TExp(type, tuple(exprs))
|
||||
if texp in self.texp_map:
|
||||
offset = self.texp_map[texp]
|
||||
else:
|
||||
offset = self._alloc(vsize, texp)
|
||||
self.texp_map[texp] = offset
|
||||
vtype = {1: '', 2: '.v2', 4: '.v4'}.get(len(dregs))
|
||||
if len(dregs) > 0:
|
||||
opname = ['ldu', 'b%d' % (size*8)]
|
||||
if len(dregs) > 1:
|
||||
opname.insert(1, 'v%d' % len(dregs))
|
||||
dregs = vec(dregs)
|
||||
op._call('ldu%s.b%d' % (vtype, size), dregs, addr(areg+off),
|
||||
ifp=ifp, ifnotp=ifnotp)
|
||||
op._call(opname, dregs, addr(areg, offset), ifp=ifp, ifnotp=ifnotp)
|
||||
|
||||
@ptx_func
|
||||
def _stream_get(self, areg, dreg, expr, ifp=None, ifnotp=None):
|
||||
@ -991,16 +1025,20 @@ class DataStream(object):
|
||||
ifp, ifnotp)
|
||||
|
||||
@ptx_func
|
||||
def _stream_get_v2(self, areg, d1, e1, d2, e2, d3, e3, d4, e4,
|
||||
def _stream_get_v4(self, areg, d1, e1, d2, e2, d3, e3, d4, e4,
|
||||
ifp=None, ifnotp=None):
|
||||
self._stream_get_internal(areg, [d1, d2, d3, d4], [e1, e2, e3, e4],
|
||||
ifp, ifnotp)
|
||||
|
||||
@property
|
||||
def _stream_size(self):
|
||||
return self.size_strvar
|
||||
x = DelayVar("not_yet_determined")
|
||||
self.size_delayvars.append(x)
|
||||
return x
|
||||
|
||||
def finalize_code(self):
|
||||
self.size_strvar.val = str(self.stream_size)
|
||||
for dv in self.size_delayvars:
|
||||
dv.val = self.stream_size
|
||||
|
||||
def to_inject(self):
|
||||
return {self.prefix + '_stream_get': self._stream_get,
|
||||
@ -1039,9 +1077,20 @@ class DataStream(object):
|
||||
for offset, size, texp in self.cells:
|
||||
if texp:
|
||||
type = texp.type
|
||||
vals = [eval(e, globals(), kwargs) for e in texp.expr_list]
|
||||
vals = [eval(e, globals(), kwargs) for e in texp.exprlist]
|
||||
else:
|
||||
type = 'x'*size # Padding bytes
|
||||
vals = []
|
||||
out.write(struct.pack(type, *vals))
|
||||
outfile.write(struct.pack(type, *vals))
|
||||
|
||||
def print_record(self):
|
||||
for cell in self.cells:
|
||||
if cell.texp is None:
|
||||
print '%3d %2d --' % (cell.offset, cell.size)
|
||||
continue
|
||||
print '%3d %2d %4s %s' % (cell.offset, cell.size, cell.texp.type,
|
||||
cell.texp.exprlist[0])
|
||||
for exp in cell.texp.exprlist[1:]:
|
||||
print '%12s %s' % ('', exp)
|
||||
|
||||
|
||||
|
@ -1,12 +1,62 @@
|
||||
|
||||
from ctypes import *
|
||||
from cStringIO import StringIO
|
||||
import numpy as np
|
||||
from fr0stlib.pyflam3 import Genome, Frame
|
||||
|
||||
from fr0stlib import pyflam3
|
||||
from fr0stlib.pyflam3._flam3 import *
|
||||
from fr0stlib.pyflam3.constants import *
|
||||
|
||||
from cuburnlib.cuda import LaunchContext
|
||||
from cuburnlib.device_code import IterThread, CPDataStream
|
||||
|
||||
Point = lambda x, y: np.array([x, y], dtype=np.double)
|
||||
|
||||
class Genome(pyflam3.Genome):
|
||||
pass
|
||||
|
||||
class Frame(pyflam3.Frame):
|
||||
def interpolate(self, time, cp):
|
||||
flam3_interpolate(self.genomes, self.ngenomes, time, 0, byref(cp))
|
||||
|
||||
def pack_stream(self, ctx, time):
|
||||
"""
|
||||
Pack and return the control point data stream to render this frame.
|
||||
"""
|
||||
# Get the central control point, and calculate parameters that change
|
||||
# once per frame
|
||||
cp = BaseGenome()
|
||||
self.interpolate(time, cp)
|
||||
self.filt = Filters(self, cp)
|
||||
rw = cp.spatial_oversample * cp.width + 2 * self.filt.gutter
|
||||
rh = cp.spatial_oversample * cp.height + 2 * self.filt.gutter
|
||||
|
||||
# Interpolate each time step, calculate per-step variables, and pack
|
||||
# into the stream
|
||||
cp_streamer = ctx.ptx.instances[CPDataStream]
|
||||
stream = StringIO()
|
||||
print "Data stream contents:"
|
||||
cp_streamer.print_record()
|
||||
tcp = BaseGenome()
|
||||
for batch_idx in range(cp.nbatches):
|
||||
for time_idx in range(cp.ntemporal_samples):
|
||||
idx = time_idx + batch_idx * cp.nbatches
|
||||
cp_time = time + self.filt.temporal_deltas[idx]
|
||||
self.interpolate(time, tcp)
|
||||
tcp.camera = Camera(self, tcp, self.filt)
|
||||
|
||||
# TODO: figure out which object to pack this into
|
||||
nsamples = ((tcp.camera.sample_density * cp.width * cp.height) /
|
||||
(cp.nbatches * cp.ntemporal_samples))
|
||||
samples_per_thread = nsamples / ctx.threads + 15
|
||||
|
||||
cp_streamer.pack_into(stream,
|
||||
frame=self,
|
||||
cp=tcp,
|
||||
cp_idx=idx,
|
||||
samples_per_thread=samples_per_thread)
|
||||
stream.seek(0)
|
||||
return (stream.read(), cp.nbatches * cp.ntemporal_samples)
|
||||
|
||||
class Animation(object):
|
||||
"""
|
||||
Control structure for rendering a series of frames.
|
||||
@ -31,46 +81,46 @@ class Animation(object):
|
||||
memmove(byref(self.genomes[i]), byref(genomes[i]),
|
||||
sizeof(BaseGenome))
|
||||
|
||||
self._frame = Frame()
|
||||
self._frame.genomes = cast(self.genomes, POINTER(BaseGenome))
|
||||
self._frame.ngenomes = len(genomes)
|
||||
self.features = Features(genomes)
|
||||
self.frame = Frame()
|
||||
self.frame.genomes = cast(self.genomes, POINTER(BaseGenome))
|
||||
self.frame.ngenomes = len(genomes)
|
||||
|
||||
self.ctx = None
|
||||
|
||||
def compile(self):
|
||||
"""
|
||||
Create a PTX kernel optimized for this animation, compile it, and
|
||||
attach it to a LaunchContext with a thread distribution optimized for
|
||||
the active device.
|
||||
"""
|
||||
# TODO: user-configurable test control
|
||||
self.ctx = LaunchContext([IterThread], block=(256,1,1), grid=(54,1),
|
||||
tests=True)
|
||||
# TODO: user-configurable verbosity control
|
||||
self.ctx.compile(verbose=3, anim=self, features=self.features)
|
||||
# TODO: automatic optimization of block parameters
|
||||
|
||||
def render_frame(self, time=0):
|
||||
# TODO: support more nuanced frame control than just 'time'
|
||||
# TODO: reuse more information between frames
|
||||
# TODO: allow animation-long override of certain parameters (size, etc)
|
||||
cp_stream, num_cps = self.frame.pack_stream(self.ctx, time)
|
||||
iter_thread = self.ctx.ptx.instances[IterThread]
|
||||
iter_thread.upload_cp_stream(self.ctx, cp_stream, num_cps)
|
||||
iter_thread.call(self.ctx)
|
||||
|
||||
cp = BaseGenome()
|
||||
flam3_interpolate(self.frame.genomes, len(self.genomes), time, 0,
|
||||
byref(cp))
|
||||
filt = Filters(self.frame, cp)
|
||||
rw = cp.spatial_oversample * cp.width + 2 * filt.gutter
|
||||
rh = cp.spatial_oversample * cp.height + 2 * filt.gutter
|
||||
class Features(object):
|
||||
"""
|
||||
Determine features and constants required to render a particular set of
|
||||
genomes. The values of this class are fixed before compilation begins.
|
||||
"""
|
||||
# Constant; number of rounds spent fusing points on first CP of a frame
|
||||
num_fuse_samples = 25
|
||||
|
||||
# Allocate buckets, accumulator
|
||||
# Loop over all batches:
|
||||
# [density estimation]
|
||||
# Loop over all temporal samples:
|
||||
# Color scalar = temporal filter at index
|
||||
# Interpolate and get control point
|
||||
# Precalculate
|
||||
# Prepare xforms
|
||||
# Compute colormap
|
||||
# Run iterations
|
||||
# Accumulate vibrancy, gamma, background
|
||||
# Calculate k1, k2
|
||||
# If not DE, then do log filtering to accumulator
|
||||
# Else, [density estimation]
|
||||
# Do final clip and filter
|
||||
|
||||
# For now:
|
||||
# Loop over all batches:
|
||||
# Loop over all temporal samples:
|
||||
# Interpolate and get control point
|
||||
# Read the
|
||||
# Dump noise into buckets
|
||||
# Do log filtering to accumulator
|
||||
# Do simplified final clip
|
||||
def __init__(self, genomes):
|
||||
self.max_ntemporal_samples = max(
|
||||
[cp.nbatches * cp.ntemporal_samples for cp in genomes]) + 1
|
||||
|
||||
class Filters(object):
|
||||
def __init__(self, frame, cp):
|
||||
@ -115,7 +165,7 @@ class Camera(object):
|
||||
scale = 2.0 ** cp.zoom
|
||||
self.sample_density = cp.sample_density * scale * scale
|
||||
|
||||
center = Point(cp.center[0], cp.center[1])
|
||||
center = Point(cp._center[0], cp._center[1])
|
||||
size = Point(cp.width, cp.height)
|
||||
# pix per unit, where 'unit' is '1.0' in IFS space
|
||||
self.ppu = Point(
|
||||
@ -129,4 +179,3 @@ class Camera(object):
|
||||
self.ifs_space_size = 1.0 / (self.upper_bounds - self.lower_bounds)
|
||||
# TODO: coordinate transforms in concert with GPU (rotation, size)
|
||||
|
||||
|
||||
|
5
main.py
5
main.py
@ -25,15 +25,14 @@ def main(args):
|
||||
verbose = 1
|
||||
if '-d' in args:
|
||||
verbose = 3
|
||||
ctx = LaunchContext([IterThread], block=(256,1,1), grid=(64,1), tests=True)
|
||||
ctx.compile(verbose=verbose)
|
||||
ctx.run_tests()
|
||||
|
||||
with open(args[-1]) as fp:
|
||||
genomes = Genome.from_string(fp.read())
|
||||
anim = Animation(genomes)
|
||||
anim.compile()
|
||||
anim.render_frame()
|
||||
|
||||
|
||||
#genome.width, genome.height = 512, 512
|
||||
#genome.sample_density = 1000
|
||||
#obuf, stats, frame = genome.render(estimator=3)
|
||||
|
Loading…
Reference in New Issue
Block a user