Finally runs again

This commit is contained in:
Steven Robertson 2010-09-06 11:18:20 -04:00
parent 27e7fd82a3
commit f3298e0bed
5 changed files with 230 additions and 209 deletions

View File

@ -15,7 +15,8 @@ from cuburnlib.ptx import PTXModule
class LaunchContext(object):
"""
Context collecting the information needed to create, run, and gather the
results of a device computation.
results of a device computation. This may eventually also include an actual
CUDA context, but for now it just uses the global one.
To create the fastest device code across multiple device families, this
context may decide to iteratively refine the final PTX by regenerating
@ -32,34 +33,27 @@ class LaunchContext(object):
`mod`: Final compiled module. Unavailable during assembly.
"""
def __init__(self, entries, block=(1,1,1), grid=(1,1), seed=None,
tests=False):
def __init__(self, entries, block=(1,1,1), grid=(1,1), tests=False):
self.entry_types = entries
self.block, self.grid, self.build_tests = block, grid, tests
self.rand = np.random.mtrand.RandomState(seed)
self.setup_done = False
@property
def threads(self):
return reduce(lambda a, b: a*b, self.block + self.grid)
def print_source(self):
print '\n'.join(["%03d %s" % (i+1, l) for (i, l) in
enumerate(self.ptx.source.split('\n'))])
def compile(self, to_inject={}, verbose=False):
inj = dict(to_inject)
inj['ctx'] = self
self.ptx = PTXModule(self.entry_types, inj, self.build_tests)
def compile(self, verbose=False, **kwargs):
kwargs['ctx'] = self
self.ptx = PTXModule(self.entry_types, kwargs, self.build_tests)
try:
self.mod = cuda.module_from_buffer(self.ptx.source)
except (cuda.CompileError, cuda.RuntimeError), e:
print "Aww, dang, compile error. Here's the source:"
self.print_source()
self.ptx.print_source()
raise e
if verbose:
if verbose >= 3:
self.print_source()
self.ptx.print_source()
for entry in self.ptx.entries:
func = self.mod.get_function(entry.entry_name)
print "Compiled %s: used %d regs, %d sm, %d local" % (

View File

@ -4,126 +4,34 @@ Contains the PTX fragments which will drive the device.
import os
import time
import struct
import pycuda.driver as cuda
import numpy as np
from cuburnlib.ptx import *
"""
Here's the current draft of the full algorithm implementation.
declare xform jump table
load random state
clear x_coord, y_coord, z_coord, w_coord;
store -(FUSE+1) to shared (per-warp) num_samples_sh
clear badvals [1]
load param (global_cp_idx_addr)
index table start (global_cp_idx) [2]
load count of indexes from global cp index =>
store to qlocal current_cp_num [3]
outermost loop start:
load current_cp_num
if current_cp_num <= 0:
exit
load param global_cp_idx_addr
calculate offset into address with current_cp_num, global_cp_idx_addr
load cp_base_address
stream_start (cp_base, cp_base_addr) [4]
FUSE_START:
num_samples += 1
if num_samples >= 0:
# Okay, we're done FUSEing, prepare to enter normal loop
load num_samples => store to shared (per-warp) num_samples
ITER_LOOP_START:
reg xform_addr, xform_stream_addr, xform_select
mwc_next_u32 to xform_select
# Performance test: roll/unroll this loop?
stream_load xform_prob (cp_stream)
if xform_select <= xform_prob:
bra.uni XFORM_1_LBL
...
stream_load xform_prob (cp_stream)
if xform_select <= xform_prob:
bra.uni XFORM_N_LBL
XFORM_1_LBL:
stream_load xform_1_ (cp_stream)
...
bra.uni XFORM_POST
XFORM_POST:
[if final_xform:]
[do final_xform]
if num_samples < 0:
# FUSE still in progress
bra.uni FUSE_START
FRAGMENT_WRITEBACK:
# Unknown at this time.
SHUFFLE:
# Unknown at this time.
load num_samples from num_samples_sh
num_samples -= 1
if num_samples > 0:
bra.uni ITER_LOOP_START
[1] Tracking 'badvals' can put a pretty large hit on performance, particularly
for images that sample a small amount of the grid. So this might be cut
when rendering for performance. On the other hand, it might actually help
tune the algorithm later, so it'll definitely be an option.
[2] Control points for each temporal sample will be preloaded to the
device in the compact DataStream format (more on this later). Their
locations are represented in an index table, which starts with a single
`.u32 length`, followed by `length` pointers. To avoid having to keep
reloading `length`, or worse, using a register to hold it in memory, we
instead count *down* to zero. This is a very common idiom.
[3] 'qlocal' is quasi-local storage. it could easily be actual local storage,
depending on how local storage is implemented, but the extra 128-byte loads
for such values might make a performance difference. qlocal variables may
be identical across a warp or even a CTA, and so variables noted as
"qlocal" here might end up in shared memory or even a small per-warp or
per-CTA buffer in global memory created specifically for this purpose,
after benchmarking is done.
[4] DataStreams are "opaque" data serialization structures defined below. The
structure of a stream is actually created while parsing the DSL by the load
statements themselves. Some benchmarks need to be done before DataStreams
stop being "opaque" and become simply "dynamic".
"""
class IterThread(PTXTest):
entry_name = 'iter_thread'
entry_params = []
def __init__(self):
self.cps_uploaded = False
def deps(self):
return [MWCRNG, CPDataStream]
@ptx_func
def module_setup(self):
mem.global_.u32('g_cp_array',
[features.max_ntemporal_samples,'*',cp_stream_size])
cp_stream_size*features.max_ntemporal_samples)
mem.global_.u32('g_num_cps')
# TODO move into debug statement
mem.global_.u32('g_num_rounds', ctx.threads)
mem.global_.u32('g_num_writes', ctx.threads)
@ptx_func
def entry():
def entry(self):
reg.f32('x_coord y_coord color_coord alpha_coord')
# TODO: temporary, for testing
@ -158,8 +66,8 @@ class IterThread(PTXTest):
op.mov.s32(num_samples, -(features.num_fuse_samples+1))
# TODO: Move cp_num to qlocal storage (or spill it, rarely accessed)
reg.u32('cp_num cpA')
mov.u32(cp_num, 0)
reg.u32('cp_idx cpA')
op.mov.u32(cp_idx, 0)
label('cp_loop_start')
op.bar.sync(0)
@ -168,19 +76,19 @@ class IterThread(PTXTest):
reg.u32('num_cps')
reg.pred('p_last_cp')
op.ldu.u32(num_cps, addr(g_num_cps))
op.setp.lt.u32(p_last_cp, cp_num, num_cps)
op.setp.ge.u32(p_last_cp, cp_idx, num_cps)
op.bra.uni('all_cps_done', ifp=p_last_cp)
with block('Load CP address'):
op.mov.u32(cpA, g_cp_array)
op.mad.lo.u32(cpA, cp_num, cp_stream_size, cpA)
op.mad.lo.u32(cpA, cp_idx, cp_stream_size, cpA)
with block('Increment CP number, load num_samples (unless in fuse)'):
reg.pred('p_in_fuse')
op.setp.lt.s32(p_in_fuse, num_samples, 0)
op.add.u32(cp_num, cp_num, 1, ifp=p_in_fuse)
cp_stream_get(cpA, num_samples, 'cp.samples_per_thread',
ifp=p_in_fuse)
with block('Increment CP index, load num_samples (unless in fuse)'):
reg.pred('p_not_in_fuse')
op.setp.ge.s32(p_not_in_fuse, num_samples, 0)
op.add.u32(cp_idx, cp_idx, 1, ifp=p_not_in_fuse)
cp_stream_get(cpA, num_samples, 'samples_per_thread',
ifp=p_not_in_fuse)
label('fuse_loop_start')
with block('FUSE-specific stuff'):
@ -188,7 +96,7 @@ class IterThread(PTXTest):
comment('If num_samples == -1, set it to 0 and jump back up')
comment('This will start the normal CP loading machinery')
op.setp.eq.s32(p_fuse, num_samples, -1)
op.mov.s32(p_fuse, 0, ifp=p_fuse)
op.mov.s32(num_samples, 0, ifp=p_fuse)
op.bra.uni(cp_loop_start, ifp=p_fuse)
comment('If num_samples < -1, still fusing, so increment')
@ -204,33 +112,55 @@ class IterThread(PTXTest):
with block("Test if we're still in FUSE"):
reg.pred('p_in_fuse')
op.setp.lt.s32(p_in_fuse, num_samples, 0)
op.bra.uni(fuse_start, ifp=p_in_fuse)
op.bra.uni(fuse_loop_start, ifp=p_in_fuse)
with block("Ordinarily, we'd write the result here"):
op.add.u32(num_writes, num_writes, 1)
with block("Check to see if we're done with this CP"):
reg.pred('p_cp_done')
op.add.s32(num_samples, num_samples, -1)
op.setp.eq.s32(p_cp_done, num_samples, 0)
op.bra.uni(cp_loop_start, ifp=p_cp_done)
op.bra.uni(iter_loop_start)
label('all_cps_done')
# TODO this is for testing, move it to a debug statement
store_per_thread(g_num_rounds, num_rounds)
store_per_thread(g_num_writes, num_writes)
def call(self, ctx):
raise HorribleDeathError("Okay I'm going to bed now")
def upload_cp_stream(self, ctx, cp_stream, num_cps):
cp_array_dp, cp_array_l = ctx.mod.get_global('g_cp_array')
assert len(cp_stream) <= cp_array_l, "Stream too big!"
cuda.memcpy_htod_async(cp_array_dp, cp_stream)
num_cps_dp, num_cps_l = ctx.mod.get_global('g_num_cps')
cuda.memcpy_htod_async(num_cps_dp, struct.pack('i', num_cps))
self.cps_uploaded = True
def call(self, ctx):
if not self.cps_uploaded:
raise Error("Cannot call IterThread before uploading CPs")
func = ctx.mod.get_function('iter_thread')
dtime = func(block=ctx.block, grid=ctx.grid, time_kernel=True)
num_rounds_dp, num_rounds_l = ctx.mod.get_global('g_num_rounds')
num_writes_dp, num_writes_l = ctx.mod.get_global('g_num_writes')
rounds = cuda.from_device(num_rounds_dp, ctx.threads, np.uint32)
writes = cuda.from_device(num_writes_dp, ctx.threads, np.uint32)
print "Rounds:", rounds
print "Writes:", writes
class MWCRNG(PTXFragment):
def __init__(self):
self.rand = np.random
self.threads_ready = 0
if not os.path.isfile('primes.bin'):
raise EnvironmentError('primes.bin not found')
def set_seed(self, seed):
self.rand = np.random.mtrand.RandomState(seed)
@ptx_func
def module_setup(self):
mem.global_.u32('mwc_rng_mults', ctx.threads)
@ -284,13 +214,13 @@ class MWCRNG(PTXFragment):
# Randomness in choosing multipliers is good, but larger multipliers
# have longer periods, which is also good. This is a compromise.
mults = np.array(mults[:ctx.threads*4])
ctx.rand.shuffle(mults)
self.rand.shuffle(mults)
# Copy multipliers and seeds to the device
multdp, multl = ctx.mod.get_global('mwc_rng_mults')
cuda.memcpy_htod_async(multdp, mults.tostring()[:multl])
# Intentionally excludes both 0 and (2^32-1), as they can lead to
# degenerate sequences of period 0
states = np.array(ctx.rand.randint(1, 0xffffffff, size=2*ctx.threads),
states = np.array(self.rand.randint(1, 0xffffffff, size=2*ctx.threads),
dtype=np.uint32)
statedp, statel = ctx.mod.get_global('mwc_rng_state')
cuda.memcpy_htod_async(statedp, states.tostring())
@ -376,7 +306,7 @@ class MWCRNGTest(PTXTest):
class CameraCoordTransform(PTXFragment):
pass
class CPDataStream(PTXFragment):
class CPDataStream(DataStream):
"""DataStream which stores the control points."""
prefix = 'cp'

View File

@ -11,7 +11,7 @@ easier to maintain using this system.
# If you see 'import inspect', you know you're in for a good time
import inspect
import types
import traceback
import struct
from cStringIO import StringIO
from collections import namedtuple
@ -116,6 +116,8 @@ class _BlockInjector(object):
self.dead = False
map(self.inject, self.to_inject.items())
def __exit__(self, exc_type, exc_val, tb):
# Do some real exceptorin'
if exc_type is not None: return
for k in self.injected:
del self.inject_into[k]
self.dead = True
@ -137,17 +139,27 @@ class _Block(object):
inj = self.stack[-1].injectors
[inj.remove(i) for i in inj if i.dead]
def push_ctx(self):
# Move most recent active injector to new context
self.clean_injectors()
last_inj = self.stack[-1].injectors.pop()
self.stack.append(BlockCtx(dict(self.stack[-1].locals), [],
[last_inj]))
self.stack.append(BlockCtx(dict(self.stack[-1].locals), [], []))
# The only reason we should have no injectors in the previous block is
# if we are hitting a new ptx_func entry point or global declaration at
# PTX module scope, which means the stack only contains the outer
# context and the current one (i.e. len(stack) == 2)
if len(self.stack[-2].injectors) == 0:
assert len(self.stack) == 2, "Empty injector list too early!"
# Otherwise, the active injector in the previous block is the one for
# the Python function which is currently creating a new PTX block, and
# and it needs to be promoted to the current block
else:
self.stack[-1].injectors.append(self.stack[-2].injectors.pop())
def pop_ctx(self):
self.clean_injectors()
bs = self.stack.pop()
# TODO: figure out why this next line is needed
[bs.injectors.remove(i) for i in bs.injectors if i.dead]
self.stack[-1].code.extend(bs.code)
if len(self.stack) == 1:
# We're on outer_ctx, so all injectors should be gone
# We're on outer_ctx, so all injectors should be gone.
assert len(bs.injectors) == 0, "Injector/context mismatch"
return
# The only injector should be the one added in push_ctx
@ -186,7 +198,7 @@ class _Block(object):
spacing. To keep things simple, nested lists and tuples will be reduced
in this manner (but not other iterable types). Coercion will not happen
until after the entire DSL call tree has been walked. This allows a
class to submit a mutable type (e.g. the trivial `StrVar`) when first
class to submit a mutable type (e.g. ``DelayVar``) when first
walked with an undefined value, then substitute the correct value on
being finalized.
@ -196,14 +208,23 @@ class _Block(object):
"""
self.stack[-1].code.append(PTXStmt(prefix, op, vars, semi, indent))
class StrVar(object):
class DelayVar(object):
"""
Trivial wrapper to allow deferred variable substitution.
"""
def __init__(self, val=None):
self.val = val
def __str__(self):
return str(val)
return str(self.val)
def __mul__(self, other):
# Oh this is truly egregious
return DelayVarProxy(self, "self.other.val*" + str(other))
class DelayVarProxy(object):
def __init__(self, other, expr):
self.other, self.expr = other, expr
def __str__(self):
return str(eval(self.expr))
class _PTXFuncWrapper(object):
"""Enables ptx_func"""
@ -298,6 +319,9 @@ class Block(object):
self.block.code(op=['// ', self.comment], semi=False)
self.comment = None
def __exit__(self, exc_type, exc_value, tb):
# Allow exceptions to be propagated; things get really messy if we try
# to pop the stack if things aren't ordered correctly
if exc_type is not None: return
self.block.code(indent=-1)
self.block.code(op='}', semi=False)
self.block.pop_ctx()
@ -370,12 +394,14 @@ class Op(_CallChain):
"""
def _call(self, op, *args, **kwargs):
pred = ''
if 'ifp' in kwargs:
if 'ifnotp' in kwargs:
ifp = kwargs.get('ifp')
ifnotp = kwargs.get('ifnotp')
if ifp:
if ifnotp:
raise SyntaxError("can't use both, fool")
pred = ['@', kwargs['ifp']]
if 'ifnotp' in kwargs:
pred = ['@!', kwargs['ifnotp']]
pred = ['@', ifp]
if ifnotp:
pred = ['@!', ifnotp]
self.block.code(pred, '.'.join(op), _softjoin(args, ','))
class Mem(object):
@ -421,7 +447,7 @@ class Mem(object):
>>> op.st.global.v2.u32(addr(areg), vec(reg1, reg2))
>>> op.ld.global.v2.u32(vec(reg1, reg2), addr(areg, 8))
"""
return ['[', areg, aoffset and '+' or '', aoffset, ']']
return ['[', areg, aoffset is not '' and '+' or '', aoffset, ']']
class _MemFactory(_CallChain):
"""Actual `mem` object"""
@ -538,8 +564,8 @@ class PTXFragment(object):
"""
Called after running all PTX DSL functions, but before code generation,
to allow fragments which postponed variable evaluation (e.g. using
`StrVar`) to fill in the resulting values. Most fragments should not
use this.
``DelayVar``) to fill in the resulting values. Most fragments should
not use this.
If implemented, this function *may* use an @ptx_func decorator to
access the global DSL scope, but pretty please don't emit any code
@ -796,6 +822,13 @@ class PTXModule(object):
raise ValueError("Too many recompiles scheduled!")
self.__needs_recompilation = True
def print_source(self):
if not hasattr(self, 'source'):
raise ValueError("Not assembled yet!")
print '\n'.join(["%03d %s" % (i+1, l) for (i, l) in
enumerate(self.source.split('\n'))])
def _flatten(val):
if isinstance(val, (list, tuple)):
return ''.join(map(_flatten, val))
@ -806,7 +839,7 @@ class PTXFormatter(object):
Formats PTXStmt items into beautiful code. Well, the beautiful part is
postponed for now.
"""
def __init__(self, indent_amt=2, oplen_max=20, varlen_max=12):
def __init__(self, indent_amt=4, oplen_max=20, varlen_max=12):
self.idamt, self.opm, self.vm = indent_amt, oplen_max, varlen_max
def format(self, code):
out = []
@ -844,7 +877,7 @@ class PTXFormatter(object):
_TExp = namedtuple('_TExp', 'type exprlist')
_DataCell = namedtuple('_DataCell', 'offset size texp')
class DataStream(object):
class DataStream(PTXFragment):
"""
Simple interface between Python and PTX, designed to create and tightly
pack control structs.
@ -914,19 +947,19 @@ class DataStream(object):
self.cells = []
self.stream_size = 0
self.free = {}
self.size_strvar = StrVar("not_yet_determined")
self.size_delayvars = []
_types = dict(s8='b', u8='B', s16='h', u16='H', s32='i', u32='I', f32='f',
s64='l', u64='L', f64='d')
def _get_type(self, *regs):
def _get_type(self, regs):
size = int(regs[0].type[1:])
for r in regs:
for reg in regs:
if reg.type not in self._types:
raise TypeError("Register %s of type %s not supported" %
(reg.name, reg.type))
if int(r.type[1:]) != size:
if int(reg.type[1:]) != size:
raise TypeError("Can't vector-load different size regs")
return size, ''.join([self._types.get(r.type) for r in regs])
return size/8, ''.join([self._types.get(r.type) for r in regs])
def _alloc(self, vsize, texp):
# A really crappy allocator. May later include optimizations for
@ -939,7 +972,7 @@ class DataStream(object):
if idx is None:
# No aligned free cells, allocate a new `align`-byte free cell
assert alloc not in self.free
self.free[alloc] = idx = len(self.stream_size)
self.free[alloc] = idx = len(self.cells)
self.cells.append(_DataCell(self.stream_size, alloc, None))
self.stream_size += alloc
# Overwrite the free cell at `idx` with texp
@ -958,27 +991,28 @@ class DataStream(object):
self.cells.insert(fidx, _DataCell(foffset, fsize, None))
foffset += fsize
self.free[fsize] = fidx
fsize *= 2
# Adjust indexes. This is ugly, but evidently unavoidable
if fidx-idx:
for k, v in filter(lambda k, v: v > idx, self.free.items()):
for k, v in filter(lambda (k, v): v > idx, self.free.items()):
self.free[k] = v+(fidx-idx)
return self.offset
return offset
@ptx_func
def _stream_get_internal(self, areg, dregs, exprs, ifp, ifnotp):
size, type = self._get_type(dregs)
vsize = size * len(dregs)
texp = _TExp(type, [expr])
if texp in self.expr_map:
texp = _TExp(type, tuple(exprs))
if texp in self.texp_map:
offset = self.texp_map[texp]
else:
offset = self._alloc(vsize, texp)
self.texp_map[texp] = offset
vtype = {1: '', 2: '.v2', 4: '.v4'}.get(len(dregs))
if len(dregs) > 0:
opname = ['ldu', 'b%d' % (size*8)]
if len(dregs) > 1:
opname.insert(1, 'v%d' % len(dregs))
dregs = vec(dregs)
op._call('ldu%s.b%d' % (vtype, size), dregs, addr(areg+off),
ifp=ifp, ifnotp=ifnotp)
op._call(opname, dregs, addr(areg, offset), ifp=ifp, ifnotp=ifnotp)
@ptx_func
def _stream_get(self, areg, dreg, expr, ifp=None, ifnotp=None):
@ -991,16 +1025,20 @@ class DataStream(object):
ifp, ifnotp)
@ptx_func
def _stream_get_v2(self, areg, d1, e1, d2, e2, d3, e3, d4, e4,
def _stream_get_v4(self, areg, d1, e1, d2, e2, d3, e3, d4, e4,
ifp=None, ifnotp=None):
self._stream_get_internal(areg, [d1, d2, d3, d4], [e1, e2, e3, e4],
ifp, ifnotp)
@property
def _stream_size(self):
return self.size_strvar
x = DelayVar("not_yet_determined")
self.size_delayvars.append(x)
return x
def finalize_code(self):
self.size_strvar.val = str(self.stream_size)
for dv in self.size_delayvars:
dv.val = self.stream_size
def to_inject(self):
return {self.prefix + '_stream_get': self._stream_get,
@ -1039,9 +1077,20 @@ class DataStream(object):
for offset, size, texp in self.cells:
if texp:
type = texp.type
vals = [eval(e, globals(), kwargs) for e in texp.expr_list]
vals = [eval(e, globals(), kwargs) for e in texp.exprlist]
else:
type = 'x'*size # Padding bytes
vals = []
out.write(struct.pack(type, *vals))
outfile.write(struct.pack(type, *vals))
def print_record(self):
for cell in self.cells:
if cell.texp is None:
print '%3d %2d --' % (cell.offset, cell.size)
continue
print '%3d %2d %4s %s' % (cell.offset, cell.size, cell.texp.type,
cell.texp.exprlist[0])
for exp in cell.texp.exprlist[1:]:
print '%12s %s' % ('', exp)

View File

@ -1,12 +1,62 @@
from ctypes import *
from cStringIO import StringIO
import numpy as np
from fr0stlib.pyflam3 import Genome, Frame
from fr0stlib import pyflam3
from fr0stlib.pyflam3._flam3 import *
from fr0stlib.pyflam3.constants import *
from cuburnlib.cuda import LaunchContext
from cuburnlib.device_code import IterThread, CPDataStream
Point = lambda x, y: np.array([x, y], dtype=np.double)
class Genome(pyflam3.Genome):
pass
class Frame(pyflam3.Frame):
def interpolate(self, time, cp):
flam3_interpolate(self.genomes, self.ngenomes, time, 0, byref(cp))
def pack_stream(self, ctx, time):
"""
Pack and return the control point data stream to render this frame.
"""
# Get the central control point, and calculate parameters that change
# once per frame
cp = BaseGenome()
self.interpolate(time, cp)
self.filt = Filters(self, cp)
rw = cp.spatial_oversample * cp.width + 2 * self.filt.gutter
rh = cp.spatial_oversample * cp.height + 2 * self.filt.gutter
# Interpolate each time step, calculate per-step variables, and pack
# into the stream
cp_streamer = ctx.ptx.instances[CPDataStream]
stream = StringIO()
print "Data stream contents:"
cp_streamer.print_record()
tcp = BaseGenome()
for batch_idx in range(cp.nbatches):
for time_idx in range(cp.ntemporal_samples):
idx = time_idx + batch_idx * cp.nbatches
cp_time = time + self.filt.temporal_deltas[idx]
self.interpolate(time, tcp)
tcp.camera = Camera(self, tcp, self.filt)
# TODO: figure out which object to pack this into
nsamples = ((tcp.camera.sample_density * cp.width * cp.height) /
(cp.nbatches * cp.ntemporal_samples))
samples_per_thread = nsamples / ctx.threads + 15
cp_streamer.pack_into(stream,
frame=self,
cp=tcp,
cp_idx=idx,
samples_per_thread=samples_per_thread)
stream.seek(0)
return (stream.read(), cp.nbatches * cp.ntemporal_samples)
class Animation(object):
"""
Control structure for rendering a series of frames.
@ -31,46 +81,46 @@ class Animation(object):
memmove(byref(self.genomes[i]), byref(genomes[i]),
sizeof(BaseGenome))
self._frame = Frame()
self._frame.genomes = cast(self.genomes, POINTER(BaseGenome))
self._frame.ngenomes = len(genomes)
self.features = Features(genomes)
self.frame = Frame()
self.frame.genomes = cast(self.genomes, POINTER(BaseGenome))
self.frame.ngenomes = len(genomes)
self.ctx = None
def compile(self):
"""
Create a PTX kernel optimized for this animation, compile it, and
attach it to a LaunchContext with a thread distribution optimized for
the active device.
"""
# TODO: user-configurable test control
self.ctx = LaunchContext([IterThread], block=(256,1,1), grid=(54,1),
tests=True)
# TODO: user-configurable verbosity control
self.ctx.compile(verbose=3, anim=self, features=self.features)
# TODO: automatic optimization of block parameters
def render_frame(self, time=0):
# TODO: support more nuanced frame control than just 'time'
# TODO: reuse more information between frames
# TODO: allow animation-long override of certain parameters (size, etc)
cp_stream, num_cps = self.frame.pack_stream(self.ctx, time)
iter_thread = self.ctx.ptx.instances[IterThread]
iter_thread.upload_cp_stream(self.ctx, cp_stream, num_cps)
iter_thread.call(self.ctx)
cp = BaseGenome()
flam3_interpolate(self.frame.genomes, len(self.genomes), time, 0,
byref(cp))
filt = Filters(self.frame, cp)
rw = cp.spatial_oversample * cp.width + 2 * filt.gutter
rh = cp.spatial_oversample * cp.height + 2 * filt.gutter
class Features(object):
"""
Determine features and constants required to render a particular set of
genomes. The values of this class are fixed before compilation begins.
"""
# Constant; number of rounds spent fusing points on first CP of a frame
num_fuse_samples = 25
# Allocate buckets, accumulator
# Loop over all batches:
# [density estimation]
# Loop over all temporal samples:
# Color scalar = temporal filter at index
# Interpolate and get control point
# Precalculate
# Prepare xforms
# Compute colormap
# Run iterations
# Accumulate vibrancy, gamma, background
# Calculate k1, k2
# If not DE, then do log filtering to accumulator
# Else, [density estimation]
# Do final clip and filter
# For now:
# Loop over all batches:
# Loop over all temporal samples:
# Interpolate and get control point
# Read the
# Dump noise into buckets
# Do log filtering to accumulator
# Do simplified final clip
def __init__(self, genomes):
self.max_ntemporal_samples = max(
[cp.nbatches * cp.ntemporal_samples for cp in genomes]) + 1
class Filters(object):
def __init__(self, frame, cp):
@ -115,7 +165,7 @@ class Camera(object):
scale = 2.0 ** cp.zoom
self.sample_density = cp.sample_density * scale * scale
center = Point(cp.center[0], cp.center[1])
center = Point(cp._center[0], cp._center[1])
size = Point(cp.width, cp.height)
# pix per unit, where 'unit' is '1.0' in IFS space
self.ppu = Point(
@ -129,4 +179,3 @@ class Camera(object):
self.ifs_space_size = 1.0 / (self.upper_bounds - self.lower_bounds)
# TODO: coordinate transforms in concert with GPU (rotation, size)

View File

@ -25,15 +25,14 @@ def main(args):
verbose = 1
if '-d' in args:
verbose = 3
ctx = LaunchContext([IterThread], block=(256,1,1), grid=(64,1), tests=True)
ctx.compile(verbose=verbose)
ctx.run_tests()
with open(args[-1]) as fp:
genomes = Genome.from_string(fp.read())
anim = Animation(genomes)
anim.compile()
anim.render_frame()
#genome.width, genome.height = 512, 512
#genome.sample_density = 1000
#obuf, stats, frame = genome.render(estimator=3)