Finally runs again

2025-07-05 07:45:18 -04:00 · 2010-09-06 11:18:20 -04:00
parent 27e7fd82a3
commit f3298e0bed
5 changed files with 230 additions and 209 deletions
--- a/cuburnlib/cuda.py
+++ b/cuburnlib/cuda.py
@ -15,7 +15,8 @@ from cuburnlib.ptx import PTXModule
 class LaunchContext(object):
    """
    Context collecting the information needed to create, run, and gather the
-    results of a device computation.
+    results of a device computation. This may eventually also include an actual
    CUDA context, but for now it just uses the global one.
    To create the fastest device code across multiple device families, this
    context may decide to iteratively refine the final PTX by regenerating
@ -32,34 +33,27 @@ class LaunchContext(object):
        `mod`:      Final compiled module. Unavailable during assembly.
    """
-    def __init__(self, entries, block=(1,1,1), grid=(1,1), seed=None,
+    def __init__(self, entries, block=(1,1,1), grid=(1,1), tests=False):
                 tests=False):
        self.entry_types = entries
        self.block, self.grid, self.build_tests = block, grid, tests
        self.rand = np.random.mtrand.RandomState(seed)
        self.setup_done = False
    @property
    def threads(self):
        return reduce(lambda a, b: a*b, self.block + self.grid)
-    def print_source(self):
+    def compile(self, verbose=False, **kwargs):
-        print '\n'.join(["%03d %s" % (i+1, l) for (i, l) in
+        kwargs['ctx'] = self
-                        enumerate(self.ptx.source.split('\n'))])
+        self.ptx = PTXModule(self.entry_types, kwargs, self.build_tests)
    def compile(self, to_inject={}, verbose=False):
        inj = dict(to_inject)
        inj['ctx'] = self
        self.ptx = PTXModule(self.entry_types, inj, self.build_tests)
        try:
            self.mod = cuda.module_from_buffer(self.ptx.source)
        except (cuda.CompileError, cuda.RuntimeError), e:
            print "Aww, dang, compile error. Here's the source:"
-            self.print_source()
+            self.ptx.print_source()
            raise e
        if verbose:
            if verbose >= 3:
-                self.print_source()
+                self.ptx.print_source()
            for entry in self.ptx.entries:
                func = self.mod.get_function(entry.entry_name)
                print "Compiled %s: used %d regs, %d sm, %d local" % (
--- a/cuburnlib/device_code.py
+++ b/cuburnlib/device_code.py
@ -4,126 +4,34 @@ Contains the PTX fragments which will drive the device.
 import os
 import time
 import struct
 import pycuda.driver as cuda
 import numpy as np
 from cuburnlib.ptx import *
 """
 Here's the current draft of the full algorithm implementation.
 declare xform jump table
 load random state
 clear x_coord, y_coord, z_coord, w_coord;
 store -(FUSE+1) to shared (per-warp) num_samples_sh
 clear badvals [1]
 load param (global_cp_idx_addr)
 index table start (global_cp_idx) [2]
 load count of indexes from global cp index =>
    store to qlocal current_cp_num [3]
 outermost loop start:
    load current_cp_num
    if current_cp_num <= 0:
        exit
    load param global_cp_idx_addr
    calculate offset into address with current_cp_num, global_cp_idx_addr
    load cp_base_address
    stream_start (cp_base, cp_base_addr) [4]
 FUSE_START:
    num_samples += 1
    if num_samples >= 0:
        # Okay, we're done FUSEing, prepare to enter normal loop
        load num_samples => store to shared (per-warp) num_samples
 ITER_LOOP_START:
        reg xform_addr, xform_stream_addr, xform_select
        mwc_next_u32 to xform_select
        # Performance test: roll/unroll this loop?
        stream_load xform_prob (cp_stream)
        if xform_select <= xform_prob:
            bra.uni XFORM_1_LBL
        ...
        stream_load xform_prob (cp_stream)
        if xform_select <= xform_prob:
            bra.uni XFORM_N_LBL
 XFORM_1_LBL:
        stream_load xform_1_ (cp_stream)
        ...
        bra.uni XFORM_POST
 XFORM_POST:
        [if final_xform:]
            [do final_xform]
        if num_samples < 0:
            # FUSE still in progress
            bra.uni FUSE_START
 FRAGMENT_WRITEBACK:
        # Unknown at this time.
 SHUFFLE:
        # Unknown at this time.
        load num_samples from num_samples_sh
        num_samples -= 1
        if num_samples > 0:
            bra.uni ITER_LOOP_START
 [1] Tracking 'badvals' can put a pretty large hit on performance, particularly
    for images that sample a small amount of the grid. So this might be cut
    when rendering for performance. On the other hand, it might actually help
    tune the algorithm later, so it'll definitely be an option.
 [2] Control points for each temporal sample will be preloaded to the
    device in the compact DataStream format (more on this later). Their
    locations are represented in an index table, which starts with a single
    `.u32 length`, followed by `length` pointers. To avoid having to keep
    reloading `length`, or worse, using a register to hold it in memory, we
    instead count *down* to zero. This is a very common idiom.
 [3] 'qlocal' is quasi-local storage. it could easily be actual local storage,
    depending on how local storage is implemented, but the extra 128-byte loads
    for such values might make a performance difference. qlocal variables may
    be identical across a warp or even a CTA, and so variables noted as
    "qlocal" here might end up in shared memory or even a small per-warp or
    per-CTA buffer in global memory created specifically for this purpose,
    after benchmarking is done.
 [4] DataStreams are "opaque" data serialization structures defined below.  The
    structure of a stream is actually created while parsing the DSL by the load
    statements themselves. Some benchmarks need to be done before DataStreams
    stop being "opaque" and become simply "dynamic".
 """
 class IterThread(PTXTest):
    entry_name = 'iter_thread'
    entry_params = []
    def __init__(self):
        self.cps_uploaded = False
    def deps(self):
        return [MWCRNG, CPDataStream]
    @ptx_func
    def module_setup(self):
        mem.global_.u32('g_cp_array',
-                [features.max_ntemporal_samples,'*',cp_stream_size])
+                        cp_stream_size*features.max_ntemporal_samples)
        mem.global_.u32('g_num_cps')
        # TODO move into debug statement
        mem.global_.u32('g_num_rounds', ctx.threads)
        mem.global_.u32('g_num_writes', ctx.threads)
    @ptx_func
-    def entry():
+    def entry(self):
        reg.f32('x_coord y_coord color_coord alpha_coord')
        # TODO: temporary, for testing
@ -158,8 +66,8 @@ class IterThread(PTXTest):
        op.mov.s32(num_samples, -(features.num_fuse_samples+1))
        # TODO: Move cp_num to qlocal storage (or spill it, rarely accessed)
-        reg.u32('cp_num cpA')
+        reg.u32('cp_idx cpA')
-        mov.u32(cp_num, 0)
+        op.mov.u32(cp_idx, 0)
        label('cp_loop_start')
        op.bar.sync(0)
@ -168,19 +76,19 @@ class IterThread(PTXTest):
            reg.u32('num_cps')
            reg.pred('p_last_cp')
            op.ldu.u32(num_cps, addr(g_num_cps))
-            op.setp.lt.u32(p_last_cp, cp_num, num_cps)
+            op.setp.ge.u32(p_last_cp, cp_idx, num_cps)
            op.bra.uni('all_cps_done', ifp=p_last_cp)
        with block('Load CP address'):
            op.mov.u32(cpA, g_cp_array)
-            op.mad.lo.u32(cpA, cp_num, cp_stream_size, cpA)
+            op.mad.lo.u32(cpA, cp_idx, cp_stream_size, cpA)
-        with block('Increment CP number, load num_samples (unless in fuse)'):
+        with block('Increment CP index, load num_samples (unless in fuse)'):
-            reg.pred('p_in_fuse')
+            reg.pred('p_not_in_fuse')
-            op.setp.lt.s32(p_in_fuse, num_samples, 0)
+            op.setp.ge.s32(p_not_in_fuse, num_samples, 0)
-            op.add.u32(cp_num, cp_num, 1, ifp=p_in_fuse)
+            op.add.u32(cp_idx, cp_idx, 1, ifp=p_not_in_fuse)
-            cp_stream_get(cpA, num_samples, 'cp.samples_per_thread',
+            cp_stream_get(cpA, num_samples, 'samples_per_thread',
-                          ifp=p_in_fuse)
+                          ifp=p_not_in_fuse)
        label('fuse_loop_start')
        with block('FUSE-specific stuff'):
@ -188,7 +96,7 @@ class IterThread(PTXTest):
            comment('If num_samples == -1, set it to 0 and jump back up')
            comment('This will start the normal CP loading machinery')
            op.setp.eq.s32(p_fuse, num_samples, -1)
-            op.mov.s32(p_fuse, 0, ifp=p_fuse)
+            op.mov.s32(num_samples, 0, ifp=p_fuse)
            op.bra.uni(cp_loop_start, ifp=p_fuse)
            comment('If num_samples < -1, still fusing, so increment')
@ -204,33 +112,55 @@ class IterThread(PTXTest):
        with block("Test if we're still in FUSE"):
            reg.pred('p_in_fuse')
            op.setp.lt.s32(p_in_fuse, num_samples, 0)
-            op.bra.uni(fuse_start, ifp=p_in_fuse)
+            op.bra.uni(fuse_loop_start, ifp=p_in_fuse)
        with block("Ordinarily, we'd write the result here"):
            op.add.u32(num_writes, num_writes, 1)
        with block("Check to see if we're done with this CP"):
            reg.pred('p_cp_done')
            op.add.s32(num_samples, num_samples, -1)
            op.setp.eq.s32(p_cp_done, num_samples, 0)
            op.bra.uni(cp_loop_start, ifp=p_cp_done)
        op.bra.uni(iter_loop_start)
-
+        label('all_cps_done')
        # TODO this is for testing, move it to a debug statement
        store_per_thread(g_num_rounds, num_rounds)
        store_per_thread(g_num_writes, num_writes)
-    def call(self, ctx):
+    def upload_cp_stream(self, ctx, cp_stream, num_cps):
-        raise HorribleDeathError("Okay I'm going to bed now")
+        cp_array_dp, cp_array_l = ctx.mod.get_global('g_cp_array')
        assert len(cp_stream) <= cp_array_l, "Stream too big!"
        cuda.memcpy_htod_async(cp_array_dp, cp_stream)
        num_cps_dp, num_cps_l = ctx.mod.get_global('g_num_cps')
        cuda.memcpy_htod_async(num_cps_dp, struct.pack('i', num_cps))
        self.cps_uploaded = True
    def call(self, ctx):
        if not self.cps_uploaded:
            raise Error("Cannot call IterThread before uploading CPs")
        func = ctx.mod.get_function('iter_thread')
        dtime = func(block=ctx.block, grid=ctx.grid, time_kernel=True)
        num_rounds_dp, num_rounds_l = ctx.mod.get_global('g_num_rounds')
        num_writes_dp, num_writes_l = ctx.mod.get_global('g_num_writes')
        rounds = cuda.from_device(num_rounds_dp, ctx.threads, np.uint32)
        writes = cuda.from_device(num_writes_dp, ctx.threads, np.uint32)
        print "Rounds:", rounds
        print "Writes:", writes
 class MWCRNG(PTXFragment):
    def __init__(self):
        self.rand = np.random
        self.threads_ready = 0
        if not os.path.isfile('primes.bin'):
            raise EnvironmentError('primes.bin not found')
    def set_seed(self, seed):
        self.rand = np.random.mtrand.RandomState(seed)
    @ptx_func
    def module_setup(self):
        mem.global_.u32('mwc_rng_mults', ctx.threads)
@ -284,13 +214,13 @@ class MWCRNG(PTXFragment):
        # Randomness in choosing multipliers is good, but larger multipliers
        # have longer periods, which is also good. This is a compromise.
        mults = np.array(mults[:ctx.threads*4])
-        ctx.rand.shuffle(mults)
+        self.rand.shuffle(mults)
        # Copy multipliers and seeds to the device
        multdp, multl = ctx.mod.get_global('mwc_rng_mults')
        cuda.memcpy_htod_async(multdp, mults.tostring()[:multl])
        # Intentionally excludes both 0 and (2^32-1), as they can lead to
        # degenerate sequences of period 0
-        states = np.array(ctx.rand.randint(1, 0xffffffff, size=2*ctx.threads),
+        states = np.array(self.rand.randint(1, 0xffffffff, size=2*ctx.threads),
                          dtype=np.uint32)
        statedp, statel = ctx.mod.get_global('mwc_rng_state')
        cuda.memcpy_htod_async(statedp, states.tostring())
@ -376,7 +306,7 @@ class MWCRNGTest(PTXTest):
 class CameraCoordTransform(PTXFragment):
    pass
-class CPDataStream(PTXFragment):
+class CPDataStream(DataStream):
    """DataStream which stores the control points."""
    prefix = 'cp'
--- a/cuburnlib/ptx.py
+++ b/cuburnlib/ptx.py
@ -11,7 +11,7 @@ easier to maintain using this system.
 # If you see 'import inspect', you know you're in for a good time
 import inspect
 import types
-import traceback
+import struct
 from cStringIO import StringIO
 from collections import namedtuple
@ -116,6 +116,8 @@ class _BlockInjector(object):
        self.dead = False
        map(self.inject, self.to_inject.items())
    def __exit__(self, exc_type, exc_val, tb):
        # Do some real exceptorin'
        if exc_type is not None: return
        for k in self.injected:
            del self.inject_into[k]
        self.dead = True
@ -137,17 +139,27 @@ class _Block(object):
        inj = self.stack[-1].injectors
        [inj.remove(i) for i in inj if i.dead]
    def push_ctx(self):
        # Move most recent active injector to new context
        self.clean_injectors()
-        last_inj = self.stack[-1].injectors.pop()
+        self.stack.append(BlockCtx(dict(self.stack[-1].locals), [], []))
-        self.stack.append(BlockCtx(dict(self.stack[-1].locals), [],
+        # The only reason we should have no injectors in the previous block is
-                          [last_inj]))
+        # if we are hitting a new ptx_func entry point or global declaration at
        # PTX module scope, which means the stack only contains the outer
        # context and the current one (i.e. len(stack) == 2)
        if len(self.stack[-2].injectors) == 0:
            assert len(self.stack) == 2, "Empty injector list too early!"
        # Otherwise, the active injector in the previous block is the one for
        # the Python function which is currently creating a new PTX block, and
        # and it needs to be promoted to the current block
        else:
            self.stack[-1].injectors.append(self.stack[-2].injectors.pop())
    def pop_ctx(self):
        self.clean_injectors()
        bs = self.stack.pop()
        # TODO: figure out why this next line is needed
        [bs.injectors.remove(i) for i in bs.injectors if i.dead]
        self.stack[-1].code.extend(bs.code)
        if len(self.stack) == 1:
-            # We're on outer_ctx, so all injectors should be gone
+            # We're on outer_ctx, so all injectors should be gone.
            assert len(bs.injectors) == 0, "Injector/context mismatch"
            return
        # The only injector should be the one added in push_ctx
@ -186,7 +198,7 @@ class _Block(object):
        spacing. To keep things simple, nested lists and tuples will be reduced
        in this manner (but not other iterable types). Coercion will not happen
        until after the entire DSL call tree has been walked. This allows a
-        class to submit a mutable type (e.g. the trivial `StrVar`) when first
+        class to submit a mutable type (e.g. ``DelayVar``) when first
        walked with an undefined value, then substitute the correct value on
        being finalized.
@ -196,14 +208,23 @@ class _Block(object):
        """
        self.stack[-1].code.append(PTXStmt(prefix, op, vars, semi, indent))
-class StrVar(object):
+class DelayVar(object):
    """
    Trivial wrapper to allow deferred variable substitution.
    """
    def __init__(self, val=None):
        self.val = val
    def __str__(self):
-        return str(val)
+        return str(self.val)
    def __mul__(self, other):
        # Oh this is truly egregious
        return DelayVarProxy(self, "self.other.val*" + str(other))
 class DelayVarProxy(object):
    def __init__(self, other, expr):
        self.other, self.expr = other, expr
    def __str__(self):
        return str(eval(self.expr))
 class _PTXFuncWrapper(object):
    """Enables ptx_func"""
@ -298,6 +319,9 @@ class Block(object):
            self.block.code(op=['// ', self.comment], semi=False)
        self.comment = None
    def __exit__(self, exc_type, exc_value, tb):
        # Allow exceptions to be propagated; things get really messy if we try
        # to pop the stack if things aren't ordered correctly
        if exc_type is not None: return
        self.block.code(indent=-1)
        self.block.code(op='}', semi=False)
        self.block.pop_ctx()
@ -370,12 +394,14 @@ class Op(_CallChain):
    """
    def _call(self, op, *args, **kwargs):
        pred = ''
-        if 'ifp' in kwargs:
+        ifp = kwargs.get('ifp')
-            if 'ifnotp' in kwargs:
+        ifnotp = kwargs.get('ifnotp')
        if ifp:
            if ifnotp:
                raise SyntaxError("can't use both, fool")
-            pred = ['@', kwargs['ifp']]
+            pred = ['@', ifp]
-        if 'ifnotp' in kwargs:
+        if ifnotp:
-            pred = ['@!', kwargs['ifnotp']]
+            pred = ['@!', ifnotp]
        self.block.code(pred, '.'.join(op), _softjoin(args, ','))
 class Mem(object):
@ -421,7 +447,7 @@ class Mem(object):
        >>> op.st.global.v2.u32(addr(areg), vec(reg1, reg2))
        >>> op.ld.global.v2.u32(vec(reg1, reg2), addr(areg, 8))
        """
-        return ['[', areg, aoffset and '+' or '', aoffset, ']']
+        return ['[', areg, aoffset is not '' and '+' or '', aoffset, ']']
 class _MemFactory(_CallChain):
    """Actual `mem` object"""
@ -538,8 +564,8 @@ class PTXFragment(object):
        """
        Called after running all PTX DSL functions, but before code generation,
        to allow fragments which postponed variable evaluation (e.g. using
-        `StrVar`) to fill in the resulting values. Most fragments should not
+        ``DelayVar``) to fill in the resulting values. Most fragments should
-        use this.
+        not use this.
        If implemented, this function *may* use an @ptx_func decorator to
        access the global DSL scope, but pretty please don't emit any code
@ -796,6 +822,13 @@ class PTXModule(object):
                raise ValueError("Too many recompiles scheduled!")
            self.__needs_recompilation = True
    def print_source(self):
        if not hasattr(self, 'source'):
            raise ValueError("Not assembled yet!")
        print '\n'.join(["%03d %s" % (i+1, l) for (i, l) in
                        enumerate(self.source.split('\n'))])
 def _flatten(val):
    if isinstance(val, (list, tuple)):
        return ''.join(map(_flatten, val))
@ -806,7 +839,7 @@ class PTXFormatter(object):
    Formats PTXStmt items into beautiful code. Well, the beautiful part is
    postponed for now.
    """
-    def __init__(self, indent_amt=2, oplen_max=20, varlen_max=12):
+    def __init__(self, indent_amt=4, oplen_max=20, varlen_max=12):
        self.idamt, self.opm, self.vm = indent_amt, oplen_max, varlen_max
    def format(self, code):
        out = []
@ -844,7 +877,7 @@ class PTXFormatter(object):
 _TExp = namedtuple('_TExp', 'type exprlist')
 _DataCell = namedtuple('_DataCell', 'offset size texp')
-class DataStream(object):
+class DataStream(PTXFragment):
    """
    Simple interface between Python and PTX, designed to create and tightly
    pack control structs.
@ -914,19 +947,19 @@ class DataStream(object):
        self.cells = []
        self.stream_size = 0
        self.free = {}
-        self.size_strvar = StrVar("not_yet_determined")
+        self.size_delayvars = []
    _types = dict(s8='b', u8='B', s16='h', u16='H', s32='i', u32='I', f32='f',
                  s64='l', u64='L', f64='d')
-    def _get_type(self, *regs):
+    def _get_type(self, regs):
        size = int(regs[0].type[1:])
-        for r in regs:
+        for reg in regs:
            if reg.type not in self._types:
                raise TypeError("Register %s of type %s not supported" %
                                (reg.name, reg.type))
-            if int(r.type[1:]) != size:
+            if int(reg.type[1:]) != size:
                raise TypeError("Can't vector-load different size regs")
-        return size, ''.join([self._types.get(r.type) for r in regs])
+        return size/8, ''.join([self._types.get(r.type) for r in regs])
    def _alloc(self, vsize, texp):
        # A really crappy allocator. May later include optimizations for
@ -939,7 +972,7 @@ class DataStream(object):
        if idx is None:
            # No aligned free cells, allocate a new `align`-byte free cell
            assert alloc not in self.free
-            self.free[alloc] = idx = len(self.stream_size)
+            self.free[alloc] = idx = len(self.cells)
            self.cells.append(_DataCell(self.stream_size, alloc, None))
            self.stream_size += alloc
        # Overwrite the free cell at `idx` with texp
@ -958,27 +991,28 @@ class DataStream(object):
                self.cells.insert(fidx, _DataCell(foffset, fsize, None))
                foffset += fsize
                self.free[fsize] = fidx
            fsize *= 2
        # Adjust indexes. This is ugly, but evidently unavoidable
        if fidx-idx:
-            for k, v in filter(lambda k, v: v > idx, self.free.items()):
+            for k, v in filter(lambda (k, v): v > idx, self.free.items()):
                self.free[k] = v+(fidx-idx)
-        return self.offset
+        return offset
    @ptx_func
    def _stream_get_internal(self, areg, dregs, exprs, ifp, ifnotp):
        size, type = self._get_type(dregs)
        vsize = size * len(dregs)
-        texp = _TExp(type, [expr])
+        texp = _TExp(type, tuple(exprs))
-        if texp in self.expr_map:
+        if texp in self.texp_map:
            offset = self.texp_map[texp]
        else:
            offset = self._alloc(vsize, texp)
            self.texp_map[texp] = offset
-        vtype = {1: '', 2: '.v2', 4: '.v4'}.get(len(dregs))
+        opname = ['ldu', 'b%d' % (size*8)]
-        if len(dregs) > 0:
+        if len(dregs) > 1:
            opname.insert(1, 'v%d' % len(dregs))
            dregs = vec(dregs)
-        op._call('ldu%s.b%d' % (vtype, size), dregs, addr(areg+off),
+        op._call(opname, dregs, addr(areg, offset), ifp=ifp, ifnotp=ifnotp)
                 ifp=ifp, ifnotp=ifnotp)
    @ptx_func
    def _stream_get(self, areg, dreg, expr, ifp=None, ifnotp=None):
@ -991,16 +1025,20 @@ class DataStream(object):
                                  ifp, ifnotp)
    @ptx_func
-    def _stream_get_v2(self, areg, d1, e1, d2, e2, d3, e3, d4, e4,
+    def _stream_get_v4(self, areg, d1, e1, d2, e2, d3, e3, d4, e4,
                       ifp=None, ifnotp=None):
        self._stream_get_internal(areg, [d1, d2, d3, d4], [e1, e2, e3, e4],
                                  ifp, ifnotp)
    @property
    def _stream_size(self):
-        return self.size_strvar
+        x = DelayVar("not_yet_determined")
        self.size_delayvars.append(x)
        return x
    def finalize_code(self):
-        self.size_strvar.val = str(self.stream_size)
+        for dv in self.size_delayvars:
            dv.val = self.stream_size
    def to_inject(self):
        return {self.prefix + '_stream_get': self._stream_get,
@ -1039,9 +1077,20 @@ class DataStream(object):
        for offset, size, texp in self.cells:
            if texp:
                type = texp.type
-                vals = [eval(e, globals(), kwargs) for e in texp.expr_list]
+                vals = [eval(e, globals(), kwargs) for e in texp.exprlist]
            else:
                type = 'x'*size # Padding bytes
                vals = []
-            out.write(struct.pack(type, *vals))
+            outfile.write(struct.pack(type, *vals))
    def print_record(self):
        for cell in self.cells:
            if cell.texp is None:
                print '%3d %2d --' % (cell.offset, cell.size)
                continue
            print '%3d %2d %4s %s' % (cell.offset, cell.size, cell.texp.type,
                                      cell.texp.exprlist[0])
            for exp in cell.texp.exprlist[1:]:
                print '%12s %s' % ('', exp)
--- a/cuburnlib/render.py
+++ b/cuburnlib/render.py
@ -1,12 +1,62 @@
 from ctypes import *
 from cStringIO import StringIO
 import numpy as np
-from fr0stlib.pyflam3 import Genome, Frame
+
 from fr0stlib import pyflam3
 from fr0stlib.pyflam3._flam3 import *
 from fr0stlib.pyflam3.constants import *
 from cuburnlib.cuda import LaunchContext
 from cuburnlib.device_code import IterThread, CPDataStream
 Point = lambda x, y: np.array([x, y], dtype=np.double)
 class Genome(pyflam3.Genome):
    pass
 class Frame(pyflam3.Frame):
    def interpolate(self, time, cp):
        flam3_interpolate(self.genomes, self.ngenomes, time, 0, byref(cp))
    def pack_stream(self, ctx, time):
        """
        Pack and return the control point data stream to render this frame.
        """
        # Get the central control point, and calculate parameters that change
        # once per frame
        cp = BaseGenome()
        self.interpolate(time, cp)
        self.filt = Filters(self, cp)
        rw = cp.spatial_oversample * cp.width  + 2 * self.filt.gutter
        rh = cp.spatial_oversample * cp.height + 2 * self.filt.gutter
        # Interpolate each time step, calculate per-step variables, and pack
        # into the stream
        cp_streamer = ctx.ptx.instances[CPDataStream]
        stream = StringIO()
        print "Data stream contents:"
        cp_streamer.print_record()
        tcp = BaseGenome()
        for batch_idx in range(cp.nbatches):
            for time_idx in range(cp.ntemporal_samples):
                idx = time_idx + batch_idx * cp.nbatches
                cp_time = time + self.filt.temporal_deltas[idx]
                self.interpolate(time, tcp)
                tcp.camera = Camera(self, tcp, self.filt)
                # TODO: figure out which object to pack this into
                nsamples = ((tcp.camera.sample_density * cp.width * cp.height) /
                            (cp.nbatches * cp.ntemporal_samples))
                samples_per_thread = nsamples / ctx.threads + 15
                cp_streamer.pack_into(stream,
                        frame=self,
                        cp=tcp,
                        cp_idx=idx,
                        samples_per_thread=samples_per_thread)
        stream.seek(0)
        return (stream.read(), cp.nbatches * cp.ntemporal_samples)
 class Animation(object):
    """
    Control structure for rendering a series of frames.
@ -31,46 +81,46 @@ class Animation(object):
            memmove(byref(self.genomes[i]), byref(genomes[i]),
                    sizeof(BaseGenome))
-        self._frame = Frame()
+        self.features = Features(genomes)
-        self._frame.genomes = cast(self.genomes, POINTER(BaseGenome))
+        self.frame = Frame()
-        self._frame.ngenomes = len(genomes)
+        self.frame.genomes = cast(self.genomes, POINTER(BaseGenome))
        self.frame.ngenomes = len(genomes)
        self.ctx = None
    def compile(self):
        """
        Create a PTX kernel optimized for this animation, compile it, and
        attach it to a LaunchContext with a thread distribution optimized for
        the active device.
        """
        # TODO: user-configurable test control
        self.ctx = LaunchContext([IterThread], block=(256,1,1), grid=(54,1),
                                 tests=True)
        # TODO: user-configurable verbosity control
        self.ctx.compile(verbose=3, anim=self, features=self.features)
        # TODO: automatic optimization of block parameters
    def render_frame(self, time=0):
        # TODO: support more nuanced frame control than just 'time'
        # TODO: reuse more information between frames
        # TODO: allow animation-long override of certain parameters (size, etc)
        cp_stream, num_cps = self.frame.pack_stream(self.ctx, time)
        iter_thread = self.ctx.ptx.instances[IterThread]
        iter_thread.upload_cp_stream(self.ctx, cp_stream, num_cps)
        iter_thread.call(self.ctx)
-        cp = BaseGenome()
+class Features(object):
-        flam3_interpolate(self.frame.genomes, len(self.genomes), time, 0,
+    """
-                          byref(cp))
+    Determine features and constants required to render a particular set of
-        filt = Filters(self.frame, cp)
+    genomes. The values of this class are fixed before compilation begins.
-        rw = cp.spatial_oversample * cp.width  + 2 * filt.gutter
+    """
-        rh = cp.spatial_oversample * cp.height + 2 * filt.gutter
+    # Constant; number of rounds spent fusing points on first CP of a frame
    num_fuse_samples = 25
-        # Allocate buckets, accumulator
+    def __init__(self, genomes):
-        # Loop over all batches:
+        self.max_ntemporal_samples = max(
-        #   [density estimation]
+                [cp.nbatches * cp.ntemporal_samples for cp in genomes]) + 1
        #   Loop over all temporal samples:
        #     Color scalar = temporal filter at index
        #     Interpolate and get control point
        #     Precalculate
        #     Prepare xforms
        #     Compute colormap
        #     Run iterations
        #     Accumulate vibrancy, gamma, background
        #   Calculate k1, k2
        #   If not DE, then do log filtering to accumulator
        #   Else, [density estimation]
        # Do final clip and filter
        # For now:
        # Loop over all batches:
        #   Loop over all temporal samples:
        #     Interpolate and get control point
        #     Read the
        #     Dump noise into buckets
        #   Do log filtering to accumulator
        # Do simplified final clip
 class Filters(object):
    def __init__(self, frame, cp):
@ -115,7 +165,7 @@ class Camera(object):
        scale = 2.0 ** cp.zoom
        self.sample_density = cp.sample_density * scale * scale
-        center = Point(cp.center[0], cp.center[1])
+        center = Point(cp._center[0], cp._center[1])
        size = Point(cp.width, cp.height)
        # pix per unit, where 'unit' is '1.0' in IFS space
        self.ppu = Point(
@ -129,4 +179,3 @@ class Camera(object):
        self.ifs_space_size = 1.0 / (self.upper_bounds - self.lower_bounds)
        # TODO: coordinate transforms in concert with GPU (rotation, size)
--- a/main.py
+++ b/main.py
@ -25,15 +25,14 @@ def main(args):
    verbose = 1
    if '-d' in args:
        verbose = 3
    ctx = LaunchContext([IterThread], block=(256,1,1), grid=(64,1), tests=True)
    ctx.compile(verbose=verbose)
    ctx.run_tests()
    with open(args[-1]) as fp:
        genomes = Genome.from_string(fp.read())
    anim = Animation(genomes)
    anim.compile()
    anim.render_frame()
    #genome.width, genome.height = 512, 512
    #genome.sample_density = 1000
    #obuf, stats, frame = genome.render(estimator=3)