From f3298e0bed71126205fb4e2f87c0fb3b8d7cb94d Mon Sep 17 00:00:00 2001
From: Steven Robertson <steven@strobe.cc>
Date: Mon, 6 Sep 2010 11:18:20 -0400
Subject: [PATCH] Finally runs again

---
 cuburnlib/cuda.py        |  22 ++----
 cuburnlib/device_code.py | 164 +++++++++++----------------------------
 cuburnlib/ptx.py         | 125 ++++++++++++++++++++---------
 cuburnlib/render.py      | 123 ++++++++++++++++++++---------
 main.py                  |   5 +-
 5 files changed, 230 insertions(+), 209 deletions(-)

diff --git a/cuburnlib/cuda.py b/cuburnlib/cuda.py
index 2aedc43..7acc673 100644
--- a/cuburnlib/cuda.py
+++ b/cuburnlib/cuda.py
@@ -15,7 +15,8 @@ from cuburnlib.ptx import PTXModule
 class LaunchContext(object):
     """
     Context collecting the information needed to create, run, and gather the
-    results of a device computation.
+    results of a device computation. This may eventually also include an actual
+    CUDA context, but for now it just uses the global one.
 
     To create the fastest device code across multiple device families, this
     context may decide to iteratively refine the final PTX by regenerating
@@ -32,34 +33,27 @@ class LaunchContext(object):
         `mod`:      Final compiled module. Unavailable during assembly.
 
     """
-    def __init__(self, entries, block=(1,1,1), grid=(1,1), seed=None,
-                 tests=False):
+    def __init__(self, entries, block=(1,1,1), grid=(1,1), tests=False):
         self.entry_types = entries
         self.block, self.grid, self.build_tests = block, grid, tests
-        self.rand = np.random.mtrand.RandomState(seed)
         self.setup_done = False
 
     @property
     def threads(self):
         return reduce(lambda a, b: a*b, self.block + self.grid)
 
-    def print_source(self):
-        print '\n'.join(["%03d %s" % (i+1, l) for (i, l) in
-                        enumerate(self.ptx.source.split('\n'))])
-
-    def compile(self, to_inject={}, verbose=False):
-        inj = dict(to_inject)
-        inj['ctx'] = self
-        self.ptx = PTXModule(self.entry_types, inj, self.build_tests)
+    def compile(self, verbose=False, **kwargs):
+        kwargs['ctx'] = self
+        self.ptx = PTXModule(self.entry_types, kwargs, self.build_tests)
         try:
             self.mod = cuda.module_from_buffer(self.ptx.source)
         except (cuda.CompileError, cuda.RuntimeError), e:
             print "Aww, dang, compile error. Here's the source:"
-            self.print_source()
+            self.ptx.print_source()
             raise e
         if verbose:
             if verbose >= 3:
-                self.print_source()
+                self.ptx.print_source()
             for entry in self.ptx.entries:
                 func = self.mod.get_function(entry.entry_name)
                 print "Compiled %s: used %d regs, %d sm, %d local" % (
diff --git a/cuburnlib/device_code.py b/cuburnlib/device_code.py
index e5bec0b..8401fcf 100644
--- a/cuburnlib/device_code.py
+++ b/cuburnlib/device_code.py
@@ -4,126 +4,34 @@ Contains the PTX fragments which will drive the device.
 
 import os
 import time
+import struct
 
 import pycuda.driver as cuda
 import numpy as np
 
 from cuburnlib.ptx import *
 
-"""
-Here's the current draft of the full algorithm implementation.
-
-declare xform jump table
-
-load random state
-
-clear x_coord, y_coord, z_coord, w_coord;
-store -(FUSE+1) to shared (per-warp) num_samples_sh
-clear badvals [1]
-
-load param (global_cp_idx_addr)
-index table start (global_cp_idx) [2]
-load count of indexes from global cp index =>
-    store to qlocal current_cp_num [3]
-
-outermost loop start:
-    load current_cp_num
-    if current_cp_num <= 0:
-        exit
-
-    load param global_cp_idx_addr
-    calculate offset into address with current_cp_num, global_cp_idx_addr
-    load cp_base_address
-    stream_start (cp_base, cp_base_addr) [4]
-
-FUSE_START:
-    num_samples += 1
-    if num_samples >= 0:
-        # Okay, we're done FUSEing, prepare to enter normal loop
-        load num_samples => store to shared (per-warp) num_samples
-
-
-ITER_LOOP_START:
-        reg xform_addr, xform_stream_addr, xform_select
-
-        mwc_next_u32 to xform_select
-        # Performance test: roll/unroll this loop?
-        stream_load xform_prob (cp_stream)
-        if xform_select <= xform_prob:
-            bra.uni XFORM_1_LBL
-        ...
-        stream_load xform_prob (cp_stream)
-        if xform_select <= xform_prob:
-            bra.uni XFORM_N_LBL
-
-XFORM_1_LBL:
-        stream_load xform_1_ (cp_stream)
-        ...
-        bra.uni XFORM_POST
-
-XFORM_POST:
-        [if final_xform:]
-            [do final_xform]
-
-        if num_samples < 0:
-            # FUSE still in progress
-            bra.uni FUSE_START
-
-FRAGMENT_WRITEBACK:
-        # Unknown at this time.
-
-SHUFFLE:
-        # Unknown at this time.
-
-        load num_samples from num_samples_sh
-        num_samples -= 1
-        if num_samples > 0:
-            bra.uni ITER_LOOP_START
-
-
-[1] Tracking 'badvals' can put a pretty large hit on performance, particularly
-    for images that sample a small amount of the grid. So this might be cut
-    when rendering for performance. On the other hand, it might actually help
-    tune the algorithm later, so it'll definitely be an option.
-
-[2] Control points for each temporal sample will be preloaded to the
-    device in the compact DataStream format (more on this later). Their
-    locations are represented in an index table, which starts with a single
-    `.u32 length`, followed by `length` pointers. To avoid having to keep
-    reloading `length`, or worse, using a register to hold it in memory, we
-    instead count *down* to zero. This is a very common idiom.
-
-[3] 'qlocal' is quasi-local storage. it could easily be actual local storage,
-    depending on how local storage is implemented, but the extra 128-byte loads
-    for such values might make a performance difference. qlocal variables may
-    be identical across a warp or even a CTA, and so variables noted as
-    "qlocal" here might end up in shared memory or even a small per-warp or
-    per-CTA buffer in global memory created specifically for this purpose,
-    after benchmarking is done.
-
-[4] DataStreams are "opaque" data serialization structures defined below.  The
-    structure of a stream is actually created while parsing the DSL by the load
-    statements themselves. Some benchmarks need to be done before DataStreams
-    stop being "opaque" and become simply "dynamic".
-"""
-
 class IterThread(PTXTest):
     entry_name = 'iter_thread'
     entry_params = []
+
+    def __init__(self):
+        self.cps_uploaded = False
+
     def deps(self):
         return [MWCRNG, CPDataStream]
 
     @ptx_func
     def module_setup(self):
         mem.global_.u32('g_cp_array',
-                [features.max_ntemporal_samples,'*',cp_stream_size])
+                        cp_stream_size*features.max_ntemporal_samples)
         mem.global_.u32('g_num_cps')
         # TODO move into debug statement
         mem.global_.u32('g_num_rounds', ctx.threads)
         mem.global_.u32('g_num_writes', ctx.threads)
 
     @ptx_func
-    def entry():
+    def entry(self):
         reg.f32('x_coord y_coord color_coord alpha_coord')
 
         # TODO: temporary, for testing
@@ -158,8 +66,8 @@ class IterThread(PTXTest):
         op.mov.s32(num_samples, -(features.num_fuse_samples+1))
 
         # TODO: Move cp_num to qlocal storage (or spill it, rarely accessed)
-        reg.u32('cp_num cpA')
-        mov.u32(cp_num, 0)
+        reg.u32('cp_idx cpA')
+        op.mov.u32(cp_idx, 0)
 
         label('cp_loop_start')
         op.bar.sync(0)
@@ -168,19 +76,19 @@ class IterThread(PTXTest):
             reg.u32('num_cps')
             reg.pred('p_last_cp')
             op.ldu.u32(num_cps, addr(g_num_cps))
-            op.setp.lt.u32(p_last_cp, cp_num, num_cps)
+            op.setp.ge.u32(p_last_cp, cp_idx, num_cps)
             op.bra.uni('all_cps_done', ifp=p_last_cp)
 
         with block('Load CP address'):
             op.mov.u32(cpA, g_cp_array)
-            op.mad.lo.u32(cpA, cp_num, cp_stream_size, cpA)
+            op.mad.lo.u32(cpA, cp_idx, cp_stream_size, cpA)
 
-        with block('Increment CP number, load num_samples (unless in fuse)'):
-            reg.pred('p_in_fuse')
-            op.setp.lt.s32(p_in_fuse, num_samples, 0)
-            op.add.u32(cp_num, cp_num, 1, ifp=p_in_fuse)
-            cp_stream_get(cpA, num_samples, 'cp.samples_per_thread',
-                          ifp=p_in_fuse)
+        with block('Increment CP index, load num_samples (unless in fuse)'):
+            reg.pred('p_not_in_fuse')
+            op.setp.ge.s32(p_not_in_fuse, num_samples, 0)
+            op.add.u32(cp_idx, cp_idx, 1, ifp=p_not_in_fuse)
+            cp_stream_get(cpA, num_samples, 'samples_per_thread',
+                          ifp=p_not_in_fuse)
 
         label('fuse_loop_start')
         with block('FUSE-specific stuff'):
@@ -188,7 +96,7 @@ class IterThread(PTXTest):
             comment('If num_samples == -1, set it to 0 and jump back up')
             comment('This will start the normal CP loading machinery')
             op.setp.eq.s32(p_fuse, num_samples, -1)
-            op.mov.s32(p_fuse, 0, ifp=p_fuse)
+            op.mov.s32(num_samples, 0, ifp=p_fuse)
             op.bra.uni(cp_loop_start, ifp=p_fuse)
 
             comment('If num_samples < -1, still fusing, so increment')
@@ -204,33 +112,55 @@ class IterThread(PTXTest):
         with block("Test if we're still in FUSE"):
             reg.pred('p_in_fuse')
             op.setp.lt.s32(p_in_fuse, num_samples, 0)
-            op.bra.uni(fuse_start, ifp=p_in_fuse)
+            op.bra.uni(fuse_loop_start, ifp=p_in_fuse)
 
         with block("Ordinarily, we'd write the result here"):
             op.add.u32(num_writes, num_writes, 1)
 
         with block("Check to see if we're done with this CP"):
             reg.pred('p_cp_done')
+            op.add.s32(num_samples, num_samples, -1)
             op.setp.eq.s32(p_cp_done, num_samples, 0)
             op.bra.uni(cp_loop_start, ifp=p_cp_done)
 
         op.bra.uni(iter_loop_start)
 
-
+        label('all_cps_done')
         # TODO this is for testing, move it to a debug statement
         store_per_thread(g_num_rounds, num_rounds)
         store_per_thread(g_num_writes, num_writes)
 
-    def call(self, ctx):
-        raise HorribleDeathError("Okay I'm going to bed now")
+    def upload_cp_stream(self, ctx, cp_stream, num_cps):
+        cp_array_dp, cp_array_l = ctx.mod.get_global('g_cp_array')
+        assert len(cp_stream) <= cp_array_l, "Stream too big!"
+        cuda.memcpy_htod_async(cp_array_dp, cp_stream)
+        num_cps_dp, num_cps_l = ctx.mod.get_global('g_num_cps')
+        cuda.memcpy_htod_async(num_cps_dp, struct.pack('i', num_cps))
+        self.cps_uploaded = True
 
+    def call(self, ctx):
+        if not self.cps_uploaded:
+            raise Error("Cannot call IterThread before uploading CPs")
+        func = ctx.mod.get_function('iter_thread')
+        dtime = func(block=ctx.block, grid=ctx.grid, time_kernel=True)
+
+        num_rounds_dp, num_rounds_l = ctx.mod.get_global('g_num_rounds')
+        num_writes_dp, num_writes_l = ctx.mod.get_global('g_num_writes')
+        rounds = cuda.from_device(num_rounds_dp, ctx.threads, np.uint32)
+        writes = cuda.from_device(num_writes_dp, ctx.threads, np.uint32)
+        print "Rounds:", rounds
+        print "Writes:", writes
 
 class MWCRNG(PTXFragment):
     def __init__(self):
+        self.rand = np.random
         self.threads_ready = 0
         if not os.path.isfile('primes.bin'):
             raise EnvironmentError('primes.bin not found')
 
+    def set_seed(self, seed):
+        self.rand = np.random.mtrand.RandomState(seed)
+
     @ptx_func
     def module_setup(self):
         mem.global_.u32('mwc_rng_mults', ctx.threads)
@@ -284,13 +214,13 @@ class MWCRNG(PTXFragment):
         # Randomness in choosing multipliers is good, but larger multipliers
         # have longer periods, which is also good. This is a compromise.
         mults = np.array(mults[:ctx.threads*4])
-        ctx.rand.shuffle(mults)
+        self.rand.shuffle(mults)
         # Copy multipliers and seeds to the device
         multdp, multl = ctx.mod.get_global('mwc_rng_mults')
         cuda.memcpy_htod_async(multdp, mults.tostring()[:multl])
         # Intentionally excludes both 0 and (2^32-1), as they can lead to
         # degenerate sequences of period 0
-        states = np.array(ctx.rand.randint(1, 0xffffffff, size=2*ctx.threads),
+        states = np.array(self.rand.randint(1, 0xffffffff, size=2*ctx.threads),
                           dtype=np.uint32)
         statedp, statel = ctx.mod.get_global('mwc_rng_state')
         cuda.memcpy_htod_async(statedp, states.tostring())
@@ -376,7 +306,7 @@ class MWCRNGTest(PTXTest):
 class CameraCoordTransform(PTXFragment):
     pass
 
-class CPDataStream(PTXFragment):
+class CPDataStream(DataStream):
     """DataStream which stores the control points."""
     prefix = 'cp'
 
diff --git a/cuburnlib/ptx.py b/cuburnlib/ptx.py
index 71ba80a..ffdf075 100644
--- a/cuburnlib/ptx.py
+++ b/cuburnlib/ptx.py
@@ -11,7 +11,7 @@ easier to maintain using this system.
 # If you see 'import inspect', you know you're in for a good time
 import inspect
 import types
-import traceback
+import struct
 from cStringIO import StringIO
 from collections import namedtuple
 
@@ -116,6 +116,8 @@ class _BlockInjector(object):
         self.dead = False
         map(self.inject, self.to_inject.items())
     def __exit__(self, exc_type, exc_val, tb):
+        # Do some real exceptorin'
+        if exc_type is not None: return
         for k in self.injected:
             del self.inject_into[k]
         self.dead = True
@@ -137,17 +139,27 @@ class _Block(object):
         inj = self.stack[-1].injectors
         [inj.remove(i) for i in inj if i.dead]
     def push_ctx(self):
-        # Move most recent active injector to new context
         self.clean_injectors()
-        last_inj = self.stack[-1].injectors.pop()
-        self.stack.append(BlockCtx(dict(self.stack[-1].locals), [],
-                          [last_inj]))
+        self.stack.append(BlockCtx(dict(self.stack[-1].locals), [], []))
+        # The only reason we should have no injectors in the previous block is
+        # if we are hitting a new ptx_func entry point or global declaration at
+        # PTX module scope, which means the stack only contains the outer
+        # context and the current one (i.e. len(stack) == 2)
+        if len(self.stack[-2].injectors) == 0:
+            assert len(self.stack) == 2, "Empty injector list too early!"
+        # Otherwise, the active injector in the previous block is the one for
+        # the Python function which is currently creating a new PTX block, and
+        # and it needs to be promoted to the current block
+        else:
+            self.stack[-1].injectors.append(self.stack[-2].injectors.pop())
     def pop_ctx(self):
         self.clean_injectors()
         bs = self.stack.pop()
+        # TODO: figure out why this next line is needed
+        [bs.injectors.remove(i) for i in bs.injectors if i.dead]
         self.stack[-1].code.extend(bs.code)
         if len(self.stack) == 1:
-            # We're on outer_ctx, so all injectors should be gone
+            # We're on outer_ctx, so all injectors should be gone.
             assert len(bs.injectors) == 0, "Injector/context mismatch"
             return
         # The only injector should be the one added in push_ctx
@@ -186,7 +198,7 @@ class _Block(object):
         spacing. To keep things simple, nested lists and tuples will be reduced
         in this manner (but not other iterable types). Coercion will not happen
         until after the entire DSL call tree has been walked. This allows a
-        class to submit a mutable type (e.g. the trivial `StrVar`) when first
+        class to submit a mutable type (e.g. ``DelayVar``) when first
         walked with an undefined value, then substitute the correct value on
         being finalized.
 
@@ -196,14 +208,23 @@ class _Block(object):
         """
         self.stack[-1].code.append(PTXStmt(prefix, op, vars, semi, indent))
 
-class StrVar(object):
+class DelayVar(object):
     """
     Trivial wrapper to allow deferred variable substitution.
     """
     def __init__(self, val=None):
         self.val = val
     def __str__(self):
-        return str(val)
+        return str(self.val)
+    def __mul__(self, other):
+        # Oh this is truly egregious
+        return DelayVarProxy(self, "self.other.val*" + str(other))
+
+class DelayVarProxy(object):
+    def __init__(self, other, expr):
+        self.other, self.expr = other, expr
+    def __str__(self):
+        return str(eval(self.expr))
 
 class _PTXFuncWrapper(object):
     """Enables ptx_func"""
@@ -298,6 +319,9 @@ class Block(object):
             self.block.code(op=['// ', self.comment], semi=False)
         self.comment = None
     def __exit__(self, exc_type, exc_value, tb):
+        # Allow exceptions to be propagated; things get really messy if we try
+        # to pop the stack if things aren't ordered correctly
+        if exc_type is not None: return
         self.block.code(indent=-1)
         self.block.code(op='}', semi=False)
         self.block.pop_ctx()
@@ -370,12 +394,14 @@ class Op(_CallChain):
     """
     def _call(self, op, *args, **kwargs):
         pred = ''
-        if 'ifp' in kwargs:
-            if 'ifnotp' in kwargs:
+        ifp = kwargs.get('ifp')
+        ifnotp = kwargs.get('ifnotp')
+        if ifp:
+            if ifnotp:
                 raise SyntaxError("can't use both, fool")
-            pred = ['@', kwargs['ifp']]
-        if 'ifnotp' in kwargs:
-            pred = ['@!', kwargs['ifnotp']]
+            pred = ['@', ifp]
+        if ifnotp:
+            pred = ['@!', ifnotp]
         self.block.code(pred, '.'.join(op), _softjoin(args, ','))
 
 class Mem(object):
@@ -421,7 +447,7 @@ class Mem(object):
         >>> op.st.global.v2.u32(addr(areg), vec(reg1, reg2))
         >>> op.ld.global.v2.u32(vec(reg1, reg2), addr(areg, 8))
         """
-        return ['[', areg, aoffset and '+' or '', aoffset, ']']
+        return ['[', areg, aoffset is not '' and '+' or '', aoffset, ']']
 
 class _MemFactory(_CallChain):
     """Actual `mem` object"""
@@ -538,8 +564,8 @@ class PTXFragment(object):
         """
         Called after running all PTX DSL functions, but before code generation,
         to allow fragments which postponed variable evaluation (e.g. using
-        `StrVar`) to fill in the resulting values. Most fragments should not
-        use this.
+        ``DelayVar``) to fill in the resulting values. Most fragments should
+        not use this.
 
         If implemented, this function *may* use an @ptx_func decorator to
         access the global DSL scope, but pretty please don't emit any code
@@ -796,6 +822,13 @@ class PTXModule(object):
                 raise ValueError("Too many recompiles scheduled!")
             self.__needs_recompilation = True
 
+    def print_source(self):
+        if not hasattr(self, 'source'):
+            raise ValueError("Not assembled yet!")
+        print '\n'.join(["%03d %s" % (i+1, l) for (i, l) in
+                        enumerate(self.source.split('\n'))])
+
+
 def _flatten(val):
     if isinstance(val, (list, tuple)):
         return ''.join(map(_flatten, val))
@@ -806,7 +839,7 @@ class PTXFormatter(object):
     Formats PTXStmt items into beautiful code. Well, the beautiful part is
     postponed for now.
     """
-    def __init__(self, indent_amt=2, oplen_max=20, varlen_max=12):
+    def __init__(self, indent_amt=4, oplen_max=20, varlen_max=12):
         self.idamt, self.opm, self.vm = indent_amt, oplen_max, varlen_max
     def format(self, code):
         out = []
@@ -844,7 +877,7 @@ class PTXFormatter(object):
 _TExp = namedtuple('_TExp', 'type exprlist')
 _DataCell = namedtuple('_DataCell', 'offset size texp')
 
-class DataStream(object):
+class DataStream(PTXFragment):
     """
     Simple interface between Python and PTX, designed to create and tightly
     pack control structs.
@@ -914,19 +947,19 @@ class DataStream(object):
         self.cells = []
         self.stream_size = 0
         self.free = {}
-        self.size_strvar = StrVar("not_yet_determined")
+        self.size_delayvars = []
 
     _types = dict(s8='b', u8='B', s16='h', u16='H', s32='i', u32='I', f32='f',
                   s64='l', u64='L', f64='d')
-    def _get_type(self, *regs):
+    def _get_type(self, regs):
         size = int(regs[0].type[1:])
-        for r in regs:
+        for reg in regs:
             if reg.type not in self._types:
                 raise TypeError("Register %s of type %s not supported" %
                                 (reg.name, reg.type))
-            if int(r.type[1:]) != size:
+            if int(reg.type[1:]) != size:
                 raise TypeError("Can't vector-load different size regs")
-        return size, ''.join([self._types.get(r.type) for r in regs])
+        return size/8, ''.join([self._types.get(r.type) for r in regs])
 
     def _alloc(self, vsize, texp):
         # A really crappy allocator. May later include optimizations for
@@ -939,7 +972,7 @@ class DataStream(object):
         if idx is None:
             # No aligned free cells, allocate a new `align`-byte free cell
             assert alloc not in self.free
-            self.free[alloc] = idx = len(self.stream_size)
+            self.free[alloc] = idx = len(self.cells)
             self.cells.append(_DataCell(self.stream_size, alloc, None))
             self.stream_size += alloc
         # Overwrite the free cell at `idx` with texp
@@ -958,27 +991,28 @@ class DataStream(object):
                 self.cells.insert(fidx, _DataCell(foffset, fsize, None))
                 foffset += fsize
                 self.free[fsize] = fidx
+            fsize *= 2
         # Adjust indexes. This is ugly, but evidently unavoidable
         if fidx-idx:
-            for k, v in filter(lambda k, v: v > idx, self.free.items()):
+            for k, v in filter(lambda (k, v): v > idx, self.free.items()):
                 self.free[k] = v+(fidx-idx)
-        return self.offset
+        return offset
 
     @ptx_func
     def _stream_get_internal(self, areg, dregs, exprs, ifp, ifnotp):
         size, type = self._get_type(dregs)
         vsize = size * len(dregs)
-        texp = _TExp(type, [expr])
-        if texp in self.expr_map:
+        texp = _TExp(type, tuple(exprs))
+        if texp in self.texp_map:
             offset = self.texp_map[texp]
         else:
             offset = self._alloc(vsize, texp)
             self.texp_map[texp] = offset
-        vtype = {1: '', 2: '.v2', 4: '.v4'}.get(len(dregs))
-        if len(dregs) > 0:
+        opname = ['ldu', 'b%d' % (size*8)]
+        if len(dregs) > 1:
+            opname.insert(1, 'v%d' % len(dregs))
             dregs = vec(dregs)
-        op._call('ldu%s.b%d' % (vtype, size), dregs, addr(areg+off),
-                 ifp=ifp, ifnotp=ifnotp)
+        op._call(opname, dregs, addr(areg, offset), ifp=ifp, ifnotp=ifnotp)
 
     @ptx_func
     def _stream_get(self, areg, dreg, expr, ifp=None, ifnotp=None):
@@ -991,16 +1025,20 @@ class DataStream(object):
                                   ifp, ifnotp)
 
     @ptx_func
-    def _stream_get_v2(self, areg, d1, e1, d2, e2, d3, e3, d4, e4,
+    def _stream_get_v4(self, areg, d1, e1, d2, e2, d3, e3, d4, e4,
                        ifp=None, ifnotp=None):
         self._stream_get_internal(areg, [d1, d2, d3, d4], [e1, e2, e3, e4],
                                   ifp, ifnotp)
 
+    @property
     def _stream_size(self):
-        return self.size_strvar
+        x = DelayVar("not_yet_determined")
+        self.size_delayvars.append(x)
+        return x
 
     def finalize_code(self):
-        self.size_strvar.val = str(self.stream_size)
+        for dv in self.size_delayvars:
+            dv.val = self.stream_size
 
     def to_inject(self):
         return {self.prefix + '_stream_get': self._stream_get,
@@ -1039,9 +1077,20 @@ class DataStream(object):
         for offset, size, texp in self.cells:
             if texp:
                 type = texp.type
-                vals = [eval(e, globals(), kwargs) for e in texp.expr_list]
+                vals = [eval(e, globals(), kwargs) for e in texp.exprlist]
             else:
                 type = 'x'*size # Padding bytes
                 vals = []
-            out.write(struct.pack(type, *vals))
+            outfile.write(struct.pack(type, *vals))
+
+    def print_record(self):
+        for cell in self.cells:
+            if cell.texp is None:
+                print '%3d %2d --' % (cell.offset, cell.size)
+                continue
+            print '%3d %2d %4s %s' % (cell.offset, cell.size, cell.texp.type,
+                                      cell.texp.exprlist[0])
+            for exp in cell.texp.exprlist[1:]:
+                print '%12s %s' % ('', exp)
+
 
diff --git a/cuburnlib/render.py b/cuburnlib/render.py
index db7932c..11ebf1a 100644
--- a/cuburnlib/render.py
+++ b/cuburnlib/render.py
@@ -1,12 +1,62 @@
-
 from ctypes import *
+from cStringIO import StringIO
 import numpy as np
-from fr0stlib.pyflam3 import Genome, Frame
+
+from fr0stlib import pyflam3
 from fr0stlib.pyflam3._flam3 import *
 from fr0stlib.pyflam3.constants import *
 
+from cuburnlib.cuda import LaunchContext
+from cuburnlib.device_code import IterThread, CPDataStream
+
 Point = lambda x, y: np.array([x, y], dtype=np.double)
 
+class Genome(pyflam3.Genome):
+    pass
+
+class Frame(pyflam3.Frame):
+    def interpolate(self, time, cp):
+        flam3_interpolate(self.genomes, self.ngenomes, time, 0, byref(cp))
+
+    def pack_stream(self, ctx, time):
+        """
+        Pack and return the control point data stream to render this frame.
+        """
+        # Get the central control point, and calculate parameters that change
+        # once per frame
+        cp = BaseGenome()
+        self.interpolate(time, cp)
+        self.filt = Filters(self, cp)
+        rw = cp.spatial_oversample * cp.width  + 2 * self.filt.gutter
+        rh = cp.spatial_oversample * cp.height + 2 * self.filt.gutter
+
+        # Interpolate each time step, calculate per-step variables, and pack
+        # into the stream
+        cp_streamer = ctx.ptx.instances[CPDataStream]
+        stream = StringIO()
+        print "Data stream contents:"
+        cp_streamer.print_record()
+        tcp = BaseGenome()
+        for batch_idx in range(cp.nbatches):
+            for time_idx in range(cp.ntemporal_samples):
+                idx = time_idx + batch_idx * cp.nbatches
+                cp_time = time + self.filt.temporal_deltas[idx]
+                self.interpolate(time, tcp)
+                tcp.camera = Camera(self, tcp, self.filt)
+
+                # TODO: figure out which object to pack this into
+                nsamples = ((tcp.camera.sample_density * cp.width * cp.height) /
+                            (cp.nbatches * cp.ntemporal_samples))
+                samples_per_thread = nsamples / ctx.threads + 15
+
+                cp_streamer.pack_into(stream,
+                        frame=self,
+                        cp=tcp,
+                        cp_idx=idx,
+                        samples_per_thread=samples_per_thread)
+        stream.seek(0)
+        return (stream.read(), cp.nbatches * cp.ntemporal_samples)
+
 class Animation(object):
     """
     Control structure for rendering a series of frames.
@@ -31,46 +81,46 @@ class Animation(object):
             memmove(byref(self.genomes[i]), byref(genomes[i]),
                     sizeof(BaseGenome))
 
-        self._frame = Frame()
-        self._frame.genomes = cast(self.genomes, POINTER(BaseGenome))
-        self._frame.ngenomes = len(genomes)
+        self.features = Features(genomes)
+        self.frame = Frame()
+        self.frame.genomes = cast(self.genomes, POINTER(BaseGenome))
+        self.frame.ngenomes = len(genomes)
+
+        self.ctx = None
+
+    def compile(self):
+        """
+        Create a PTX kernel optimized for this animation, compile it, and
+        attach it to a LaunchContext with a thread distribution optimized for
+        the active device.
+        """
+        # TODO: user-configurable test control
+        self.ctx = LaunchContext([IterThread], block=(256,1,1), grid=(54,1),
+                                 tests=True)
+        # TODO: user-configurable verbosity control
+        self.ctx.compile(verbose=3, anim=self, features=self.features)
+        # TODO: automatic optimization of block parameters
 
     def render_frame(self, time=0):
         # TODO: support more nuanced frame control than just 'time'
         # TODO: reuse more information between frames
         # TODO: allow animation-long override of certain parameters (size, etc)
+        cp_stream, num_cps = self.frame.pack_stream(self.ctx, time)
+        iter_thread = self.ctx.ptx.instances[IterThread]
+        iter_thread.upload_cp_stream(self.ctx, cp_stream, num_cps)
+        iter_thread.call(self.ctx)
 
-        cp = BaseGenome()
-        flam3_interpolate(self.frame.genomes, len(self.genomes), time, 0,
-                          byref(cp))
-        filt = Filters(self.frame, cp)
-        rw = cp.spatial_oversample * cp.width  + 2 * filt.gutter
-        rh = cp.spatial_oversample * cp.height + 2 * filt.gutter
+class Features(object):
+    """
+    Determine features and constants required to render a particular set of
+    genomes. The values of this class are fixed before compilation begins.
+    """
+    # Constant; number of rounds spent fusing points on first CP of a frame
+    num_fuse_samples = 25
 
-        # Allocate buckets, accumulator
-        # Loop over all batches:
-        #   [density estimation]
-        #   Loop over all temporal samples:
-        #     Color scalar = temporal filter at index
-        #     Interpolate and get control point
-        #     Precalculate
-        #     Prepare xforms
-        #     Compute colormap
-        #     Run iterations
-        #     Accumulate vibrancy, gamma, background
-        #   Calculate k1, k2
-        #   If not DE, then do log filtering to accumulator
-        #   Else, [density estimation]
-        # Do final clip and filter
-
-        # For now:
-        # Loop over all batches:
-        #   Loop over all temporal samples:
-        #     Interpolate and get control point
-        #     Read the
-        #     Dump noise into buckets
-        #   Do log filtering to accumulator
-        # Do simplified final clip
+    def __init__(self, genomes):
+        self.max_ntemporal_samples = max(
+                [cp.nbatches * cp.ntemporal_samples for cp in genomes]) + 1
 
 class Filters(object):
     def __init__(self, frame, cp):
@@ -115,7 +165,7 @@ class Camera(object):
         scale = 2.0 ** cp.zoom
         self.sample_density = cp.sample_density * scale * scale
 
-        center = Point(cp.center[0], cp.center[1])
+        center = Point(cp._center[0], cp._center[1])
         size = Point(cp.width, cp.height)
         # pix per unit, where 'unit' is '1.0' in IFS space
         self.ppu = Point(
@@ -129,4 +179,3 @@ class Camera(object):
         self.ifs_space_size = 1.0 / (self.upper_bounds - self.lower_bounds)
         # TODO: coordinate transforms in concert with GPU (rotation, size)
 
-
diff --git a/main.py b/main.py
index 2d12287..4e4c4b8 100644
--- a/main.py
+++ b/main.py
@@ -25,15 +25,14 @@ def main(args):
     verbose = 1
     if '-d' in args:
         verbose = 3
-    ctx = LaunchContext([IterThread], block=(256,1,1), grid=(64,1), tests=True)
-    ctx.compile(verbose=verbose)
-    ctx.run_tests()
 
     with open(args[-1]) as fp:
         genomes = Genome.from_string(fp.read())
     anim = Animation(genomes)
+    anim.compile()
     anim.render_frame()
 
+
     #genome.width, genome.height = 512, 512
     #genome.sample_density = 1000
     #obuf, stats, frame = genome.render(estimator=3)