Rename "cuburnlib" (stupid) to "cuburn" (stupid but shorter)

--HG-- rename : cuburnlib/__init__.py => cuburn/__init__.py rename : cuburnlib/cuda.py => cuburn/cuda.py rename : cuburnlib/device_code.py => cuburn/device_code.py rename : cuburnlib/ptx.py => cuburn/ptx.py rename : cuburnlib/render.py => cuburn/render.py
2025-11-03 01:40:47 -05:00 · 2010-09-10 14:48:34 -04:00
parent 4552589b35
commit 36f1c1c056
7 changed files with 10 additions and 10 deletions
--- a/cuburn/init.py
+++ b/cuburn/init.py
--- a/cuburn/cuda.py
+++ b/cuburn/cuda.py
@ -0,0 +1,105 @@
+# These imports are order-sensitive!
+import pyglet
+import pyglet.gl as gl
+gl.get_current_context()
+
+import pycuda.driver as cuda
+import pycuda.tools
+import pycuda.gl as cudagl
+import pycuda.gl.autoinit
+
+import numpy as np
+
+from cuburn.ptx import PTXModule, PTXTest, PTXTestFailure
+
+class LaunchContext(object):
+    """
+    Context collecting the information needed to create, run, and gather the
+    results of a device computation. This may eventually also include an actual
+    CUDA context, but for now it just uses the global one.
+
+    To create the fastest device code across multiple device families, this
+    context may decide to iteratively refine the final PTX by regenerating
+    and recompiling it several times to optimize certain parameters of the
+    launch, such as the distribution of threads throughout the device.
+    The properties of this device which are tuned are listed below. Any PTX
+    fragments which use this information must emit valid PTX for any state
+    given below, but the PTX is only required to actually run with the final,
+    fixed values of all tuned parameters below.
+
+        `block`:    3-tuple of (x,y,z); dimensions of each CTA.
+        `grid`:     2-tuple of (x,y); dimensions of the grid of CTAs.
+        `threads`:  Number of active threads on device as a whole.
+        `mod`:      Final compiled module. Unavailable during assembly.
+
+    """
+    def __init__(self, entries, block=(1,1,1), grid=(1,1), tests=False):
+        self.entry_types = entries
+        self.block, self.grid, self.build_tests = block, grid, tests
+        self.setup_done = False
+
+    @property
+    def threads(self):
+        return reduce(lambda a, b: a*b, self.block + self.grid)
+
+    @property
+    def ctas(self):
+        return self.grid[0] * self.grid[1]
+
+    @property
+    def threads_per_cta(self):
+        return self.block[0] * self.block[1] * self.block[2]
+
+    @property
+    def warps_per_cta(self):
+        return self.threads_per_cta / 32
+
+    def compile(self, verbose=False, **kwargs):
+        kwargs['ctx'] = self
+        self.ptx = PTXModule(self.entry_types, kwargs, self.build_tests)
+        try:
+            self.mod = cuda.module_from_buffer(self.ptx.source)
+        except (cuda.CompileError, cuda.RuntimeError), e:
+            print "Aww, dang, compile error. Here's the source:"
+            self.ptx.print_source()
+            raise e
+        if verbose:
+            if verbose >= 3:
+                self.ptx.print_source()
+            for entry in self.ptx.entries:
+                func = self.mod.get_function(entry.entry_name)
+                print "Compiled %s: used %d regs, %d sm, %d local" % (
+                        entry.entry_name, func.num_regs,
+                        func.shared_size_bytes, func.local_size_bytes)
+
+    def call_setup(self, entry_inst):
+        for inst in self.ptx.entry_deps[type(entry_inst)]:
+            inst.call_setup(self)
+
+    def call_teardown(self, entry_inst):
+        okay = True
+        for inst in reversed(self.ptx.entry_deps[type(entry_inst)]):
+            if inst is entry_inst and isinstance(entry_inst, PTXTest):
+                try:
+                    inst.call_teardown(self)
+                except PTXTestFailure, e:
+                    print "PTX Test %s failed!" % inst.entry_name, e
+                    okay = False
+            else:
+                inst.call_teardown(self)
+        return okay
+
+    def run_tests(self):
+        if not self.ptx.tests:
+            print "No tests to run."
+            return True
+        all_okay = True
+        for test in self.ptx.tests:
+            cuda.Context.synchronize()
+            if test.call(self):
+                print "Test %s passed." % test.entry_name
+            else:
+                print "Test %s FAILED." % test.entry_name
+            all_okay = False
+        return all_okay
+
--- a/cuburn/device_code.py
+++ b/cuburn/device_code.py
@ -0,0 +1,557 @@
+"""
+Contains the PTX fragments which will drive the device.
+"""
+
+import os
+import time
+import struct
+
+import pycuda.driver as cuda
+import numpy as np
+
+from cuburn.ptx import *
+
+class IterThread(PTXEntryPoint):
+    entry_name = 'iter_thread'
+    entry_params = []
+
+    def __init__(self):
+        self.cps_uploaded = False
+
+    def deps(self):
+        return [MWCRNG, CPDataStream, HistScatter]
+
+    @ptx_func
+    def module_setup(self):
+        mem.global_.u32('g_cp_array',
+                        cp.stream_size*features.max_ntemporal_samples)
+        mem.global_.u32('g_num_cps')
+        mem.global_.u32('g_num_cps_started')
+        # TODO move into debug statement
+        mem.global_.u32('g_num_rounds', ctx.threads)
+        mem.global_.u32('g_num_writes', ctx.threads)
+
+    @ptx_func
+    def entry(self):
+        # For now, we indulge in the luxury of shared memory.
+
+        # Index number of current CP, shared across CTA
+        mem.shared.u32('s_cp_idx')
+
+        # Number of samples that have been generated so far in this CTA
+        # If this number is negative, we're still fusing points, so this
+        # behaves slightly differently (see ``fuse_loop_start``)
+        mem.shared.u32('s_num_samples')
+        op.st.shared.u32(addr(s_num_samples), -(features.num_fuse_samples+1))
+
+        # TODO: temporary, for testing
+        reg.u32('num_rounds num_writes')
+        op.mov.u32(num_rounds, 0)
+        op.mov.u32(num_writes, 0)
+
+        reg.f32('x_coord y_coord color_coord')
+        mwc.next_f32_11(x_coord)
+        mwc.next_f32_11(y_coord)
+        mwc.next_f32_01(color_coord)
+
+        comment("Ensure all init is done")
+        op.bar.sync(0)
+
+        label('cp_loop_start')
+        reg.u32('cp_idx cpA')
+        with block("Claim a CP"):
+            std.set_is_first_thread(reg.pred('p_is_first'))
+            op.atom.add.u32(cp_idx, addr(g_num_cps_started), 1, ifp=p_is_first)
+            op.st.shared.u32(addr(s_cp_idx), cp_idx, ifp=p_is_first)
+            op.st.shared.u32(addr(s_num_samples), 0, ifp=p_is_first)
+
+        comment("Load the CP index in all threads")
+        op.bar.sync(1)
+        op.ld.shared.u32(cp_idx, addr(s_cp_idx))
+
+        with block("Check to see if this CP is valid (if not, we're done)"):
+            reg.u32('num_cps')
+            reg.pred('p_last_cp')
+            op.ldu.u32(num_cps, addr(g_num_cps))
+            op.setp.ge.u32(p_last_cp, cp_idx, num_cps)
+            op.bra.uni('all_cps_done', ifp=p_last_cp)
+
+        with block('Load CP address'):
+            op.mov.u32(cpA, g_cp_array)
+            op.mad.lo.u32(cpA, cp_idx, cp.stream_size, cpA)
+
+        label('fuse_loop_start')
+        # When fusing, num_samples holds the (negative) number of iterations
+        # left across the CP, rather than the number of samples in total.
+        with block("If still fusing, increment count unconditionally"):
+            std.set_is_first_thread(reg.pred('p_is_first'))
+            op.red.shared.add.s32(addr(s_num_samples), 1, ifp=p_is_first)
+            op.bar.sync(2)
+
+        label('iter_loop_start')
+
+        comment('Do... well, most of everything')
+
+        mwc.next_f32_11(x_coord)
+        mwc.next_f32_11(y_coord)
+        mwc.next_f32_01(color_coord)
+
+        op.add.u32(num_rounds, num_rounds, 1)
+
+        with block("Test if we're still in FUSE"):
+            reg.s32('num_samples')
+            reg.pred('p_in_fuse')
+            op.ld.shared.s32(num_samples, addr(s_num_samples))
+            op.setp.lt.s32(p_in_fuse, num_samples, 0)
+            op.bra.uni(fuse_loop_start, ifp=p_in_fuse)
+
+        reg.pred('p_point_is_valid')
+        with block("Write the result"):
+            hist.scatter(x_coord, y_coord, color_coord, 0, p_point_is_valid)
+            op.add.u32(num_writes, num_writes, 1, ifp=p_point_is_valid)
+
+        with block("Increment number of samples by number of good values"):
+            reg.b32('good_samples laneid')
+            reg.pred('p_is_first')
+            op.vote.ballot.b32(good_samples, p_point_is_valid)
+            op.popc.b32(good_samples, good_samples)
+            op.mov.u32(laneid, '%laneid')
+            op.setp.eq.u32(p_is_first, laneid, 0)
+            op.red.shared.add.s32(addr(s_num_samples), good_samples,
+                                  ifp=p_is_first)
+
+        with block("Check to see if we're done with this CP"):
+            reg.pred('p_cp_done')
+            reg.s32('num_samples num_samples_needed')
+            op.ld.shared.s32(num_samples, addr(s_num_samples))
+            cp.get(cpA, num_samples_needed, 'cp.nsamples')
+            op.setp.ge.s32(p_cp_done, num_samples, num_samples_needed)
+            op.bra.uni(cp_loop_start, ifp=p_cp_done)
+
+        op.bra.uni(iter_loop_start)
+
+        label('all_cps_done')
+        # TODO this is for testing, move it to a debug statement
+        std.store_per_thread(g_num_rounds, num_rounds)
+        std.store_per_thread(g_num_writes, num_writes)
+
+    @instmethod
+    def upload_cp_stream(self, ctx, cp_stream, num_cps):
+        cp_array_dp, cp_array_l = ctx.mod.get_global('g_cp_array')
+        assert len(cp_stream) <= cp_array_l, "Stream too big!"
+        cuda.memcpy_htod_async(cp_array_dp, cp_stream)
+
+        num_cps_dp, num_cps_l = ctx.mod.get_global('g_num_cps')
+        cuda.memset_d32(num_cps_dp, num_cps, 1)
+        # TODO: "if debug >= 3"
+        print "Uploaded stream to card:"
+        CPDataStream.print_record(ctx, cp_stream, 5)
+        self.cps_uploaded = True
+
+    def call_setup(self, ctx):
+        if not self.cps_uploaded:
+            raise Error("Cannot call IterThread before uploading CPs")
+        num_cps_st_dp, num_cps_st_l = ctx.mod.get_global('g_num_cps_started')
+        cuda.memset_d32(num_cps_st_dp, 0, 1)
+
+    def _call(self, ctx, func):
+        # Get texture reference from the Palette
+        # TODO: more elegant method than reaching into ctx.ptx?
+        tr = ctx.ptx.instances[PaletteLookup].texref
+        super(IterThread, self)._call(ctx, func, texrefs=[tr])
+
+    def call_teardown(self, ctx):
+        shape = (ctx.grid[0], ctx.block[0]/32, 32)
+        num_rounds_dp, num_rounds_l = ctx.mod.get_global('g_num_rounds')
+        num_writes_dp, num_writes_l = ctx.mod.get_global('g_num_writes')
+        rounds = cuda.from_device(num_rounds_dp, shape, np.int32)
+        writes = cuda.from_device(num_writes_dp, shape, np.int32)
+        print "Rounds:", sum(rounds)
+        print "Writes:", sum(writes)
+        print rounds
+        print writes
+
+class CameraTransform(PTXFragment):
+    shortname = 'camera'
+    def deps(self):
+        return [CPDataStream]
+
+    @ptx_func
+    def rotate(self, rotated_x, rotated_y, x, y):
+        """
+        Rotate an IFS-space coordinate as defined by the camera.
+        """
+        if features.camera_rotation:
+            assert rotated_x.name != x.name and rotated_y.name != y.name
+            with block("Rotate %s, %s to camera alignment" % (x, y)):
+                reg.f32('rot_center_x rot_center_y')
+                cp.get_v2(cpA, rot_center_x, 'cp.rot_center[0]',
+                                      rot_center_y, 'cp.rot_center[1]')
+                op.sub.f32(x, x, rot_center_x)
+                op.sub.f32(y, y, rot_center_y)
+
+                reg.f32('rot_sin_t rot_cos_t rot_old_x rot_old_y')
+                cp.get_v2(cpA, rot_cos_t,  'cos(cp.rotate * 2 * pi / 360.)',
+                               rot_sin_t, '-sin(cp.rotate * 2 * pi / 360.)')
+
+                comment('rotated_x = x * cos(t) - y * sin(t) + rot_center_x')
+                op.fma.rn.f32(rotated_x, x, rot_cos_t, rot_center_x)
+                op.fma.rn.f32(rotated_x, y, rot_sin_t, rotated_x)
+
+                op.neg.f32(rot_sin_t, rot_sin_t)
+                comment('rotated_y = x * sin(t) + y * cos(t) + rot_center_y')
+                op.fma.rn.f32(rotated_y, x, rot_sin_t, rot_center_y)
+                op.fma.rn.f32(rotated_y, y, rot_cos_t, rotated_y)
+
+                # TODO: if this is a register-critical section, reloading
+                # rot_center_[xy] here should save two regs. OTOH, if this is
+                # *not* reg-crit, moving the subtraction above to new variables
+                # may save a few clocks
+                op.add.f32(x, x, rot_center_x)
+                op.add.f32(y, y, rot_center_y)
+        else:
+            comment("No camera rotation in this kernel")
+            op.mov.f32(rotated_x, x)
+            op.mov.f32(rotated_y, y)
+
+    @ptx_func
+    def get_norm(self, norm_x, norm_y, x, y):
+        """
+        Find the [0,1]-normalized floating-point histogram coordinates
+        ``norm_x, norm_y`` from the given IFS-space coordinates ``x, y``.
+        """
+        self.rotate(norm_x, norm_y, x, y)
+        with block("Scale rotated points to [0,1]-normalized coordinates"):
+            reg.f32('cam_scale cam_offset')
+            cp.get_v2(cpA, cam_scale,  'cp.camera.norm_scale[0]',
+                           cam_offset, 'cp.camera.norm_offset[0]')
+            op.fma.f32(norm_x, norm_x, cam_scale, cam_offset)
+            cp.get_v2(cpA, cam_scale,  'cp.camera.norm_scale[1]',
+                           cam_offset, 'cp.camera.norm_offset[1]')
+            op.fma.f32(norm_y, norm_y, cam_scale, cam_offset)
+
+    @ptx_func
+    def get_index(self, index, x, y, pred=None):
+        """
+        Find the histogram index (as a u32) from the IFS spatial coordinate in
+        ``x, y``.
+
+        If the coordinates are out of bounds, 0xffffffff will be stored to
+        ``index``. If ``pred`` is given, it will be set if the point is valid,
+        and cleared if not.
+        """
+        # A few instructions could probably be shaved off of this one
+        with block("Find histogram index"):
+            reg.f32('norm_x norm_y')
+            self.rotate(norm_x, norm_y, x, y)
+            comment('Scale and offset from IFS to index coordinates')
+            reg.f32('cam_scale cam_offset')
+            cp.get_v2(cpA, cam_scale,  'cp.camera.idx_scale[0]',
+                           cam_offset, 'cp.camera.idx_offset[0]')
+            op.fma.rn.f32(norm_x, norm_x, cam_scale, cam_offset)
+
+            cp.get_v2(cpA, cam_scale,  'cp.camera.idx_scale[1]',
+                           cam_offset, 'cp.camera.idx_offset[1]')
+            op.fma.rn.f32(norm_y, norm_y, cam_scale, cam_offset)
+
+            comment('Check for bad value')
+            reg.u32('index_x index_y')
+            if not pred:
+                pred = reg.pred('p_valid')
+
+            op.cvt.rzi.s32.f32(index_x, norm_x)
+            op.setp.ge.s32(pred, index_x, 0)
+            op.setp.lt.and_.s32(pred, index_x, features.hist_width, pred)
+
+            op.cvt.rzi.s32.f32(index_y, norm_y)
+            op.setp.ge.and_.s32(pred, index_y, 0, pred)
+            op.setp.lt.and_.s32(pred, index_y, features.hist_height, pred)
+
+            op.mad.lo.u32(index, index_y, features.hist_stride, index_x)
+            op.mov.u32(index, 0xffffffff, ifnotp=pred)
+
+class PaletteLookup(PTXFragment):
+    shortname = "palette"
+    # Resolution of texture on device. Bigger = more palette rez, maybe slower
+    texheight = 16
+
+    def __init__(self):
+        self.texref = None
+
+    def deps(self):
+        return [CPDataStream]
+
+    @ptx_func
+    def module_setup(self):
+        mem.global_.texref('t_palette')
+
+    @ptx_func
+    def look_up(self, r, g, b, a, color, norm_time):
+        """
+        Look up the values of ``r, g, b, a`` corresponding to ``color_coord``
+        at the CP indexed in ``timestamp_idx``. Note that both ``color_coord``
+        and ``timestamp_idx`` should be [0,1]-normalized floats.
+        """
+        op.tex._2d.v4.f32.f32(vec(r, g, b, a),
+                addr([t_palette, ', ',  vec(norm_time, color)]))
+        if features.non_box_temporal_filter:
+            raise NotImplementedError("Non-box temporal filters not supported")
+
+    @instmethod
+    def upload_palette(self, ctx, frame, cp_list):
+        """
+        Extract the palette from the given list of interpolated CPs, and upload
+        it to the device as a texture.
+        """
+        # TODO: figure out if storing the full list is an actual drag on
+        # performance/memory
+        if frame.center_cp.temporal_filter_type != 0:
+            # TODO: make texture sample based on time, not on CP index
+            raise NotImplementedError("Use box temporal filters for now")
+        pal = np.ndarray((self.texheight, 256, 4), dtype=np.float32)
+        inv = float(len(cp_list) - 1) / (self.texheight - 1)
+        for y in range(self.texheight):
+            for x in range(256):
+                for c in range(4):
+                    # TODO: interpolate here?
+                    cy = int(round(y * inv))
+                    pal[y][x][c] = cp_list[cy].palette.entries[x].color[c]
+        dev_array = cuda.make_multichannel_2d_array(pal, "C")
+        self.texref = ctx.mod.get_texref('t_palette')
+        # TODO: float16? or can we still use interp with int storage?
+        self.texref.set_format(cuda.array_format.FLOAT, 4)
+        self.texref.set_flags(cuda.TRSF_NORMALIZED_COORDINATES)
+        self.texref.set_filter_mode(cuda.filter_mode.LINEAR)
+        self.texref.set_address_mode(0, cuda.address_mode.CLAMP)
+        self.texref.set_address_mode(1, cuda.address_mode.CLAMP)
+        self.texref.set_array(dev_array)
+
+    def call_setup(self, ctx):
+        assert self.texref, "Must upload palette texture before launch!"
+
+class HistScatter(PTXFragment):
+    shortname = "hist"
+    def deps(self):
+        return [CPDataStream, CameraTransform, PaletteLookup]
+
+    @ptx_func
+    def module_setup(self):
+        mem.global_.f32('g_hist_bins',
+                        features.hist_height * features.hist_stride * 4)
+
+    @ptx_func
+    def entry_setup(self):
+        comment("For now, assume histogram bins have been cleared by host")
+
+    @ptx_func
+    def scatter(self, x, y, color, xf_idx, p_valid=None):
+        """
+        Scatter the given point directly to the histogram bins. I think this
+        technique has the worst performance of all of 'em. Accesses ``cpA``
+        directly.
+        """
+        with block("Scatter directly to buffer"):
+            if p_valid is None:
+                p_valid = reg.pred('p_valid')
+            reg.u32('hist_index')
+            camera.get_index(hist_index, x, y, p_valid)
+            reg.u32('hist_bin_addr')
+            op.mov.u32(hist_bin_addr, g_hist_bins)
+            op.mad.lo.u32(hist_bin_addr, hist_index, 16, hist_bin_addr)
+
+            reg.f32('r g b a norm_time')
+            cp.get(cpA, norm_time, 'cp.norm_time')
+            palette.look_up(r, g, b, a, color, norm_time)
+            # TODO: look up, scale by xform visibility
+            op.red.add.f32(addr(hist_bin_addr), r)
+            op.red.add.f32(addr(hist_bin_addr,4), g)
+            op.red.add.f32(addr(hist_bin_addr,8), b)
+            op.red.add.f32(addr(hist_bin_addr,12), a)
+
+
+    def call_setup(self, ctx):
+        hist_bins_dp, hist_bins_l = ctx.mod.get_global('g_hist_bins')
+        cuda.memset_d32(hist_bins_dp, 0, hist_bins_l/4)
+
+    @instmethod
+    def get_bins(self, ctx, features):
+        hist_bins_dp, hist_bins_l = ctx.mod.get_global('g_hist_bins')
+        return cuda.from_device(hist_bins_dp,
+                (features.hist_height, features.hist_stride, 4),
+                dtype=np.float32)
+
+class MWCRNG(PTXFragment):
+    shortname = "mwc"
+
+    def __init__(self):
+        self.threads_ready = 0
+        if not os.path.isfile('primes.bin'):
+            raise EnvironmentError('primes.bin not found')
+
+    @ptx_func
+    def module_setup(self):
+        mem.global_.u32('mwc_rng_mults', ctx.threads)
+        mem.global_.u64('mwc_rng_state', ctx.threads)
+
+    @ptx_func
+    def entry_setup(self):
+        reg.u32('mwc_st mwc_mult mwc_car')
+        with block('Load MWC multipliers and states'):
+            reg.u32('mwc_off mwc_addr')
+            std.get_gtid(mwc_off)
+            op.mov.u32(mwc_addr, mwc_rng_mults)
+            op.mad.lo.u32(mwc_addr, mwc_off, 4, mwc_addr)
+            op.ld.global_.u32(mwc_mult, addr(mwc_addr))
+
+            op.mov.u32(mwc_addr, mwc_rng_state)
+            op.mad.lo.u32(mwc_addr, mwc_off, 8, mwc_addr)
+            op.ld.global_.v2.u32(vec(mwc_st, mwc_car), addr(mwc_addr))
+
+    @ptx_func
+    def entry_teardown(self):
+        with block('Save MWC states'):
+            reg.u32('mwc_off mwc_addr')
+            std.get_gtid(mwc_off)
+            op.mov.u32(mwc_addr, mwc_rng_state)
+            op.mad.lo.u32(mwc_addr, mwc_off, 8, mwc_addr)
+            op.st.global_.v2.u32(addr(mwc_addr), vec(mwc_st, mwc_car))
+
+    @ptx_func
+    def _next(self):
+        # Call from inside a block!
+        reg.u64('mwc_out')
+        op.cvt.u64.u32(mwc_out, mwc_car)
+        op.mad.wide.u32(mwc_out, mwc_st, mwc_mult, mwc_out)
+        op.mov.b64(vec(mwc_st, mwc_car), mwc_out)
+
+    @ptx_func
+    def next_b32(self, dst_reg):
+        with block('Load next random u32 into ' + dst_reg.name):
+            self._next()
+            op.mov.u32(dst_reg, mwc_st)
+
+    @ptx_func
+    def next_f32_01(self, dst_reg):
+        # TODO: verify that this is the fastest-performance method
+        # TODO: verify that this actually does what I think it does
+        with block('Load random float [0,1] into ' + dst_reg.name):
+            self._next()
+            op.cvt.rn.f32.u32(dst_reg, mwc_st)
+            op.mul.f32(dst_reg, dst_reg, '0f2F800000') # 1./(1<<32)
+
+    @ptx_func
+    def next_f32_11(self, dst_reg):
+        with block('Load random float [-1,1) into ' + dst_reg.name):
+            reg.u32('mwc_to_float')
+            self._next()
+            op.cvt.rn.f32.s32(dst_reg, mwc_st)
+            op.mul.f32(dst_reg, dst_reg, '0f30000000') # 1./(1<<31)
+
+    @instmethod
+    def seed(self, ctx, rand=np.random):
+        """
+        Seed the random number generators with values taken from a
+        ``np.random`` instance.
+        """
+        # Load raw big-endian u32 multipliers from primes.bin.
+        with open('primes.bin') as primefp:
+            dt = np.dtype(np.uint32).newbyteorder('B')
+            mults = np.frombuffer(primefp.read(), dtype=dt)
+        stream = cuda.Stream()
+        # Randomness in choosing multipliers is good, but larger multipliers
+        # have longer periods, which is also good. This is a compromise.
+        mults = np.array(mults[:ctx.threads*4])
+        rand.shuffle(mults)
+        # Copy multipliers and seeds to the device
+        multdp, multl = ctx.mod.get_global('mwc_rng_mults')
+        cuda.memcpy_htod_async(multdp, mults.tostring()[:multl])
+        # Intentionally excludes both 0 and (2^32-1), as they can lead to
+        # degenerate sequences of period 0
+        states = np.array(rand.randint(1, 0xffffffff, size=2*ctx.threads),
+                          dtype=np.uint32)
+        statedp, statel = ctx.mod.get_global('mwc_rng_state')
+        cuda.memcpy_htod_async(statedp, states.tostring())
+        self.threads_ready = ctx.threads
+
+    def call_setup(self, ctx):
+        if self.threads_ready < ctx.threads:
+            self.seed(ctx)
+
+    def tests(self):
+        return [MWCRNGTest]
+
+class MWCRNGTest(PTXTest):
+    name = "MWC RNG sum-of-threads"
+    rounds = 5000
+    entry_name = 'MWC_RNG_test'
+    entry_params = ''
+
+    def deps(self):
+        return [MWCRNG]
+
+    @ptx_func
+    def module_setup(self):
+        mem.global_.u64('mwc_rng_test_sums', ctx.threads)
+
+    @ptx_func
+    def entry(self):
+        reg.u64('sum addl')
+        reg.u32('addend')
+        op.mov.u64(sum, 0)
+        with block('Sum next %d random numbers' % self.rounds):
+            reg.u32('loopct')
+            reg.pred('p')
+            op.mov.u32(loopct, self.rounds)
+            label('loopstart')
+            mwc.next_b32(addend)
+            op.cvt.u64.u32(addl, addend)
+            op.add.u64(sum, sum, addl)
+            op.sub.u32(loopct, loopct, 1)
+            op.setp.gt.u32(p, loopct, 0)
+            op.bra.uni(loopstart, ifp=p)
+
+        with block('Store sum and state'):
+            reg.u32('adr offset')
+            std.get_gtid(offset)
+            op.mov.u32(adr, mwc_rng_test_sums)
+            op.mad.lo.u32(adr, offset, 8, adr)
+            op.st.global_.u64(addr(adr), sum)
+
+    def call_setup(self, ctx):
+        # Get current multipliers and seeds from the device
+        multdp, multl = ctx.mod.get_global('mwc_rng_mults')
+        mults = cuda.from_device(multdp, ctx.threads, np.uint32)
+        statedp, statel = ctx.mod.get_global('mwc_rng_state')
+        fullstates = cuda.from_device(statedp, ctx.threads, np.uint64)
+        sums = np.zeros(ctx.threads, np.uint64)
+
+        print "Running %d states forward %d rounds" % (len(mults), self.rounds)
+        ctime = time.time()
+        for i in range(self.rounds):
+            states = fullstates & 0xffffffff
+            carries = fullstates >> 32
+            fullstates = mults * states + carries
+            sums = sums + (fullstates & 0xffffffff)
+        ctime = time.time() - ctime
+        print "Done on host, took %g seconds" % ctime
+
+    def call_teardown(self, ctx):
+        dfullstates = cuda.from_device(statedp, ctx.threads, np.uint64)
+        if not (dfullstates == fullstates).all():
+            print "State discrepancy"
+            print dfullstates
+            print fullstates
+            raise PTXTestFailure("MWC RNG state discrepancy")
+
+        sumdp, suml = ctx.mod.get_global('mwc_rng_test_sums')
+        dsums = cuda.from_device(sumdp, ctx.threads, np.uint64)
+        if not (dsums == sums).all():
+            print "Sum discrepancy"
+            print dsums
+            print sums
+            raise PTXTestFailure("MWC RNG sum discrepancy")
+
+class CPDataStream(DataStream):
+    """DataStream which stores the control points."""
+    shortname = 'cp'
+
--- a/cuburn/ptx.py
+++ b/cuburn/ptx.py
--- a/cuburn/render.py
+++ b/cuburn/render.py
@ -0,0 +1,223 @@
+import math
+from ctypes import *
+from cStringIO import StringIO
+import numpy as np
+
+from fr0stlib import pyflam3
+from fr0stlib.pyflam3._flam3 import *
+from fr0stlib.pyflam3.constants import *
+
+from cuburn.cuda import LaunchContext
+from cuburn.device_code import *
+
+Point = lambda x, y: np.array([x, y], dtype=np.double)
+
+class Genome(pyflam3.Genome):
+    pass
+
+class _Frame(pyflam3.Frame):
+    """
+    ctypes flam3_frame object used for genome interpolation and
+    spatial filter creation
+    """
+    def __init__(self, genomes, *args, **kwargs):
+        pyflam3.Frame.__init__(self, *args, **kwargs)
+        self.genomes = (BaseGenome * len(genomes))()
+        for i in range(len(genomes)):
+            memmove(byref(self.genomes[i]), byref(genomes[i]),
+                    sizeof(BaseGenome))
+        self.ngenomes = len(genomes)
+
+        # TODO: do this here?
+        self.pixel_aspect_ratio = float(genomes[0].height) / genomes[0].width
+
+    def interpolate(self, time, stagger=0, cp=None):
+        cp = cp or BaseGenome()
+        flam3_interpolate(self.genomes, self.ngenomes, time,
+                          stagger, byref(cp))
+        return cp
+
+class Frame(object):
+    """
+    Handler for a single frame of a rendered genome.
+    """
+    def __init__(self, _frame, time):
+        self._frame = _frame
+        self.center_cp = self._frame.interpolate(time)
+
+    def upload_data(self, ctx, filters, time):
+        """
+        Prepare and upload the data needed to render this frame to the device.
+        """
+        center = self.center_cp
+        ncps = center.nbatches * center.ntemporal_samples
+
+        if ncps < ctx.ctas:
+            raise NotImplementedError(
+                "Distribution of a CP across multiple CTAs not yet done")
+
+        # TODO: isn't this leaking ctypes xforms all over the place?
+        stream = StringIO()
+        cp_list = []
+
+        for batch_idx in range(center.nbatches):
+            for time_idx in range(center.ntemporal_samples):
+                idx = time_idx + batch_idx * center.nbatches
+                time = time + filters.temporal_deltas[idx]
+                cp = self._frame.interpolate(time)
+                cp_list.append(cp)
+
+                cp.camera = Camera(self._frame, cp, filters)
+                cp.nsamples = (cp.camera.sample_density *
+                               center.width * center.height) / ncps
+
+        print "Expected writes:", (
+                cp.camera.sample_density * center.width * center.height)
+        min_time = min(filters.temporal_deltas)
+        max_time = max(filters.temporal_deltas)
+        for i, cp in enumerate(cp_list):
+            cp.norm_time = (filters.temporal_deltas[i] - min_time) / (
+                            max_time - min_time)
+            CPDataStream.pack_into(ctx, stream, frame=self, cp=cp, cp_idx=idx)
+        PaletteLookup.upload_palette(ctx, self, cp_list)
+        stream.seek(0)
+        IterThread.upload_cp_stream(ctx, stream.read(), ncps)
+
+class Animation(object):
+    """
+    Control structure for rendering a series of frames.
+
+    Each animation will dynamically generate a kernel that includes only the
+    code necessary to render the genomes provided. The process of generating
+    and uploading the kernel takes a small but finite amount of time. In
+    general, the kernel generated for all genomes resulting from interpolating
+    between two control points will have identical performance, so it is
+    wasteful to create more than one animation for any interpolated sequence.
+
+    However, genome sequences interpolated from three or more control points
+    with different features enabled will have the code needed to render all
+    genomes enabled for every frame. Doing this can hurt performance.
+
+    In other words, it's best to use exactly one Animation for each
+    interpolated sequence between one or two genomes.
+    """
+    def __init__(self, genomes):
+        # _frame is the ctypes frame object used only for interpolation
+        self._frame = _Frame(genomes)
+
+        # Use the same set of filters throughout the anim, a la flam3
+        self.filters = Filters(self._frame, genomes[0])
+        self.features = Features(genomes, self.filters)
+
+        self.ctx = None
+
+    def compile(self):
+        """
+        Create a PTX kernel optimized for this animation, compile it, and
+        attach it to a LaunchContext with a thread distribution optimized for
+        the active device.
+        """
+        # TODO: user-configurable test control
+        self.ctx = LaunchContext([IterThread], block=(256,1,1), grid=(54,1),
+                                 tests=True)
+        # TODO: user-configurable verbosity control
+        self.ctx.compile(verbose=3, anim=self, features=self.features)
+        # TODO: automatic optimization of block parameters
+
+    def render_frame(self, time=0):
+        # TODO: support more nuanced frame control than just 'time'
+        # TODO: reuse more information between frames
+        # TODO: allow animation-long override of certain parameters (size, etc)
+        frame = Frame(self._frame, time)
+        frame.upload_data(self.ctx, self.filters, time)
+        IterThread.call(self.ctx)
+        return HistScatter.get_bins(self.ctx, self.features)
+
+class Filters(object):
+    def __init__(self, frame, cp):
+        # Use one oversample per filter set, even over multiple timesteps
+        self.oversample = frame.genomes[0].spatial_oversample
+
+        # Ugh. I'd really like to replace this mess
+        spa_filt_ptr = POINTER(c_double)()
+        spa_width = flam3_create_spatial_filter(byref(frame),
+                                                flam3_field_both,
+                                                byref(spa_filt_ptr))
+        if spa_width < 0:
+            raise EnvironmentError("flam3 call failed")
+        self.spatial = np.asarray([[spa_filt_ptr[y*spa_width+x] for x in
+            range(spa_width)] for y in range(spa_width)], dtype=np.double)
+        self.spatial_width = spa_width
+        flam3_free(spa_filt_ptr)
+
+        tmp_filt_ptr = POINTER(c_double)()
+        tmp_deltas_ptr = POINTER(c_double)()
+        steps = cp.nbatches * cp.ntemporal_samples
+        self.temporal_sum = flam3_create_temporal_filter(
+                steps,
+                cp.temporal_filter_type,
+                cp.temporal_filter_exp,
+                cp.temporal_filter_width,
+                byref(tmp_filt_ptr),
+                byref(tmp_deltas_ptr))
+        self.temporal = np.asarray([tmp_filt_ptr[i] for i in range(steps)],
+                                   dtype=np.double)
+        flam3_free(tmp_filt_ptr)
+        self.temporal_deltas = np.asarray(
+                [tmp_deltas_ptr[i] for i in range(steps)], dtype=np.double)
+        flam3_free(tmp_deltas_ptr)
+
+        # TODO: density estimation
+        self.gutter = (spa_width - self.oversample) / 2
+
+class Features(object):
+    """
+    Determine features and constants required to render a particular set of
+    genomes. The values of this class are fixed before compilation begins.
+    """
+    # Constant; number of rounds spent fusing points on first CP of a frame
+    num_fuse_samples = 25
+
+    def __init__(self, genomes, flt):
+        any = lambda l: bool(filter(None, map(l, genomes)))
+        self.max_ntemporal_samples = max(
+                [cp.nbatches * cp.ntemporal_samples for cp in genomes])
+        self.camera_rotation = any(lambda cp: cp.rotate)
+        self.non_box_temporal_filter = genomes[0].temporal_filter_type
+        self.palette_mode = genomes[0].palette_mode and "linear" or "nearest"
+
+        # Histogram (and log-density copy) width and height
+        self.hist_width  = flt.oversample * genomes[0].width  + 2 * flt.gutter
+        self.hist_height = flt.oversample * genomes[0].height + 2 * flt.gutter
+        # Histogram stride, for better filtering. This code assumes the
+        # 128-byte L1 cache line width of Fermi devices, and a 16-byte
+        # histogram bucket size. TODO: detect these things programmatically,
+        # particularly the histogram bucket size, which may be split soon
+        self.hist_stride = 8 * int(math.ceil(self.hist_width / 8.0))
+
+class Camera(object):
+    """Viewport and exposure."""
+    def __init__(self, frame, cp, filters):
+        # Calculate the conversion matrix between the IFS space (xform
+        # coordinates) and the sampling lattice (bucket addresses)
+        # TODO: test this code (against compute_camera?)
+        scale = 2.0 ** cp.zoom
+        self.sample_density = cp.sample_density * scale * scale
+
+        center = Point(cp._center[0], cp._center[1])
+        size = Point(cp.width, cp.height)
+
+        # pix per unit, where 'unit' is '1.0' in IFS space
+        self.ppu = Point(
+            cp.pixels_per_unit * scale / frame.pixel_aspect_ratio,
+            cp.pixels_per_unit * scale)
+        # extra shifts applied due to gutter
+        gutter = filters.gutter / (cp.spatial_oversample * self.ppu)
+        cornerLL = center - (size / (2 * self.ppu))
+        self.lower_bounds = cornerLL - gutter
+        self.upper_bounds = cornerLL + (size / self.ppu) + gutter
+        self.norm_scale = 1.0 / (self.upper_bounds - self.lower_bounds)
+        self.norm_offset = -self.norm_scale * self.lower_bounds
+        self.idx_scale = size * self.norm_scale
+        self.idx_offset = size * self.norm_offset
+