From e79df46c66183caaef4914d64668c0d68fb85133 Mon Sep 17 00:00:00 2001
From: Steven Robertson <steven@strobe.cc>
Date: Sat, 11 Jun 2011 15:59:10 -0400
Subject: [PATCH] Refactor API

--HG--
rename : cuburn/code/filter.py => cuburn/code/filtering.py
---
 cuburn/_pyflam3_hacks.py                |   2 +
 cuburn/code/{filter.py => filtering.py} |  12 +-
 cuburn/code/iter.py                     | 104 +-------
 cuburn/code/util.py                     |  18 ++
 cuburn/render.py                        | 339 ++++++++++++++++++++++--
 main.py                                 |  29 +-
 6 files changed, 362 insertions(+), 142 deletions(-)
 rename cuburn/code/{filter.py => filtering.py} (95%)

diff --git a/cuburn/_pyflam3_hacks.py b/cuburn/_pyflam3_hacks.py
index 8656e95..d98a474 100644
--- a/cuburn/_pyflam3_hacks.py
+++ b/cuburn/_pyflam3_hacks.py
@@ -14,6 +14,8 @@ from ctypes import *
 from fr0stlib.pyflam3 import constants
 from fr0stlib.pyflam3._flam3 import *
 
+from cuburn import render
+
 flam3_nvariations = constants.flam3_nvariations = 99
 
 BaseXForm._fields_ = [('var', c_double * flam3_nvariations)
diff --git a/cuburn/code/filter.py b/cuburn/code/filtering.py
similarity index 95%
rename from cuburn/code/filter.py
rename to cuburn/code/filtering.py
index a41674d..bd1ae73 100644
--- a/cuburn/code/filter.py
+++ b/cuburn/code/filtering.py
@@ -223,24 +223,22 @@ void density_est(float4 *pixbuf, float4 *outbuf, float *denbuf,
 
 """)
 
-    def invoke(self, mod, abufd, obufd, dbufd):
+    def invoke(self, mod, abufd, obufd, dbufd, stream=None):
         # TODO: add no-est version
         # TODO: come up with a general way to average these parameters
 
         k1 = self.cp.brightness * 268 / 256
         area = self.features.acc_width * self.features.acc_height / self.cp.ppu ** 2
         k2 = 1 / (area * self.cp.adj_density)
-        print k1, k2, area
 
         if self.cp.estimator == 0:
             fun = mod.get_function("logscale")
             t = fun(abufd, obufd, np.float32(k1), np.float32(k2),
                     block=(self.features.acc_width, 1, 1),
-                    grid=(self.features.acc_height, 1), time_kernel=True)
+                    grid=(self.features.acc_height, 1), stream=stream)
         else:
             fun = mod.get_function("density_est")
-            t = fun(abufd, obufd, dbufd, np.float32(k1), np.float32(k2),
-                    block=(32, 32, 1), grid=(self.features.acc_width/32, 1),
-                    time_kernel=True)
-            print "Density estimation: %g" % t
+            fun(abufd, obufd, dbufd, np.float32(k1), np.float32(k2),
+                block=(32, 32, 1), grid=(self.features.acc_width/32, 1),
+                stream=stream)
 
diff --git a/cuburn/code/iter.py b/cuburn/code/iter.py
index 4e96c81..6ebab15 100644
--- a/cuburn/code/iter.py
+++ b/cuburn/code/iter.py
@@ -2,20 +2,13 @@
 The main iteration loop.
 """
 
-from ctypes import byref, memset, sizeof
-
-import pycuda.driver as cuda
-from pycuda.driver import In, Out, InOut
-from pycuda.compiler import SourceModule
-import numpy as np
-from scipy import ndimage
-
-from fr0stlib.pyflam3 import flam3_interpolate
-from cuburn.code import mwc, variations, filter
+from cuburn.code import mwc, variations
 from cuburn.code.util import *
-from cuburn.render import Genome
 
 class IterCode(HunkOCode):
+    # The number of threads per block
+    NTHREADS = 512
+
     def __init__(self, features):
         self.features = features
         self.packer = DataPacker('iter_info')
@@ -69,14 +62,14 @@ void iter(mwc_st *msts, iter_info *infos, float4 *accbuf, float *denbuf) {
     iter_info *info_glob = &(infos[blockIdx.x]);
 
     // load info to shared memory cooperatively
-    for (int i = threadIdx.y * 32 + threadIdx.x;
+    for (int i = threadIdx.y * blockDim.x + threadIdx.x;
          i * 4 < sizeof(iter_info); i += blockDim.x * blockDim.y)
         reinterpret_cast<float*>(&info)[i] =
             reinterpret_cast<float*>(info_glob)[i];
 
     int consec_bad = -{{features.fuse}};
-    // TODO: make nsteps adjustable via genome
-    int nsamps = {{packer.get('cp.width * cp.height / 512000. * cp.adj_density')}};
+    // TODO: remove '512' constant
+    int nsamps = {{packer.get('cp.width * cp.height / (cp.ntemporal_samples * 512.) * cp.adj_density')}};
 
     float x, y, color;
     x = mwc_next_11(&rctx);
@@ -157,86 +150,3 @@ void iter(mwc_st *msts, iter_info *infos, float4 *accbuf, float *denbuf) {
                 packer = self.packer.view('info'),
                 **globals())
 
-def render(features, cps):
-    # TODO: make this adjustable via genome
-    nsteps = 1000
-    abuf = np.zeros((features.acc_height, features.acc_stride, 4), dtype=np.float32)
-    dbuf = np.zeros((features.acc_height, features.acc_stride), dtype=np.float32)
-    seeds = mwc.MWC.make_seeds(512 * nsteps)
-
-    iter = IterCode(features)
-    de = filter.DensityEst(features, cps[0])
-    code = assemble_code(BaseCode, mwc.MWC, iter.packer, iter,
-                         filter.ColorClip, de)
-
-    for lno, line in enumerate(code.split('\n')):
-        print '%3d %s' % (lno, line)
-    mod = SourceModule(code,
-            options=['-use_fast_math', '-maxrregcount', '32'])
-
-    cps_as_array = (Genome * len(cps))()
-    for i, cp in enumerate(cps):
-        cps_as_array[i] = cp
-
-    infos = []
-    pal = np.empty((16, 256, 4), dtype=np.uint8)
-
-    # TODO: move this into a common function
-    if len(cps) > 1:
-        cp = Genome()
-        memset(byref(cp), 0, sizeof(cp))
-
-        sampAt = [int(i/15.*(nsteps-1)) for i in range(16)]
-        for n in range(nsteps):
-            flam3_interpolate(cps_as_array, 2, float(n)/nsteps - 0.5,
-                              0, byref(cp))
-            cp._init()
-            if n in sampAt:
-                pidx = sampAt.index(n)
-                for i, e in enumerate(cp.palette.entries):
-                    pal[pidx][i] = np.uint8(np.array(e.color) * 255.0)
-            infos.append(iter.packer.pack(cp=cp, cp_step_frac=float(n)/nsteps))
-    else:
-        for i, e in enumerate(cps[0].palette.entries):
-            pal[0][i] = np.uint8(np.array(e.color) * 255.0)
-        pal[1:] = pal[0]
-        infos.append(iter.packer.pack(cp=cps[0], cp_step_frac=0))
-        infos *= nsteps
-
-    infos = np.concatenate(infos)
-
-    dpal = cuda.make_multichannel_2d_array(pal, 'C')
-    tref = mod.get_texref('palTex')
-    tref.set_array(dpal)
-    tref.set_format(cuda.array_format.UNSIGNED_INT8, 4)
-    tref.set_flags(cuda.TRSF_NORMALIZED_COORDINATES)
-    tref.set_filter_mode(cuda.filter_mode.LINEAR)
-
-    abufd = cuda.to_device(abuf)
-    dbufd = cuda.to_device(dbuf)
-
-    fun = mod.get_function("iter")
-    fun.set_cache_config(cuda.func_cache.PREFER_L1)
-    t = fun(InOut(seeds), InOut(infos), abufd, dbufd,
-        block=(32,16,1), grid=(nsteps,1), time_kernel=True)
-    print "Completed render in %g seconds" % t
-
-    f = np.float32
-
-    npix = features.acc_width * features.acc_height
-
-    # TODO: just allocate
-    obufd = cuda.to_device(abuf)
-    dbuf = cuda.from_device_like(dbufd, dbuf)
-    dbuf = ndimage.filters.gaussian_filter(dbuf, 0.6)
-    dbufd = cuda.to_device(dbuf)
-    de.invoke(mod, abufd, obufd, dbufd)
-
-    fun = mod.get_function("colorclip")
-    t = fun(obufd, f(1 / cp.gamma), f(cp.vibrancy), f(cp.highlight_power),
-        block=(256,1,1), grid=(npix/256,1), time_kernel=True)
-    print "Completed color filtering in %g seconds" % t
-
-    abuf = cuda.from_device_like(obufd, abuf)
-    return abuf, dbuf
-
diff --git a/cuburn/code/util.py b/cuburn/code/util.py
index ebba41d..9c71873 100644
--- a/cuburn/code/util.py
+++ b/cuburn/code/util.py
@@ -66,8 +66,26 @@ int trunca(float f) {
     asm("cvt.rni.s32.f32    %0,     %1;" : "=r"(ret) : "f"(f));
     return ret;
 }
+
+__global__
+void zero_dptr(float* dptr, int size) {
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < size) {
+        dptr[i] = 0.0f;
+    }
+}
 """
 
+    @staticmethod
+    def zero_dptr(mod, dptr, size, stream=None):
+        """
+        A memory zeroer which can be embedded in a stream. Size is the
+        number of 4-byte words in the pointer.
+        """
+        zero = mod.get_function("zero_dptr")
+        zero(dptr, np.int32(size), stream=stream,
+             block=(1024, 1, 1), grid=(size/1024+1, 1))
+
 class DataPackerView(object):
     """
     View of a data packer. Intended to be initialized using DataPacker.view().
diff --git a/cuburn/render.py b/cuburn/render.py
index 9267f14..54e4ed3 100644
--- a/cuburn/render.py
+++ b/cuburn/render.py
@@ -1,44 +1,57 @@
 import sys
 import math
 import re
+from itertools import cycle, repeat, chain, izip
 from ctypes import *
 from cStringIO import StringIO
 import numpy as np
+from scipy import ndimage
 
 from fr0stlib import pyflam3
 from fr0stlib.pyflam3._flam3 import *
 from fr0stlib.pyflam3.constants import *
 
+import pycuda.compiler
+import pycuda.driver as cuda
+
 from cuburn import affine
-from cuburn.variations import Variations
+from cuburn.code import util, mwc, iter, filtering
 
-class Genome(pyflam3.Genome):
-    @classmethod
-    def from_string(cls, *args, **kwargs):
-        gnms = super(Genome, cls).from_string(*args, **kwargs)
-        for g in gnms: g._init()
-        return gnms
+def _chunk(l, cs):
+    """
+    Yield the contents of list ``l`` in chunks of size no more than ``cs``.
+    """
+    for i in range(0, len(l), cs):
+        yield l[i:i+cs]
 
-    def _init(self):
+class Genome(object):
+    """
+    Normalizes and precalculates some properties of a Genome. Assumes that
+    Genome argument passed in will not change.
+    """
+    # Fix the ctypes ugliness since switching to __getattribute__ in 2.7.
+    # There are more elegant ways to do this, but I can't be bothered.
+    def __getattr__(self, name):
+        return getattr(self.cp, name)
+
+    def __init__(self, ctypes_genome):
+        self.cp = ctypes_genome
         self.xforms = [self.xform[i] for i in range(self.num_xforms)]
         dens = np.array([x.density for i, x in enumerate(self.xforms)
                          if i != self.final_xform_index])
         dens /= np.sum(dens)
         self.norm_density = [np.sum(dens[:i+1]) for i in range(len(dens))]
+        self.camera_transform = self.calc_camera_transform()
 
     scale = property(lambda cp: 2.0 ** cp.zoom)
     adj_density = property(lambda cp: cp.sample_density * (cp.scale ** 2))
     ppu = property(lambda cp: cp.pixels_per_unit * cp.scale)
 
-    @property
-    def camera_transform(cp):
+    def calc_camera_transform(cp):
         """
         An affine matrix which will transform IFS coordinates to image width
         and height. Assumes that width and height are constant.
         """
-        # TODO: when reading as a property during packing, this may be
-        # calculated 6 times instead of 1
-        # TODO: also requires knowing gutter width
         g = Features.gutter
         return ( affine.translate(0.5 * cp.width + g, 0.5 * cp.height + g)
                * affine.scale(cp.ppu, cp.ppu)
@@ -65,13 +78,294 @@ class Animation(object):
     In other words, it's best to use exactly one Animation for each
     interpolated sequence between one or two genomes.
     """
-    def __init__(self, genomes, ngenomes = None):
-        self.features = Features(genomes)
+    def __init__(self, ctypes_genome_array):
+        self._g_arr = ctypes_genome_array
+        self.genomes = map(Genome, ctypes_genome_array)
+        self.features = Features(self.genomes)
+        self._iter = self._de = self.src = self.cubin = self.mod = None
 
-    def compile(self):
-        pass
-    def render_frame(self, time=0):
-        pass
+    def compile(self, keep=False,
+                cmp_options=('-use_fast_math', '-maxrregcount', '32')):
+        """
+        Compile a kernel capable of rendering every frame in this animation.
+        The resulting compiled kernel is stored in the ``cubin`` property;
+        the source is available as ``src``, and is also returned for
+        inspection and display.
+
+        This operation is idempotent, and has no side effects outside of
+        setting properties on this instance (unless there's a compiler error,
+        which is a bug); it should therefore be threadsafe as well.
+        It is, however, rather slow.
+        """
+        self._iter = iter.IterCode(self.features)
+        self._de = filtering.DensityEst(self.features, self.genomes[0])
+        # TODO: make choice of filtering explicit
+        # TODO: autoload dependent modules?
+        self.src = util.assemble_code(util.BaseCode, mwc.MWC, self._iter.packer,
+                                      self._iter, filtering.ColorClip, self._de)
+        self.cubin = pycuda.compiler.compile(self.src, keep=False,
+                                             options=list(cmp_options))
+        return self.src
+
+    def copy(self):
+        """
+        Return a copy of this animation without any references to the current
+        CUDA context. This can be used to load an animation in multiple CUDA
+        contexts without recompiling, so that rendering can proceed across
+        multiple devices - but managing that is up to you.
+        """
+        import copy
+        new = copy.copy(self)
+        new.mod = None
+        return new
+
+    def load(self, jit_options=[]):
+        """
+        Replace the currently loaded CUDA module in the active CUDA context
+        with the compiled code's module. A reference is kept to the module,
+        meaning that rendering should henceforth only be called from the
+        thread and context in which this function was called.
+        """
+        if self.cubin is None:
+            self.compile()
+        self.mod = cuda.module_from_buffer(self.cubin, jit_options)
+
+    def render_frames(self, times=None):
+        """
+        Render a flame for each genome in the iterable value 'genomes'.
+        Returns a Python generator object which will yield one NumPy array
+        for each rendered image.
+
+        This method produces a considerable amount of side effects, and should
+        not be used lightly. Things may go poorly for you if this method is not
+        allowed to run until completion (by exhausting all items in the
+        generator object).
+
+        A performance note: while any ready tasks will be scheduled on the GPU
+        before yielding a result, spending a lot of time before returning
+        control to this function can allow the GPU to become idle. It's best
+        to hand the resulting array to another thread after grabbing it from
+        the renderer for handling.
+
+        ``times`` is a sequence of center times at which to render, or ``None``
+        to render one frame for each genome used to create the animation.
+        """
+        # Don't see this changing, but empirical tests could prove me wrong
+        NRENDERERS = 2
+        # TODO: under a slightly modified sequencing, certain buffers can be
+        # shared (though this may be unimportant if a good AA technique which
+        # doesn't require full SS can be found)
+        rdrs = [_AnimRenderer(self) for i in range(NRENDERERS)]
+
+        # Zip up each genome with an alternating renderer, plus enough empty
+        # genomes at the end to flush all pending tasks
+        times = times or [cp.time for cp in self.genomes]
+        exttimes = chain(times, repeat(None, NRENDERERS))
+        for rdr, time in izip(cycle(rdrs), exttimes):
+            if rdr.wait():
+                yield rdr.get_result()
+            if time is not None:
+                rdr.render(time)
+
+    def _interp(self, time, cp):
+        flam3_interpolate(self._g_arr, len(self._g_arr), time, 0, byref(cp))
+
+
+
+class _AnimRenderer(object):
+    # Large launches lock the display for a considerable period and may be
+    # killed due to a device timeout; small launches are harder to load-balance
+    # on the GPU and incur overhead. This empirical value is multiplied by the
+    # number of SMs on the device to determine how many blocks should be in
+    # each launch. Extremely high quality, high resolution renders may still
+    # encounter a device timeout, and no workaround is in place for that yet.
+    SM_FACTOR = 8
+
+    # Currently, palette interpolation is done independently of animation
+    # interpolation, so that the process is not biased and so we only need to
+    # mess about with one texture per renderer. This many steps will always be
+    # used, no matter the number of time steps.
+    PAL_HEIGHT = 16
+
+
+    def __init__(self, anim):
+        self.anim = anim
+        self.pending = False
+        self.stream = cuda.Stream()
+
+        self._nsms = cuda.Context.get_device().multiprocessor_count
+        self.cps_per_block = self._nsms * self.SM_FACTOR
+        self.ncps = anim.features.max_cps
+        self.nblocks = int(math.ceil(self.ncps / float(self.cps_per_block)))
+
+        # These are stored to avoid leaks, not to be stateful in method calls
+        # TODO: ensure proper cleanup is done
+        self._dst_cp = pyflam3.Genome()
+        memset(byref(self._dst_cp), 0, sizeof(self._dst_cp))
+        self._cen_cp = pyflam3.Genome()
+        memset(byref(self._cen_cp), 0, sizeof(self._cen_cp))
+
+        self.nbins = anim.features.acc_height * anim.features.acc_stride
+        self.d_den = cuda.mem_alloc(4 * self.nbins)
+        self.d_accum = cuda.mem_alloc(16 * self.nbins)
+        self.d_out = cuda.mem_alloc(16 * self.nbins)
+        self.d_infos = cuda.mem_alloc(anim._iter.packer.align * self.ncps)
+        # Defer allocation until first needed
+        self.d_seeds = [None] * self.nblocks
+
+    def render(self, cen_time):
+        assert not self.pending, "Tried to render with results pending!"
+        self.pending = True
+        a = self.anim
+
+        cen_cp = self._cen_cp
+        a._interp(cen_time, cen_cp)
+        palette = self._interp_colors(cen_time, cen_cp)
+
+        util.BaseCode.zero_dptr(a.mod, self.d_den, self.nbins,
+                                self.stream)
+        util.BaseCode.zero_dptr(a.mod, self.d_accum, 4 * self.nbins,
+                                self.stream)
+
+        # ------------------------------------------------------------
+        # TODO WARNING TODO WARNING TODO WARNING TODO WARNING TODO
+        # This will replace the palette while it's in use by the other
+        # rendering function. Need to pass palette texref in function
+        # invocation.
+        # ------------------------------------------------------------
+        dpal = cuda.make_multichannel_2d_array(palette, 'C')
+        tref = a.mod.get_texref('palTex')
+        tref.set_array(dpal)
+        tref.set_format(cuda.array_format.UNSIGNED_INT8, 4)
+        tref.set_flags(cuda.TRSF_NORMALIZED_COORDINATES)
+        tref.set_filter_mode(cuda.filter_mode.LINEAR)
+
+        cp = self._dst_cp
+        packer = a._iter.packer
+
+        iter_fun = a.mod.get_function("iter")
+        iter_fun.set_cache_config(cuda.func_cache.PREFER_L1)
+
+        # Must be accumulated over all CPs
+        gam, vib, hipow = 0, 0, 0
+
+        # This is gross, but there are a lot of fiddly corner cases with any
+        # index-based iteration scheme.
+        times = list(enumerate(self._mk_dts(cen_time, cen_cp, self.ncps)))
+        for b, block_times in enumerate(_chunk(times, self.cps_per_block)):
+            infos = []
+            if len(a.genomes) > 1:
+                for n, t in block_times:
+                    a._interp(t, cp)
+                    frac = float(n) / cen_cp.ntemporal_samples
+                    info = packer.pack(cp=Genome(cp), cp_step_frac=frac)
+                    infos.append(info)
+                    gam += cp.gamma
+                    vib += cp.vibrancy
+                    hipow += cp.highlight_power
+            else:
+                # Can't interpolate normally; just pack copies
+                # TODO: this still packs the genome 20 times or so instead of
+                # once
+                packed = packer.pack(cp=a.genomes[0], cp_step_frac=0)
+                infos = [packed] * len(block_times)
+                gam += a.genomes[0].gamma * len(block_times)
+                vib += a.genomes[0].vibrancy * len(block_times)
+                hipow += a.genomes[0].highlight_power * len(block_times)
+
+            infos = np.concatenate(infos)
+            offset = b * packer.align * self.cps_per_block
+            # TODO: portable across 32/64-bit arches?
+            d_info_off = int(self.d_infos) + offset
+            cuda.memcpy_htod(d_info_off, infos)
+
+            if not self.d_seeds[b]:
+                seeds = mwc.MWC.make_seeds(iter.IterCode.NTHREADS *
+                                           self.cps_per_block)
+                self.d_seeds[b] = cuda.to_device(seeds)
+
+            # TODO: get block config from IterCode
+            # TODO: print timing information
+            iter_fun(self.d_seeds[b], np.uint64(d_info_off),
+                     self.d_accum, self.d_den,
+                     block=(32, 16, 1), grid=(len(block_times), 1),
+                     stream=self.stream)
+
+        # MAJOR TODO: for now, we kill almost all parallelism by forcing the
+        # stream here. Later, once we've decided on a density-buffer prefilter,
+        # we will move it to the GPU, allowing it to be embedded in the stream
+        # and letting the remaining code be asynchronous.
+        self.stream.synchronize()
+        dbuf_dim = (a.features.acc_height, a.features.acc_stride)
+        dbuf = cuda.from_device(self.d_den, dbuf_dim, np.float32)
+        dbuf = ndimage.filters.gaussian_filter(dbuf, 0.6)
+        cuda.memcpy_htod(self.d_den, dbuf)
+
+        util.BaseCode.zero_dptr(a.mod, self.d_out, 4 * self.nbins,
+                                self.stream)
+        self.stream.synchronize()
+        a._de.invoke(a.mod, self.d_accum, self.d_out, self.d_den,
+                     self.stream)
+        self.stream.synchronize()
+
+
+        n = np.float32(self.ncps)
+        gam = np.float32(n / gam)
+        vib = np.float32(vib / n)
+        hipow = np.float32(hipow / n)
+
+        # TODO: get block size from colorclip class? It actually does not
+        # depend on that being the case
+        color_fun = a.mod.get_function("colorclip")
+        color_fun(self.d_out, gam, vib, hipow,
+                  block=(256, 1, 1), grid=(self.nbins / 256, 1),
+                  stream=self.stream)
+
+    def _interp_colors(self, cen_time, cen_cp):
+        # TODO: any visible difference between uint8 and richer formats?
+        pal = np.empty((self.PAL_HEIGHT, 256, 4), dtype=np.uint8)
+        a = self.anim
+
+        if len(a.genomes) > 1:
+            # The typical case; applying real motion blur
+            cp = self._dst_cp
+            times = self._mk_dts(cen_time, cen_cp, self.PAL_HEIGHT)
+            for n, t in enumerate(times):
+                a._interp(t, cp)
+                for i, e in enumerate(cp.palette.entries):
+                    pal[n][i] = np.uint8(np.array(e.color) * 255.0)
+        else:
+            # Cannot call any interp functions on a single genome; rather than
+            # have alternate code-paths, just copy the same colors everywhere
+            for i, e in enumerate(a.genomes[0].palette.entries):
+                # TODO: This triggers a RuntimeWarning
+                pal[0][i] = np.uint8(np.array(e.color) * 255.0)
+            pal[1:] = pal[0]
+        return pal
+
+    def wait(self):
+        if self.pending:
+            self.stream.synchronize()
+            self.pending = False
+            return True
+        return False
+
+    def get_result(self):
+        a = self.anim
+        g = a.features.gutter
+        obuf_dim = (a.features.acc_height, a.features.acc_stride, 4)
+        out = cuda.from_device(self.d_out, obuf_dim, np.float32)
+        # TODO: performance?
+        out = np.delete(out, np.s_[:16], axis=0)
+        out = np.delete(out, np.s_[:16], axis=1)
+        out = np.delete(out, np.s_[-16:], axis=0)
+        out = np.delete(out, np.s_[-16:], axis=1)
+        return out
+
+    @staticmethod
+    def _mk_dts(cen_time, cen_cp, ncps):
+        w = cen_cp.temporal_filter_width
+        return [w * (t / (ncps - 1.0) - 0.5) for t in range(ncps)]
 
 class Features(object):
     """
@@ -93,7 +387,8 @@ class Features(object):
     palette_height = 16
 
     # Maximum width of DE and other spatial filters, and thus in turn the
-    # amount of padding applied
+    # amount of padding applied. Note that, for now, this must not be changed!
+    # The filtering code makes deep assumptions about this value.
     gutter = 16
 
     def __init__(self, genomes):
@@ -116,11 +411,13 @@ class Features(object):
         else:
             self.final_xform_index = None
 
+        self.max_cps = max([cp.ntemporal_samples for cp in genomes])
+
         self.width = genomes[0].width
         self.height = genomes[0].height
         self.acc_width = genomes[0].width + 2 * self.gutter
         self.acc_height = genomes[0].height + 2 * self.gutter
-        self.acc_stride = genomes[0].width + 2 * self.gutter
+        self.acc_stride = 32 * int(math.ceil(self.acc_width / 32.))
 
 class XFormFeatures(object):
     def __init__(self, xforms, xform_id):
diff --git a/main.py b/main.py
index bf1ce85..731d944 100644
--- a/main.py
+++ b/main.py
@@ -22,13 +22,10 @@ import scipy
 import pyglet
 import pycuda.autoinit
 
-from fr0stlib.pyflam3 import *
-from fr0stlib.pyflam3._flam3 import *
-
 import cuburn._pyflam3_hacks
+from fr0stlib import pyflam3
 from cuburn.render import *
 from cuburn.code.mwc import MWCTest
-from cuburn.code.iter import render, membench
 
 # Required on my system; CUDA doesn't yet work with GCC 4.5
 os.environ['PATH'] = ('/usr/x86_64-pc-linux-gnu/gcc-bin/4.4.5:'
@@ -37,24 +34,22 @@ os.environ['PATH'] = ('/usr/x86_64-pc-linux-gnu/gcc-bin/4.4.5:'
 def main(args):
     if '-t' in args:
         MWCTest.test_mwc()
-        membench()
-
 
     with open(args[1]) as fp:
-        genomes = Genome.from_string(fp.read())
+        genome_ptr, ngenomes = pyflam3.Genome.from_string(fp.read())
+        genomes = cast(genome_ptr, POINTER(pyflam3.Genome*ngenomes)).contents
     anim = Animation(genomes)
-    accum, den = render(anim.features, genomes)
-    accum = np.delete(accum, np.s_[:16], axis=0)
-    accum = np.delete(accum, np.s_[:16], axis=1)
-    accum = np.delete(accum, np.s_[-16:], axis=0)
-    accum = np.delete(accum, np.s_[-16:], axis=1)
+    anim.compile()
+    anim.load()
+    for n, out in enumerate(anim.render_frames()):
+        noalpha = np.delete(out, 3, axis=2)
+        scipy.misc.imsave('rendered_%03d.png' % n, noalpha)
+        scipy.misc.imsave('rendered_%03d.jpg' % n, noalpha)
 
-    noalpha = np.delete(accum, 3, axis=2)
-    scipy.misc.imsave('rendered.png', noalpha)
-    scipy.misc.imsave('rendered.jpg', noalpha)
+    return
 
-    if '-g' not in args:
-        return
+    #if '-g' not in args:
+    #    return
 
     window = pyglet.window.Window(anim.features.width, anim.features.height)
     imgbuf = (np.minimum(accum * 255, 255)).astype(np.uint8)