From b43481e374ee5b7a5937c91ed631d5ffa5bff17e Mon Sep 17 00:00:00 2001
From: Steven Robertson <steven@strobe.cc>
Date: Thu, 15 Dec 2011 11:11:05 -0500
Subject: [PATCH] New genome format to support flockutil

---
 cuburn/code/filtering.py |  88 ++++++-------
 cuburn/code/interp.py    |   7 +-
 cuburn/code/iter.py      |  36 +++---
 cuburn/genome.py         | 179 +++++++++++---------------
 cuburn/render.py         | 268 ++++++++++++++++++++++++---------------
 5 files changed, 303 insertions(+), 275 deletions(-)

diff --git a/cuburn/code/filtering.py b/cuburn/code/filtering.py
index fdc1f86..905bc59 100644
--- a/cuburn/code/filtering.py
+++ b/cuburn/code/filtering.py
@@ -1,5 +1,7 @@
 
 import numpy as np
+from numpy import float32 as f32, int32 as i32
+
 import pycuda.compiler
 from pycuda.gpuarray import vec
 
@@ -10,8 +12,8 @@ _CODE = '''
 
 __global__
 void colorclip(float4 *pixbuf, float gamma, float vibrancy, float highpow,
-               float linrange, float lingam, float3 bkgd, int fbsize,
-               int alpha_output_channel) {
+               float linrange, float lingam, float3 bkgd,
+               int fbsize, int blend_background_color) {
     int i = threadIdx.x + blockDim.x * (blockIdx.x + gridDim.x * blockIdx.y);
     if (i >= fbsize) return;
 
@@ -30,8 +32,17 @@ void colorclip(float4 *pixbuf, float gamma, float vibrancy, float highpow,
         alpha = (1.0f - frac) * pix.w * lingam + frac * alpha;
     }
 
-    float ls = vibrancy * alpha / pix.w;
+    if (!blend_background_color) {
+        float ls = alpha / pix.w;
+        pix.x *= ls;
+        pix.y *= ls;
+        pix.z *= ls;
+        pix.w = alpha;
+        pixbuf[i] = pix;
+        return;
+    }
 
+    float ls = vibrancy * alpha / pix.w;
     alpha = fminf(1.0f, fmaxf(0.0f, alpha));
 
     float maxc = fmaxf(pix.x, fmaxf(pix.y, pix.z));
@@ -64,29 +75,21 @@ void colorclip(float4 *pixbuf, float gamma, float vibrancy, float highpow,
     pix.y += (1.0f - vibrancy) * powf(opix.y, gamma);
     pix.z += (1.0f - vibrancy) * powf(opix.z, gamma);
 
-    if (alpha_output_channel) {
-        float one_alpha = 1.0f / alpha;
-        pix.x *= one_alpha;
-        pix.y *= one_alpha;
-        pix.z *= one_alpha;
-    } else {
-        pix.x += (1.0f - alpha) * bkgd.x;
-        pix.y += (1.0f - alpha) * bkgd.y;
-        pix.z += (1.0f - alpha) * bkgd.z;
-    }
-    pix.w = alpha;
+    pix.x += (1.0f - alpha) * bkgd.x;
+    pix.y += (1.0f - alpha) * bkgd.y;
+    pix.z += (1.0f - alpha) * bkgd.z;
 
-    // Clamp values. I think this is superfluous, but I'm not certain.
     pix.x = fminf(1.0f, pix.x);
     pix.y = fminf(1.0f, pix.y);
     pix.z = fminf(1.0f, pix.z);
+    pix.w = alpha;
 
     pixbuf[i] = pix;
 }
 
 
 #define W 15        // Filter width (regardless of standard deviation chosen)
-#define W2 7       // Half of filter width, rounded down
+#define W2 7        // Half of filter width, rounded down
 #define FW 46       // Width of local result storage (NW+W2+W2)
 #define FW2 (FW*FW)
 
@@ -288,52 +291,45 @@ class Filtering(object):
     def __init__(self):
         self.init_mod()
 
-    def de(self, ddst, dsrc, info, start, stop, stream=None):
-        # TODO: use integration to obtain parameter values
-        t = (start + stop) / 2
-        cp = info.genome
-
-        k1 = np.float32(cp.color.brightness(t) * 268 / 256)
+    def de(self, ddst, dsrc, gnm, dim, tc, stream=None):
+        k1 = f32(gnm.color.brightness(tc) * 268 / 256)
         # Old definition of area is (w*h/(s*s)). Since new scale 'ns' is now
         # s/w, new definition is (w*h/(s*s*w*w)) = (h/(s*s*w))
-        area = info.height / (cp.camera.scale(t) ** 2 * info.width)
-        k2 = np.float32(1 / (area * info.density))
+        area = dim.h / (gnm.camera.scale(tc) ** 2 * dim.w)
+        k2 = f32(1 / (area * gnm.spp(tc)))
 
-        if cp.de.radius == 0:
-            nbins = info.acc_height * info.acc_stride
+        if gnm.de.radius == 0:
+            nbins = dim.ah * dim.astride
             fun = self.mod.get_function("logscale")
             t = fun(dsrc, ddst, k1, k2,
                     block=(512, 1, 1), grid=(nbins/512, 1), stream=stream)
         else:
-            scale_coeff = np.float32(-(1 + cp.de.radius(t)) ** -2.0)
-            est_curve = np.float32(2 * cp.de.curve(t))
+            scale_coeff = f32(-(1 + gnm.de.radius(tc)) ** -2.0)
+            est_curve = f32(2 * gnm.de.curve(tc))
             # TODO: experiment with this
-            edge_clamp = np.float32(2.0)
+            edge_clamp = f32(1.2)
             fun = self.mod.get_function("density_est")
             fun(dsrc, ddst, scale_coeff, est_curve, edge_clamp, k1, k2,
-                np.int32(info.acc_height), np.int32(info.acc_stride),
-                block=(32, 32, 1), grid=(info.acc_width/32, 1), stream=stream)
+                i32(dim.ah), i32(dim.astride), block=(32, 32, 1),
+                grid=(dim.aw/32, 1), stream=stream)
 
-    def colorclip(self, dbuf, info, start, stop, stream=None):
-        f32 = np.float32
-        t = (start + stop) / 2
-        cp = info.genome
-        nbins = info.acc_height * info.acc_stride
+    def colorclip(self, dbuf, gnm, dim, tc, blend, stream=None):
+        nbins = dim.ah * dim.astride
 
         # TODO: implement integration over cubic splines?
-        gam = f32(1 / cp.color.gamma(t))
-        vib = f32(cp.color.vibrancy(t))
-        hipow = f32(cp.color.highlight_power(t))
-        lin = f32(cp.color.gamma_threshold(t))
+        gam = f32(1 / gnm.color.gamma(tc))
+        vib = f32(gnm.color.vibrancy(tc))
+        hipow = f32(gnm.color.highlight_power(tc))
+        lin = f32(gnm.color.gamma_threshold(tc))
         lingam = f32(lin ** (gam-1.0) if lin > 0 else 0)
         bkgd = vec.make_float3(
-                cp.color.background.r(t),
-                cp.color.background.g(t),
-                cp.color.background.b(t))
+                gnm.color.background.r(tc),
+                gnm.color.background.g(tc),
+                gnm.color.background.b(tc))
 
         color_fun = self.mod.get_function("colorclip")
         blocks = int(np.ceil(np.sqrt(nbins / 256)))
-        color_fun(dbuf, gam, vib, hipow, lin, lingam, bkgd, np.int32(nbins),
-                  np.int32(0),
-                  block=(256, 1, 1), grid=(blocks, blocks), stream=stream)
+        color_fun(dbuf, gam, vib, hipow, lin, lingam, bkgd, i32(nbins),
+                  i32(blend), block=(256, 1, 1), grid=(blocks, blocks),
+                  stream=stream)
 
diff --git a/cuburn/code/interp.py b/cuburn/code/interp.py
index 5b4961f..d363ac1 100644
--- a/cuburn/code/interp.py
+++ b/cuburn/code/interp.py
@@ -226,8 +226,11 @@ class GenomePacker(HunkOCode):
     _defs = Template(r"""
 
 __global__
-void interp_{{tname}}({{tname}}* out, float *times, float *knots,
-        float tstart, float tstep, mwc_st *rctxes, int maxid) {
+void interp_{{tname}}(
+        {{tname}}* out, mwc_st *rctxes,
+        const float *times, const float *knots,
+        float tstart, float tstep, int maxid
+) {
     int id = gtid();
     if (id >= maxid) return;
     out = &out[id];
diff --git a/cuburn/code/iter.py b/cuburn/code/iter.py
index 30e0e02..549e835 100644
--- a/cuburn/code/iter.py
+++ b/cuburn/code/iter.py
@@ -53,7 +53,7 @@ def precalc_chaos(pcp, std_xforms):
 
     """).substitute(locals()))
 
-def precalc_camera(info, pcam):
+def precalc_camera(pcam):
     pre_cam = pcam._precalc()
 
     # Maxima code to check my logic:
@@ -68,7 +68,7 @@ def precalc_camera(info, pcam):
         float rot = {{pre_cam.rotation}} * M_PI / 180.0f;
         float rotsin = sin(rot), rotcos = cos(rot);
         float cenx = {{pre_cam.center.x}}, ceny = {{pre_cam.center.y}};
-        float scale = {{pre_cam.scale}} * {{info.width}};
+        float scale = {{pre_cam.scale}} * acc_size.width;
 
         float ditherwidth = {{pre_cam.dither_width}} * 0.33f;
         float u0 = mwc_next_01(rctx);
@@ -81,12 +81,12 @@ def precalc_camera(info, pcam):
         {{pre_cam._set('xx')}} = scale * rotcos;
         {{pre_cam._set('xy')}} = scale * -rotsin;
         {{pre_cam._set('xo')}} = scale * (rotsin * ceny - rotcos * cenx)
-                              + {{0.5 * (info.width + info.gutter + 1)}} + ditherx;
+                               + 0.5f * acc_size.awidth + ditherx;
 
         {{pre_cam._set('yx')}} = scale * rotsin;
         {{pre_cam._set('yy')}} = scale * rotcos;
         {{pre_cam._set('yo')}} = scale * -(rotsin * cenx + rotcos * ceny)
-                              + {{0.5 * (info.height + info.gutter + 1)}} + dithery;
+                               + 0.5f * acc_size.aheight + dithery;
 
     """).substitute(locals()))
 
@@ -113,13 +113,12 @@ class IterCode(HunkOCode):
     # The number of threads per block
     NTHREADS = 256
 
-    def __init__(self, info):
-        self.info = info
+    def __init__(self, info, genome):
         self.packer = interp.GenomePacker('iter_params')
-        self.pcp = self.packer.view('params', self.info.genome, 'cp')
+        self.pcp = self.packer.view('params', genome, 'cp')
 
-        iterbody = self._iterbody()
-        bodies = [self._xfbody(i,x) for i,x in sorted(info.genome.xforms.items())]
+        iterbody = self._iterbody(info, genome)
+        bodies = [self._xfbody(i,x) for i,x in sorted(genome.xforms.items())]
         bodies.append(iterbody)
         self.defs = '\n'.join(bodies)
 
@@ -132,7 +131,9 @@ __device__ int rb_head, rb_tail, rb_size;
 typedef struct {
     uint32_t width;
     uint32_t height;
-    uint32_t stride;
+    uint32_t awidth;
+    uint32_t aheight;
+    uint32_t astride;
 } acc_size_t;
 __constant__ acc_size_t acc_size;
 
@@ -174,7 +175,7 @@ void apply_xf_{{xfid}}(float &ox, float &oy, float &color, mwc_st &rctx) {
         g.update(locals())
         return tmpl.substitute(g)
 
-    def _iterbody(self):
+    def _iterbody(self, info, genome):
         tmpl = Template(r'''
 
 __global__ void reset_rb(int size) {
@@ -331,7 +332,7 @@ void iter(
 
         float cx, cy, cc;
 
-        {{precalc_camera(info, pcp.camera)}}
+        {{precalc_camera(pcp.camera)}}
 
 {{if 'final' in cp.xforms}}
         {{apply_affine('fx', 'fy', 'cx', 'cy', pcp.camera)}}
@@ -343,14 +344,14 @@ void iter(
 
         uint32_t ix = trunca(cx), iy = trunca(cy);
 
-        if (ix >= acc_size.width || iy >= acc_size.height) {
+        if (ix >= acc_size.awidth || iy >= acc_size.aheight) {
 {{if info.acc_mode == 'deferred'}}
             *log = 0xffffffff;
 {{endif}}
             continue;
         }
 
-        uint32_t i = iy * acc_size.stride + ix;
+        uint32_t i = iy * acc_size.astride + ix;
 {{if info.acc_mode == 'atomic'}}
         asm volatile ({{crep("""
 {
@@ -626,12 +627,11 @@ oflow_write_end:
 {{endif}}
 ''', 'iter_kern')
         return tmpl.substitute(
-                info = self.info,
-                cp = self.info.genome,
+                info = info,
+                cp = genome,
                 pcp = self.pcp,
                 NTHREADS = self.NTHREADS,
                 NWARPS = self.NTHREADS / 32,
-                std_xforms = [n for n in sorted(self.info.genome.xforms)
-                              if n != 'final'],
+                std_xforms = [n for n in sorted(genome.xforms) if n != 'final'],
                 **globals())
 
diff --git a/cuburn/genome.py b/cuburn/genome.py
index 8386bf7..2d9e1de 100644
--- a/cuburn/genome.py
+++ b/cuburn/genome.py
@@ -79,115 +79,86 @@ class SplEval(object):
             return self.knots[1][0]
         return list(self.knots.T.flat)
 
-    @classmethod
-    def wrap(cls, obj):
-        """
-        Given a dict 'obj' representing, for instance, a Genome object, walk
-        through the object recursively and in-place, turning any number or
-        list of numbers into an SplEval.
-        """
-        for k, v in obj.items():
-            if (isinstance(v, (float, int)) or
-                    (isinstance(v, list) and isinstance(v[1], (float, int)))):
-                obj[k] = cls(v)
-            elif isinstance(v, dict):
-                cls.wrap(v)
-
-class RenderInfo(object):
-    """
-    Determine features and constants required to render a particular set of
-    genomes. The values of this class are fixed before compilation begins.
-    """
-    # Number of iterations to iterate without write after generating a new
-    # point. This number is currently fixed pretty deeply in the set of magic
-    # constants which govern buffer sizes; changing the value here won't
-    # actually change the code on the device to do something different.
-    fuse = 256
-
-    # Height of the texture pallete which gets uploaded to the GPU (assuming
-    # that palette-from-texture is enabled). For most genomes, this doesn't
-    # need to be very large at all. However, since only an easily-cached
-    # fraction of this will be accessed per SM, larger values shouldn't hurt
-    # performance too much. When using deferred accumulation, increasing this
-    # value increases the number of uniquely-dithered samples, which is nice.
-    # Power-of-two, please.
-    palette_height = 64
-
-    # Maximum width of DE and other spatial filters, and thus in turn the
-    # amount of padding applied. Note that, for now, this must not be changed!
-    # The filtering code makes deep assumptions about this value.
-    gutter = 15
-
-    # TODO: for now, we always throw away the alpha channel before writing.
-    # All code is in place to not do this, we just need to find a way to expose
-    # this preference via the API (or push alpha blending entirely on the client,
-    # which I'm not opposed to)
-    alpha_output_channel = False
-
-    # There are three settings for this somewhat ersatz paramater. 'global'
-    # uses unsynchronized global writes to accumulate sample points, 'atomic'
-    # uses atomic global writes, and 'deferred' stores color and position in a
-    # sample log, sorts the log by position, and uses shared memory to
-    # perform the accumulation. Deferred has the accuracy of 'atomic' and
-    # the speed of 'global' (it's actually faster!), but packs color and
-    # position into a single 32-bit int for now, which limits resolution to
-    # 1080p when xform opacity is respected, so the other two modes will hang
-    # around until that can be extended to be memory-limited again.
-    acc_mode = 'atomic'
-
-    # TODO: fix this
-    chaos_used = False
-
-    def __init__(self, db, **kwargs):
-        self.db = db
-        # Copy all args into this object's namespace
-        self.__dict__.update(kwargs)
-
-        self.acc_width = self.width + 2 * self.gutter
-        self.acc_height = self.height + 2 * self.gutter
-        self.acc_stride = 32 * int(np.ceil(self.acc_width / 32.))
-        self.density = self.quality
-
-        # Deref genome
-        self.genome = self.db.genomes[self.genome]
-
-        for k, v in self.db.palettes.items():
-            pal = np.fromstring(base64.b64decode(v), np.uint8)
-            pal = np.reshape(pal, (256, 3))
-            pal_a = np.ones((256, 4), np.float32)
-            pal_a[:,:3] = pal / 255.0
-            self.db.palettes[k] = pal_a
+class Palette(object):
+    """Wafer-thin wrapper around palettes. For the future!"""
+    def __init__(self, datastr, fmt='rgb8'):
+        if fmt != 'rgb8':
+            raise NotImplementedError
+        if len(datastr) != 768:
+            raise ValueError("Unsupported palette width")
+        self.width = 256
+        pal = np.reshape(np.fromstring(datastr, np.uint8), (256, 3))
+        self.data = np.ones((256, 4), np.float32)
+        self.data[:,:3] = pal / 255.0
 
 class _AttrDict(dict):
     def __getattr__(self, name):
         return self[name]
 
-def load_info(contents):
-    result = json.loads(contents, object_hook=_AttrDict)
-    SplEval.wrap(result.genomes)
+    @classmethod
+    def _wrap(cls, dct):
+        for k, v in dct.items():
+            if (isinstance(v, (float, int)) or
+                    (isinstance(v, list) and isinstance(v[1], (float, int)))):
+                dct[k] = SplEval(v)
+            elif isinstance(v, dict):
+                dct[k] = cls._wrap(cls(v))
+        return dct
 
-    # A Job object will have more details or something
-    result = RenderInfo(result, **result.renders.values()[0])
-    return result
+class Genome(_AttrDict):
+    # For now, we base the Genome class on an _AttrDict, letting its structure
+    # be defined implicitly by the way it is used in device code. More formal
+    # unpacking will happen soon.
+    def __init__(self, gnm, base_den):
+        super(Genome, self).__init__(gnm)
+        for k, v in self.items():
+            v = _AttrDict(v)
+            if k not in ('info', 'time'):
+                _AttrDict._wrap(v)
+            self[k] = v
+        # TODO: this is a hack, figure out how to solve it more elegantly
+        self.spp = SplEval(self.camera.density.knotlist)
+        self.spp.knots[1] *= base_den
+        # TODO: decide how to handle palettes. For now, it's the caller's
+        # responsibility to replace this list with actual palettes.
+        pal = self.color.palette
+        if isinstance(pal, basestring):
+            self.color.palette = [(0.0, pal), (1.0, pal)]
+        elif isinstance(pal, list):
+            self.color.palette = zip(pal[::2], pal[1::2])
 
-class HacketyGenome(object):
-    """
-    Holdover class to postpone a very deep refactoring as long as possible.
-    Converts property accesses into interpolations over predetermined times.
-    """
-    def __init__(self, referent, times):
-        # Times can be singular
-        self.referent, self.times = referent, times
-    def __getattr__(self, name):
-        r = getattr(self.referent, str(name))
-        if isinstance(r, _AttrDict):
-            return HacketyGenome(r, self.times)
-        elif isinstance(r, SplEval):
-            return r(self.times)
-        return r
-    __getitem__ = __getattr__
+        # TODO: caller also needs to call set_timing()
+        self.adj_frame_width = None
+        self.canonical_right = (not self.get('link') or not self.link == 'self'
+                                or not self.link.get('right'))
 
-if __name__ == "__main__":
-    import sys
-    import pprint
-    pprint.pprint(read_genome(sys.stdin))
+    def set_timing(self, base_dur, fps, offset=0.0, err_spread=True):
+        """
+        Set frame timing. Must be called at least once prior to rendering.
+        """
+        # TODO: test!
+        dur = self.time.duration
+        if isinstance(dur, basestring):
+            clock = float(dur[:-1]) + offset
+        else:
+            clock = dur * base_dur + offset
+        if self.canonical_right:
+            nframes = int(np.floor(clock * fps))
+        else:
+            nframes = int(np.ceil(clock * fps))
+        err = (clock - nframes / fps) / clock
+
+        fw = self.time.frame_width
+        if not isinstance(fw, list):
+            fw = [0, fw, 1, fw]
+        fw = [float(f[:-1]) * fps if isinstance(f, basestring)
+              else float(f) / (clock * fps) for f in fw]
+        self.adj_frame_width = SplEval(fw)
+
+        times = np.linspace(offset, 1 - err, nframes + 1)
+        # Move each time to a center time, and discard the last value
+        times = times[:-1] + 0.5 * (times[1] - times[0])
+        if err_spread:
+            epts = np.linspace(-2*np.pi, 2*np.pi, nframes)
+            times = times + 0.5 * err * (np.tanh(epts) + 1)
+        return err, times
diff --git a/cuburn/render.py b/cuburn/render.py
index 977a73a..8b4f2bb 100644
--- a/cuburn/render.py
+++ b/cuburn/render.py
@@ -4,11 +4,11 @@ import re
 import time as timemod
 import tempfile
 from collections import namedtuple
-from itertools import cycle, repeat, chain, izip
+from itertools import cycle, repeat, chain, izip, imap, ifilter
 from ctypes import *
 from cStringIO import StringIO
 import numpy as np
-from numpy import int32 as i32, uint64 as u64
+from numpy import float32 as f32, int32 as i32, uint32 as u32, uint64 as u64
 from scipy import ndimage
 
 from fr0stlib import pyflam3
@@ -24,41 +24,55 @@ from cuburn import affine
 from cuburn.code import util, mwc, iter, filtering, sort
 
 RenderedImage = namedtuple('RenderedImage', 'buf idx gpu_time')
+Dimensions = namedtuple('Dimensions', 'w h aw ah astride')
 
 def _sync_stream(dst, src):
     dst.wait_for_event(cuda.Event(cuda.event_flags.DISABLE_TIMING).record(src))
 
+def argset(obj, **kwargs):
+    for k, v in kwargs.items():
+        setattr(obj, k, v)
+    return obj
+
 class Renderer(object):
     """
     Control structure for rendering a series of frames.
-
-    Each animation will dynamically generate a kernel that includes only the
-    code necessary to render the genomes provided. The process of generating
-    and uploading the kernel takes a small but finite amount of time. In
-    general, the kernel generated for all genomes resulting from interpolating
-    between two control points will have identical performance, so it is
-    wasteful to create more than one animation for any interpolated sequence.
-
-    However, genome sequences interpolated from three or more control points
-    with different features enabled will have the code needed to render all
-    genomes enabled for every frame. Doing this can hurt performance.
-
-    In other words, it's best to use exactly one Animation for each
-    interpolated sequence between one or two genomes.
     """
 
+    # Number of iterations to iterate without write after generating a new
+    # point. This number is currently fixed pretty deeply in the set of magic
+    # constants which govern buffer sizes; changing the value here won't
+    # actually change the code on the device to do something different.
+    fuse = 256
+
+    # The palette texture/surface covers the color coordinate from [0,1] with
+    # (for now, a fixed 256) equidistant horizontal samples, and spans the
+    # temporal range of the frame linearly with this many rows. Increasing
+    # this value increases the number of uniquely-dithered samples when using
+    # pre-dithered surfaces.
+    palette_height = 64
+
+    # Maximum width of DE and other spatial filters, and thus in turn the
+    # amount of padding applied. Note that, for now, this must not be changed!
+    # The filtering code makes deep assumptions about this value.
+    gutter = 15
+
+    # Accumulation mode. Leave it at 'atomic' for now.
+    acc_mode = 'atomic'
+
+    # TODO
+    chaos_used = False
+
     cmp_options = ('-use_fast_math', '-maxrregcount', '42')
     keep = False
 
-    def __init__(self, info):
-        self.info = info
+    def __init__(self):
         self._iter = self.src = self.cubin = self.mod = None
-        self.packed_genome = None
 
         # Ensure class options don't get contaminated on an instance
         self.cmp_options = list(self.cmp_options)
 
-    def compile(self, keep=None, cmp_options=None, jit_options=[]):
+    def compile(self, genome, keep=None, cmp_options=None):
         """
         Compile a kernel capable of rendering every frame in this animation.
         The resulting compiled kernel is stored in the ``cubin`` property;
@@ -73,7 +87,7 @@ class Renderer(object):
         keep = self.keep if keep is None else keep
         cmp_options = self.cmp_options if cmp_options is None else cmp_options
 
-        self._iter = iter.IterCode(self.info)
+        self._iter = iter.IterCode(self, genome)
         self._iter.packer.finalize()
         self.src = util.assemble_code(util.BaseCode, mwc.MWC, self._iter.packer,
                                       self._iter)
@@ -82,41 +96,81 @@ class Renderer(object):
         self.cubin = pycuda.compiler.compile(
                 self.src, keep=keep, options=cmp_options,
                 cache_dir=False if keep else None)
+
+    def load(self, genome, jit_options=[]):
+        if not self.cubin:
+            self.compile(genome)
         self.mod = cuda.module_from_buffer(self.cubin, jit_options)
         with open('/tmp/iter_kern.cubin', 'wb') as fp:
             fp.write(self.cubin)
         return self.src
 
-    def render(self, times):
+    def render(self, genome, times, width, height, blend=True):
         """
-        Render a flame for each genome in the iterable value 'genomes'.
-        Returns a RenderedImage object with the rendered buffer in the
-        requested format (3D RGBA ndarray only for now).
+        Render a frame for each timestamp in the iterable value ``times``. This
+        function returns a generator that will yield a RenderedImage object
+        containing a shared reference to the output buffer for each specified
+        frame.
 
-        This method produces a considerable amount of side effects, and should
-        not be used lightly. Things may go poorly for you if this method is not
-        allowed to run until completion (by exhausting all items in the
-        generator object).
+        The returned buffer is page-locked host memory. Between the time a
+        buffer is yielded and the time the next frame's results are requested,
+        the buffer will not be modified. Thereafter, however, it will be
+        overwritten by an asynchronous DMA operation coming from the CUDA
+        device. If you hang on to it for longer than one frame, copy it.
 
-        ``times`` is a sequence of (idx, start, stop) times, where index is
-        the logical frame number (though it can be any value) and 'start' and
-        'stop' together define the time range to be rendered for each frame.
+        ``genome`` is the genome to be rendered. Successive calls to the
+        `render()` method on one ``Renderer`` object must use genomes which
+        produce identical compiled code, and this will not be verified by the
+        renderer. In practice, this means you can alter genome parameter
+        values, but the full set of keys must remain identical between runs on
+        the same renderer.
+
+        ``times`` is a list of (idx, cen_time) tuples, where ``idx`` is passed
+        unmodified in the RenderedImage return value and ``cen_time`` is the
+        central time of the current frame in spline-time units. (Any
+        clock-time or frame-time units in the genome should be preconverted.)
+
+        If ``blend`` is False, the output buffer will contain unclipped,
+        premultiplied RGBA data, without vibrancy, highlight power, or the
+        alpha elbow applied.
         """
-        if times == []:
+        r = self.render_gen(genome, width, height, blend=blend)
+        next(r)
+        return ifilter(None, imap(r.send, chain(times, [None])))
+
+    def render_gen(self, genome, width, height, blend=True):
+        """
+        Render frames. This method is wrapped by the ``render()`` method; see
+        its docstring for warnings and details.
+
+        Instead of passing frame times as an iterable, they are passed
+        individually via the ``generator.send()`` method. There is an
+        internal pipeline latency of one frame, so the first call to the
+        ``send()`` method will return None, the second call will return the
+        first frame's result, and so on. To retrieve the last frame in a
+        sequence, send ``None``.
+
+        Direct use of this method is useful for implementing render servers.
+        """
+
+        last_idx = None
+        next_frame = yield
+        if next_frame is None:
             return
 
+        if not self.mod:
+            self.load(genome)
+
         filt = filtering.Filtering()
 
         reset_rb_fun = self.mod.get_function("reset_rb")
         packer_fun = self.mod.get_function("interp_iter_params")
         iter_fun = self.mod.get_function("iter")
 
-        info = self.info
-
         # The synchronization model is messy. See helpers/task_model.svg.
         iter_stream = cuda.Stream()
         filt_stream = cuda.Stream()
-        if info.acc_mode == 'deferred':
+        if self.acc_mode == 'deferred':
             write_stream = cuda.Stream()
             write_fun = self.mod.get_function("write_shmem")
         else:
@@ -128,19 +182,30 @@ class Renderer(object):
         event_a = cuda.Event().record(filt_stream)
         event_b = None
 
-        nbins = info.acc_height * info.acc_stride
+        awidth = width + 2 * self.gutter
+        aheight = height + 2 * self.gutter
+        astride = 32 * int(np.ceil(awidth / 32.))
+        dim = Dimensions(width, height, awidth, aheight, astride)
+        d_acc_size = self.mod.get_global('acc_size')[0]
+        cuda.memcpy_htod_async(d_acc_size, u32(list(dim)), write_stream)
+
+        nbins = awidth * aheight
         # Extra padding in accum helps with write_shmem overruns
         d_accum = cuda.mem_alloc(16 * nbins + (1<<16))
         d_out = cuda.mem_alloc(16 * nbins)
-        if info.acc_mode == 'atomic':
+        if self.acc_mode == 'atomic':
             d_atom = cuda.mem_alloc(8 * nbins)
             flush_fun = self.mod.get_function("flush_atom")
 
-        acc_size = np.array([info.acc_width, info.acc_height, info.acc_stride])
-        d_acc_size = self.mod.get_global('acc_size')[0]
-        cuda.memcpy_htod_async(d_acc_size, np.uint32(acc_size), write_stream)
+        obuf_copy = argset(cuda.Memcpy2D(),
+            src_y=self.gutter, src_x_in_bytes=16*self.gutter,
+            src_pitch=16*astride, dst_pitch=16*width,
+            width_in_bytes=16*width, height=height)
+        obuf_copy.set_src_device(d_out)
+        h_out_a = cuda.pagelocked_empty((height, width, 4), f32)
+        h_out_b = cuda.pagelocked_empty((height, width, 4), f32)
 
-        if info.acc_mode == 'deferred':
+        if self.acc_mode == 'deferred':
             # Having a fixed, power-of-two log size makes things much easier
             log_size = 64 << 20
             d_log = cuda.mem_alloc(log_size * 4)
@@ -153,9 +218,8 @@ class Renderer(object):
         # Calculate 'nslots', the number of simultaneous running threads that
         # can be active on the GPU during iteration (and thus the number of
         # slots for loading and storing RNG and point context that will be
-        # prepared on the device), 'rb_size' (the number of blocks in
-        # 'nslots'), and determine a number of temporal samples
-        # likely to load-balance effectively
+        # prepared on the device), and derive 'rb_size', the number of blocks in
+        # 'nslots'.
         iter_threads_per_block = 256
         dev_data = pycuda.tools.DeviceData()
         occupancy = pycuda.tools.OccupancyRecord(
@@ -169,14 +233,16 @@ class Renderer(object):
         reset_rb_fun(np.int32(rb_size), block=(1,1,1))
 
         d_points = cuda.mem_alloc(nslots * 16)
-        # We may add extra seeds to simplify palette dithering.
-        seeds = mwc.MWC.make_seeds(max(nslots, 256 * info.palette_height))
+        # This statement may add extra seeds to simplify palette dithering.
+        seeds = mwc.MWC.make_seeds(max(nslots, 256 * self.palette_height))
         d_seeds = cuda.to_device(seeds)
 
         # We used to auto-calculate this to a multiple of the number of SMs on
         # the device, but since we now use shorter launches and, to a certain
         # extent, allow simultaneous occupancy, that's not as important. The
-        # 1024 is a magic constant, though: FUSE
+        # 1024 is a magic constant to ensure reasonable and power-of-two log
+        # size for deferred: 256MB / (4B * FUSE * NTHREADS). Enhancements to
+        # the sort engine are needed to make this more flexible.
         ntemporal_samples = 1024
         genome_times, genome_knots = self._iter.packer.pack()
         d_genome_times = cuda.to_device(genome_times)
@@ -184,37 +250,31 @@ class Renderer(object):
         info_size = 4 * len(self._iter.packer) * ntemporal_samples
         d_infos = cuda.mem_alloc(info_size)
 
-        pals = info.genome.color.palette
+        pals = genome.color.palette
         if isinstance(pals, basestring):
             pals = [0.0, pals, 1.0, pals]
-        palint_times = np.empty(len(genome_times[0]), np.float32)
+        palint_times = np.empty(len(genome_times[0]), f32)
         palint_times.fill(100.0)
-        palint_times[:len(pals)/2] = pals[::2]
+        palint_times[:len(pals)] = [p[0] for p in pals]
         d_palint_times = cuda.to_device(palint_times)
         d_palint_vals = cuda.to_device(
-                np.concatenate(map(info.db.palettes.get, pals[1::2])))
+                np.concatenate([p[1].data for p in pals]))
 
-        if info.acc_mode in ('deferred', 'atomic'):
+        if self.acc_mode in ('deferred', 'atomic'):
             palette_fun = self.mod.get_function("interp_palette_hsv_flat")
-            dsc = cuda.ArrayDescriptor3D()
-            dsc.height = info.palette_height
-            dsc.width = 256
-            dsc.depth = 0
-            dsc.format = cuda.array_format.SIGNED_INT32
-            dsc.num_channels = 2
-            dsc.flags = cuda.array3d_flags.SURFACE_LDST
+            dsc = argset(cuda.ArrayDescriptor3D(), height=self.palette_height,
+                    width=256, depth=0, format=cuda.array_format.SIGNED_INT32,
+                    num_channels=2, flags=cuda.array3d_flags.SURFACE_LDST)
             palarray = cuda.Array(dsc)
 
             tref = self.mod.get_surfref('flatpal')
             tref.set_array(palarray, 0)
         else:
             palette_fun = self.mod.get_function("interp_palette_hsv")
-            dsc = cuda.ArrayDescriptor()
-            dsc.height = info.palette_height
-            dsc.width = 256
-            dsc.format = cuda.array_format.UNSIGNED_INT8
-            dsc.num_channels = 4
-            d_palmem = cuda.mem_alloc(256 * info.palette_height * 4)
+            dsc = argset(cuda.ArrayDescriptor(), height=self.palette_height,
+                    width=256, format=cuda.array_format.UNSIGNED_INT8,
+                    num_channels=4)
+            d_palmem = cuda.mem_alloc(256 * self.palette_height * 4)
 
             tref = self.mod.get_texref('palTex')
             tref.set_address_2d(d_palmem, dsc, 1024)
@@ -222,49 +282,46 @@ class Renderer(object):
             tref.set_flags(cuda.TRSF_NORMALIZED_COORDINATES)
             tref.set_filter_mode(cuda.filter_mode.LINEAR)
 
-        h_out_a = cuda.pagelocked_empty((info.acc_height, info.acc_stride, 4),
-                                        np.float32)
-        h_out_b = cuda.pagelocked_empty((info.acc_height, info.acc_stride, 4),
-                                        np.float32)
-        last_idx = None
+        while next_frame is not None:
+            # tc, td, ts, te: central, delta, start, end times
+            idx, tc = next_frame
+            td = genome.adj_frame_width(tc)
+            ts, te = tc - 0.5 * td, tc + 0.5 * td
 
-        for idx, start, stop in times:
-            twidth = np.float32((stop-start) / info.palette_height)
-            if info.acc_mode in ('deferred', 'atomic'):
-                palette_fun(d_seeds, d_palint_times, d_palint_vals,
-                            np.float32(start), twidth,
-                            block=(256,1,1), grid=(info.palette_height,1),
-                            stream=write_stream)
+            if self.acc_mode in ('deferred', 'atomic'):
+                # In this mode, the palette writes to a surface reference, but
+                # requires dithering, so we pass it the seeds instead
+                arg0 = d_seeds
             else:
-                palette_fun(d_palmem, d_palint_times, d_palint_vals,
-                            np.float32(start), twidth,
-                            block=(256,1,1), grid=(info.palette_height,1),
-                            stream=write_stream)
+                arg0 = d_palmem
+            palette_fun(arg0, d_palint_times, d_palint_vals,
+                        f32(ts), f32(td / self.palette_height),
+                        block=(256,1,1), grid=(self.palette_height,1),
+                        stream=write_stream)
 
-            width = np.float32((stop-start) / ntemporal_samples)
-            packer_fun(d_infos, d_genome_times, d_genome_knots,
-                       np.float32(start), width, d_seeds,
-                       np.int32(ntemporal_samples), block=(256,1,1),
+            packer_fun(d_infos, d_seeds, d_genome_times, d_genome_knots,
+                       f32(ts), f32(td / ntemporal_samples),
+                       i32(ntemporal_samples), block=(256,1,1),
                        grid=(int(np.ceil(ntemporal_samples/256.)),1),
                        stream=iter_stream)
 
             # Reset points so that they will be FUSEd
             util.BaseCode.fill_dptr(self.mod, d_points, 4 * nslots,
-                                    iter_stream, np.float32(np.nan))
+                                    iter_stream, f32(np.nan))
 
             # Get interpolated control points for debugging
             #iter_stream.synchronize()
             #d_temp = cuda.from_device(d_infos,
-                    #(ntemporal_samples, len(self._iter.packer)), np.float32)
+                    #(ntemporal_samples, len(self._iter.packer)), f32)
             #for i, n in zip(d_temp[5], self._iter.packer.packed):
                 #print '%60s %g' % ('_'.join(n), i)
 
             util.BaseCode.fill_dptr(self.mod, d_accum, 4 * nbins, write_stream)
-            if info.acc_mode == 'atomic':
+            if self.acc_mode == 'atomic':
                 util.BaseCode.fill_dptr(self.mod, d_atom, 2 * nbins, write_stream)
-            nrounds = ( (info.density * info.width * info.height)
-                      / (ntemporal_samples * 256 * 256) ) + 1
-            if info.acc_mode == 'deferred':
+            nrounds = int( (genome.spp(tc) * width * height)
+                         / (ntemporal_samples * 256 * 256) ) + 1
+            if self.acc_mode == 'deferred':
                 for i in range(nrounds):
                     iter_fun(np.uint64(d_log), d_seeds, d_points, d_infos,
                              block=(32, self._iter.NTHREADS/32, 1),
@@ -272,18 +329,17 @@ class Renderer(object):
                     _sync_stream(write_stream, iter_stream)
                     sorter.sort(d_log_sorted, d_log, log_size, 3, True,
                                 stream=write_stream)
-                    #print cuda.from_device(sorter.dglobal, (256,), np.uint32)
                     _sync_stream(iter_stream, write_stream)
                     write_fun(d_accum, d_log_sorted, sorter.dglobal, i32(nbins),
                               block=(1024, 1, 1), grid=(nwriteblocks, 1),
                               stream=write_stream)
             else:
                 args = [u64(d_accum), d_seeds, d_points, d_infos]
-                if info.acc_mode == 'atomic':
+                if self.acc_mode == 'atomic':
                     args.append(u64(d_atom))
                 iter_fun(*args, block=(32, self._iter.NTHREADS/32, 1),
                          grid=(ntemporal_samples, nrounds), stream=iter_stream)
-                if info.acc_mode == 'atomic':
+                if self.acc_mode == 'atomic':
                     nblocks = int(np.ceil(np.sqrt(nbins/float(512))))
                     flush_fun(u64(d_accum), u64(d_atom), i32(nbins),
                               block=(512, 1, 1), grid=(nblocks, nblocks),
@@ -291,27 +347,29 @@ class Renderer(object):
 
             util.BaseCode.fill_dptr(self.mod, d_out, 4 * nbins, filt_stream)
             _sync_stream(filt_stream, write_stream)
-            filt.de(d_out, d_accum, info, start, stop, filt_stream)
+            filt.de(d_out, d_accum, genome, dim, tc, stream=filt_stream)
             _sync_stream(write_stream, filt_stream)
-            filt.colorclip(d_out, info, start, stop, filt_stream)
-            cuda.memcpy_dtoh_async(h_out_a, d_out, filt_stream)
+            filt.colorclip(d_out, genome, dim, tc, blend, stream=filt_stream)
+            obuf_copy.set_dst_host(h_out_a)
+            obuf_copy(filt_stream)
 
             if event_b:
                 while not event_a.query():
                     timemod.sleep(0.01)
                 gpu_time = event_a.time_since(event_b)
-                yield RenderedImage(self._trim(h_out_b), last_idx, gpu_time)
+                result = RenderedImage(h_out_b, last_idx, gpu_time)
+            else:
+                result = None
+            last_idx = idx
 
             event_a, event_b = cuda.Event().record(filt_stream), event_a
             h_out_a, h_out_b = h_out_b, h_out_a
-            last_idx = idx
+
+            # TODO: add ability to flush a frame without breaking the pipe
+            next_frame = yield result
 
         while not event_a.query():
             timemod.sleep(0.001)
         gpu_time = event_a.time_since(event_b)
-        yield RenderedImage(self._trim(h_out_b), last_idx, gpu_time)
-
-    def _trim(self, result):
-        g = self.info.gutter
-        return result[g:-g,g:g+self.info.width].copy()
+        yield RenderedImage(h_out_b, last_idx, gpu_time)