New genome format to support flockutil

2026-03-18 07:50:49 -04:00 · 2011-12-15 11:11:05 -05:00
parent 12655b8611
commit b43481e374
5 changed files with 303 additions and 275 deletions
--- a/cuburn/code/filtering.py
+++ b/cuburn/code/filtering.py
@ -1,5 +1,7 @@
 import numpy as np
 from numpy import float32 as f32, int32 as i32
 import pycuda.compiler
 from pycuda.gpuarray import vec
@ -10,8 +12,8 @@ _CODE = '''
 __global__
 void colorclip(float4 *pixbuf, float gamma, float vibrancy, float highpow,
-               float linrange, float lingam, float3 bkgd, int fbsize,
+               float linrange, float lingam, float3 bkgd,
-               int alpha_output_channel) {
+               int fbsize, int blend_background_color) {
    int i = threadIdx.x + blockDim.x * (blockIdx.x + gridDim.x * blockIdx.y);
    if (i >= fbsize) return;
@ -30,8 +32,17 @@ void colorclip(float4 *pixbuf, float gamma, float vibrancy, float highpow,
        alpha = (1.0f - frac) * pix.w * lingam + frac * alpha;
    }
-    float ls = vibrancy * alpha / pix.w;
+    if (!blend_background_color) {
        float ls = alpha / pix.w;
        pix.x *= ls;
        pix.y *= ls;
        pix.z *= ls;
        pix.w = alpha;
        pixbuf[i] = pix;
        return;
    }
    float ls = vibrancy * alpha / pix.w;
    alpha = fminf(1.0f, fmaxf(0.0f, alpha));
    float maxc = fmaxf(pix.x, fmaxf(pix.y, pix.z));
@ -64,29 +75,21 @@ void colorclip(float4 *pixbuf, float gamma, float vibrancy, float highpow,
    pix.y += (1.0f - vibrancy) * powf(opix.y, gamma);
    pix.z += (1.0f - vibrancy) * powf(opix.z, gamma);
-    if (alpha_output_channel) {
+    pix.x += (1.0f - alpha) * bkgd.x;
-        float one_alpha = 1.0f / alpha;
+    pix.y += (1.0f - alpha) * bkgd.y;
-        pix.x *= one_alpha;
+    pix.z += (1.0f - alpha) * bkgd.z;
        pix.y *= one_alpha;
        pix.z *= one_alpha;
    } else {
        pix.x += (1.0f - alpha) * bkgd.x;
        pix.y += (1.0f - alpha) * bkgd.y;
        pix.z += (1.0f - alpha) * bkgd.z;
    }
    pix.w = alpha;
    // Clamp values. I think this is superfluous, but I'm not certain.
    pix.x = fminf(1.0f, pix.x);
    pix.y = fminf(1.0f, pix.y);
    pix.z = fminf(1.0f, pix.z);
    pix.w = alpha;
    pixbuf[i] = pix;
 }
 #define W 15        // Filter width (regardless of standard deviation chosen)
-#define W2 7       // Half of filter width, rounded down
+#define W2 7        // Half of filter width, rounded down
 #define FW 46       // Width of local result storage (NW+W2+W2)
 #define FW2 (FW*FW)
@ -288,52 +291,45 @@ class Filtering(object):
    def __init__(self):
        self.init_mod()
-    def de(self, ddst, dsrc, info, start, stop, stream=None):
+    def de(self, ddst, dsrc, gnm, dim, tc, stream=None):
-        # TODO: use integration to obtain parameter values
+        k1 = f32(gnm.color.brightness(tc) * 268 / 256)
        t = (start + stop) / 2
        cp = info.genome
        k1 = np.float32(cp.color.brightness(t) * 268 / 256)
        # Old definition of area is (w*h/(s*s)). Since new scale 'ns' is now
        # s/w, new definition is (w*h/(s*s*w*w)) = (h/(s*s*w))
-        area = info.height / (cp.camera.scale(t) ** 2 * info.width)
+        area = dim.h / (gnm.camera.scale(tc) ** 2 * dim.w)
-        k2 = np.float32(1 / (area * info.density))
+        k2 = f32(1 / (area * gnm.spp(tc)))
-        if cp.de.radius == 0:
+        if gnm.de.radius == 0:
-            nbins = info.acc_height * info.acc_stride
+            nbins = dim.ah * dim.astride
            fun = self.mod.get_function("logscale")
            t = fun(dsrc, ddst, k1, k2,
                    block=(512, 1, 1), grid=(nbins/512, 1), stream=stream)
        else:
-            scale_coeff = np.float32(-(1 + cp.de.radius(t)) ** -2.0)
+            scale_coeff = f32(-(1 + gnm.de.radius(tc)) ** -2.0)
-            est_curve = np.float32(2 * cp.de.curve(t))
+            est_curve = f32(2 * gnm.de.curve(tc))
            # TODO: experiment with this
-            edge_clamp = np.float32(2.0)
+            edge_clamp = f32(1.2)
            fun = self.mod.get_function("density_est")
            fun(dsrc, ddst, scale_coeff, est_curve, edge_clamp, k1, k2,
-                np.int32(info.acc_height), np.int32(info.acc_stride),
+                i32(dim.ah), i32(dim.astride), block=(32, 32, 1),
-                block=(32, 32, 1), grid=(info.acc_width/32, 1), stream=stream)
+                grid=(dim.aw/32, 1), stream=stream)
-    def colorclip(self, dbuf, info, start, stop, stream=None):
+    def colorclip(self, dbuf, gnm, dim, tc, blend, stream=None):
-        f32 = np.float32
+        nbins = dim.ah * dim.astride
        t = (start + stop) / 2
        cp = info.genome
        nbins = info.acc_height * info.acc_stride
        # TODO: implement integration over cubic splines?
-        gam = f32(1 / cp.color.gamma(t))
+        gam = f32(1 / gnm.color.gamma(tc))
-        vib = f32(cp.color.vibrancy(t))
+        vib = f32(gnm.color.vibrancy(tc))
-        hipow = f32(cp.color.highlight_power(t))
+        hipow = f32(gnm.color.highlight_power(tc))
-        lin = f32(cp.color.gamma_threshold(t))
+        lin = f32(gnm.color.gamma_threshold(tc))
        lingam = f32(lin ** (gam-1.0) if lin > 0 else 0)
        bkgd = vec.make_float3(
-                cp.color.background.r(t),
+                gnm.color.background.r(tc),
-                cp.color.background.g(t),
+                gnm.color.background.g(tc),
-                cp.color.background.b(t))
+                gnm.color.background.b(tc))
        color_fun = self.mod.get_function("colorclip")
        blocks = int(np.ceil(np.sqrt(nbins / 256)))
-        color_fun(dbuf, gam, vib, hipow, lin, lingam, bkgd, np.int32(nbins),
+        color_fun(dbuf, gam, vib, hipow, lin, lingam, bkgd, i32(nbins),
-                  np.int32(0),
+                  i32(blend), block=(256, 1, 1), grid=(blocks, blocks),
-                  block=(256, 1, 1), grid=(blocks, blocks), stream=stream)
+                  stream=stream)
--- a/cuburn/code/interp.py
+++ b/cuburn/code/interp.py
@ -226,8 +226,11 @@ class GenomePacker(HunkOCode):
    _defs = Template(r"""
 __global__
-void interp_{{tname}}({{tname}}* out, float *times, float *knots,
+void interp_{{tname}}(
-        float tstart, float tstep, mwc_st *rctxes, int maxid) {
+        {{tname}}* out, mwc_st *rctxes,
        const float *times, const float *knots,
        float tstart, float tstep, int maxid
 ) {
    int id = gtid();
    if (id >= maxid) return;
    out = &out[id];
--- a/cuburn/code/iter.py
+++ b/cuburn/code/iter.py
@ -53,7 +53,7 @@ def precalc_chaos(pcp, std_xforms):
    """).substitute(locals()))
-def precalc_camera(info, pcam):
+def precalc_camera(pcam):
    pre_cam = pcam._precalc()
    # Maxima code to check my logic:
@ -68,7 +68,7 @@ def precalc_camera(info, pcam):
        float rot = {{pre_cam.rotation}} * M_PI / 180.0f;
        float rotsin = sin(rot), rotcos = cos(rot);
        float cenx = {{pre_cam.center.x}}, ceny = {{pre_cam.center.y}};
-        float scale = {{pre_cam.scale}} * {{info.width}};
+        float scale = {{pre_cam.scale}} * acc_size.width;
        float ditherwidth = {{pre_cam.dither_width}} * 0.33f;
        float u0 = mwc_next_01(rctx);
@ -81,12 +81,12 @@ def precalc_camera(info, pcam):
        {{pre_cam._set('xx')}} = scale * rotcos;
        {{pre_cam._set('xy')}} = scale * -rotsin;
        {{pre_cam._set('xo')}} = scale * (rotsin * ceny - rotcos * cenx)
-                              + {{0.5 * (info.width + info.gutter + 1)}} + ditherx;
+                               + 0.5f * acc_size.awidth + ditherx;
        {{pre_cam._set('yx')}} = scale * rotsin;
        {{pre_cam._set('yy')}} = scale * rotcos;
        {{pre_cam._set('yo')}} = scale * -(rotsin * cenx + rotcos * ceny)
-                              + {{0.5 * (info.height + info.gutter + 1)}} + dithery;
+                               + 0.5f * acc_size.aheight + dithery;
    """).substitute(locals()))
@ -113,13 +113,12 @@ class IterCode(HunkOCode):
    # The number of threads per block
    NTHREADS = 256
-    def __init__(self, info):
+    def __init__(self, info, genome):
        self.info = info
        self.packer = interp.GenomePacker('iter_params')
-        self.pcp = self.packer.view('params', self.info.genome, 'cp')
+        self.pcp = self.packer.view('params', genome, 'cp')
-        iterbody = self._iterbody()
+        iterbody = self._iterbody(info, genome)
-        bodies = [self._xfbody(i,x) for i,x in sorted(info.genome.xforms.items())]
+        bodies = [self._xfbody(i,x) for i,x in sorted(genome.xforms.items())]
        bodies.append(iterbody)
        self.defs = '\n'.join(bodies)
@ -132,7 +131,9 @@ __device__ int rb_head, rb_tail, rb_size;
 typedef struct {
    uint32_t width;
    uint32_t height;
-    uint32_t stride;
+    uint32_t awidth;
    uint32_t aheight;
    uint32_t astride;
 } acc_size_t;
 __constant__ acc_size_t acc_size;
@ -174,7 +175,7 @@ void apply_xf_{{xfid}}(float &ox, float &oy, float &color, mwc_st &rctx) {
        g.update(locals())
        return tmpl.substitute(g)
-    def _iterbody(self):
+    def _iterbody(self, info, genome):
        tmpl = Template(r'''
 __global__ void reset_rb(int size) {
@ -331,7 +332,7 @@ void iter(
        float cx, cy, cc;
-        {{precalc_camera(info, pcp.camera)}}
+        {{precalc_camera(pcp.camera)}}
 {{if 'final' in cp.xforms}}
        {{apply_affine('fx', 'fy', 'cx', 'cy', pcp.camera)}}
@ -343,14 +344,14 @@ void iter(
        uint32_t ix = trunca(cx), iy = trunca(cy);
-        if (ix >= acc_size.width || iy >= acc_size.height) {
+        if (ix >= acc_size.awidth || iy >= acc_size.aheight) {
 {{if info.acc_mode == 'deferred'}}
            *log = 0xffffffff;
 {{endif}}
            continue;
        }
-        uint32_t i = iy * acc_size.stride + ix;
+        uint32_t i = iy * acc_size.astride + ix;
 {{if info.acc_mode == 'atomic'}}
        asm volatile ({{crep("""
 {
@ -626,12 +627,11 @@ oflow_write_end:
 {{endif}}
 ''', 'iter_kern')
        return tmpl.substitute(
-                info = self.info,
+                info = info,
-                cp = self.info.genome,
+                cp = genome,
                pcp = self.pcp,
                NTHREADS = self.NTHREADS,
                NWARPS = self.NTHREADS / 32,
-                std_xforms = [n for n in sorted(self.info.genome.xforms)
+                std_xforms = [n for n in sorted(genome.xforms) if n != 'final'],
                              if n != 'final'],
                **globals())
--- a/cuburn/genome.py
+++ b/cuburn/genome.py
@ -79,115 +79,86 @@ class SplEval(object):
            return self.knots[1][0]
        return list(self.knots.T.flat)
-    @classmethod
+class Palette(object):
-    def wrap(cls, obj):
+    """Wafer-thin wrapper around palettes. For the future!"""
-        """
+    def __init__(self, datastr, fmt='rgb8'):
-        Given a dict 'obj' representing, for instance, a Genome object, walk
+        if fmt != 'rgb8':
-        through the object recursively and in-place, turning any number or
+            raise NotImplementedError
-        list of numbers into an SplEval.
+        if len(datastr) != 768:
-        """
+            raise ValueError("Unsupported palette width")
-        for k, v in obj.items():
+        self.width = 256
-            if (isinstance(v, (float, int)) or
+        pal = np.reshape(np.fromstring(datastr, np.uint8), (256, 3))
-                    (isinstance(v, list) and isinstance(v[1], (float, int)))):
+        self.data = np.ones((256, 4), np.float32)
-                obj[k] = cls(v)
+        self.data[:,:3] = pal / 255.0
            elif isinstance(v, dict):
                cls.wrap(v)
 class RenderInfo(object):
    """
    Determine features and constants required to render a particular set of
    genomes. The values of this class are fixed before compilation begins.
    """
    # Number of iterations to iterate without write after generating a new
    # point. This number is currently fixed pretty deeply in the set of magic
    # constants which govern buffer sizes; changing the value here won't
    # actually change the code on the device to do something different.
    fuse = 256
    # Height of the texture pallete which gets uploaded to the GPU (assuming
    # that palette-from-texture is enabled). For most genomes, this doesn't
    # need to be very large at all. However, since only an easily-cached
    # fraction of this will be accessed per SM, larger values shouldn't hurt
    # performance too much. When using deferred accumulation, increasing this
    # value increases the number of uniquely-dithered samples, which is nice.
    # Power-of-two, please.
    palette_height = 64
    # Maximum width of DE and other spatial filters, and thus in turn the
    # amount of padding applied. Note that, for now, this must not be changed!
    # The filtering code makes deep assumptions about this value.
    gutter = 15
    # TODO: for now, we always throw away the alpha channel before writing.
    # All code is in place to not do this, we just need to find a way to expose
    # this preference via the API (or push alpha blending entirely on the client,
    # which I'm not opposed to)
    alpha_output_channel = False
    # There are three settings for this somewhat ersatz paramater. 'global'
    # uses unsynchronized global writes to accumulate sample points, 'atomic'
    # uses atomic global writes, and 'deferred' stores color and position in a
    # sample log, sorts the log by position, and uses shared memory to
    # perform the accumulation. Deferred has the accuracy of 'atomic' and
    # the speed of 'global' (it's actually faster!), but packs color and
    # position into a single 32-bit int for now, which limits resolution to
    # 1080p when xform opacity is respected, so the other two modes will hang
    # around until that can be extended to be memory-limited again.
    acc_mode = 'atomic'
    # TODO: fix this
    chaos_used = False
    def __init__(self, db, **kwargs):
        self.db = db
        # Copy all args into this object's namespace
        self.__dict__.update(kwargs)
        self.acc_width = self.width + 2 * self.gutter
        self.acc_height = self.height + 2 * self.gutter
        self.acc_stride = 32 * int(np.ceil(self.acc_width / 32.))
        self.density = self.quality
        # Deref genome
        self.genome = self.db.genomes[self.genome]
        for k, v in self.db.palettes.items():
            pal = np.fromstring(base64.b64decode(v), np.uint8)
            pal = np.reshape(pal, (256, 3))
            pal_a = np.ones((256, 4), np.float32)
            pal_a[:,:3] = pal / 255.0
            self.db.palettes[k] = pal_a
 class _AttrDict(dict):
    def __getattr__(self, name):
        return self[name]
-def load_info(contents):
+    @classmethod
-    result = json.loads(contents, object_hook=_AttrDict)
+    def _wrap(cls, dct):
-    SplEval.wrap(result.genomes)
+        for k, v in dct.items():
            if (isinstance(v, (float, int)) or
                    (isinstance(v, list) and isinstance(v[1], (float, int)))):
                dct[k] = SplEval(v)
            elif isinstance(v, dict):
                dct[k] = cls._wrap(cls(v))
        return dct
-    # A Job object will have more details or something
+class Genome(_AttrDict):
-    result = RenderInfo(result, **result.renders.values()[0])
+    # For now, we base the Genome class on an _AttrDict, letting its structure
-    return result
+    # be defined implicitly by the way it is used in device code. More formal
    # unpacking will happen soon.
    def __init__(self, gnm, base_den):
        super(Genome, self).__init__(gnm)
        for k, v in self.items():
            v = _AttrDict(v)
            if k not in ('info', 'time'):
                _AttrDict._wrap(v)
            self[k] = v
        # TODO: this is a hack, figure out how to solve it more elegantly
        self.spp = SplEval(self.camera.density.knotlist)
        self.spp.knots[1] *= base_den
        # TODO: decide how to handle palettes. For now, it's the caller's
        # responsibility to replace this list with actual palettes.
        pal = self.color.palette
        if isinstance(pal, basestring):
            self.color.palette = [(0.0, pal), (1.0, pal)]
        elif isinstance(pal, list):
            self.color.palette = zip(pal[::2], pal[1::2])
-class HacketyGenome(object):
+        # TODO: caller also needs to call set_timing()
-    """
+        self.adj_frame_width = None
-    Holdover class to postpone a very deep refactoring as long as possible.
+        self.canonical_right = (not self.get('link') or not self.link == 'self'
-    Converts property accesses into interpolations over predetermined times.
+                                or not self.link.get('right'))
    """
    def __init__(self, referent, times):
        # Times can be singular
        self.referent, self.times = referent, times
    def __getattr__(self, name):
        r = getattr(self.referent, str(name))
        if isinstance(r, _AttrDict):
            return HacketyGenome(r, self.times)
        elif isinstance(r, SplEval):
            return r(self.times)
        return r
    __getitem__ = __getattr__
-if __name__ == "__main__":
+    def set_timing(self, base_dur, fps, offset=0.0, err_spread=True):
-    import sys
+        """
-    import pprint
+        Set frame timing. Must be called at least once prior to rendering.
-    pprint.pprint(read_genome(sys.stdin))
+        """
        # TODO: test!
        dur = self.time.duration
        if isinstance(dur, basestring):
            clock = float(dur[:-1]) + offset
        else:
            clock = dur * base_dur + offset
        if self.canonical_right:
            nframes = int(np.floor(clock * fps))
        else:
            nframes = int(np.ceil(clock * fps))
        err = (clock - nframes / fps) / clock
        fw = self.time.frame_width
        if not isinstance(fw, list):
            fw = [0, fw, 1, fw]
        fw = [float(f[:-1]) * fps if isinstance(f, basestring)
              else float(f) / (clock * fps) for f in fw]
        self.adj_frame_width = SplEval(fw)
        times = np.linspace(offset, 1 - err, nframes + 1)
        # Move each time to a center time, and discard the last value
        times = times[:-1] + 0.5 * (times[1] - times[0])
        if err_spread:
            epts = np.linspace(-2*np.pi, 2*np.pi, nframes)
            times = times + 0.5 * err * (np.tanh(epts) + 1)
        return err, times
--- a/cuburn/render.py
+++ b/cuburn/render.py
@ -4,11 +4,11 @@ import re
 import time as timemod
 import tempfile
 from collections import namedtuple
-from itertools import cycle, repeat, chain, izip
+from itertools import cycle, repeat, chain, izip, imap, ifilter
 from ctypes import *
 from cStringIO import StringIO
 import numpy as np
-from numpy import int32 as i32, uint64 as u64
+from numpy import float32 as f32, int32 as i32, uint32 as u32, uint64 as u64
 from scipy import ndimage
 from fr0stlib import pyflam3
@ -24,41 +24,55 @@ from cuburn import affine
 from cuburn.code import util, mwc, iter, filtering, sort
 RenderedImage = namedtuple('RenderedImage', 'buf idx gpu_time')
 Dimensions = namedtuple('Dimensions', 'w h aw ah astride')
 def _sync_stream(dst, src):
    dst.wait_for_event(cuda.Event(cuda.event_flags.DISABLE_TIMING).record(src))
 def argset(obj, **kwargs):
    for k, v in kwargs.items():
        setattr(obj, k, v)
    return obj
 class Renderer(object):
    """
    Control structure for rendering a series of frames.
    Each animation will dynamically generate a kernel that includes only the
    code necessary to render the genomes provided. The process of generating
    and uploading the kernel takes a small but finite amount of time. In
    general, the kernel generated for all genomes resulting from interpolating
    between two control points will have identical performance, so it is
    wasteful to create more than one animation for any interpolated sequence.
    However, genome sequences interpolated from three or more control points
    with different features enabled will have the code needed to render all
    genomes enabled for every frame. Doing this can hurt performance.
    In other words, it's best to use exactly one Animation for each
    interpolated sequence between one or two genomes.
    """
    # Number of iterations to iterate without write after generating a new
    # point. This number is currently fixed pretty deeply in the set of magic
    # constants which govern buffer sizes; changing the value here won't
    # actually change the code on the device to do something different.
    fuse = 256
    # The palette texture/surface covers the color coordinate from [0,1] with
    # (for now, a fixed 256) equidistant horizontal samples, and spans the
    # temporal range of the frame linearly with this many rows. Increasing
    # this value increases the number of uniquely-dithered samples when using
    # pre-dithered surfaces.
    palette_height = 64
    # Maximum width of DE and other spatial filters, and thus in turn the
    # amount of padding applied. Note that, for now, this must not be changed!
    # The filtering code makes deep assumptions about this value.
    gutter = 15
    # Accumulation mode. Leave it at 'atomic' for now.
    acc_mode = 'atomic'
    # TODO
    chaos_used = False
    cmp_options = ('-use_fast_math', '-maxrregcount', '42')
    keep = False
-    def __init__(self, info):
+    def __init__(self):
        self.info = info
        self._iter = self.src = self.cubin = self.mod = None
        self.packed_genome = None
        # Ensure class options don't get contaminated on an instance
        self.cmp_options = list(self.cmp_options)
-    def compile(self, keep=None, cmp_options=None, jit_options=[]):
+    def compile(self, genome, keep=None, cmp_options=None):
        """
        Compile a kernel capable of rendering every frame in this animation.
        The resulting compiled kernel is stored in the ``cubin`` property;
@ -73,7 +87,7 @@ class Renderer(object):
        keep = self.keep if keep is None else keep
        cmp_options = self.cmp_options if cmp_options is None else cmp_options
-        self._iter = iter.IterCode(self.info)
+        self._iter = iter.IterCode(self, genome)
        self._iter.packer.finalize()
        self.src = util.assemble_code(util.BaseCode, mwc.MWC, self._iter.packer,
                                      self._iter)
@ -82,41 +96,81 @@ class Renderer(object):
        self.cubin = pycuda.compiler.compile(
                self.src, keep=keep, options=cmp_options,
                cache_dir=False if keep else None)
    def load(self, genome, jit_options=[]):
        if not self.cubin:
            self.compile(genome)
        self.mod = cuda.module_from_buffer(self.cubin, jit_options)
        with open('/tmp/iter_kern.cubin', 'wb') as fp:
            fp.write(self.cubin)
        return self.src
-    def render(self, times):
+    def render(self, genome, times, width, height, blend=True):
        """
-        Render a flame for each genome in the iterable value 'genomes'.
+        Render a frame for each timestamp in the iterable value ``times``. This
-        Returns a RenderedImage object with the rendered buffer in the
+        function returns a generator that will yield a RenderedImage object
-        requested format (3D RGBA ndarray only for now).
+        containing a shared reference to the output buffer for each specified
        frame.
-        This method produces a considerable amount of side effects, and should
+        The returned buffer is page-locked host memory. Between the time a
-        not be used lightly. Things may go poorly for you if this method is not
+        buffer is yielded and the time the next frame's results are requested,
-        allowed to run until completion (by exhausting all items in the
+        the buffer will not be modified. Thereafter, however, it will be
-        generator object).
+        overwritten by an asynchronous DMA operation coming from the CUDA
        device. If you hang on to it for longer than one frame, copy it.
-        ``times`` is a sequence of (idx, start, stop) times, where index is
+        ``genome`` is the genome to be rendered. Successive calls to the
-        the logical frame number (though it can be any value) and 'start' and
+        `render()` method on one ``Renderer`` object must use genomes which
-        'stop' together define the time range to be rendered for each frame.
+        produce identical compiled code, and this will not be verified by the
        renderer. In practice, this means you can alter genome parameter
        values, but the full set of keys must remain identical between runs on
        the same renderer.
        ``times`` is a list of (idx, cen_time) tuples, where ``idx`` is passed
        unmodified in the RenderedImage return value and ``cen_time`` is the
        central time of the current frame in spline-time units. (Any
        clock-time or frame-time units in the genome should be preconverted.)
        If ``blend`` is False, the output buffer will contain unclipped,
        premultiplied RGBA data, without vibrancy, highlight power, or the
        alpha elbow applied.
        """
-        if times == []:
+        r = self.render_gen(genome, width, height, blend=blend)
        next(r)
        return ifilter(None, imap(r.send, chain(times, [None])))
    def render_gen(self, genome, width, height, blend=True):
        """
        Render frames. This method is wrapped by the ``render()`` method; see
        its docstring for warnings and details.
        Instead of passing frame times as an iterable, they are passed
        individually via the ``generator.send()`` method. There is an
        internal pipeline latency of one frame, so the first call to the
        ``send()`` method will return None, the second call will return the
        first frame's result, and so on. To retrieve the last frame in a
        sequence, send ``None``.
        Direct use of this method is useful for implementing render servers.
        """
        last_idx = None
        next_frame = yield
        if next_frame is None:
            return
        if not self.mod:
            self.load(genome)
        filt = filtering.Filtering()
        reset_rb_fun = self.mod.get_function("reset_rb")
        packer_fun = self.mod.get_function("interp_iter_params")
        iter_fun = self.mod.get_function("iter")
        info = self.info
        # The synchronization model is messy. See helpers/task_model.svg.
        iter_stream = cuda.Stream()
        filt_stream = cuda.Stream()
-        if info.acc_mode == 'deferred':
+        if self.acc_mode == 'deferred':
            write_stream = cuda.Stream()
            write_fun = self.mod.get_function("write_shmem")
        else:
@ -128,19 +182,30 @@ class Renderer(object):
        event_a = cuda.Event().record(filt_stream)
        event_b = None
-        nbins = info.acc_height * info.acc_stride
+        awidth = width + 2 * self.gutter
        aheight = height + 2 * self.gutter
        astride = 32 * int(np.ceil(awidth / 32.))
        dim = Dimensions(width, height, awidth, aheight, astride)
        d_acc_size = self.mod.get_global('acc_size')[0]
        cuda.memcpy_htod_async(d_acc_size, u32(list(dim)), write_stream)
        nbins = awidth * aheight
        # Extra padding in accum helps with write_shmem overruns
        d_accum = cuda.mem_alloc(16 * nbins + (1<<16))
        d_out = cuda.mem_alloc(16 * nbins)
-        if info.acc_mode == 'atomic':
+        if self.acc_mode == 'atomic':
            d_atom = cuda.mem_alloc(8 * nbins)
            flush_fun = self.mod.get_function("flush_atom")
-        acc_size = np.array([info.acc_width, info.acc_height, info.acc_stride])
+        obuf_copy = argset(cuda.Memcpy2D(),
-        d_acc_size = self.mod.get_global('acc_size')[0]
+            src_y=self.gutter, src_x_in_bytes=16*self.gutter,
-        cuda.memcpy_htod_async(d_acc_size, np.uint32(acc_size), write_stream)
+            src_pitch=16*astride, dst_pitch=16*width,
            width_in_bytes=16*width, height=height)
        obuf_copy.set_src_device(d_out)
        h_out_a = cuda.pagelocked_empty((height, width, 4), f32)
        h_out_b = cuda.pagelocked_empty((height, width, 4), f32)
-        if info.acc_mode == 'deferred':
+        if self.acc_mode == 'deferred':
            # Having a fixed, power-of-two log size makes things much easier
            log_size = 64 << 20
            d_log = cuda.mem_alloc(log_size * 4)
@ -153,9 +218,8 @@ class Renderer(object):
        # Calculate 'nslots', the number of simultaneous running threads that
        # can be active on the GPU during iteration (and thus the number of
        # slots for loading and storing RNG and point context that will be
-        # prepared on the device), 'rb_size' (the number of blocks in
+        # prepared on the device), and derive 'rb_size', the number of blocks in
-        # 'nslots'), and determine a number of temporal samples
+        # 'nslots'.
        # likely to load-balance effectively
        iter_threads_per_block = 256
        dev_data = pycuda.tools.DeviceData()
        occupancy = pycuda.tools.OccupancyRecord(
@ -169,14 +233,16 @@ class Renderer(object):
        reset_rb_fun(np.int32(rb_size), block=(1,1,1))
        d_points = cuda.mem_alloc(nslots * 16)
-        # We may add extra seeds to simplify palette dithering.
+        # This statement may add extra seeds to simplify palette dithering.
-        seeds = mwc.MWC.make_seeds(max(nslots, 256 * info.palette_height))
+        seeds = mwc.MWC.make_seeds(max(nslots, 256 * self.palette_height))
        d_seeds = cuda.to_device(seeds)
        # We used to auto-calculate this to a multiple of the number of SMs on
        # the device, but since we now use shorter launches and, to a certain
        # extent, allow simultaneous occupancy, that's not as important. The
-        # 1024 is a magic constant, though: FUSE
+        # 1024 is a magic constant to ensure reasonable and power-of-two log
        # size for deferred: 256MB / (4B * FUSE * NTHREADS). Enhancements to
        # the sort engine are needed to make this more flexible.
        ntemporal_samples = 1024
        genome_times, genome_knots = self._iter.packer.pack()
        d_genome_times = cuda.to_device(genome_times)
@ -184,37 +250,31 @@ class Renderer(object):
        info_size = 4 * len(self._iter.packer) * ntemporal_samples
        d_infos = cuda.mem_alloc(info_size)
-        pals = info.genome.color.palette
+        pals = genome.color.palette
        if isinstance(pals, basestring):
            pals = [0.0, pals, 1.0, pals]
-        palint_times = np.empty(len(genome_times[0]), np.float32)
+        palint_times = np.empty(len(genome_times[0]), f32)
        palint_times.fill(100.0)
-        palint_times[:len(pals)/2] = pals[::2]
+        palint_times[:len(pals)] = [p[0] for p in pals]
        d_palint_times = cuda.to_device(palint_times)
        d_palint_vals = cuda.to_device(
-                np.concatenate(map(info.db.palettes.get, pals[1::2])))
+                np.concatenate([p[1].data for p in pals]))
-        if info.acc_mode in ('deferred', 'atomic'):
+        if self.acc_mode in ('deferred', 'atomic'):
            palette_fun = self.mod.get_function("interp_palette_hsv_flat")
-            dsc = cuda.ArrayDescriptor3D()
+            dsc = argset(cuda.ArrayDescriptor3D(), height=self.palette_height,
-            dsc.height = info.palette_height
+                    width=256, depth=0, format=cuda.array_format.SIGNED_INT32,
-            dsc.width = 256
+                    num_channels=2, flags=cuda.array3d_flags.SURFACE_LDST)
            dsc.depth = 0
            dsc.format = cuda.array_format.SIGNED_INT32
            dsc.num_channels = 2
            dsc.flags = cuda.array3d_flags.SURFACE_LDST
            palarray = cuda.Array(dsc)
            tref = self.mod.get_surfref('flatpal')
            tref.set_array(palarray, 0)
        else:
            palette_fun = self.mod.get_function("interp_palette_hsv")
-            dsc = cuda.ArrayDescriptor()
+            dsc = argset(cuda.ArrayDescriptor(), height=self.palette_height,
-            dsc.height = info.palette_height
+                    width=256, format=cuda.array_format.UNSIGNED_INT8,
-            dsc.width = 256
+                    num_channels=4)
-            dsc.format = cuda.array_format.UNSIGNED_INT8
+            d_palmem = cuda.mem_alloc(256 * self.palette_height * 4)
            dsc.num_channels = 4
            d_palmem = cuda.mem_alloc(256 * info.palette_height * 4)
            tref = self.mod.get_texref('palTex')
            tref.set_address_2d(d_palmem, dsc, 1024)
@ -222,49 +282,46 @@ class Renderer(object):
            tref.set_flags(cuda.TRSF_NORMALIZED_COORDINATES)
            tref.set_filter_mode(cuda.filter_mode.LINEAR)
-        h_out_a = cuda.pagelocked_empty((info.acc_height, info.acc_stride, 4),
+        while next_frame is not None:
-                                        np.float32)
+            # tc, td, ts, te: central, delta, start, end times
-        h_out_b = cuda.pagelocked_empty((info.acc_height, info.acc_stride, 4),
+            idx, tc = next_frame
-                                        np.float32)
+            td = genome.adj_frame_width(tc)
-        last_idx = None
+            ts, te = tc - 0.5 * td, tc + 0.5 * td
-        for idx, start, stop in times:
+            if self.acc_mode in ('deferred', 'atomic'):
-            twidth = np.float32((stop-start) / info.palette_height)
+                # In this mode, the palette writes to a surface reference, but
-            if info.acc_mode in ('deferred', 'atomic'):
+                # requires dithering, so we pass it the seeds instead
-                palette_fun(d_seeds, d_palint_times, d_palint_vals,
+                arg0 = d_seeds
                            np.float32(start), twidth,
                            block=(256,1,1), grid=(info.palette_height,1),
                            stream=write_stream)
            else:
-                palette_fun(d_palmem, d_palint_times, d_palint_vals,
+                arg0 = d_palmem
-                            np.float32(start), twidth,
+            palette_fun(arg0, d_palint_times, d_palint_vals,
-                            block=(256,1,1), grid=(info.palette_height,1),
+                        f32(ts), f32(td / self.palette_height),
-                            stream=write_stream)
+                        block=(256,1,1), grid=(self.palette_height,1),
                        stream=write_stream)
-            width = np.float32((stop-start) / ntemporal_samples)
+            packer_fun(d_infos, d_seeds, d_genome_times, d_genome_knots,
-            packer_fun(d_infos, d_genome_times, d_genome_knots,
+                       f32(ts), f32(td / ntemporal_samples),
-                       np.float32(start), width, d_seeds,
+                       i32(ntemporal_samples), block=(256,1,1),
                       np.int32(ntemporal_samples), block=(256,1,1),
                       grid=(int(np.ceil(ntemporal_samples/256.)),1),
                       stream=iter_stream)
            # Reset points so that they will be FUSEd
            util.BaseCode.fill_dptr(self.mod, d_points, 4 * nslots,
-                                    iter_stream, np.float32(np.nan))
+                                    iter_stream, f32(np.nan))
            # Get interpolated control points for debugging
            #iter_stream.synchronize()
            #d_temp = cuda.from_device(d_infos,
-                    #(ntemporal_samples, len(self._iter.packer)), np.float32)
+                    #(ntemporal_samples, len(self._iter.packer)), f32)
            #for i, n in zip(d_temp[5], self._iter.packer.packed):
                #print '%60s %g' % ('_'.join(n), i)
            util.BaseCode.fill_dptr(self.mod, d_accum, 4 * nbins, write_stream)
-            if info.acc_mode == 'atomic':
+            if self.acc_mode == 'atomic':
                util.BaseCode.fill_dptr(self.mod, d_atom, 2 * nbins, write_stream)
-            nrounds = ( (info.density * info.width * info.height)
+            nrounds = int( (genome.spp(tc) * width * height)
-                      / (ntemporal_samples * 256 * 256) ) + 1
+                         / (ntemporal_samples * 256 * 256) ) + 1
-            if info.acc_mode == 'deferred':
+            if self.acc_mode == 'deferred':
                for i in range(nrounds):
                    iter_fun(np.uint64(d_log), d_seeds, d_points, d_infos,
                             block=(32, self._iter.NTHREADS/32, 1),
@ -272,18 +329,17 @@ class Renderer(object):
                    _sync_stream(write_stream, iter_stream)
                    sorter.sort(d_log_sorted, d_log, log_size, 3, True,
                                stream=write_stream)
                    #print cuda.from_device(sorter.dglobal, (256,), np.uint32)
                    _sync_stream(iter_stream, write_stream)
                    write_fun(d_accum, d_log_sorted, sorter.dglobal, i32(nbins),
                              block=(1024, 1, 1), grid=(nwriteblocks, 1),
                              stream=write_stream)
            else:
                args = [u64(d_accum), d_seeds, d_points, d_infos]
-                if info.acc_mode == 'atomic':
+                if self.acc_mode == 'atomic':
                    args.append(u64(d_atom))
                iter_fun(*args, block=(32, self._iter.NTHREADS/32, 1),
                         grid=(ntemporal_samples, nrounds), stream=iter_stream)
-                if info.acc_mode == 'atomic':
+                if self.acc_mode == 'atomic':
                    nblocks = int(np.ceil(np.sqrt(nbins/float(512))))
                    flush_fun(u64(d_accum), u64(d_atom), i32(nbins),
                              block=(512, 1, 1), grid=(nblocks, nblocks),
@ -291,27 +347,29 @@ class Renderer(object):
            util.BaseCode.fill_dptr(self.mod, d_out, 4 * nbins, filt_stream)
            _sync_stream(filt_stream, write_stream)
-            filt.de(d_out, d_accum, info, start, stop, filt_stream)
+            filt.de(d_out, d_accum, genome, dim, tc, stream=filt_stream)
            _sync_stream(write_stream, filt_stream)
-            filt.colorclip(d_out, info, start, stop, filt_stream)
+            filt.colorclip(d_out, genome, dim, tc, blend, stream=filt_stream)
-            cuda.memcpy_dtoh_async(h_out_a, d_out, filt_stream)
+            obuf_copy.set_dst_host(h_out_a)
            obuf_copy(filt_stream)
            if event_b:
                while not event_a.query():
                    timemod.sleep(0.01)
                gpu_time = event_a.time_since(event_b)
-                yield RenderedImage(self._trim(h_out_b), last_idx, gpu_time)
+                result = RenderedImage(h_out_b, last_idx, gpu_time)
            else:
                result = None
            last_idx = idx
            event_a, event_b = cuda.Event().record(filt_stream), event_a
            h_out_a, h_out_b = h_out_b, h_out_a
-            last_idx = idx
+
            # TODO: add ability to flush a frame without breaking the pipe
            next_frame = yield result
        while not event_a.query():
            timemod.sleep(0.001)
        gpu_time = event_a.time_since(event_b)
-        yield RenderedImage(self._trim(h_out_b), last_idx, gpu_time)
+        yield RenderedImage(h_out_b, last_idx, gpu_time)
    def _trim(self, result):
        g = self.info.gutter
        return result[g:-g,g:g+self.info.width].copy()