Sweeping refactor. More bugs undoubtedly remain.

2025-10-25 22:40:46 -04:00 · 2012-02-14 07:40:58 -08:00
parent 4cfd328f85
commit 60a45c9a20
13 changed files with 1170 additions and 1299 deletions
--- a/cuburn/code/init.py
+++ b/cuburn/code/init.py
@ -1,9 +0,0 @@
-"""
-Contains the PTX fragments which will drive the device, and helper functions
-to combine those fragments.
-"""
-
-import util
-import mwc
-import iter
-
--- a/cuburn/code/color.py
+++ b/cuburn/code/color.py
@ -0,0 +1,77 @@
+import numpy as np
+
+from util import devlib
+
+# The JPEG YUV full-range matrix, without bias into the positve regime.
+# This assumes input color space is CIERGB D65, encoded with gamma 2.2.
+# Note that some interpolated colors may exceed the sRGB and YUV gamuts.
+YUV_MATRIX = np.matrix([[ 0.299,        0.587,      0.114],
+                        [-0.168736,    -0.331264,   0.5],
+                        [ 0.5,         -0.418688,  -0.081312]])
+
+yuvlib = devlib(defs='''
+__device__ float3 rgb2yuv(float3 rgb);
+__device__ float3 yuv2rgb(float3 yuv);
+''', decls=r'''
+/* This conversion uses the JPEG full-range standard. Note that UV have range
+ * [-0.5, 0.5], so consider biasing the results. */
+__device__ float3 rgb2yuv(float3 rgb) {
+    return make_float3(
+        0.299f      * rgb.x + 0.587f    * rgb.y + 0.114f    * rgb.z,
+        -0.168736f  * rgb.x - 0.331264f * rgb.y + 0.5f      * rgb.z,
+        0.5f        * rgb.x - 0.418688f * rgb.y - 0.081312f * rgb.z);
+}
+
+__device__ float3 yuv2rgb(float3 yuv) {
+    return make_float3(
+        yuv.x                    + 1.402f   * yuv.z,
+        yuv.x - 0.34414f * yuv.y - 0.71414f * yuv.z,
+        yuv.x + 1.772f   * yuv.y);
+}
+''')
+
+hsvlib = devlib(decls='''
+__device__ float3 rgb2hsv(float3 rgb);
+__device__ float3 hsv2rgb(float3 hsv);
+''', defs=r'''
+__device__ float3 rgb2hsv(float3 rgb) {
+    float M = fmaxf(fmaxf(rgb.x, rgb.y), rgb.z);
+    float m = fminf(fminf(rgb.x, rgb.y), rgb.z);
+    float C = M - m;
+
+    float s = M > 0.0f ? C / M : 0.0f;
+
+    float h = 0.0f;
+    if (s != 0.0f) {
+        C = 1.0f / C;
+        float rc = (M - rgb.x) * C;
+        float gc = (M - rgb.y) * C;
+        float bc = (M - rgb.z) * C;
+
+        if      (rgb.x == M)  h = bc - gc;
+        else if (rgb.y == M)  h = 2 + rc - bc;
+        else                  h = 4 + gc - rc;
+
+        if (h < 0) h += 6.0f;
+    }
+    return make_float3(h, s, M);
+}
+
+__device__ float3 hsv2rgb(float3 hsv) {
+    float whole = floorf(hsv.x);
+    float frac = hsv.x - whole;
+    float val = hsv.z;
+    float min = val * (1 - hsv.y);
+    float mid = val * (1 - (hsv.y * frac));
+    float alt = val * (1 - (hsv.y * (1 - frac)));
+
+    float3 out;
+         if (whole == 0.0f) { out.x = val; out.y = alt; out.z = min; }
+    else if (whole == 1.0f) { out.x = mid; out.y = val; out.z = min; }
+    else if (whole == 2.0f) { out.x = min; out.y = val; out.z = alt; }
+    else if (whole == 3.0f) { out.x = min; out.y = mid; out.z = val; }
+    else if (whole == 4.0f) { out.x = alt; out.y = min; out.z = val; }
+    else                    { out.x = val; out.y = min; out.z = mid; }
+    return out;
+}
+''')
--- a/cuburn/code/filtering.py
+++ b/cuburn/code/filtering.py
@ -1,13 +1,7 @@
-import numpy as np
-from numpy import float32 as f32, int32 as i32
+from util import devlib
+from color import yuvlib

-import pycuda.driver as cuda
-import pycuda.compiler
-from pycuda.gpuarray import vec
-
-from cuburn.code.util import *
-
-_CODE = r'''
+texshearlib = devlib(defs=r'''
 // Filter directions specified in degrees, using image/texture addressing
 // [(0,0) is upper left corner, 90 degrees is down].

@ -22,7 +16,9 @@ __constant__ float2 addressing_patterns[16] = {
    { 1.0f, -0.333333f},   { 0.333333f,  1.0f}, // 14, 15: -15,    75
 };

-// Mon dieu! A C++ feature?
+// Mon dieu! A C++ feature? Gotta to close the "extern C" added by the compiler.
+}
+
 template <typename T> __device__ T
 tex_shear(texture<T, cudaTextureType2D> ref, int pattern,
          float x, float y, float radius) {
@ -39,100 +35,14 @@ tex_shear(texture<T, cudaTextureType2D> ref, int pattern,
 }

 extern "C" {
+''')

-__global__
-void colorclip(float4 *pixbuf, float gamma, float vibrance, float highpow,
-               float linrange, float lingam, float3 bkgd,
-               int fbsize, int blend_background_color) {
-    int i = threadIdx.x + blockDim.x * (blockIdx.x + gridDim.x * blockIdx.y);
-    if (i >= fbsize) return;
-
-    float4 pix = pixbuf[i];
-
-    if (pix.w <= 0) {
-        pixbuf[i] = make_float4(bkgd.x, bkgd.y, bkgd.z, 0.0f);
-        return;
-    }
-    pix.y -= 0.5f * pix.w;
-    pix.z -= 0.5f * pix.w;
-    float3 tmp = yuv2rgb(make_float3(pix.x, pix.y, pix.z));
-    pix.x = tmp.x;
-    pix.y = tmp.y;
-    pix.z = tmp.z;
-
-    pix.x = fmaxf(0.0f, pix.x);
-    pix.y = fmaxf(0.0f, pix.y);
-    pix.z = fmaxf(0.0f, pix.z);
-
-    float4 opix = pix;
-
-    float alpha = powf(pix.w, gamma);
-    if (pix.w < linrange) {
-        float frac = pix.w / linrange;
-        alpha = (1.0f - frac) * pix.w * lingam + frac * alpha;
-    }
-
-    if (!blend_background_color) {
-        float ls = alpha / pix.w;
-        pix.x *= ls;
-        pix.y *= ls;
-        pix.z *= ls;
-        pix.w = alpha;
-        pixbuf[i] = pix;
-        return;
-    }
-
-    float ls = vibrance * alpha / pix.w;
-    alpha = fminf(1.0f, fmaxf(0.0f, alpha));
-
-    float maxc = fmaxf(pix.x, fmaxf(pix.y, pix.z));
-    float maxa = maxc * ls;
-    float newls = 1.0f / maxc;
-
-    if (maxa > 1.0f && highpow >= 0.0f) {
-        float lsratio = powf(newls / ls, highpow);
-        pix.x *= newls;
-        pix.y *= newls;
-        pix.z *= newls;
-
-        // Reduce saturation (according to the HSV model) by proportionally
-        // increasing the values of the other colors.
-        pix.x = maxc - (maxc - pix.x) * lsratio;
-        pix.y = maxc - (maxc - pix.y) * lsratio;
-        pix.z = maxc - (maxc - pix.z) * lsratio;
-    } else {
-        float adjhlp = -highpow;
-        if (adjhlp > 1.0f || maxa <= 1.0f) adjhlp = 1.0f;
-        if (maxc > 0.0f) {
-            float adj = ((1.0f - adjhlp) * newls + adjhlp * ls);
-            pix.x *= adj;
-            pix.y *= adj;
-            pix.z *= adj;
-        }
-    }
-
-    pix.x += (1.0f - vibrance) * powf(opix.x, gamma);
-    pix.y += (1.0f - vibrance) * powf(opix.y, gamma);
-    pix.z += (1.0f - vibrance) * powf(opix.z, gamma);
-
-    pix.x += (1.0f - alpha) * bkgd.x;
-    pix.y += (1.0f - alpha) * bkgd.y;
-    pix.z += (1.0f - alpha) * bkgd.z;
-
-    pix.x = fminf(1.0f, pix.x);
-    pix.y = fminf(1.0f, pix.y);
-    pix.z = fminf(1.0f, pix.z);
-    pix.w = alpha;
-
-    pixbuf[i] = pix;
-}
-
-__global__
-void logscale(float4 *outbuf, const float4 *pixbuf, float k1, float k2) {
+logscalelib = devlib(defs=r'''
+__global__ void
+logscale(float4 *outbuf, const float4 *pixbuf, float k1, float k2) {
    int i = blockDim.x * blockIdx.x + threadIdx.x;
    float4 pix = pixbuf[i];

-    // float ls = fmaxf(0, k1 * logf(1.0f + pix.w * pix.w * k2 / (1 + pix.w)) / pix.w);
    float ls = fmaxf(0, k1 * logf(1.0f + pix.w * k2) / pix.w);
    pix.x *= ls;
    pix.y *= ls;
@ -141,10 +51,12 @@ void logscale(float4 *outbuf, const float4 *pixbuf, float k1, float k2) {

    outbuf[i] = pix;
 }
+''')

+fmabuflib = devlib(defs=r'''
 // Element-wise computation of ``dst[i]=dst[i]+src[i]*scale``.
-__global__
-void fma_buf(float4 *dst, const float4 *src, int astride, float scale) {
+__global__ void
+fma_buf(float4 *dst, const float4 *src, int astride, float scale) {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    int i = y * astride + x;
@ -155,23 +67,16 @@ void fma_buf(float4 *dst, const float4 *src, int astride, float scale) {
    d.w += s.w * scale;
    dst[i] = d;
 }
+''')

-texture<float4, cudaTextureType2D> bilateral_src;
+denblurlib = devlib(decls='''
 texture<float,  cudaTextureType2D> blur_src;

-__global__ void logscale_den(float *dst, float k2) {
-    int xi = blockIdx.x * blockDim.x + threadIdx.x;
-    int yi = blockIdx.y * blockDim.y + threadIdx.y;
-    float4 pix = tex2D(bilateral_src, xi, yi);
-    float out = logf(1.0f + pix.w * k2);
-    dst[yi * (blockDim.x * gridDim.x) + xi] = out;
-}
-
 __constant__ float gauss_coefs[9] = {
    0.00443305f,  0.05400558f,  0.24203623f,  0.39905028f,
    0.24203623f,  0.05400558f,  0.00443305f
 };
-
+''', defs=r'''
 // Apply a Gaussian-esque blur to the density channel of the texture in
 // ``bilateral_src`` in the horizontal direction, and write it to ``dst``, a
 // one-channel buffer.
@ -203,7 +108,11 @@ __global__ void den_blur_1c(float *dst, int pattern, int upsample) {
             * gauss_coefs[i];
    dst[yi * (blockDim.x * gridDim.x) + xi] = den;
 }
+''')

+bilaterallib = devlib(deps=[logscalelib, texshearlib, denblurlib], decls='''
+texture<float4, cudaTextureType2D> bilateral_src;
+''', defs=r'''
 /* sstd:    spatial standard deviation (Gaussian filter)
 * cstd:    color standard deviation (Gaussian on the range [0, 1], where 1
 *          represents an "opposite" color).
@ -215,9 +124,8 @@ __global__ void den_blur_1c(float *dst, int pattern, int upsample) {
 * gspeed:  Speed of (exp2f) Gompertz distribution governing how much to
 *          tighten gradients. Zero and negative values OK.
 */
-
-__global__
-void bilateral(float4 *dst, int pattern, int radius,
+__global__ void
+bilateral(float4 *dst, int pattern, int radius,
          float sstd, float cstd, float dstd, float dpow, float gspeed)
 {
    int xi = blockIdx.x * blockDim.x + threadIdx.x;
@ -317,124 +225,83 @@ void bilateral(float4 *dst, int pattern, int radius,
    const int astride = blockDim.x * gridDim.x;
    dst[yi * astride + xi] = out;
 }
+''')

-} // end extern "C"
-'''
+colorcliplib = devlib(deps=[yuvlib], defs=r'''
+__global__ void
+colorclip(float4 *pixbuf, float gamma, float vibrance, float highpow,
+          float linrange, float lingam, float3 bkgd, int fbsize)
+{
+    int i = threadIdx.x + blockDim.x * (blockIdx.x + gridDim.x * blockIdx.y);
+    if (i >= fbsize) return;

-class Filtering(HunkOCode):
-    mod = None
-    defs = _CODE
+    float4 pix = pixbuf[i];

-    @classmethod
-    def init_mod(cls):
-        if cls.mod is None:
-            cls.mod = pycuda.compiler.SourceModule(assemble_code(BaseCode, cls),
-                    options=['-use_fast_math', '-maxrregcount', '32'],
-                    no_extern_c=True)
+    if (pix.w <= 0) {
+        pixbuf[i] = make_float4(bkgd.x, bkgd.y, bkgd.z, 0.0f);
+        return;
+    }
+    pix.y -= 0.5f * pix.w;
+    pix.z -= 0.5f * pix.w;
+    float3 tmp = yuv2rgb(make_float3(pix.x, pix.y, pix.z));
+    pix.x = tmp.x;
+    pix.y = tmp.y;
+    pix.z = tmp.z;

-    def __init__(self):
-        self.init_mod()
+    pix.x = fmaxf(0.0f, pix.x);
+    pix.y = fmaxf(0.0f, pix.y);
+    pix.z = fmaxf(0.0f, pix.z);

-    def de(self, ddst, dsrc, dscratch, gnm, dim, tc, nxf, stream=None):
+    float4 opix = pix;

-        # Helper variables and functions to keep it clean
-        sb = 16 * dim.astride
-        bs = sb * dim.ah
-        bl, gr = (32, 8, 1), (dim.astride / 32, dim.ah / 8)
+    float alpha = powf(pix.w, gamma);
+    if (pix.w < linrange) {
+        float frac = pix.w / linrange;
+        alpha = (1.0f - frac) * pix.w * lingam + frac * alpha;
+    }

-        def launch(f, *args, **kwargs):
-            f(*args, block=bl, grid=gr, stream=stream, **kwargs)
-        mkdsc = lambda c: argset(cuda.ArrayDescriptor(), height=dim.ah,
-                                 width=dim.astride, num_channels=c,
-                                 format=cuda.array_format.FLOAT)
-        def mktref(n):
-            tref = self.mod.get_texref(n)
-            tref.set_filter_mode(cuda.filter_mode.POINT)
-            tref.set_address_mode(0, cuda.address_mode.WRAP)
-            tref.set_address_mode(1, cuda.address_mode.WRAP)
-            return tref
+    float ls = vibrance * alpha / pix.w;
+    alpha = fminf(1.0f, fmaxf(0.0f, alpha));

-        dsc = mkdsc(4)
-        tref = mktref('bilateral_src')
-        grad_dsc = mkdsc(1)
-        grad_tref = mktref('blur_src')
+    float maxc = fmaxf(pix.x, fmaxf(pix.y, pix.z));
+    float maxa = maxc * ls;
+    float newls = 1.0f / maxc;

-        bilateral, logscale_den, den_blur, den_blur_1c, fma_buf = map(
-                self.mod.get_function,
-                'bilateral logscale_den den_blur den_blur_1c fma_buf'.split())
+    if (maxa > 1.0f && highpow >= 0.0f) {
+        float lsratio = powf(newls / ls, highpow);
+        pix.x *= newls;
+        pix.y *= newls;
+        pix.z *= newls;

-        # Number of different shear patterns to use when filtering. Must be
-        # even, since we depend on buffer bouncing (but I don't think that it's
-        # a requirement for the filter itself to get decent results).
-        DIRECTIONS = 8
+        // Reduce saturation (according to the HSV model) by proportionally
+        // increasing the values of the other colors.
+        pix.x = maxc - (maxc - pix.x) * lsratio;
+        pix.y = maxc - (maxc - pix.y) * lsratio;
+        pix.z = maxc - (maxc - pix.z) * lsratio;
+    } else {
+        float adjhlp = -highpow;
+        if (adjhlp > 1.0f || maxa <= 1.0f) adjhlp = 1.0f;
+        if (maxc > 0.0f) {
+            float adj = ((1.0f - adjhlp) * newls + adjhlp * ls);
+            pix.x *= adj;
+            pix.y *= adj;
+            pix.z *= adj;
+        }
+    }

-        def do_bilateral(bsrc, bdst, pattern, r=15, sstd=6, cstd=0.05,
-                         dstd=1.5, dpow=0.8, gspeed=4.0):
-            # Scale spatial parameter so that a "pixel" is equivalent to an
-            # actual pixel at 1080p
-            sstd *= dim.w / 1920.
+    pix.x += (1.0f - vibrance) * powf(opix.x, gamma);
+    pix.y += (1.0f - vibrance) * powf(opix.y, gamma);
+    pix.z += (1.0f - vibrance) * powf(opix.z, gamma);

-            tref.set_address_2d(bsrc, dsc, sb)
+    pix.x += (1.0f - alpha) * bkgd.x;
+    pix.y += (1.0f - alpha) * bkgd.y;
+    pix.z += (1.0f - alpha) * bkgd.z;

-            # Blur density two octaves along sampling vector, ultimately
-            # storing in `dscratch`
-            launch(den_blur, np.intp(bdst), i32(pattern), i32(0),
-                   texrefs=[tref])
-            grad_tref.set_address_2d(bdst, grad_dsc, sb / 4)
-            launch(den_blur_1c, dscratch, i32(pattern), i32(1),
-                   texrefs=[grad_tref])
-            grad_tref.set_address_2d(dscratch, grad_dsc, sb / 4)
+    pix.x = fminf(1.0f, pix.x);
+    pix.y = fminf(1.0f, pix.y);
+    pix.z = fminf(1.0f, pix.z);
+    pix.w = alpha;

-            launch(bilateral, np.intp(bdst), i32(pattern), i32(r),
-                   f32(sstd), f32(cstd), f32(dstd), f32(dpow), f32(gspeed),
-                   texrefs=[tref, grad_tref])
-
-        def do_bilateral_range(bsrc, bdst, npats, *args, **kwargs):
-            for i in range(npats):
-                do_bilateral(bsrc, bdst, i, *args, **kwargs)
-                bdst, bsrc = bsrc, bdst
-            if npats % 2:
-                cuda.memcpy_dtod_async(bdst, bsrc, bs, stream)
-
-        # Filter the first xform, using `ddst` as an intermediate buffer.
-        # End result is the filtered copy in `dsrc`.
-        do_bilateral_range(dsrc, ddst, DIRECTIONS)
-
-        # Filter the remaining xforms, using `ddst` as an intermediate
-        # buffer, then add the result to `dsrc` (at the zero'th xform).
-        for x in range(1, nxf):
-            src = int(dsrc) + x * bs
-            do_bilateral_range(src, ddst, DIRECTIONS)
-            launch(fma_buf, dsrc, np.intp(src), i32(dim.astride), f32(1))
-
-        # Log-scale the accumulated buffer in `dsrc`.
-        k1 = f32(gnm.color.brightness(tc) * 268 / 256)
-        # Old definition of area is (w*h/(s*s)). Since new scale 'ns' is now
-        # s/w, new definition is (w*h/(s*s*w*w)) = (h/(s*s*w))
-        area = dim.h / (gnm.camera.scale(tc) ** 2 * dim.w)
-        k2 = f32(1.0 / (area * gnm.spp(tc)))
-
-        nbins = dim.ah * dim.astride
-        logscale = self.mod.get_function("logscale")
-        t = logscale(ddst, dsrc, k1, k2,
-                block=(512, 1, 1), grid=(nbins/512, 1), stream=stream)
-
-    def colorclip(self, dbuf, gnm, dim, tc, blend, stream=None):
-        nbins = dim.ah * dim.astride
-
-        # TODO: implement integration over cubic splines?
-        gam = f32(1 / gnm.color.gamma(tc))
-        vib = f32(gnm.color.vibrance(tc))
-        hipow = f32(gnm.color.highlight_power(tc))
-        lin = f32(gnm.color.gamma_threshold(tc))
-        lingam = f32(lin ** (gam-1.0) if lin > 0 else 0)
-        bkgd = vec.make_float3(
-                gnm.color.background.r(tc),
-                gnm.color.background.g(tc),
-                gnm.color.background.b(tc))
-
-        color_fun = self.mod.get_function("colorclip")
-        blocks = int(np.ceil(np.sqrt(nbins / 256)))
-        color_fun(dbuf, gam, vib, hipow, lin, lingam, bkgd, i32(nbins),
-                  i32(blend), block=(256, 1, 1), grid=(blocks, blocks),
-                  stream=stream)
+    pixbuf[i] = pix;
+}
+''')
--- a/cuburn/code/interp.py
+++ b/cuburn/code/interp.py
@ -2,8 +2,10 @@ from collections import OrderedDict
 from itertools import cycle
 import numpy as np

-import tempita
-from util import HunkOCode, Template, BaseCode, assemble_code
+import util
+from util import Template, assemble_code, devlib, binsearchlib, ringbuflib
+from color import yuvlib
+from mwc import mwclib

 class GenomePackerName(str):
    """Class to indicate that a property is precalculated on the device"""
@ -115,17 +117,10 @@ class GenomePackerPrecalc(GenomePackerView):
    def _code(self, code):
        self.packer.precalc_code.append(code)

-class GenomePacker(HunkOCode):
+class GenomePacker(object):
    """
    Packs a genome for use in iteration.
    """
-
-    # 2^search_rounds is the maximum number of control points, including
-    # endpoints, that can be used in a single genome. It should be okay to
-    # change this number without touching other code, but 32 samples fits
-    # nicely on a single cache line.
-    search_rounds = 5
-
    def __init__(self, tname):
        """
        Create a new DataPacker.
@ -152,6 +147,7 @@ class GenomePacker(HunkOCode):

        self.packed = None
        self.genome = None
+        self.search_rounds = util.DEFAULT_SEARCH_ROUNDS

    def __len__(self):
        assert self._len is not None, 'len() called before finalize()'
@ -189,42 +185,41 @@ class GenomePacker(HunkOCode):

        self._len = len(self.packed)

-        self.decls = self._decls.substitute(
-                packed=self.packed, tname=self.tname,
-                search_rounds=self.search_rounds)
-        self.defs = self._defs.substitute(
+        decls = self._decls.substitute(packed=self.packed, tname=self.tname)
+        defs = self._defs.substitute(
                packed_direct=self.packed_direct, tname=self.tname,
                precalc_code=self.precalc_code,
                search_rounds=self.search_rounds)

+        return devlib(deps=[catmullromlib], decls=decls, defs=defs)

-    def pack(self):
+    def pack(self, pool=None):
        """
-        Return a packed copy of the genome ready for uploading to the GPU as a
-        3D NDArray, with the first element being the times and the second
-        being the values.
+        Return a packed copy of the genome ready for uploading to the GPU,
+        as two float32 NDArrays for the knot times and values.
        """
        width = 1 << self.search_rounds
-        out = np.empty((2, len(self.genome), width), dtype=np.float32)
-        # Ensure that unused values at the top are always big (must be >2.0)
-        out[0].fill(1e9)
+        if pool:
+            times = pool.allocate((len(self.genome), width), 'f4')
+            knots = pool.allocate((len(self.genome), width), 'f4')
+        else:
+            times, knots = np.empty((2, len(self.genome), width), 'f4')
+        times.fill(1e9)

        for idx, gname in enumerate(self.genome):
            attr = self.ns[gname[0]]
            for g in gname[1:]:
                attr = getattr(attr, g)
-            out[0][idx][:len(attr.knots[0])] = attr.knots[0]
-            out[1][idx][:len(attr.knots[1])] = attr.knots[1]
-        return out
+            times[idx,:len(attr.knots[0])] = attr.knots[0]
+            knots[idx,:len(attr.knots[1])] = attr.knots[1]
+        return times, knots

    _defs = Template(r"""
-
-__global__
-void interp_{{tname}}(
+__global__ void interp_{{tname}}(
        {{tname}}* out,
        const float *times, const float *knots,
-        float tstart, float tstep, int maxid
-) {
+        float tstart, float tstep, int maxid)
+{
    int id = gtid();
    if (id >= maxid) return;
    out = &out[id];
@ -249,7 +244,6 @@ void interp_{{tname}}(
    }
    {{endfor}}
 }
-
 """)

    _decls = Template(r"""
@ -259,22 +253,13 @@ typedef struct {
 {{endfor}}
 } {{tname}};

-/* Search through the fixed-size list 'hay' to find the rightmost index which
- * contains a value strictly smaller than the input 'needle'. The crazy
- * bitwise search is just for my own amusement.
- */
-__device__
-int bitwise_binsearch(const float *hay, float needle) {
-    int lo = 0;

-    // TODO: improve efficiency on 64-bit arches
-    {{for i in range(search_rounds-1, -1, -1)}}
-    if (needle > hay[lo + {{1 << i}}])
-        lo += {{1 << i}};
-    {{endfor}}
-    return lo;
-}
+""")

+catmullromlib = devlib(deps=[binsearchlib], decls=r'''
+__device__ __noinline__
+float catmull_rom(const float *times, const float *knots, float t);
+''', defs=r'''
 __device__ __noinline__
 float catmull_rom(const float *times, const float *knots, float t) {
    int idx = bitwise_binsearch(times, t);
@ -303,67 +288,14 @@ float catmull_rom(const float *times, const float *knots, float t) {
         + m2 * (      ttt -      tt)
         + k2 * (-2.0f*ttt + 3.0f*tt);
 }
+''')

-__global__
-void test_cr(const float *times, const float *knots, const float *t, float *r) {
-    int i = threadIdx.x + blockDim.x * blockIdx.x;
-    r[i] = catmull_rom(times, knots, t[i]);
-}
-""")
-
-class Palette(HunkOCode):
-    # The JPEG YUV full-range matrix, without bias into the positve regime.
-    # This assumes input color space is CIERGB D65, encoded with gamma 2.2.
-    # Note that some interpolated colors may exceed the sRGB and YUV gamuts.
-    YUV = np.matrix([[ 0.299,        0.587,      0.114],
-                     [-0.168736,    -0.331264,   0.5],
-                     [ 0.5,         -0.418688,  -0.081312]])
-
-    def __init__(self, interp_mode):
-        assert interp_mode in self.modes
-        self.mode = interp_mode
-        self.defs = self._defs.substitute(mode=interp_mode)
-
-    def prepare(self, palettes):
-        """
-        Produce palettes suitable for uploading to the device. Returns an
-        array of palettes in the same size and shape as the input.
-
-        This function will never modify its argument, but may return it
-        unmodified for certain interpolation modes.
-        """
-        if self.mode == 'yuvpolar':
-            ys, uvrs, uvts, alphas = zip(*map(self.rgbtoyuvpolar, palettes))
-            # Center all medians as closely to 0 as possible
-            means = np.mean(uvts, axis=1)
-            newmeans = (means + np.pi) % (2 * np.pi) - np.pi
-            uvts = (newmeans - means).reshape((-1, 1)) + uvts
-            zipped = zip(ys, uvrs, uvts, alphas)
-            return np.array(zipped, dtype='f4').transpose((0, 2, 1))
-        return palettes
-
-    @classmethod
-    def rgbtoyuvpolar(cls, pal):
-        # TODO: premultiply alpha or some nonsense like that?
-        y, u, v = np.array(cls.YUV * pal.T[:3])
-        uvr = np.hypot(u, v)
-        uvt = np.unwrap(np.arctan2(v, u))
-        return y, uvr, uvt, pal.T[3]
-
-    @classmethod
-    def yuvpolartorgb(cls, y, uvr, uvt, a):
-        u = uvr * np.cos(uvt)
-        v = uvr * np.sin(uvt)
-        r, g, b = np.array(cls.YUV.I * np.array([y, u, v]))
-        # Ensure Fortran order so that the memory gets laid out correctly
-        return np.array([r, g, b, a], order='F').T
-
-    modes = ['hsv', 'yuv', 'yuvpolar']
-    decls = "surface<void, cudaSurfaceType2D> flatpal;\n"
-
-    _defs = Template(r"""
-__device__
-float4 interp_color(const float *times, const float4 *sources, float time) {
+palintlib = devlib(deps=[binsearchlib, ringbuflib, yuvlib, mwclib], decls='''
+surface<void, cudaSurfaceType2D> flatpal;
+''', defs=r'''
+__device__ float4
+interp_color(const float *times, const float4 *sources, float time)
+{
    int idx = fmaxf(bitwise_binsearch(times, time) + 1, 1);
    float lf = (times[idx] - time) / (times[idx] - times[idx-1]);
    float rf = 1.0f - lf;
@ -375,42 +307,11 @@ float4 interp_color(const float *times, const float4 *sources, float time) {
    float3 l3 = make_float3(left.x, left.y, left.z);
    float3 r3 = make_float3(right.x, right.y, right.z);

-{{if mode == 'hsv'}}
-    float3 lhsv = rgb2hsv(l3);
-    float3 rhsv = rgb2hsv(r3);
-
-    if (fabs(lhsv.x - rhsv.x) > 3.0f)
-        if (lhsv.x < rhsv.x)
-            lhsv.x += 6.0f;
-        else
-            rhsv.x += 6.0f;
-
-    float3 hsv;
-    hsv.x = lhsv.x * lf + rhsv.x * rf;
-    hsv.y = lhsv.y * lf + rhsv.y * rf;
-    hsv.z = lhsv.z * lf + rhsv.z * rf;
-
-    if (hsv.x > 6.0f)
-        hsv.x -= 6.0f;
-    if (hsv.x < 0.0f)
-        hsv.x += 6.0f;
-
-    yuv = rgb2yuv(hsv2rgb(hsv));
-{{elif mode.startswith('yuv')}}
-    {{if mode == 'yuv'}}
    float3 lyuv = rgb2yuv(l3);
    float3 ryuv = rgb2yuv(r3);
    yuv.x = lyuv.x * lf + ryuv.x * rf;
    yuv.y = lyuv.y * lf + ryuv.y * rf;
    yuv.z = lyuv.z * lf + ryuv.z * rf;
-    {{elif mode == 'yuvpolar'}}
-    yuv.x = l3.x * lf + r3.x * rf;
-    float radius = l3.y * lf + r3.y * rf;
-    float angle  = l3.z * lf + r3.z * rf;
-    yuv.y = radius * cosf(angle);
-    yuv.z = radius * sinf(angle);
-    {{endif}}
-{{endif}}

    yuv.y += 0.5f;
    yuv.z += 0.5f;
@ -418,28 +319,13 @@ float4 interp_color(const float *times, const float4 *sources, float time) {
    return make_float4(yuv.x, yuv.y, yuv.z, left.w * lf + right.w * rf);
 }

-__global__
-void interp_palette(uchar4 *outs,
+__global__ void interp_palette_flat(
+        ringbuf *rb, mwc_st *rctxs,
        const float *times, const float4 *sources,
-        float tstart, float tstep) {
-    float time = tstart + blockIdx.x * tstep;
-    float4 yuva = interp_color(times, sources, time);
-
-    uchar4 out;
-    out.x = yuva.x * 255.0f;
-    out.y = yuva.y * 255.0f;
-    out.z = yuva.z * 255.0f;
-    out.w = yuva.w * 255.0f;
-    outs[blockDim.x * blockIdx.x + threadIdx.x] = out;
-}
-
-__global__
-void interp_palette_flat(mwc_st *rctxs,
-        const float *times, const float4 *sources,
-        float tstart, float tstep) {
-
+        float tstart, float tstep)
+{
+    mwc_st rctx = rctxs[rb_incr(rb->head, threadIdx.x)];
    int gid = blockIdx.x * blockDim.x + threadIdx.x;
-    mwc_st rctx = rctxs[gid];

    float time = tstart + blockIdx.x * tstep;
    float4 yuva = interp_color(times, sources, time);
@ -447,17 +333,30 @@ void interp_palette_flat(mwc_st *rctxs,
    // TODO: pack Y at full precision, UV at quarter
    uint2 out;

-    uint32_t y = min(255, (uint32_t) (yuva.x * 255.0f + 0.49f * mwc_next_11(rctx)));
-    uint32_t u = min(255, (uint32_t) (yuva.y * 255.0f + 0.49f * mwc_next_11(rctx)));
-    uint32_t v = min(255, (uint32_t) (yuva.z * 255.0f + 0.49f * mwc_next_11(rctx)));
+    uint32_t y = yuva.x * 255.0f + 0.49f * mwc_next_11(rctx);
+    uint32_t u = yuva.y * 255.0f + 0.49f * mwc_next_11(rctx);
+    uint32_t v = yuva.z * 255.0f + 0.49f * mwc_next_11(rctx);
+    y = min(255, y);
+    u = min(255, u);
+    v = min(255, v);
    out.y = (1 << 22) | (y << 4);
    out.x = (u << 18) | v;
+
    surf2Dwrite(out, flatpal, 8 * threadIdx.x, blockIdx.x);
-    rctxs[gid] = rctx;
+    rctxs[rb_incr(rb->tail, threadIdx.x)] = rctx;
 }
-""")
+''')
+
+testcrlib = devlib(defs=r'''
+__global__ void
+test_cr(const float *times, const float *knots, const float *t, float *r) {
+    int i = threadIdx.x + blockDim.x * blockIdx.x;
+    r[i] = catmull_rom(times, knots, t[i]);
+}
+''')

 if __name__ == "__main__":
+    # Test spline evaluation. This code will probably drift pretty often.
    import pycuda.driver as cuda
    from pycuda.compiler import SourceModule
    import pycuda.autoinit
--- a/cuburn/code/iter.py
+++ b/cuburn/code/iter.py
@ -2,15 +2,16 @@
 The main iteration loop.
 """

-from cuburn.code import mwc, variations, interp
-from cuburn.code.util import *
+import variations
+import interp
+from util import Template, devlib, ringbuflib
+from mwc import mwclib

 def precalc_densities(pcp, std_xforms):
    # This pattern recurs a few times for precalc segments. Unfortunately,
    # namespace stuff means it's not easy to functionalize this boilerplate
    pre_cp = pcp._precalc()
    pre_cp._code(Template(r"""
-
        float sum = 0.0f;

        {{for n in std_xforms}}
@ -30,7 +31,6 @@ def precalc_densities(pcp, std_xforms):
 def precalc_chaos(pcp, std_xforms):
    pre_cp = pcp._precalc()
    pre_cp._code(Template("""
-
        float sum, rsum;

        {{for p in std_xforms}}
@ -50,7 +50,6 @@ def precalc_chaos(pcp, std_xforms):
        {{endfor}}

        {{endfor}}
-
    """, name='precalc_chaos').substitute(locals()))

 def precalc_camera(pcam):
@ -64,7 +63,6 @@ def precalc_camera(pcam):
    # . matrix([X],[Y],[1]);

    pre_cam._code(Template(r"""
-
        float rot = {{pre_cam.rotation}} * M_PI / 180.0f;
        float rotsin = sin(rot), rotcos = cos(rot);
        float cenx = {{pre_cam.center.x}}, ceny = {{pre_cam.center.y}};
@ -79,13 +77,11 @@ def precalc_camera(pcam):
        {{pre_cam._set('yy')}} = scale * rotcos;
        {{pre_cam._set('yo')}} = scale * -(rotsin * cenx + rotcos * ceny)
                               + 0.5f * acc_size.aheight;
-
-    """).substitute(locals()))
+    """, 'precalc_camera').substitute(locals()))

 def precalc_xf_affine(px):
    pre = px._precalc()
    pre._code(Template(r"""
-
        float pri = {{pre.angle}} * M_PI / 180.0f;
        float spr = {{pre.spread}} * M_PI / 180.0f;

@ -98,27 +94,22 @@ def precalc_xf_affine(px):
        {{pre._set('yy')}} = magy * sin(pri+spr);
        {{pre._set('xo')}} = {{pre.offset.x}};
        {{pre._set('yo')}} = -{{pre.offset.y}};
+    """, 'precalc_xf_affine').substitute(locals()))

-    """).substitute(locals()))
+def apply_affine(x, y, xo, yo, packer):
+    return Template("""
+    {{xo}} = {{packer.xx}} * {{x}} + {{packer.xy}} * {{y}} + {{packer.xo}};
+    {{yo}} = {{packer.yx}} * {{x}} + {{packer.yy}} * {{y}} + {{packer.yo}};
+    """, 'apply_affine').substitute(locals())

-class IterCode(HunkOCode):
-    # The number of threads per block
-    NTHREADS = 256
+# The number of threads per block used in the iteration function. Don't change
+# it lightly; the code may depend on it in unusual ways.
+NTHREADS = 256

-    def __init__(self, info, genome):
-        self.packer = interp.GenomePacker('iter_params')
-        self.pcp = self.packer.view('params', genome, 'cp')
-
-        iterbody = self._iterbody(info, genome)
-        bodies = [self._xfbody(i,x) for i,x in sorted(genome.xforms.items())]
-        bodies.append(iterbody)
-        self.defs = '\n'.join(bodies)
-
-    decls = """
+iter_decls = """
 // Note: for normalized lookups, uchar4 actually returns floats
 texture<uchar4, cudaTextureType2D, cudaReadModeNormalizedFloat> palTex;
 __shared__ iter_params params;
-__device__ int rb_head, rb_tail, rb_size;

 typedef struct {
    uint32_t width;
@ -128,12 +119,9 @@ typedef struct {
    uint32_t astride;
 } acc_size_t;
 __constant__ acc_size_t acc_size;
-
 """

-    def _xfbody(self, xfid, xform):
-        px = self.pcp.xforms[xfid]
-        tmpl = Template(r"""
+iter_xf_body_code = r"""
 __device__
 void apply_xf_{{xfid}}(float &ox, float &oy, float &color, mwc_st &rctx) {
    float tx, ty;
@ -162,44 +150,30 @@ void apply_xf_{{xfid}}(float &ox, float &oy, float &color, mwc_st &rctx) {
    float csp = {{px.color_speed}};
    color = color * (1.0f - csp) + {{px.color}} * csp;
 };
-""", 'apply_xf_'+xfid)
+"""
+
+def iter_xf_body(pcp, xfid, xform):
+    px = pcp.xforms[xfid]
+    tmpl = Template(iter_xf_body_code, 'apply_xf_'+xfid)
+
    g = dict(globals())
    g.update(locals())
    return tmpl.substitute(g)

-    def _iterbody(self, info, genome):
-        tmpl = Template(r'''
-
-__global__ void reset_rb(int size) {
-    rb_head = rb_tail = 0;
-    rb_size = size;
-}
-
-__global__
-void iter(
-        uint64_t out_ptr,
-        mwc_st *msts,
-        float4 *points,
-        const iter_params *all_params
-{{if info.acc_mode == 'atomic'}}
-      , uint64_t atom_ptr
-{{endif}}
-) {
-    const iter_params *global_params = &(all_params[blockIdx.x]);
-
-
+iter_body_code = r'''
+__global__ void
+iter(uint64_t out_ptr, uint64_t atom_ptr,
+     ringbuf *rb, mwc_st *msts, float4 *points,
+     const iter_params *all_params)
+{
    // load params to shared memory cooperatively
+    const iter_params *global_params = &(all_params[blockIdx.x]);
    for (int i = threadIdx.y * blockDim.x + threadIdx.x;
         i < (sizeof(iter_params) / 4); i += blockDim.x * blockDim.y)
        reinterpret_cast<float*>(&params)[i] =
            reinterpret_cast<const float*>(global_params)[i];

-    __shared__ int rb_idx;
-    if (threadIdx.y == 1 && threadIdx.x == 1)
-        rb_idx = 32 * blockDim.y * (atomicAdd(&rb_head, 1) % rb_size);
-
-    __syncthreads();
-    int this_rb_idx = rb_idx + threadIdx.x + 32 * threadIdx.y;
+    int this_rb_idx = rb_incr(rb->head, blockDim.x * threadIdx.y + threadIdx.x);
    mwc_st rctx = msts[this_rb_idx];

    {{precalc_camera(pcp.camera)}}
@ -209,22 +183,15 @@ void iter(
        {{pcp.camera.yo}} += ditherwidth * mwc_next_11(rctx);
    }

-{{if info.acc_mode == 'global'}}
-    __shared__ float time_frac;
-    time_frac = blockIdx.x / (float) gridDim.x;
-{{else}}
-    {{if info.acc_mode == 'atomic'}}
    // TODO: spare the register, reuse at call site?
    int time = blockIdx.x >> 4;
-    {{endif}}
    float color_dither = 0.49f * mwc_next_11(rctx);
-{{endif}}

    // TODO: 4th channel unused. Kill or use for something helpful
    float4 old_point = points[this_rb_idx];
    float x = old_point.x, y = old_point.y, color = old_point.z;

-{{if not info.chaos_used}}
+{{if not chaos_used}}
    // Shared memory size can be reduced by a factor of four using a slower
    // 4-stage reduce, but on Fermi hardware shmem use isn't a bottleneck
    __shared__ float swap[{{4*NTHREADS}}];
@ -250,7 +217,7 @@ void iter(
    // TODO: link up with FUSE, etc
    for (int round = 0; round < 256; round++) {

-{{if info.chaos_used}}
+{{if chaos_used}}

        {{precalc_chaos(pcp, std_xforms)}}

@ -315,16 +282,7 @@ void iter(

 {{endif}}

-{{if info.acc_mode == 'deferred'}}
-        int tid = threadIdx.y * 32 + threadIdx.x;
-        int offset = 4 * (256 * (256 * blockIdx.x + round) + tid);
-        int *log = reinterpret_cast<int*>(out_ptr + offset);
-{{endif}}
-
        if (fuse) {
-{{if info.acc_mode == 'deferred'}}
-            *log = 0xffffffff;
-{{endif}}
            continue;
        }

@ -346,15 +304,11 @@ void iter(
        uint32_t ix = trunca(cx), iy = trunca(cy);

        if (ix >= acc_size.astride || iy >= acc_size.aheight) {
-{{if info.acc_mode == 'deferred'}}
-            *log = 0xffffffff;
-{{endif}}
            continue;
        }

-        uint32_t ibase = (last_xf_used % {{info.max_nxf}}) * acc_size.aheight;
-        uint32_t i = (ibase + iy) * acc_size.astride + ix;
-{{if info.acc_mode == 'atomic'}}
+        uint32_t i = iy * acc_size.astride + ix;
+
        asm volatile ({{crep("""
 {
    // To prevent overflow, we need to flush each pixel before the density
@ -432,38 +386,45 @@ oflow_end:
        """)}}  ::  "f"(cc), "f"(color_dither), "r"(time), "r"(i),
                    "l"(atom_ptr), "f"(cosel[threadIdx.y + {{NWARPS}}]),
                    "l"(out_ptr));
-{{endif}}
-
-{{if info.acc_mode == 'global'}}
-        float4 outcol = tex2D(palTex, cc, time_frac);
-        float4 *accbuf = reinterpret_cast<float4*>(out_ptr + (16*i));
-        float4 pix = *accbuf;
-        pix.x += outcol.x;
-        pix.y += outcol.y;
-        pix.z += outcol.z;
-        pix.w += 1.0f;
-        *accbuf = pix;
-{{elif info.acc_mode == 'deferred'}}
-        // 'color' gets the top 8 bits. TODO: add dithering via precalc.
-        uint32_t icolor = fminf(1.0f, cc) * 255.0f + color_dither;
-        asm("bfi.b32    %0, %1, %0, 24, 8;" : "+r"(i) : "r"(icolor));
-        *log = i;
-{{endif}}
    }

-    if (threadIdx.x == 0 && threadIdx.y == 0)
-        rb_idx = 32 * blockDim.y * (atomicAdd(&rb_tail, 1) % rb_size);
-    __syncthreads();
-    this_rb_idx = rb_idx + threadIdx.x + 32 * threadIdx.y;
-
+    this_rb_idx = rb_incr(rb->tail, blockDim.x * threadIdx.y + threadIdx.x);
    points[this_rb_idx] = make_float4(x, y, color, 0.0f);
    msts[this_rb_idx] = rctx;
    return;
 }
+'''

-{{if info.acc_mode == 'atomic'}}
-__global__
-void flush_atom(uint64_t out_ptr, uint64_t atom_ptr, int nbins) {
+def iter_body(cp, pcp):
+    # For legacy reasons, 'cp' is used here instead of 'genome'.
+    tmpl = Template(iter_body_code, 'iter_body')
+    NWARPS = NTHREADS / 32
+    std_xforms = [n for n in sorted(cp.xforms) if n != 'final']
+
+    # TODO: detect this properly and use it
+    chaos_used = False
+
+    vars = globals()
+    vars.update(locals())
+    return tmpl.substitute(vars)
+
+def mkiterlib(genome):
+    packer = interp.GenomePacker('iter_params')
+    pcp = packer.view('params', genome, 'cp')
+
+    iterbody = iter_body(genome, pcp)
+    bodies = [iter_xf_body(pcp, i, x) for i, x in sorted(genome.xforms.items())]
+    bodies.append(iterbody)
+    packer_lib = packer.finalize()
+
+    lib = devlib(deps=[packer_lib, mwclib, ringbuflib],
+                 # We grab the surf decl from palintlib as well
+                 decls=iter_decls + interp.palintlib.decls,
+                 defs='\n'.join(bodies))
+    return packer, lib
+
+flushatomlib = devlib(defs=Template(r'''
+__global__ void flush_atom(uint64_t out_ptr, uint64_t atom_ptr, int nbins) {
    int i = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;
    if (i >= nbins) return;
    asm volatile ({{crep("""
@ -499,168 +460,4 @@ void flush_atom(uint64_t out_ptr, uint64_t atom_ptr, int nbins) {
 }
    """)}}  ::  "r"(i), "l"(atom_ptr), "l"(out_ptr));
 }
-
-{{endif}}
-
-{{if info.acc_mode == 'deferred'}}
-
-// Block size, shared accumulation bits, shared accumulation width.
-#define BS 1024
-#define SHAB 12
-#define SHAW (1<<SHAB)
-
-// Read the point log, accumulate in shared memory, and write the results.
-// This kernel is to be launched with one block for every 4,096 addresses to
-// be processed, and will handle those addresses.
-//
-// log_bounds is an array mapping radix values to the first index in the log
-// with that radix position. For performance reasons in other parts of the
-// code, the radix may actually include bits within the lower SHAB part of the
-// address, or it might not cover the first few bits after the SHAB part;
-// log_bounds_shift covers that. glob_addr_bits specifies the number of bits
-// above SHAB which are address bits.
-
-__global__ void
-__launch_bounds__(BS, 1)
-write_shmem(
-        float4 *acc,
-        const uint32_t *log,
-        const uint32_t *log_bounds,
-        uint32_t nbins
-) {
-    const int tid = threadIdx.x;
-    const int bid = blockIdx.x;
-
-    // These two accumulators, used in write_shmem, hold {density, red} and
-    // {green, blue} values as packed u16 pairs. The fixed size represents
-    // 4,096 pixels in the accumulator.
-    __shared__ uint32_t s_acc[SHAW*2];
-
-    int idx = tid;
-    for (int i = 0; i < (SHAW * 2 / BS); i++) {
-        s_acc[idx] = 0;
-        idx += BS;
-    }
-    __syncthreads();
-
-    // Shut the compiler up
-    idx = s_acc[0];
-
-    // log_bounds[] holds inclusive prefix sums, so that log_bounds[0] is the
-    // largest index with radix 0, and so on.
-    int lb_idx_hi = bid & 0xff;
-    int lb_idx_lo = lb_idx_hi - 1;
-
-    int idx_lo;
-    if (lb_idx_lo > 0) idx_lo = log_bounds[lb_idx_lo] & ~(BS - 1);
-    else               idx_lo = 0;
-    int idx_hi = (log_bounds[lb_idx_hi] & ~(BS - 1)) + BS;
-
-    float rnrounds = {{'%d.0f' % info.palette_height}} / (idx_hi - idx_lo);
-    float time = tid * rnrounds;
-    float time_step = BS * rnrounds;
-
-    int magic = ((blockIdx.x & 0xff) << 3) + ((blockIdx.x & 0xf00) << 12);
-    int magic_mask = 0xf007f8;
-
-    for (int i = idx_lo + tid; i < idx_hi; i += BS) {
-        int entry = log[i];
-
-        asm volatile ({{crep("""
-{
-    .reg .pred q;
-    .reg .u32 shoff, color, time, d, r, g, b, hi, lo, hiw, low, tmp;
-    .reg .u64 ptr;
-    .reg .f32 rf, gf, bf, df, rg, gg, dg, bg;
-
-    // TODO: opacity
-    and.b32         tmp,    %0,     %4;
-    setp.eq.u32     q,      tmp,    %3;
-@!q bra             before_sync;
-
-    and.b32         shoff,  %0,     0xff800;
-    shr.b32         shoff,  shoff,  5;
-    bfi.b32         shoff,  %0,     shoff,  3,  3;
-
-    bfe.u32         color,  %0,     24, 8;
-    shl.b32         color,  color,  3;
-    cvt.rni.u32.f32 time,   %1;
-
-    suld.b.2d.v2.b32.clamp  {lo, hi},   [flatpal, {color, time}];
-    ld.shared.v2.u32    {low, hiw},     [shoff];
-    add.cc.u32          lo,     lo,     low;
-    addc.u32            hi,     hi,     hiw;
-    setp.hs.u32         q,      hi,     (1023 << 22);
-@q  bra                 oflow_sync;
-    st.shared.v2.u32    [shoff],        {hi, lo};
-before_sync:
-    bar.sync            0;
-    bra                 oflow_write_end;
-oflow_sync:
-    st.shared.v2.u32    [shoff],    {0, 0};
-
-    // TODO: opacity
-    bfi.b32         shoff,  %0,     0,  4,  24;
-    cvt.u64.u32     ptr,    shoff;
-    add.u64         ptr,    ptr,    %2;
-    ld.global.v4.f32    {dg,bg,gg,rg},  [ptr];
-    bar.sync        0;
-
-    bfe.u32         r,      hi,     4,      18;
-    bfe.u32         g,      lo,     18,     14;
-    bfi.b32         g,      hi,     g,      14,     4;
-    and.b32         b,      lo,     ((1<<18)-1);
-    cvt.rn.f32.u32  rf,     r;
-    cvt.rn.f32.u32  gf,     g;
-    cvt.rn.f32.u32  bf,     b;
-    fma.rn.ftz.f32  rf,     rf,     (1.0/255.0),    rg;
-    fma.rn.ftz.f32  gf,     gf,     (1.0/255.0),    gg;
-    fma.rn.ftz.f32  bf,     bf,     (1.0/255.0),    bg;
-    add.f32         df,     df,     dg;
-    st.global.v4.f32    [ptr],  {df,bf,gf,rf};
-
-oflow_write_end:
-}
-        """)}}  ::  "r"(entry), "f"(time), "l"(acc), "r"(magic), "r"(magic_mask));
-        time += time_step;
-    }
-
-    __syncthreads();
-
-
-    idx = tid;
-    int glo_idx = magic | (((idx << 8) | idx) & 0xff807);
-
-    for (int i = 0; i < (SHAW / BS) && glo_idx < nbins; i++) {
-        int d, r, g, b;
-        float4 pix = acc[glo_idx];
-        asm({{crep("""
-{
-    .reg .u32 hi, lo;
-    ld.shared.v2.u32   {lo, hi},    [%4];
-    shr.u32         %0,     hi,     22;
-    bfe.u32         %1,     hi,     4,      18;
-    bfe.u32         %2,     lo,     18,     14;
-    bfi.b32         %2,     hi,     %2,     14,     4;
-    and.b32         %3,     lo,     ((1<<18)-1);
-}
-        """)}} : "=r"(d), "=r"(r), "=r"(g), "=r"(b) : "r"(idx*8));
-        pix.x += r / 255.0f;
-        pix.y += g / 255.0f;
-        pix.z += b / 255.0f;
-        pix.w += d;
-        acc[glo_idx] = pix;
-        idx += BS;
-        glo_idx += (BS << 8);
-    }
-}
-{{endif}}
-''', 'iter_kern')
-        return tmpl.substitute(
-                info = info,
-                cp = genome,
-                pcp = self.pcp,
-                NTHREADS = self.NTHREADS,
-                NWARPS = self.NTHREADS / 32,
-                std_xforms = [n for n in sorted(genome.xforms) if n != 'final'],
-                **globals())
+''', 'flush_atom').substitute())
--- a/cuburn/code/mwc.py
+++ b/cuburn/code/mwc.py
@ -6,18 +6,53 @@ import os
 import warnings
 import numpy as np

-from util import *
+from util import devlib, assemble_code

-class MWC(HunkOCode):
-    decls = """
+# Keeping this live in the module isn't necessary, but loading the mults
+# can be surprisingly slow.
+mults = None
+
+def load_mults():
+    pfpath = os.path.join(os.path.dirname(__file__), 'primes.bin')
+    if os.path.isfile(pfpath):
+        with open(pfpath) as fp:
+            return np.frombuffer(fp.read(), dtype='<u4')
+
+    warnings.warn('primes.bin not found, trying to download it')
+    import bz2, urllib2
+    ufp = urllib2.urlopen('http://aduro.strobe.cc/primes.diff.bin.bz2')
+    diffs = np.frombuffer(bz2.decompress(ufp.read()), dtype='<u2')
+    mults = np.cumsum(-np.array(diffs, dtype='<u4'), dtype='<u4')
+    with open(pfpath, 'wb') as fp:
+        fp.write(mults)
+    return mults
+
+def make_seeds(nthreads, host_seed=None):
+    global mults
+    if mults is None:
+        mults = load_mults()
+    if host_seed:
+        rand = np.random.RandomState(host_seed)
+    else:
+        rand = np.random
+
+    # Create the seed structures. TODO: check that struct is 4-byte aligned
+    seeds = np.empty((3, nthreads), dtype=np.uint32, order='F')
+    seeds[0][:] = mults[:nthreads]
+
+    # Excludes 0xffffffff for 32-bit compatibility with laziness
+    seeds[1] = rand.randint(1, 0x7fffffff, size=nthreads)
+    seeds[2] = rand.randint(1, 0x7fffffff, size=nthreads)
+
+    return seeds
+
+mwclib = devlib(decls=r'''
 typedef struct {
    uint32_t    mul;
    uint32_t    state;
    uint32_t    carry;
 } mwc_st;
-"""
-
-    defs = r"""
+''', defs=r'''
 __device__ uint32_t mwc_next(mwc_st &st) {
    asm("{\n\t"
        ".reg .u32 tmp;\n\t"
@ -40,46 +75,9 @@ __device__ float mwc_next_11(mwc_st &st) {
        : "=f"(ret) : "r"(val));
    return ret;
 }
-"""
+''')

-    @staticmethod
-    def load_mults():
-        pfpath = os.path.join(os.path.dirname(__file__), 'primes.bin')
-        if os.path.isfile(pfpath):
-            with open(pfpath) as fp:
-                return np.frombuffer(fp.read(), dtype='<u4')
-
-        warnings.warn('primes.bin not found, trying to download it')
-        import bz2, urllib2
-        ufp = urllib2.urlopen('http://aduro.strobe.cc/primes.diff.bin.bz2')
-        diffs = np.frombuffer(bz2.decompress(ufp.read()), dtype='<u2')
-        mults = np.cumsum(-np.array(diffs, dtype='<u4'), dtype='<u4')
-        with open(pfpath, 'wb') as fp:
-            fp.write(mults)
-        return mults
-
-    mults = None
-    @classmethod
-    def make_seeds(cls, nthreads, host_seed=None):
-        if cls.mults is None:
-            cls.mults = cls.load_mults()
-        if host_seed:
-            rand = np.random.RandomState(host_seed)
-        else:
-            rand = np.random
-
-        # Create the seed structures. TODO: check that struct is 4-byte aligned
-        seeds = np.empty((3, nthreads), dtype=np.uint32, order='F')
-        seeds[0][:] = cls.mults[:nthreads]
-
-        # Excludes 0xffffffff for 32-bit compatibility with laziness
-        seeds[1] = rand.randint(1, 0x7fffffff, size=nthreads)
-        seeds[2] = rand.randint(1, 0x7fffffff, size=nthreads)
-
-        return seeds
-
-class MWCTest(HunkOCode):
-    defs = """
+mwctestlib = devlib(deps=[mwclib], defs="""
 __global__ void test_mwc(mwc_st *msts, uint64_t *sums, float nrounds) {
    mwc_st rctx = msts[gtid()];
    uint64_t sum = 0;
@ -87,19 +85,18 @@ __global__ void test_mwc(mwc_st *msts, uint64_t *sums, float nrounds) {
    sums[gtid()] = sum;
    msts[gtid()] = rctx;
 }
-"""
+""")

-    @classmethod
-    def test_mwc(cls, rounds=5000, nblocks=64, blockwidth=512):
+def test_mwc(rounds=5000, nblocks=64, blockwidth=512):
    import pycuda.driver as cuda
    from pycuda.compiler import SourceModule
    import time

    nthreads = blockwidth * nblocks
-        seeds = MWC.make_seeds(nthreads, host_seed = 5)
+    seeds = make_seeds(nthreads, host_seed=42)
    dseeds = cuda.to_device(seeds)

-        mod = SourceModule(assemble_code(BaseCode, MWC, cls))
+    mod = SourceModule(assemble_code(mwctestlib))

    for trial in range(2):
        print "Trial %d, on CPU: " % trial,
@ -133,5 +130,4 @@ __global__ void test_mwc(mwc_st *msts, uint64_t *sums, float nrounds) {

 if __name__ == "__main__":
    import pycuda.autoinit
-    MWCTest.test_mwc()
-
+    test_mwc()
--- a/cuburn/code/output.py
+++ b/cuburn/code/output.py
@ -0,0 +1,34 @@
+from util import devlib, ringbuflib
+from mwc import mwclib
+
+f32tou8lib = devlib(deps=[ringbuflib, mwclib], defs=r'''
+// Perform a conversion from float32 values to uint8 ones, applying
+// pixel- and channel-independent dithering to reduce suprathreshold banding
+// artifacts. Clamps values larger than 1.0f.
+// TODO: move to a separate module?
+// TODO: less ineffecient mwc_st handling?
+__global__ void f32_to_u8(
+    ringbuf *rb, mwc_st *rctxs, uchar4 *dst, const float4 *src,
+    int gutter, int dstride, int sstride, int height)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x > dstride || y > height) return;
+    int isrc = sstride * (y + gutter) + x + gutter;
+
+    int tid = blockDim.x * threadIdx.y + threadIdx.x;
+    mwc_st rctx = rctxs[rb_incr(rb->head, tid)];
+
+    float4 in = src[isrc];
+    uchar4 out = make_uchar4(
+        fminf(1.0f, in.x) * 255.0f + 0.49f * mwc_next_11(rctx),
+        fminf(1.0f, in.y) * 255.0f + 0.49f * mwc_next_11(rctx),
+        fminf(1.0f, in.z) * 255.0f + 0.49f * mwc_next_11(rctx),
+        fminf(1.0f, in.w) * 255.0f + 0.49f * mwc_next_11(rctx)
+    );
+
+    int idst = dstride * y + x;
+    dst[idst] = out;
+    rctxs[rb_incr(rb->head, tid)] = rctx;
+}
+''')
--- a/cuburn/code/util.py
+++ b/cuburn/code/util.py
@ -1,55 +1,119 @@
 """
 Provides tools and miscellaneous functions for building device code.
 """
+import os
+import tempfile
+from collections import namedtuple

+import pycuda.driver as cuda
+import pycuda.compiler
 import numpy as np
 import tempita

-def crep(s):
-    """Escape for PTX assembly"""
-    if isinstance(s, unicode):
-        s = s.encode('utf-8')
-    return '"%s"' % s.encode("string_escape")
-
 def argset(obj, **kwargs):
+    """
+    Allow an object with many properties to be set using one call.
+
+    >>> x = argset(X(), a=1, b=2)
+    >>> x.a
+    1
+    """
    for k, v in kwargs.items():
        setattr(obj, k, v)
    return obj

+def launch(name, mod, stream, block, grid, *args, **kwargs):
+    """
+    Oft-used kernel launch helper. Provides a nice boost in readability of
+    densely-packed asynchronous code launches.
+    """
+    fun = mod.get_function(name)
+    if isinstance(block, (int, np.number)):
+        block = (int(block), 1, 1)
+    if isinstance(grid, (int, np.number)):
+        grid = (int(grid), 1)
+    fun(*args, block=block, grid=grid, stream=stream, **kwargs)
+
+def crep(s):
+    """Multiline literal escape for inline PTX assembly."""
+    if isinstance(s, unicode):
+        s = s.encode('utf-8')
+    return '"%s"' % s.encode("string_escape")
+
 class Template(tempita.Template):
+    """
+    A Tempita template with extra stuff in the default namespace.
+    """
    default_namespace = tempita.Template.default_namespace.copy()
 Template.default_namespace.update({'np': np, 'crep': crep})

-class HunkOCode(object):
-    """An apparently passive container for device code."""
-    # Use property objects to make these dynamic
-    headers = ''
-    decls = ''
-    defs = ''
+# Passive container for device code.
+DevLib = namedtuple('DevLib', 'deps headers decls defs')

-def assemble_code(*sections):
-    return ''.join([''.join([getattr(sect, kind) for sect in sections])
-                    for kind in ['headers', 'decls', 'defs']])
+def devlib(deps=(), headers='', decls='', defs=''):
+    """Create a library of device code."""
+    # This exists because namedtuple doesn't support default args
+    return DevLib(deps, headers, decls, defs)

-def apply_affine(x, y, xo, yo, packer):
-    return Template("""
-    {{xo}} = {{packer.xx}} * {{x}} + {{packer.xy}} * {{y}} + {{packer.xo}};
-    {{yo}} = {{packer.yx}} * {{x}} + {{packer.yy}} * {{y}} + {{packer.yo}};
-    """).substitute(locals())
+def assemble_code(*libs):
+    seen = set()
+    out = []
+    def go(lib):
+        map(go, lib.deps)
+        code = lib[1:]
+        if code not in seen:
+            seen.add(code)
+            out.append(code)
+    go(stdlib)
+    map(go, libs)
+    return ''.join(sum(zip(*out), ()))

-class BaseCode(HunkOCode):
-    headers = """
+DEFAULT_CMP_OPTIONS = ('-use_fast_math', '-maxrregcount', '42')
+DEFAULT_SAVE_KERNEL = True
+def compile(name, src, opts=DEFAULT_CMP_OPTIONS, save=DEFAULT_SAVE_KERNEL):
+    """
+    Compile a module. Returns a copy of the source (for inspection or
+    display) and the compiled cubin.
+    """
+    dir = tempfile.gettempdir()
+    if save:
+        with open(os.path.join(dir, name + '_kern.cu'), 'w') as fp:
+            fp.write(src)
+    cubin = pycuda.compiler.compile(src, options=list(opts))
+    if save:
+        with open(os.path.join(dir, name + '_kern.cubin'), 'w') as fp:
+            fp.write(cubin)
+    return cubin
+
+class ClsMod(object):
+    """
+    Convenience class or mixin that automatically compiles and loads a module
+    once per class, saving potentially expensive code regeneration. Only use
+    if your class does not employ run-time code generation.
+    """
+    mod = None
+    # Supply your own DevLib on this property
+    lib = None
+
+    def __init__(self):
+        super(ClsMod, self).__init__()
+        self.load()
+
+    @classmethod
+    def load(cls, name=None):
+        if cls.mod is None:
+            if name is None:
+                name = cls.__name__.lower()
+            cubin = compile(name, assemble_code(cls.lib))
+            cls.mod = cuda.module_from_buffer(cubin)
+
+# This lib is included with *every* assembled module. It contains no global
+# functions, so it shouldn't slow down compilation time too much.
+stdlib = devlib(headers="""
 #include<cuda.h>
 #include<stdint.h>
 #include<stdio.h>
-"""
-
-    decls = """
-float3 rgb2hsv(float3 rgb);
-float3 hsv2rgb(float3 hsv);
-"""
-
-    defs = Template(r"""
+""", decls=r"""
 #undef M_E
 #undef M_LOG2E
 #undef M_LOG10E
@ -84,33 +148,79 @@ float3 hsv2rgb(float3 hsv);
 #define bfe_decl(d, s, o, w) \
        int d; \
        bfe(d, s, o, w)
-
-// TODO: use launch parameter preconfig to eliminate unnecessary parts
-__device__
-uint32_t gtid() {
+""", defs=r'''
+__device__ uint32_t gtid() {
    return threadIdx.x + blockDim.x *
            (threadIdx.y + blockDim.y *
                (threadIdx.z + blockDim.z *
                    (blockIdx.x + (gridDim.x * blockIdx.y))));
 }

-__device__
-uint32_t trunca(float f) {
+__device__ uint32_t trunca(float f) {
    // truncate as used in address calculations. note the use of a signed
    // conversion is intentional here (simplifies image bounds checking).
    uint32_t ret;
    asm("cvt.rni.s32.f32    %0,     %1;" : "=r"(ret) : "f"(f));
    return ret;
 }
+''')

-__global__
-void fill_dptr(uint32_t* dptr, int size, uint32_t value) {
+def mkbinsearchlib(rounds):
+    """
+    Search through the fixed-size list 'hay' to find the rightmost index which
+    contains a value strictly smaller than the input 'needle'. The list must
+    be exactly '2^rounds' long, although padding at the top with very large
+    numbers or even +inf effectively shortens it.
+    """
+    # TODO: this doesn't optimize well on a 64-bit arch, not that it's a
+    # performance-critical chunk of code or anything
+    src = Template(r'''
+__device__ int bitwise_binsearch(const float *hay, float needle) {
+    int lo = 0;
+
+    // TODO: improve efficiency on 64-bit arches
+    {{for i in range(search_rounds-1, -1, -1)}}
+    if (needle > hay[lo + {{1 << i}}])
+        lo += {{1 << i}};
+    {{endfor}}
+    return lo;
+}
+''', 'bitwise_binsearch')
+    return devlib(defs=src.substitute(search_rounds=rounds))
+
+# 2^search_rounds is the maximum number of knots allowed in a single spline.
+# This includes the four required knots, so a 5 round search supports 28
+# interior knots in the domain (0, 1). 2^5 fits nicely on a single cache line.
+DEFAULT_SEARCH_ROUNDS = 5
+binsearchlib = mkbinsearchlib(DEFAULT_SEARCH_ROUNDS)
+
+
+filldptrlib = devlib(defs=r'''
+__global__ void
+fill_dptr(uint32_t* dptr, int size, uint32_t value) {
    int i = (gridDim.x * blockIdx.y + blockIdx.x) * blockDim.x + threadIdx.x;
    if (i < size) {
        dptr[i] = value;
    }
 }
+''')
+def fill_dptr(mod, dptr, size, stream=None, value=np.uint32(0)):
+    """
+    A memory zeroer which can be embedded in a stream, unlike the various
+    memset routines. Size is the number of 4-byte words in the pointer;
+    value is the word to fill it with. If value is not an np.uint32, it
+    will be coerced to a buffer and the first four bytes taken.
+    """
+    if not isinstance(value, np.uint32):
+        if isinstance(value, int):
+            value = np.uint32(value)
+        else:
+            value = np.frombuffer(buffer(value), np.uint32)[0]
+    blocks = int(np.ceil(np.sqrt(size / 1024.)))
+    launch('fill_dptr', mod, stream, (1024, 1, 1), (blocks, blocks),
+            dptr, np.int32(size), value)

+writehalflib = devlib(defs=r'''
 /* read_half and write_half decode and encode, respectively, two
 * floating-point values from a 32-bit value (typed as a 'float' for
 * convenience but not really). The values are packed into u16s as a
@ -122,8 +232,8 @@ void fill_dptr(uint32_t* dptr, int size, uint32_t value) {
 * approach when the alpha channel is present.
 */

-__device__
-void read_half(float &x, float &y, float xy, float den) {
+__device__ void
+read_half(float &x, float &y, float xy, float den) {
    asm("\n\t{"
        "\n\t   .reg .u16       x, y;"
        "\n\t   .reg .f32       rc;"
@ -137,8 +247,8 @@ void read_half(float &x, float &y, float xy, float den) {
        : "=f"(x), "=f"(y) : "f"(xy), "f"(den));
 }

-__device__
-void write_half(float &xy, float x, float y, float den) {
+__device__ void
+write_half(float &xy, float x, float y, float den) {
    asm("\n\t{"
        "\n\t   .reg .u16       x, y;"
        "\n\t   .reg .f32       rc, xf, yf;"
@ -152,87 +262,41 @@ void write_half(float &xy, float x, float y, float den) {
        "\n\t}"
        : "=f"(xy) : "f"(x), "f"(y), "f"(den));
 }
+''')


-/* This conversion uses the JPEG full-range standard. Note that UV have range
- * [-0.5, 0.5], so consider biasing the results. */
-__device__
-float3 rgb2yuv(float3 rgb) {
-    return make_float3(
-        0.299f      * rgb.x + 0.587f    * rgb.y + 0.114f    * rgb.z,
-        -0.168736f  * rgb.x - 0.331264f * rgb.y + 0.5f      * rgb.z,
-        0.5f        * rgb.x - 0.418688f * rgb.y - 0.081312f * rgb.z);
-}
-
-__device__
-float3 yuv2rgb(float3 yuv) {
-    return make_float3(
-        yuv.x                    + 1.402f   * yuv.z,
-        yuv.x - 0.34414f * yuv.y - 0.71414f * yuv.z,
-        yuv.x + 1.772f   * yuv.y);
-}
-
-__device__
-float3 rgb2hsv(float3 rgb) {
-    float M = fmaxf(fmaxf(rgb.x, rgb.y), rgb.z);
-    float m = fminf(fminf(rgb.x, rgb.y), rgb.z);
-    float C = M - m;
-
-    float s = M > 0.0f ? C / M : 0.0f;
-
-    float h = 0.0f;
-    if (s != 0.0f) {
-        C = 1.0f / C;
-        float rc = (M - rgb.x) * C;
-        float gc = (M - rgb.y) * C;
-        float bc = (M - rgb.z) * C;
-
-        if      (rgb.x == M)  h = bc - gc;
-        else if (rgb.y == M)  h = 2 + rc - bc;
-        else                  h = 4 + gc - rc;
-        
-        if (h < 0) h += 6.0f;
-    }
-    return make_float3(h, s, M);
-}
-
-__device__
-float3 hsv2rgb(float3 hsv) {
-
-    float whole = floorf(hsv.x);
-    float frac = hsv.x - whole;
-    float val = hsv.z;
-    float min = val * (1 - hsv.y);
-    float mid = val * (1 - (hsv.y * frac));
-    float alt = val * (1 - (hsv.y * (1 - frac)));
-
-    float3 out;
-         if (whole == 0.0f) { out.x = val; out.y = alt; out.z = min; }
-    else if (whole == 1.0f) { out.x = mid; out.y = val; out.z = min; }
-    else if (whole == 2.0f) { out.x = min; out.y = val; out.z = alt; }
-    else if (whole == 3.0f) { out.x = min; out.y = mid; out.z = val; }
-    else if (whole == 4.0f) { out.x = alt; out.y = min; out.z = val; }
-    else                    { out.x = val; out.y = min; out.z = mid; }
-    return out;
-}
-""").substitute()
-
-    @staticmethod
-    def fill_dptr(mod, dptr, size, stream=None, value=np.uint32(0)):
+def mkringbuflib(rb_size):
    """
-        A memory zeroer which can be embedded in a stream, unlike the various
-        memset routines. Size is the number of 4-byte words in the pointer;
-        value is the word to fill it with. If value is not an np.uint32, it
-        will be coerced to a buffer and the first four bytes taken.
+    A ringbuffer for access to shared resources.
+
+    Some components, such as the MWC contexts, are expensive to generate, and
+    have no affinity to a particular block. Rather than maintain a separate
+    copy of each of these objects for every thread block in a launch, we want
+    to only keep enough copies of these resources around to service every
+    thread block that could possibly be active simultaneously on one card,
+    which is often considerably smaller. This class provides a simple
+    ringbuffer type and an increment function, used in a couple places to
+    implement this kind of sharing.
    """
-        fill = mod.get_function("fill_dptr")
-        if not isinstance(value, np.uint32):
-            if isinstance(value, int):
-                value = np.uint32(value)
-            else:
-                value = np.frombuffer(buffer(value), np.uint32)[0]
-        blocks = int(np.ceil(np.sqrt(size / 1024 + 1)))
-        fill(dptr, np.int32(size), value, stream=stream,
-             block=(1024, 1, 1), grid=(blocks, blocks, 1))
-
+    return devlib(headers="#define RB_SIZE_MASK %d" % (rb_size - 1), decls='''
+typedef struct {
+    int head;
+    int tail;
+} ringbuf;
+''', defs=r'''
+__shared__ int rb_idx;
+__device__ int rb_incr(int &rb_base, int tidx) {
+    if (threadIdx.y == 1 && threadIdx.x == 1)
+        rb_idx = 256 * (atomicAdd(&rb_base, 1) & RB_SIZE_MASK);
+    __syncthreads();
+    return rb_idx + tidx;
+}
+''')

+# For now, the number of entries is fixed to a value known to work on all
+# Fermi cards. Autodetection, or perhaps just a global limit increase, will be
+# done when I get my hands on a Kepler device. The fixed size assumes blocks
+# of 256 threads, although even at that size there are pathological cases that
+# could break the assumption.
+DEFAULT_RB_SIZE = 64
+ringbuflib = mkringbuflib(DEFAULT_RB_SIZE)
--- a/cuburn/filters.py
+++ b/cuburn/filters.py
@ -0,0 +1,105 @@
+import numpy as np
+from numpy import float32 as f32, int32 as i32
+
+import pycuda.driver as cuda
+import pycuda.compiler
+from pycuda.gpuarray import vec
+
+import code.filters
+from code.util import ClsMod, argset, launch
+
+class Filter(object):
+    def apply(self, fb, gnm, dim, tc, stream=None):
+        """
+        Queue the application of this filter. When the live stream finishes
+        executing the last item enqueued by this method, the result must be
+        live in the buffer pointed to by ``fb.d_front`` at the return of this
+        function.
+        """
+        raise NotImplementedError()
+
+class Bilateral(Filter, ClsMod):
+    lib = code.filters.bilaterallib
+    def __init__(self, directions=8, r=15, sstd=6, cstd=0.05,
+                 dstd=1.5, dpow=0.8, gspeed=4.0):
+        # TODO: expose these parameters on the genome, or at least on the
+        # profile, and set them by a less ugly mechanism
+        for n in 'directions r sstd cstd dstd dpow gspeed'.split():
+            setattr(self, n, locals()[n])
+        super(Bilateral, self).__init__()
+
+    def apply(self, fb, gnm, dim, tc, stream=None):
+        # Helper variables and functions to keep it clean
+        sb = 16 * dim.astride
+        bs = sb * dim.ah
+        bl, gr = (32, 8, 1), (dim.astride / 32, dim.ah / 8)
+
+        mkdsc = lambda c: argset(cuda.ArrayDescriptor(), height=dim.ah,
+                                 width=dim.astride, num_channels=c,
+                                 format=cuda.array_format.FLOAT)
+        def mktref(n):
+            tref = self.mod.get_texref(n)
+            tref.set_filter_mode(cuda.filter_mode.POINT)
+            tref.set_address_mode(0, cuda.address_mode.WRAP)
+            tref.set_address_mode(1, cuda.address_mode.WRAP)
+            return tref
+
+        dsc = mkdsc(4)
+        tref = mktref('bilateral_src')
+        grad_dsc = mkdsc(1)
+        grad_tref = mktref('blur_src')
+
+        for pattern in range(self.directions):
+            # Scale spatial parameter so that a "pixel" is equivalent to an
+            # actual pixel at 1080p
+            sstd = self.sstd * dim.w / 1920.
+
+            tref.set_address_2d(fb.d_front, dsc, sb)
+
+            # Blur density two octaves along sampling vector, ultimately
+            # storing in the side buffer
+            launch('den_blur', self.mod, stream, bl, gr,
+                    fb.d_back, i32(pattern), i32(0), texrefs=[tref])
+            grad_tref.set_address_2d(fb.d_back, grad_dsc, sb / 4)
+            launch('den_blur_1c', self.mod, stream, bl, gr,
+                    fb.d_side, i32(pattern), i32(1), texrefs=[grad_tref])
+            grad_tref.set_address_2d(fb.d_side, grad_dsc, sb / 4)
+
+            launch('bilateral', self.mod, stream, bl, gr,
+                    fb.d_back, i32(pattern), i32(self.r),
+                    f32(sstd), f32(self.cstd), f32(self.dstd),
+                    f32(self.dpow), f32(self.gspeed),
+                    texrefs=[tref, grad_tref])
+            fb.flip()
+
+class Logscale(Filter, ClsMod):
+    lib = code.filters.logscalelib
+    def apply(self, fb, gnm, dim, tc, stream=None):
+        """Log-scale in place."""
+        k1 = f32(gnm.color.brightness(tc) * 268 / 256)
+        # Old definition of area is (w*h/(s*s)). Since new scale 'ns' is now
+        # s/w, new definition is (w*h/(s*s*w*w)) = (h/(s*s*w))
+        area = dim.h / (gnm.camera.scale(tc) ** 2 * dim.w)
+        k2 = f32(1.0 / (area * gnm.spp(tc)))
+        nbins = dim.ah * dim.astride
+        launch('logscale', self.mod, stream, 256, nbins/256,
+                fb.d_front, fb.d_front, k1, k2)
+
+class ColorClip(Filter, ClsMod):
+    lib = code.filters.colorcliplib
+    def apply(self, fb, gnm, dim, tc, stream=None):
+        # TODO: implement integration over cubic splines?
+        gam = f32(1 / gnm.color.gamma(tc))
+        vib = f32(gnm.color.vibrance(tc))
+        hipow = f32(gnm.color.highlight_power(tc))
+        lin = f32(gnm.color.gamma_threshold(tc))
+        lingam = f32(lin ** (gam-1.0) if lin > 0 else 0)
+        bkgd = vec.make_float3(
+                gnm.color.background.r(tc),
+                gnm.color.background.g(tc),
+                gnm.color.background.b(tc))
+
+        nbins = dim.ah * dim.astride
+        blocks = int(np.ceil(np.sqrt(nbins / 256.)))
+        launch('colorclip', self.mod, stream, 256, (blocks, blocks),
+                fb.d_front, gam, vib, hipow, lin, lingam, bkgd, i32(nbins))
--- a/cuburn/output.py
+++ b/cuburn/output.py
@ -0,0 +1,49 @@
+import numpy as np
+from numpy import float32 as f32, int32 as i32
+
+import pycuda.driver as cuda
+
+from code.util import ClsMod, launch
+from code.output import f32tou8lib
+
+import scipy.misc
+
+if not hasattr(scipy.misc, 'toimage'):
+    raise ImportError("Could not find scipy.misc.toimage. "
+                      "Are scipy and PIL installed?")
+
+class Output(object):
+    def convert(self, fb, gnm, dim, stream=None):
+        """
+        Convert a filtered buffer to whatever output format is needed by the
+        writer.
+        """
+        raise NotImplementedError()
+
+    def copy(self, fb, dim, pool, stream=None):
+        """
+        Schedule a copy from the device buffer to host memory, returning the
+        target buffer.
+        """
+        raise NotImplementedError()
+
+class PILOutput(Output, ClsMod):
+    lib = f32tou8lib
+
+    def convert(self, fb, gnm, dim, stream=None):
+        launch('f32_to_u8', self.mod, stream,
+                (32, 8, 1), (int(np.ceil(dim.w/32.)), int(np.ceil(dim.h/8.))),
+                fb.d_rb, fb.d_seeds, fb.d_back, fb.d_front,
+                i32(fb.gutter), i32(dim.w), i32(dim.astride), i32(dim.h))
+
+    def copy(self, fb, dim, pool, stream=None):
+        h_out = pool.allocate((dim.h, dim.w, 4), 'u1')
+        cuda.memcpy_dtoh_async(h_out, fb.d_back, stream)
+        return h_out
+
+    @staticmethod
+    def save(buf, name, type=None, quality=98):
+        if type == 'jpeg' or (type is None and name.endswith('.jpg')):
+            buf = buf[:,:,:3]
+        img = scipy.misc.toimage(buf, cmin=0, cmax=1)
+        img.save(name, type, quality=quality)
--- a/cuburn/render.py
+++ b/cuburn/render.py
@ -1,383 +1,372 @@
+"""
+Resources and tools to perform rendering.
+"""
+
 import os
 import sys
 import re
-import time as timemod
+import time
 import tempfile
 from collections import namedtuple
-from itertools import cycle, repeat, chain, izip, imap, ifilter
-from ctypes import *
-from cStringIO import StringIO
 import numpy as np
 from numpy import float32 as f32, int32 as i32, uint32 as u32, uint64 as u64
-from scipy import ndimage

-import pycuda.compiler
 import pycuda.driver as cuda
 import pycuda.tools

-import cuburn.genome
-from cuburn import affine
-from cuburn.code import util, mwc, iter, interp, filtering, sort
+import filters
+import output
+from code import util, mwc, iter, interp, sort
+from code.util import ClsMod, devlib, filldptrlib, assemble_code, launch

 RenderedImage = namedtuple('RenderedImage', 'buf idx gpu_time')
 Dimensions = namedtuple('Dimensions', 'w h aw ah astride')

-def _sync_stream(dst, src):
-    dst.wait_for_event(cuda.Event(cuda.event_flags.DISABLE_TIMING).record(src))
+class Framebuffers(object):
+    """
+    The largest memory allocations, and a stream to serialize their use.

-class Renderer(object):
+    ``d_front`` and ``d_back`` are separate buffers, each large enough to hold
+    four float32 components per pixel (including any gutter pixels added for
+    alignment or padding). ``d_side`` is another buffer large enough to hold
+    two float32 components per pixel.
+
+    Every user of this set of buffers may use and overwrite the buffers in
+    any way, as long as the output for the next stage winds up in the front
+    buffer. The front and back buffers can be exchanged by the ``flip()``
+    method (which simply exchanges the pointers); while no similar method
+    exists for the side buffer, you're free to do the same by taking local
+    copies of the references and exchanging them yourself.
+
+    There's one spot in the stream interleaving where the behavior is
+    different: the ``Output.convert`` call must store its output to the back
+    buffer, which will remain untouched until the dtoh copy of the converted
+    buffer is finished. This happens while the ``iter`` kernel of the next
+    frame writes to the front and side buffers, which means in practice that
+    there's essentially no waiting on any buffers.
+
+    If an output module decides to get all krazy and actually replaces the
+    references to the buffers on this object - to, say, implement a temporally
+    aware tone-mapping or denoising pass - that's probably okay, but just make
+    sure it appears like it's expected to.
    """
-    Control structure for rendering a series of frames.
+
+    # Minimum extension of accumulation buffer beyond output region. Used to
+    # alleviate edge effects during filtering. Actual gutter may be larger to
+    # accomodate alignment requirements; when it is, that extension will be
+    # applied to the lower-right corner of the buffer. This is asymmetrical,
+    # but simplifies trimming logic when it's time for that.
+    gutter = 10
+
+    @classmethod
+    def calc_dim(cls, width, height):
        """
+        Given a width and height, return a valid set of dimensions which
+        include at least enough gutter to exceed the minimum, and where
+        (acc_width % 32) == 0 and (acc_height % 8) == 0.
+        """
+        awidth = width + 2 * cls.gutter
+        aheight = 8 * int(np.ceil((height + 2 * cls.gutter) / 8.))
+        astride = 32 * int(np.ceil(awidth / 32.))
+        return Dimensions(width, height, awidth, aheight, astride)
+
+    def __init__(self):
+        self.stream = cuda.Stream()
+        self._clear()
+
+        # These resources rely on the slots/ringbuffer mechanism for sharing,
+        # and so can be shared across any number of launches, genomes, and
+        # render kernels. Notably, seeds are self-synchronizing, so they're not
+        # attached to either stream object.
+        self.d_rb = cuda.to_device(np.array([0, 0], dtype=u32))
+        seeds = mwc.make_seeds(util.DEFAULT_RB_SIZE * 256)
+        self.d_seeds = cuda.to_device(seeds)
+        self._len_d_points = util.DEFAULT_RB_SIZE * 256 * 16
+        self.d_points = cuda.mem_alloc(self._len_d_points)
+
+    def _clear(self):
+        self.nbins = self.d_front = self.d_back = self.d_side = None
+
+    def free(self, stream=None):
+        if stream is not None:
+            stream.synchronize()
+        else:
+            cu
+        for p in (self.d_front, self.d_back, self.d_side):
+            if p is not None:
+                p.free()
+        self._clear()
+
+    def alloc(self, dim, stream=None):
+        """
+        Ensure that this object's framebuffers are large enough to handle the
+        given dimensions, allocating new ones if not.
+
+        If ``stream`` is not None and a reallocation is necessary, the stream
+        will be synchronized before the old buffers are deallocated.
+        """
+        nbins = dim.ah * dim.astride
+        if self.nbins >= nbins: return
+        if self.nbins is not None: self.free()
+        try:
+            self.d_front = cuda.mem_alloc(16 * nbins)
+            self.d_back  = cuda.mem_alloc(16 * nbins)
+            self.d_side  = cuda.mem_alloc(8 * nbins)
+            self.nbins = nbins
+        except cuda.MemoryError, e:
+            # If a frame that's too large sneaks by the task distributor, we
+            # don't want to kill the server, but we also don't want to leave
+            # it stuck without any free memory to complete the next alloc.
+            # TODO: measure free mem and only take tasks that fit (but that
+            # should be done elsewhere)
+            self.free(stream)
+            raise e
+
+    def set_dim(self, width, height, stream=None):
+        """
+        Compute padded dimensions for given width and height, ensure that the
+        buffers are large enough (and reallocate if not), and return the
+        calculated dimensions.
+
+        Note that the returned dimensions are always the same for a given
+        width, height, and minimum gutter, even if the underlying buffers are
+        larger due to a previous allocation.
+        """
+        dim = self.calc_dim(width, height)
+        self.alloc(dim, stream)
+        return dim
+
+    def flip(self):
+        """Flip the front and back buffers."""
+        self.d_front, self.d_back = self.d_back, self.d_front
+
+class DevSrc(object):
+    """
+    The buffers which represent a genome on-device, in the formats needed to
+    serve as a source for interpolating temporal samples.
+    """
+
+    # Maximum number of knots per parameter. This also covers the maximum
+    # number of palettes allowed.
+    max_knots = 1 << util.DEFAULT_SEARCH_ROUNDS
+
+    # Maximum number of parameters per genome. This number is exceedingly
+    # high, and should never even come close to being hit.
+    max_params = 1024
+
+    def __init__(self):
+        self.d_times = cuda.mem_alloc(4 * self.max_knots * self.max_params)
+        self.d_knots = cuda.mem_alloc(4 * self.max_knots * self.max_params)
+        self.d_ptimes = cuda.mem_alloc(4 * self.max_knots)
+        self.d_pals = cuda.mem_alloc(4 * 4 * 256 * self.max_knots)
+
+class DevInfo(object):
+    """
+    The buffers which hold temporal samples on-device, as used by iter.
+    """
+
+    # The palette texture/surface covers the color coordinate from [0,1] with
+    # equidistant horizontal samples, and spans the temporal range of the
+    # frame linearly with this many rows. Increasing these values increases the
+    # number of uniquely-dithered samples when using pre-dithered surfaces, as
+    # is done in 'atomic' accumulation.
+    palette_width = 256 # TODO: make this setting be respected
+    palette_height = 64
+
+    # This used to be determined automagically, but simultaneous occupancy
+    # and a much smaller block size simplifies this.
+    ntemporal_samples = 1024

    # Number of iterations to iterate without write after generating a new
    # point. This number is currently fixed pretty deeply in the set of magic
    # constants which govern buffer sizes; changing the value here won't
    # actually change the code on the device to do something different.
+    # It's here just for documentation purposes.
    fuse = 256

-    # The palette texture/surface covers the color coordinate from [0,1] with
-    # (for now, a fixed 256) equidistant horizontal samples, and spans the
-    # temporal range of the frame linearly with this many rows. Increasing
-    # this value increases the number of uniquely-dithered samples when using
-    # pre-dithered surfaces.
-    palette_height = 64
-
-    # Palette color interpolation mode (see code.interp.Palette)
-    palette_interp_mode = 'yuv'
-
-    # Maximum width of DE and other spatial filters, and thus in turn the
-    # amount of padding applied. Note that, for now, this must not be changed!
-    # The filtering code makes deep assumptions about this value.
-    gutter = 10
-
-    # Accumulation mode. Leave it at 'atomic' for now.
-    acc_mode = 'atomic'
-
-    # At most this many separate buffers for xforms will be allocated, after
-    # which further xforms will wrap to the first when writing. Currently it
-    # is compiled in, so power-of-two and no runtime maximization. Current
-    # value of 16 fits into a 1GB card at 1080p.
-    max_nxf = 1
-
-    # TODO
-    chaos_used = False
-
-    cmp_options = ('-use_fast_math', '-maxrregcount', '42')
-    keep = False
-
    def __init__(self):
-        self._iter = self.pal = self.src = self.cubin = self.mod = None
-
-        # Ensure class options don't get contaminated on an instance
-        self.cmp_options = list(self.cmp_options)
-
-    def compile(self, genome, keep=None, cmp_options=None):
-        """
-        Compile a kernel capable of rendering every frame in this animation.
-        The resulting compiled kernel is stored in the ``cubin`` property;
-        the source is available as ``src``, and is also returned for
-        inspection and display.
-
-        This operation is idempotent, and has no side effects outside of
-        setting properties on this instance (unless there's a compiler error,
-        which is a bug); it should therefore be threadsafe as well.
-        It is, however, rather slow.
-        """
-        keep = self.keep if keep is None else keep
-        cmp_options = self.cmp_options if cmp_options is None else cmp_options
-
-        self._iter = iter.IterCode(self, genome)
-        self._iter.packer.finalize()
-        self.pal = interp.Palette(self.palette_interp_mode)
-        self.src = util.assemble_code(util.BaseCode, mwc.MWC, self._iter.packer,
-                                      self.pal, self._iter)
-        with open(os.path.join(tempfile.gettempdir(), 'kernel.cu'), 'w') as fp:
-            fp.write(self.src)
-        self.cubin = pycuda.compiler.compile(
-                self.src, keep=keep, options=cmp_options,
-                cache_dir=False if keep else None)
-
-    def load(self, genome, jit_options=[]):
-        if not self.cubin:
-            self.compile(genome)
-        self.mod = cuda.module_from_buffer(self.cubin, jit_options)
-        with open('/tmp/iter_kern.cubin', 'wb') as fp:
-            fp.write(self.cubin)
-        return self.src
-
-    def render(self, genome, times, width, height, blend=True):
-        """
-        Render a frame for each timestamp in the iterable value ``times``. This
-        function returns a generator that will yield a RenderedImage object
-        containing a shared reference to the output buffer for each specified
-        frame.
-
-        The returned buffer is page-locked host memory. Between the time a
-        buffer is yielded and the time the next frame's results are requested,
-        the buffer will not be modified. Thereafter, however, it will be
-        overwritten by an asynchronous DMA operation coming from the CUDA
-        device. If you hang on to it for longer than one frame, copy it.
-
-        ``genome`` is the genome to be rendered. Successive calls to the
-        `render()` method on one ``Renderer`` object must use genomes which
-        produce identical compiled code, and this will not be verified by the
-        renderer. In practice, this means you can alter genome parameter
-        values, but the full set of keys must remain identical between runs on
-        the same renderer.
-
-        ``times`` is a list of (idx, cen_time) tuples, where ``idx`` is passed
-        unmodified in the RenderedImage return value and ``cen_time`` is the
-        central time of the current frame in spline-time units. (Any
-        clock-time or frame-time units in the genome should be preconverted.)
-
-        If ``blend`` is False, the output buffer will contain unclipped,
-        premultiplied RGBA data, without vibrancy, highlight power, or the
-        alpha elbow applied.
-        """
-        r = self.render_gen(genome, width, height, blend=blend)
-        next(r)
-        return ifilter(None, imap(r.send, chain(times, [None])))
-
-    def render_gen(self, genome, width, height, blend=True):
-        """
-        Render frames. This method is wrapped by the ``render()`` method; see
-        its docstring for warnings and details.
-
-        Instead of passing frame times as an iterable, they are passed
-        individually via the ``generator.send()`` method. There is an
-        internal pipeline latency of one frame, so the first call to the
-        ``send()`` method will return None, the second call will return the
-        first frame's result, and so on. To retrieve the last frame in a
-        sequence, send ``None``.
-
-        Direct use of this method is useful for implementing render servers.
-        """
-
-        last_idx = None
-        next_frame = yield
-        if next_frame is None:
-            return
-
-        if not self.mod:
-            self.load(genome)
-
-        filt = filtering.Filtering()
-
-        reset_rb_fun = self.mod.get_function("reset_rb")
-        packer_fun = self.mod.get_function("interp_iter_params")
-        iter_fun = self.mod.get_function("iter")
-
-        # The synchronization model is messy. See helpers/task_model.svg.
-        iter_stream = cuda.Stream()
-        filt_stream = cuda.Stream()
-        if self.acc_mode == 'deferred':
-            write_stream = cuda.Stream()
-            write_fun = self.mod.get_function("write_shmem")
-        else:
-            write_stream = iter_stream
-
-        # These events fire when the corresponding buffer is available for
-        # reading on the host (i.e. the copy is done). On the first pass, 'a'
-        # will be ignored, and subsequently moved to 'b'.
-        event_a = cuda.Event().record(filt_stream)
-        event_b = None
-
-        awidth = width + 2 * self.gutter
-        aheight = 32 * int(np.ceil((height + 2 * self.gutter) / 32.))
-        astride = 32 * int(np.ceil(awidth / 32.))
-        dim = Dimensions(width, height, awidth, aheight, astride)
-        d_acc_size = self.mod.get_global('acc_size')[0]
-        cuda.memcpy_htod_async(d_acc_size, u32(list(dim)), write_stream)
-
-        nbins = astride * aheight
-        nxf = len(filter(lambda g: g != 'final', genome.xforms))
-        nxf = min(nxf, self.max_nxf)
-        d_accum = cuda.mem_alloc(16 * nbins * nxf)
-        d_out = cuda.mem_alloc(16 * nbins)
-        if self.acc_mode == 'atomic':
-            d_atom = cuda.mem_alloc(8 * nbins * nxf)
-            flush_fun = self.mod.get_function("flush_atom")
-        else:
-            # d_atom is also used as a scratch buffer during filtering, so we
-            # need it at least this size
-            d_atom = cuda.mem_alloc(4 * nbins)
-
-        obuf_copy = util.argset(cuda.Memcpy2D(),
-            src_y=self.gutter, src_x_in_bytes=16*self.gutter,
-            src_pitch=16*astride, dst_pitch=16*width,
-            width_in_bytes=16*width, height=height)
-        obuf_copy.set_src_device(d_out)
-        h_out_a = cuda.pagelocked_empty((height, width, 4), f32)
-        h_out_b = cuda.pagelocked_empty((height, width, 4), f32)
-
-        if self.acc_mode == 'deferred':
-            # Having a fixed, power-of-two log size makes things much easier
-            log_size = 64 << 20
-            d_log = cuda.mem_alloc(log_size * 4)
-            d_log_sorted = cuda.mem_alloc(log_size * 4)
-            sorter = sort.Sorter(log_size)
-            # We need to cover each unique tag - address bits 20-23 - with one
-            # write block per sort bin. Or somethinig like that.
-            nwriteblocks = int(np.ceil(nbins / float(1<<20))) * 256
-
-        # Calculate 'nslots', the number of simultaneous running threads that
-        # can be active on the GPU during iteration (and thus the number of
-        # slots for loading and storing RNG and point context that will be
-        # prepared on the device), and derive 'rb_size', the number of blocks in
-        # 'nslots'.
-        iter_threads_per_block = 256
-        dev_data = pycuda.tools.DeviceData()
-        occupancy = pycuda.tools.OccupancyRecord(
-                dev_data, iter_threads_per_block,
-                iter_fun.shared_size_bytes, iter_fun.num_regs)
-        nsms = cuda.Context.get_device().multiprocessor_count
-        rb_size = occupancy.warps_per_mp * nsms / (iter_threads_per_block / 32)
-        nslots = iter_threads_per_block * rb_size
-
-        # Reset the ringbuffer info for the slots
-        reset_rb_fun(np.int32(rb_size), block=(1,1,1))
-
-        d_points = cuda.mem_alloc(nslots * 16)
-        # This statement may add extra seeds to simplify palette dithering.
-        seeds = mwc.MWC.make_seeds(max(nslots, 256 * self.palette_height))
-        d_seeds = cuda.to_device(seeds)
-
-        # We used to auto-calculate this to a multiple of the number of SMs on
-        # the device, but since we now use shorter launches and, to a certain
-        # extent, allow simultaneous occupancy, that's not as important. The
-        # 1024 is a magic constant to ensure reasonable and power-of-two log
-        # size for deferred: 256MB / (4B * FUSE * NTHREADS). Enhancements to
-        # the sort engine are needed to make this more flexible.
-        ntemporal_samples = 1024
-        genome_times, genome_knots = self._iter.packer.pack()
-        d_genome_times = cuda.to_device(genome_times)
-        d_genome_knots = cuda.to_device(genome_knots)
-        info_size = 4 * len(self._iter.packer) * ntemporal_samples
-        d_infos = cuda.mem_alloc(info_size)
-
-        ptimes, pidxs = zip(*genome.palette_times)
-        palint_times = np.empty(len(genome_times[0]), f32)
-        palint_times.fill(1e10)
-        palint_times[:len(ptimes)] = ptimes
-        d_palint_times = cuda.to_device(palint_times)
-        pvals = self.pal.prepare([genome.decoded_palettes[i] for i in pidxs])
-        d_palint_vals = cuda.to_device(np.concatenate(pvals))
-
-        if self.acc_mode in ('deferred', 'atomic'):
-            palette_fun = self.mod.get_function("interp_palette_flat")
-            dsc = util.argset(cuda.ArrayDescriptor3D(),
-                    height=self.palette_height, width=256, depth=0,
+        self.d_params = cuda.mem_alloc(
+                self.ntemporal_samples * DevSrc.max_params * 4)
+        self.palette_surf_dsc = util.argset(cuda.ArrayDescriptor3D(),
+                height=self.palette_height, width=self.palette_width, depth=0,
                format=cuda.array_format.SIGNED_INT32,
                num_channels=2, flags=cuda.array3d_flags.SURFACE_LDST)
-            palarray = cuda.Array(dsc)
+        self.d_pal_array = cuda.Array(self.palette_surf_dsc)
+
+class Renderer(object):
+    def __init__(self, gnm):
+        self.packer, self.lib = iter.mkiterlib(gnm)
+        cubin = util.compile('iter', assemble_code(self.lib))
+        self.mod = cuda.module_from_buffer(cubin)
+
+        # TODO: make these customizable
+        self.filts = [ filters.Bilateral()
+                     , filters.Logscale()
+                     , filters.ColorClip() ]
+        self.out = output.PILOutput()
+
+class RenderManager(ClsMod):
+    lib = devlib(deps=[interp.palintlib, filldptrlib, iter.flushatomlib])
+
+    def __init__(self):
+        super(RenderManager, self).__init__()
+        self.pool = pycuda.tools.PageLockedMemoryPool()
+
+        self.fb = Framebuffers()
+        self.src_a, self.src_b = DevSrc(), DevSrc()
+        self.info_a, self.info_b = DevInfo(), DevInfo()
+        self.stream_a, self.stream_b = cuda.Stream(), cuda.Stream()
+        self.filt_evt = self.copy_evt = None
+
+    def _copy(self, rdr, gnm):
+        """
+        Queue a copy of a host genome into a set of device interpolation source
+        buffers.
+
+        Note that for now, this is broken! It ignores ``gnm``, and only packs
+        the genome that was used when creating the renderer.
+        """
+        times, knots = rdr.packer.pack(self.pool)
+        cuda.memcpy_htod_async(self.src_a.d_times, times, self.stream_a)
+        cuda.memcpy_htod_async(self.src_a.d_knots, knots, self.stream_a)
+
+        ptimes, pidxs = zip(*gnm.palette_times)
+        palettes = self.pool.allocate((len(ptimes), 256, 4), f32)
+        palettes[:] = [gnm.decoded_palettes[i] for i in pidxs]
+        palette_times = self.pool.allocate((self.src_a.max_knots,), f32)
+        palette_times.fill(1e9)
+        palette_times[:len(ptimes)] = ptimes
+        cuda.memcpy_htod_async(self.src_a.d_pals, palettes, self.stream_a)
+        cuda.memcpy_htod_async(self.src_a.d_ptimes, palette_times,
+                               self.stream_a)
+
+        # TODO: use bilerp tex as src for palette interp
+
+    def _interp(self, rdr, gnm, dim, ts, td):
+        d_acc_size = rdr.mod.get_global('acc_size')[0]
+        p_dim = self.pool.allocate((len(dim),), u32)
+        p_dim[:] = dim
+        cuda.memcpy_htod_async(d_acc_size, p_dim, self.stream_a)

        tref = self.mod.get_surfref('flatpal')
-            tref.set_array(palarray, 0)
-        else:
-            palette_fun = self.mod.get_function("interp_palette")
-            dsc = util.argset(cuda.ArrayDescriptor(),
-                    height=self.palette_height, width=256,
-                    format=cuda.array_format.UNSIGNED_INT8,
-                    num_channels=4)
-            d_palmem = cuda.mem_alloc(256 * self.palette_height * 4)
+        tref.set_array(self.info_a.d_pal_array, 0)
+        launch('interp_palette_flat', self.mod, self.stream_a,
+                256, self.info_a.palette_height,
+                self.fb.d_rb, self.fb.d_seeds,
+                self.src_a.d_ptimes, self.src_a.d_pals,
+                f32(ts), f32(td / self.info_a.palette_height))

-            tref = self.mod.get_texref('palTex')
-            tref.set_address_2d(d_palmem, dsc, 1024)
-            tref.set_flags(cuda.TRSF_NORMALIZED_COORDINATES)
-            tref.set_filter_mode(cuda.filter_mode.LINEAR)
+        nts = self.info_a.ntemporal_samples
+        launch('interp_iter_params', rdr.mod, self.stream_a,
+                256, np.ceil(nts / 256.),
+                self.info_a.d_params, self.src_a.d_times, self.src_a.d_knots,
+                f32(ts), f32(td / nts), i32(nts))

-        while next_frame is not None:
-            # tc, td, ts, te: central, delta, start, end times
-            idx, tc = next_frame
-            td = genome.adj_frame_width(tc)
+    def _print_interp_knots(self, rdr, tsidx=5):
+        infos = cuda.from_device_like(self.info_a.d_params,
+                (tsidx + 1, self.info_a.max_params), f32)
+        for i, n in zip(infos[-1], rdr.packer.packed):
+            print '%60s %g' % ('_'.join(n), i)
+
+    def _iter(self, rdr, gnm, dim, tc):
+        tref = rdr.mod.get_surfref('flatpal')
+        tref.set_array(self.info_a.d_pal_array, 0)
+
+        nbins = dim.ah * dim.astride
+        fill = lambda b, s, v=i32(0): util.fill_dptr(
+                self.mod, b, s, stream=self.stream_a, value=v)
+        fill(self.fb.d_front,  4 * nbins)
+        fill(self.fb.d_side,   2 * nbins)
+        fill(self.fb.d_points, self.fb._len_d_points / 4, f32(np.nan))
+
+        nts = self.info_a.ntemporal_samples
+        nsamps = (gnm.spp(tc) * dim.w * dim.h)
+        nrounds = int(nsamps / (nts * 256. * 256)) + 1
+        launch('iter', rdr.mod, self.stream_a, (32, 8, 1), (nts, nrounds),
+                self.fb.d_front, self.fb.d_side,
+                self.fb.d_rb, self.fb.d_seeds, self.fb.d_points,
+                self.info_a.d_params)
+
+        nblocks = int(np.ceil(np.sqrt(dim.ah*dim.astride/256.)))
+        launch('flush_atom', self.mod, self.stream_a,
+                256, (nblocks, nblocks),
+                u64(self.fb.d_front), u64(self.fb.d_side), i32(nbins))
+
+    def queue_frame(self, rdr, gnm, tc, w, h, copy=True):
+        """
+        Queue one frame for rendering.
+
+        ``rdr`` is a compiled Renderer module. Caller must ensure that the
+        module is compatible with the genome data provided.
+
+        ``gnm`` is the genome to be rendered.
+
+        ``tc`` is the center time at which to render.
+
+        ``w``, ``h`` are the width and height of the desired output in px.
+
+        If ``copy`` is False, the genome data will not be recopied for each
+        new genome. This function must be called with ``copy=True`` the first
+        time a new genome is used, and may be called in that manner
+        subsequently without harm. I suspect the performance impact is low, so
+        leave ``copy`` to True every time for now.
+
+        The return value is a 2-tuple ``(evt, h_out)``, where ``evt`` is a
+        CUDA event and ``h_out`` is the return value of the output module's
+        ``copy`` function. In the typical case, ``h_out`` will be a host
+        allocation containing data in an appropriate format for the output
+        module's file writer, and ``evt`` indicates when the asynchronous
+        DMA copy which will populate ``h_out`` is complete. This can vary
+        depending on the output module in use, though.
+        """
+        # Note: we synchronize on the previous stream if buffers need to be
+        # reallocated, which implicitly also syncs the current stream.
+        dim = self.fb.set_dim(w, h, self.stream_b)
+
+        td = gnm.adj_frame_width(tc)
        ts, te = tc - 0.5 * td, tc + 0.5 * td

-            if self.acc_mode in ('deferred', 'atomic'):
-                # In this mode, the palette writes to a surface reference, but
-                # requires dithering, so we pass it the seeds instead
-                arg0 = d_seeds
-            else:
-                arg0 = d_palmem
-            palette_fun(arg0, d_palint_times, d_palint_vals,
-                        f32(ts), f32(td / self.palette_height),
-                        block=(256,1,1), grid=(self.palette_height,1),
-                        stream=write_stream)
+        # The stream interleaving here is nontrivial.
+        # TODO: update diagram and link to it here
+        if copy:
+            self.src_a, self.src_b = self.src_b, self.src_a
+            self._copy(rdr, gnm)
+        self._interp(rdr, gnm, dim, ts, td)
+        if self.filt_evt:
+            self.stream_a.wait_for_event(self.filt_evt)
+        self._iter(rdr, gnm, dim, tc)
+        if self.copy_evt:
+            self.stream_a.wait_for_event(self.copy_evt)
+        for filt in rdr.filts:
+            filt.apply(self.fb, gnm, dim, tc, self.stream_a)
+        rdr.out.convert(self.fb, gnm, dim, self.stream_a)
+        self.filt_evt = cuda.Event().record(self.stream_a)
+        h_out = rdr.out.copy(self.fb, dim, self.pool, self.stream_a)
+        self.copy_evt = cuda.Event().record(self.stream_a)

-            packer_fun(d_infos, d_genome_times, d_genome_knots,
-                       f32(ts), f32(td / ntemporal_samples),
-                       i32(ntemporal_samples), block=(256,1,1),
-                       grid=(int(np.ceil(ntemporal_samples/256.)),1),
-                       stream=iter_stream)
-
-            # Reset points so that they will be FUSEd
-            util.BaseCode.fill_dptr(self.mod, d_points, 4 * nslots,
-                                    iter_stream, f32(np.nan))
-
-            # Get interpolated control points for debugging
-            #iter_stream.synchronize()
-            #d_temp = cuda.from_device(d_infos,
-                    #(ntemporal_samples, len(self._iter.packer)), f32)
-            #for i, n in zip(d_temp[5], self._iter.packer.packed):
-                #print '%60s %g' % ('_'.join(n), i)
-
-            util.BaseCode.fill_dptr(self.mod, d_accum, 4 * nbins * nxf,
-                                    write_stream)
-            if self.acc_mode == 'atomic':
-                util.BaseCode.fill_dptr(self.mod, d_atom, 2 * nbins * nxf,
-                                        write_stream)
-            nrounds = int( (genome.spp(tc) * width * height)
-                         / (ntemporal_samples * 256 * 256) ) + 1
-            if self.acc_mode == 'deferred':
-                for i in range(nrounds):
-                    iter_fun(np.uint64(d_log), d_seeds, d_points, d_infos,
-                             block=(32, self._iter.NTHREADS/32, 1),
-                             grid=(ntemporal_samples, 1), stream=iter_stream)
-                    _sync_stream(write_stream, iter_stream)
-                    sorter.sort(d_log_sorted, d_log, log_size, 3, True,
-                                stream=write_stream)
-                    _sync_stream(iter_stream, write_stream)
-                    write_fun(d_accum, d_log_sorted, sorter.dglobal, i32(nbins),
-                              block=(1024, 1, 1), grid=(nwriteblocks, 1),
-                              stream=write_stream)
-            else:
-                args = [u64(d_accum), d_seeds, d_points, d_infos]
-                if self.acc_mode == 'atomic':
-                    args.append(u64(d_atom))
-                iter_fun(*args, block=(32, self._iter.NTHREADS/32, 1),
-                         grid=(ntemporal_samples, nrounds), stream=iter_stream)
-                if self.acc_mode == 'atomic':
-                    nblocks = int(np.ceil(np.sqrt(nbins*nxf/float(512))))
-                    flush_fun(u64(d_accum), u64(d_atom), i32(nbins*nxf),
-                              block=(512, 1, 1), grid=(nblocks, nblocks),
-                              stream=iter_stream)
-
-            util.BaseCode.fill_dptr(self.mod, d_out, 4 * nbins, filt_stream)
-            _sync_stream(filt_stream, write_stream)
-            filt.de(d_out, d_accum, d_atom, genome, dim, tc, nxf,
-                    stream=filt_stream)
-            _sync_stream(write_stream, filt_stream)
-            filt.colorclip(d_out, genome, dim, tc, blend, stream=filt_stream)
-            obuf_copy.set_dst_host(h_out_a)
-            obuf_copy(filt_stream)
-
-            if event_b:
-                while not event_a.query():
-                    timemod.sleep(0.01)
-                gpu_time = event_a.time_since(event_b)
-                result = RenderedImage(h_out_b, last_idx, gpu_time)
-            else:
-                result = None
-            last_idx = idx
-
-            event_a, event_b = cuda.Event().record(filt_stream), event_a
-            h_out_a, h_out_b = h_out_b, h_out_a
-
-            # TODO: add ability to flush a frame without breaking the pipe
-            next_frame = yield result
-
-        while not event_a.query():
-            timemod.sleep(0.001)
-        gpu_time = event_a.time_since(event_b)
-        yield RenderedImage(h_out_b, last_idx, gpu_time)
+        self.info_a, self.info_b = self.info_b, self.info_a
+        self.stream_a, self.stream_b = self.stream_b, self.stream_a
+        return self.copy_evt, h_out

+    def render(self, gnm, times, w, h):
+        """
+        A port of the old rendering function, retained for backwards
+        compatibility. Some of this will be pulled into as-yet-undecided
+        methods for more DRY.
+        """
+        rdr = Renderer(gnm)
+        last_evt = cuda.Event().record(self.stream_a)
+        last_idx = None
+        def wait(): # Times like these where you wish for a macro
+            while not last_evt.query():
+                time.sleep(0.01)
+            gpu_time = last_evt.time_since(two_evts_ago)
+            return RenderedImage(h_buf, idx, gpu_time)
+        for idx, tc in times:
+            evt, h_buf = self.queue_frame(rdr, gnm, tc, w, h, last_idx is None)
+            if last_idx:
+                yield wait()
+            two_evts_ago, last_evt = last_evt, evt
+            last_buf, last_idx = h_buf, idx
+        if last_idx:
+            yield wait()
--- a/main.py
+++ b/main.py
@ -3,8 +3,8 @@
 # cuburn, one of a surprisingly large number of ports of the fractal flame
 # algorithm to NVIDIA GPUs.
 #
-# This one is copyright 2010-2011, Steven Robertson <steven@strobe.cc>
-# and Eric Reckase <e.reckase@gmail.com>.
+# This one is copyright 2010-2012, Steven Robertson <steven@strobe.cc>
+# and Erik Reckase <e.reckase@gmail.com>.
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License version 2 or later
@ -20,11 +20,9 @@ from subprocess import Popen
 from itertools import ifilter

 import numpy as np
-import Image
-import scipy
 import pycuda.driver as cuda

-from cuburn import genome, render
+from cuburn import genome, render, filters, output

 profiles = {
    '1080p': dict(fps=24, width=1920, height=1080, quality=3000, skip=0),
@ -33,11 +31,10 @@ profiles = {
    'preview': dict(fps=24, width=640, height=360, quality=800, skip=1)
 }

-def save(rframe):
-    noalpha = rframe.buf[:,:,:3]
-    img = scipy.misc.toimage(noalpha, cmin=0, cmax=1)
-    img.save(rframe.idx, quality=98)
-    print rframe.idx, rframe.gpu_time
+def save(out):
+    # Temporary! TODO: fix this
+    output.PILOutput.save(out.buf, out.idx)
+    print out.idx, out.gpu_time

 def main(args, prof):
    import pycuda.autoinit
@ -53,9 +50,7 @@ def main(args, prof):
    gnm = genome.Genome(gnm)
    err, times = gnm.set_profile(prof)

-    anim = render.Renderer()
-    anim.compile(gnm, keep=args.keep)
-    anim.load(gnm)
+    rmgr = render.RenderManager()

    basename = os.path.basename(args.flame.name).rsplit('.', 1)[0] + '_'
    if args.flame.name == '-':
@ -76,9 +71,13 @@ def main(args, prof):
                  if not os.path.isfile(f[0]) or m > os.path.getmtime(f[0]))

    w, h = prof['width'], prof['height']
-    gen = anim.render(gnm, frames, w, h)
+    gen = rmgr.render(gnm, frames, w, h)
+
+    if not args.gfx:
+        for out in gen:
+            save(out)
+        return

-    if args.gfx:
    import pyglet
    window = pyglet.window.Window(w, h, vsync=False)
    image = pyglet.image.CheckerImagePattern().create_image(w, h)
@ -126,11 +125,7 @@ def main(args, prof):
    pyglet.clock.set_fps_limit(30)
    pyglet.clock.schedule_interval(poll, 1/30.)
    pyglet.app.run()
-    else:
-        for out in gen:
-            save(out)
-            if args.sync:
-                cuda.Context.synchronize()
+

 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Render fractal flames.')
@ -150,8 +145,6 @@ if __name__ == "__main__":
    parser.add_argument('--pause', action='store_true',
        help="Don't close the preview window after rendering is finished")

-    parser.add_argument('--keep', action='store_true', dest='keep',
-        help='Keep compilation directory (disables kernel caching)')
    parser.add_argument('--sync', action='store_true', dest='sync',
        help='Use synchronous launches whenever possible')

--- a/worker.py
+++ b/worker.py
@ -17,6 +17,8 @@ import redis

 from cuburn import render, genome

+import pycuda.driver as cuda
+
 pycuda = None

 # The default maximum number of waiting jobs. Also used to determine when a
@ -50,20 +52,6 @@ def get_temperature():
                return out[idx+1:idx+3]
    return ''

-def push_frame(r, out):
-    if out is None:
-        return
-    sid, sidx, ftag = out.idx
-    # TODO: gotta put this in a module somewhere and make it adjustable
-    noalpha = out.buf[:,:,:3]
-    img = scipy.misc.toimage(noalpha, cmin=0, cmax=1)
-    buf = StringIO()
-    img.save(buf, 'jpeg', quality=98)
-    buf.seek(0)
-    head = ' '.join([sidx, str(out.gpu_time), ftag])
-    r.rpush(sid + ':queue', head + '\0' + buf.read())
-    print 'Pushed frame: %s' % head
-
 def work(server):
    global pycuda
    import pycuda.autoinit
@ -80,7 +68,11 @@ def work(server):
    r.expire(wid, 180)
    last_ping = time.time()

-    last_pid, last_gid, riter = None, None, None
+    idx = evt = buf = None
+    last_idx = last_buf = last_evt = two_evts_ago = None
+    last_pid = last_gid = rdr = None
+
+    mgr = render.RenderManager()

    while True:
        task = r.blpop('renderpool:' + rev + ':queue', 10)
@ -90,28 +82,46 @@ def work(server):
            r.expire(wid, 180)
            last_ping = now

+        # last_evt will be populated during normal queue operation (when evt
+        # contains the most recent event), as well as when the render queue is
+        # flushing due to not receiving a new task before the timeout.
+        if last_idx is not None:
+            while not last_evt.query():
+                # This delay could probably be a lot higher with zero impact
+                # on throughput for Fermi cards
+                time.sleep(0.05)
+
+            sid, sidx, ftag = last_idx
+            obuf = StringIO()
+            rdr.out.save(buf, obuf, 'jpeg')
+            obuf.seek(0)
+            gpu_time = last_evt.time_since(two_evts_ago)
+            head = ' '.join([sidx, str(gpu_time), ftag])
+            r.rpush(sid + ':queue', head + '\0' + obuf.read())
+            print 'Pushed frame: %s' % head
+
+        two_evts_ago, last_evt = last_evt, evt
+        last_idx, last_buf = idx, buf
+
        if not task:
-            if riter:
-                push_frame(r, riter.send(None))
-                riter = None
+            idx = evt = buf = None
            continue

        sid, sidx, pid, gid, ftime, ftag = task[1].split(' ', 5)
-        if pid != last_pid or gid != last_gid or not riter:
+        if pid != last_pid or gid != last_gid or not rdr:
            gnm = genome.Genome(json.loads(r.get(gid)))
            prof = json.loads(r.get(pid))
            gnm.set_profile(prof)
-            renderer = render.Renderer()
-            renderer.load(gnm)
+            rdr = render.Renderer(gnm)

-            if riter:
-                push_frame(r, riter.send(None))
+        if last_evt is None:
+            # Create a dummy event for timing
+            last_evt = cuda.Event().record(mgr.stream_a)

-            riter = renderer.render_gen(gnm, prof['width'], prof['height'])
-            next(riter)
-            last_pid, last_gid = pid, gid
-
-        push_frame(r, riter.send(((sid, sidx, ftag), float(ftime))))
+        copy = last_idx is None
+        w, h = prof['width'], prof['height']
+        evt, buf = mgr.queue_frame(rdr, gnm, float(ftime), w, h, copy)
+        idx = sid, sidx, ftag

 def iter_genomes(prof, gpaths, pname='540p'):
    """