Sweeping refactor. More bugs undoubtedly remain.

2025-06-30 13:26:27 -04:00 · 2012-02-14 07:40:58 -08:00
parent 4cfd328f85
commit 60a45c9a20
13 changed files with 1170 additions and 1299 deletions
--- a/cuburn/code/init.py
+++ b/cuburn/code/init.py
@ -1,9 +0,0 @@
 """
 Contains the PTX fragments which will drive the device, and helper functions
 to combine those fragments.
 """
 import util
 import mwc
 import iter
--- a/cuburn/code/color.py
+++ b/cuburn/code/color.py
@ -0,0 +1,77 @@
 import numpy as np
 from util import devlib
 # The JPEG YUV full-range matrix, without bias into the positve regime.
 # This assumes input color space is CIERGB D65, encoded with gamma 2.2.
 # Note that some interpolated colors may exceed the sRGB and YUV gamuts.
 YUV_MATRIX = np.matrix([[ 0.299,        0.587,      0.114],
                        [-0.168736,    -0.331264,   0.5],
                        [ 0.5,         -0.418688,  -0.081312]])
 yuvlib = devlib(defs='''
 __device__ float3 rgb2yuv(float3 rgb);
 __device__ float3 yuv2rgb(float3 yuv);
 ''', decls=r'''
 /* This conversion uses the JPEG full-range standard. Note that UV have range
 * [-0.5, 0.5], so consider biasing the results. */
 __device__ float3 rgb2yuv(float3 rgb) {
    return make_float3(
        0.299f      * rgb.x + 0.587f    * rgb.y + 0.114f    * rgb.z,
        -0.168736f  * rgb.x - 0.331264f * rgb.y + 0.5f      * rgb.z,
        0.5f        * rgb.x - 0.418688f * rgb.y - 0.081312f * rgb.z);
 }
 __device__ float3 yuv2rgb(float3 yuv) {
    return make_float3(
        yuv.x                    + 1.402f   * yuv.z,
        yuv.x - 0.34414f * yuv.y - 0.71414f * yuv.z,
        yuv.x + 1.772f   * yuv.y);
 }
 ''')
 hsvlib = devlib(decls='''
 __device__ float3 rgb2hsv(float3 rgb);
 __device__ float3 hsv2rgb(float3 hsv);
 ''', defs=r'''
 __device__ float3 rgb2hsv(float3 rgb) {
    float M = fmaxf(fmaxf(rgb.x, rgb.y), rgb.z);
    float m = fminf(fminf(rgb.x, rgb.y), rgb.z);
    float C = M - m;
    float s = M > 0.0f ? C / M : 0.0f;
    float h = 0.0f;
    if (s != 0.0f) {
        C = 1.0f / C;
        float rc = (M - rgb.x) * C;
        float gc = (M - rgb.y) * C;
        float bc = (M - rgb.z) * C;
        if      (rgb.x == M)  h = bc - gc;
        else if (rgb.y == M)  h = 2 + rc - bc;
        else                  h = 4 + gc - rc;
        if (h < 0) h += 6.0f;
    }
    return make_float3(h, s, M);
 }
 __device__ float3 hsv2rgb(float3 hsv) {
    float whole = floorf(hsv.x);
    float frac = hsv.x - whole;
    float val = hsv.z;
    float min = val * (1 - hsv.y);
    float mid = val * (1 - (hsv.y * frac));
    float alt = val * (1 - (hsv.y * (1 - frac)));
    float3 out;
         if (whole == 0.0f) { out.x = val; out.y = alt; out.z = min; }
    else if (whole == 1.0f) { out.x = mid; out.y = val; out.z = min; }
    else if (whole == 2.0f) { out.x = min; out.y = val; out.z = alt; }
    else if (whole == 3.0f) { out.x = min; out.y = mid; out.z = val; }
    else if (whole == 4.0f) { out.x = alt; out.y = min; out.z = val; }
    else                    { out.x = val; out.y = min; out.z = mid; }
    return out;
 }
 ''')
--- a/cuburn/code/filtering.py
+++ b/cuburn/code/filtering.py
@ -1,13 +1,7 @@
-import numpy as np
+from util import devlib
-from numpy import float32 as f32, int32 as i32
+from color import yuvlib
-import pycuda.driver as cuda
+texshearlib = devlib(defs=r'''
 import pycuda.compiler
 from pycuda.gpuarray import vec
 from cuburn.code.util import *
 _CODE = r'''
 // Filter directions specified in degrees, using image/texture addressing
 // [(0,0) is upper left corner, 90 degrees is down].
@ -22,7 +16,9 @@ __constant__ float2 addressing_patterns[16] = {
    { 1.0f, -0.333333f},   { 0.333333f,  1.0f}, // 14, 15: -15,    75
 };
-// Mon dieu! A C++ feature?
+// Mon dieu! A C++ feature? Gotta to close the "extern C" added by the compiler.
 }
 template <typename T> __device__ T
 tex_shear(texture<T, cudaTextureType2D> ref, int pattern,
          float x, float y, float radius) {
@ -39,100 +35,14 @@ tex_shear(texture<T, cudaTextureType2D> ref, int pattern,
 }
 extern "C" {
 ''')
-__global__
+logscalelib = devlib(defs=r'''
-void colorclip(float4 *pixbuf, float gamma, float vibrance, float highpow,
+__global__ void
-               float linrange, float lingam, float3 bkgd,
+logscale(float4 *outbuf, const float4 *pixbuf, float k1, float k2) {
               int fbsize, int blend_background_color) {
    int i = threadIdx.x + blockDim.x * (blockIdx.x + gridDim.x * blockIdx.y);
    if (i >= fbsize) return;
    float4 pix = pixbuf[i];
    if (pix.w <= 0) {
        pixbuf[i] = make_float4(bkgd.x, bkgd.y, bkgd.z, 0.0f);
        return;
    }
    pix.y -= 0.5f * pix.w;
    pix.z -= 0.5f * pix.w;
    float3 tmp = yuv2rgb(make_float3(pix.x, pix.y, pix.z));
    pix.x = tmp.x;
    pix.y = tmp.y;
    pix.z = tmp.z;
    pix.x = fmaxf(0.0f, pix.x);
    pix.y = fmaxf(0.0f, pix.y);
    pix.z = fmaxf(0.0f, pix.z);
    float4 opix = pix;
    float alpha = powf(pix.w, gamma);
    if (pix.w < linrange) {
        float frac = pix.w / linrange;
        alpha = (1.0f - frac) * pix.w * lingam + frac * alpha;
    }
    if (!blend_background_color) {
        float ls = alpha / pix.w;
        pix.x *= ls;
        pix.y *= ls;
        pix.z *= ls;
        pix.w = alpha;
        pixbuf[i] = pix;
        return;
    }
    float ls = vibrance * alpha / pix.w;
    alpha = fminf(1.0f, fmaxf(0.0f, alpha));
    float maxc = fmaxf(pix.x, fmaxf(pix.y, pix.z));
    float maxa = maxc * ls;
    float newls = 1.0f / maxc;
    if (maxa > 1.0f && highpow >= 0.0f) {
        float lsratio = powf(newls / ls, highpow);
        pix.x *= newls;
        pix.y *= newls;
        pix.z *= newls;
        // Reduce saturation (according to the HSV model) by proportionally
        // increasing the values of the other colors.
        pix.x = maxc - (maxc - pix.x) * lsratio;
        pix.y = maxc - (maxc - pix.y) * lsratio;
        pix.z = maxc - (maxc - pix.z) * lsratio;
    } else {
        float adjhlp = -highpow;
        if (adjhlp > 1.0f || maxa <= 1.0f) adjhlp = 1.0f;
        if (maxc > 0.0f) {
            float adj = ((1.0f - adjhlp) * newls + adjhlp * ls);
            pix.x *= adj;
            pix.y *= adj;
            pix.z *= adj;
        }
    }
    pix.x += (1.0f - vibrance) * powf(opix.x, gamma);
    pix.y += (1.0f - vibrance) * powf(opix.y, gamma);
    pix.z += (1.0f - vibrance) * powf(opix.z, gamma);
    pix.x += (1.0f - alpha) * bkgd.x;
    pix.y += (1.0f - alpha) * bkgd.y;
    pix.z += (1.0f - alpha) * bkgd.z;
    pix.x = fminf(1.0f, pix.x);
    pix.y = fminf(1.0f, pix.y);
    pix.z = fminf(1.0f, pix.z);
    pix.w = alpha;
    pixbuf[i] = pix;
 }
 __global__
 void logscale(float4 *outbuf, const float4 *pixbuf, float k1, float k2) {
    int i = blockDim.x * blockIdx.x + threadIdx.x;
    float4 pix = pixbuf[i];
    // float ls = fmaxf(0, k1 * logf(1.0f + pix.w * pix.w * k2 / (1 + pix.w)) / pix.w);
    float ls = fmaxf(0, k1 * logf(1.0f + pix.w * k2) / pix.w);
    pix.x *= ls;
    pix.y *= ls;
@ -141,10 +51,12 @@ void logscale(float4 *outbuf, const float4 *pixbuf, float k1, float k2) {
    outbuf[i] = pix;
 }
 ''')
 fmabuflib = devlib(defs=r'''
 // Element-wise computation of ``dst[i]=dst[i]+src[i]*scale``.
-__global__
+__global__ void
-void fma_buf(float4 *dst, const float4 *src, int astride, float scale) {
+fma_buf(float4 *dst, const float4 *src, int astride, float scale) {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    int i = y * astride + x;
@ -155,23 +67,16 @@ void fma_buf(float4 *dst, const float4 *src, int astride, float scale) {
    d.w += s.w * scale;
    dst[i] = d;
 }
 ''')
-texture<float4, cudaTextureType2D> bilateral_src;
+denblurlib = devlib(decls='''
 texture<float,  cudaTextureType2D> blur_src;
 __global__ void logscale_den(float *dst, float k2) {
    int xi = blockIdx.x * blockDim.x + threadIdx.x;
    int yi = blockIdx.y * blockDim.y + threadIdx.y;
    float4 pix = tex2D(bilateral_src, xi, yi);
    float out = logf(1.0f + pix.w * k2);
    dst[yi * (blockDim.x * gridDim.x) + xi] = out;
 }
 __constant__ float gauss_coefs[9] = {
    0.00443305f,  0.05400558f,  0.24203623f,  0.39905028f,
    0.24203623f,  0.05400558f,  0.00443305f
 };
-
+''', defs=r'''
 // Apply a Gaussian-esque blur to the density channel of the texture in
 // ``bilateral_src`` in the horizontal direction, and write it to ``dst``, a
 // one-channel buffer.
@ -203,7 +108,11 @@ __global__ void den_blur_1c(float *dst, int pattern, int upsample) {
             * gauss_coefs[i];
    dst[yi * (blockDim.x * gridDim.x) + xi] = den;
 }
 ''')
 bilaterallib = devlib(deps=[logscalelib, texshearlib, denblurlib], decls='''
 texture<float4, cudaTextureType2D> bilateral_src;
 ''', defs=r'''
 /* sstd:    spatial standard deviation (Gaussian filter)
 * cstd:    color standard deviation (Gaussian on the range [0, 1], where 1
 *          represents an "opposite" color).
@ -215,9 +124,8 @@ __global__ void den_blur_1c(float *dst, int pattern, int upsample) {
 * gspeed:  Speed of (exp2f) Gompertz distribution governing how much to
 *          tighten gradients. Zero and negative values OK.
 */
-
+__global__ void
-__global__
+bilateral(float4 *dst, int pattern, int radius,
 void bilateral(float4 *dst, int pattern, int radius,
          float sstd, float cstd, float dstd, float dpow, float gspeed)
 {
    int xi = blockIdx.x * blockDim.x + threadIdx.x;
@ -317,124 +225,83 @@ void bilateral(float4 *dst, int pattern, int radius,
    const int astride = blockDim.x * gridDim.x;
    dst[yi * astride + xi] = out;
 }
 ''')
-} // end extern "C"
+colorcliplib = devlib(deps=[yuvlib], defs=r'''
-'''
+__global__ void
 colorclip(float4 *pixbuf, float gamma, float vibrance, float highpow,
          float linrange, float lingam, float3 bkgd, int fbsize)
 {
    int i = threadIdx.x + blockDim.x * (blockIdx.x + gridDim.x * blockIdx.y);
    if (i >= fbsize) return;
-class Filtering(HunkOCode):
+    float4 pix = pixbuf[i];
    mod = None
    defs = _CODE
-    @classmethod
+    if (pix.w <= 0) {
-    def init_mod(cls):
+        pixbuf[i] = make_float4(bkgd.x, bkgd.y, bkgd.z, 0.0f);
-        if cls.mod is None:
+        return;
-            cls.mod = pycuda.compiler.SourceModule(assemble_code(BaseCode, cls),
+    }
-                    options=['-use_fast_math', '-maxrregcount', '32'],
+    pix.y -= 0.5f * pix.w;
-                    no_extern_c=True)
+    pix.z -= 0.5f * pix.w;
    float3 tmp = yuv2rgb(make_float3(pix.x, pix.y, pix.z));
    pix.x = tmp.x;
    pix.y = tmp.y;
    pix.z = tmp.z;
-    def __init__(self):
+    pix.x = fmaxf(0.0f, pix.x);
-        self.init_mod()
+    pix.y = fmaxf(0.0f, pix.y);
    pix.z = fmaxf(0.0f, pix.z);
-    def de(self, ddst, dsrc, dscratch, gnm, dim, tc, nxf, stream=None):
+    float4 opix = pix;
-        # Helper variables and functions to keep it clean
+    float alpha = powf(pix.w, gamma);
-        sb = 16 * dim.astride
+    if (pix.w < linrange) {
-        bs = sb * dim.ah
+        float frac = pix.w / linrange;
-        bl, gr = (32, 8, 1), (dim.astride / 32, dim.ah / 8)
+        alpha = (1.0f - frac) * pix.w * lingam + frac * alpha;
    }
-        def launch(f, *args, **kwargs):
+    float ls = vibrance * alpha / pix.w;
-            f(*args, block=bl, grid=gr, stream=stream, **kwargs)
+    alpha = fminf(1.0f, fmaxf(0.0f, alpha));
        mkdsc = lambda c: argset(cuda.ArrayDescriptor(), height=dim.ah,
                                 width=dim.astride, num_channels=c,
                                 format=cuda.array_format.FLOAT)
        def mktref(n):
            tref = self.mod.get_texref(n)
            tref.set_filter_mode(cuda.filter_mode.POINT)
            tref.set_address_mode(0, cuda.address_mode.WRAP)
            tref.set_address_mode(1, cuda.address_mode.WRAP)
            return tref
-        dsc = mkdsc(4)
+    float maxc = fmaxf(pix.x, fmaxf(pix.y, pix.z));
-        tref = mktref('bilateral_src')
+    float maxa = maxc * ls;
-        grad_dsc = mkdsc(1)
+    float newls = 1.0f / maxc;
        grad_tref = mktref('blur_src')
-        bilateral, logscale_den, den_blur, den_blur_1c, fma_buf = map(
+    if (maxa > 1.0f && highpow >= 0.0f) {
-                self.mod.get_function,
+        float lsratio = powf(newls / ls, highpow);
-                'bilateral logscale_den den_blur den_blur_1c fma_buf'.split())
+        pix.x *= newls;
        pix.y *= newls;
        pix.z *= newls;
-        # Number of different shear patterns to use when filtering. Must be
+        // Reduce saturation (according to the HSV model) by proportionally
-        # even, since we depend on buffer bouncing (but I don't think that it's
+        // increasing the values of the other colors.
-        # a requirement for the filter itself to get decent results).
+        pix.x = maxc - (maxc - pix.x) * lsratio;
-        DIRECTIONS = 8
+        pix.y = maxc - (maxc - pix.y) * lsratio;
        pix.z = maxc - (maxc - pix.z) * lsratio;
    } else {
        float adjhlp = -highpow;
        if (adjhlp > 1.0f || maxa <= 1.0f) adjhlp = 1.0f;
        if (maxc > 0.0f) {
            float adj = ((1.0f - adjhlp) * newls + adjhlp * ls);
            pix.x *= adj;
            pix.y *= adj;
            pix.z *= adj;
        }
    }
-        def do_bilateral(bsrc, bdst, pattern, r=15, sstd=6, cstd=0.05,
+    pix.x += (1.0f - vibrance) * powf(opix.x, gamma);
-                         dstd=1.5, dpow=0.8, gspeed=4.0):
+    pix.y += (1.0f - vibrance) * powf(opix.y, gamma);
-            # Scale spatial parameter so that a "pixel" is equivalent to an
+    pix.z += (1.0f - vibrance) * powf(opix.z, gamma);
            # actual pixel at 1080p
            sstd *= dim.w / 1920.
-            tref.set_address_2d(bsrc, dsc, sb)
+    pix.x += (1.0f - alpha) * bkgd.x;
    pix.y += (1.0f - alpha) * bkgd.y;
    pix.z += (1.0f - alpha) * bkgd.z;
-            # Blur density two octaves along sampling vector, ultimately
+    pix.x = fminf(1.0f, pix.x);
-            # storing in `dscratch`
+    pix.y = fminf(1.0f, pix.y);
-            launch(den_blur, np.intp(bdst), i32(pattern), i32(0),
+    pix.z = fminf(1.0f, pix.z);
-                   texrefs=[tref])
+    pix.w = alpha;
            grad_tref.set_address_2d(bdst, grad_dsc, sb / 4)
            launch(den_blur_1c, dscratch, i32(pattern), i32(1),
                   texrefs=[grad_tref])
            grad_tref.set_address_2d(dscratch, grad_dsc, sb / 4)
-            launch(bilateral, np.intp(bdst), i32(pattern), i32(r),
+    pixbuf[i] = pix;
-                   f32(sstd), f32(cstd), f32(dstd), f32(dpow), f32(gspeed),
+}
-                   texrefs=[tref, grad_tref])
+''')
        def do_bilateral_range(bsrc, bdst, npats, *args, **kwargs):
            for i in range(npats):
                do_bilateral(bsrc, bdst, i, *args, **kwargs)
                bdst, bsrc = bsrc, bdst
            if npats % 2:
                cuda.memcpy_dtod_async(bdst, bsrc, bs, stream)
        # Filter the first xform, using `ddst` as an intermediate buffer.
        # End result is the filtered copy in `dsrc`.
        do_bilateral_range(dsrc, ddst, DIRECTIONS)
        # Filter the remaining xforms, using `ddst` as an intermediate
        # buffer, then add the result to `dsrc` (at the zero'th xform).
        for x in range(1, nxf):
            src = int(dsrc) + x * bs
            do_bilateral_range(src, ddst, DIRECTIONS)
            launch(fma_buf, dsrc, np.intp(src), i32(dim.astride), f32(1))
        # Log-scale the accumulated buffer in `dsrc`.
        k1 = f32(gnm.color.brightness(tc) * 268 / 256)
        # Old definition of area is (w*h/(s*s)). Since new scale 'ns' is now
        # s/w, new definition is (w*h/(s*s*w*w)) = (h/(s*s*w))
        area = dim.h / (gnm.camera.scale(tc) ** 2 * dim.w)
        k2 = f32(1.0 / (area * gnm.spp(tc)))
        nbins = dim.ah * dim.astride
        logscale = self.mod.get_function("logscale")
        t = logscale(ddst, dsrc, k1, k2,
                block=(512, 1, 1), grid=(nbins/512, 1), stream=stream)
    def colorclip(self, dbuf, gnm, dim, tc, blend, stream=None):
        nbins = dim.ah * dim.astride
        # TODO: implement integration over cubic splines?
        gam = f32(1 / gnm.color.gamma(tc))
        vib = f32(gnm.color.vibrance(tc))
        hipow = f32(gnm.color.highlight_power(tc))
        lin = f32(gnm.color.gamma_threshold(tc))
        lingam = f32(lin ** (gam-1.0) if lin > 0 else 0)
        bkgd = vec.make_float3(
                gnm.color.background.r(tc),
                gnm.color.background.g(tc),
                gnm.color.background.b(tc))
        color_fun = self.mod.get_function("colorclip")
        blocks = int(np.ceil(np.sqrt(nbins / 256)))
        color_fun(dbuf, gam, vib, hipow, lin, lingam, bkgd, i32(nbins),
                  i32(blend), block=(256, 1, 1), grid=(blocks, blocks),
                  stream=stream)
--- a/cuburn/code/interp.py
+++ b/cuburn/code/interp.py
@ -2,8 +2,10 @@ from collections import OrderedDict
 from itertools import cycle
 import numpy as np
-import tempita
+import util
-from util import HunkOCode, Template, BaseCode, assemble_code
+from util import Template, assemble_code, devlib, binsearchlib, ringbuflib
 from color import yuvlib
 from mwc import mwclib
 class GenomePackerName(str):
    """Class to indicate that a property is precalculated on the device"""
@ -115,17 +117,10 @@ class GenomePackerPrecalc(GenomePackerView):
    def _code(self, code):
        self.packer.precalc_code.append(code)
-class GenomePacker(HunkOCode):
+class GenomePacker(object):
    """
    Packs a genome for use in iteration.
    """
    # 2^search_rounds is the maximum number of control points, including
    # endpoints, that can be used in a single genome. It should be okay to
    # change this number without touching other code, but 32 samples fits
    # nicely on a single cache line.
    search_rounds = 5
    def __init__(self, tname):
        """
        Create a new DataPacker.
@ -152,6 +147,7 @@ class GenomePacker(HunkOCode):
        self.packed = None
        self.genome = None
        self.search_rounds = util.DEFAULT_SEARCH_ROUNDS
    def __len__(self):
        assert self._len is not None, 'len() called before finalize()'
@ -189,42 +185,41 @@ class GenomePacker(HunkOCode):
        self._len = len(self.packed)
-        self.decls = self._decls.substitute(
+        decls = self._decls.substitute(packed=self.packed, tname=self.tname)
-                packed=self.packed, tname=self.tname,
+        defs = self._defs.substitute(
                search_rounds=self.search_rounds)
        self.defs = self._defs.substitute(
                packed_direct=self.packed_direct, tname=self.tname,
                precalc_code=self.precalc_code,
                search_rounds=self.search_rounds)
        return devlib(deps=[catmullromlib], decls=decls, defs=defs)
-    def pack(self):
+    def pack(self, pool=None):
        """
-        Return a packed copy of the genome ready for uploading to the GPU as a
+        Return a packed copy of the genome ready for uploading to the GPU,
-        3D NDArray, with the first element being the times and the second
+        as two float32 NDArrays for the knot times and values.
        being the values.
        """
        width = 1 << self.search_rounds
-        out = np.empty((2, len(self.genome), width), dtype=np.float32)
+        if pool:
-        # Ensure that unused values at the top are always big (must be >2.0)
+            times = pool.allocate((len(self.genome), width), 'f4')
-        out[0].fill(1e9)
+            knots = pool.allocate((len(self.genome), width), 'f4')
        else:
            times, knots = np.empty((2, len(self.genome), width), 'f4')
        times.fill(1e9)
        for idx, gname in enumerate(self.genome):
            attr = self.ns[gname[0]]
            for g in gname[1:]:
                attr = getattr(attr, g)
-            out[0][idx][:len(attr.knots[0])] = attr.knots[0]
+            times[idx,:len(attr.knots[0])] = attr.knots[0]
-            out[1][idx][:len(attr.knots[1])] = attr.knots[1]
+            knots[idx,:len(attr.knots[1])] = attr.knots[1]
-        return out
+        return times, knots
    _defs = Template(r"""
-
+__global__ void interp_{{tname}}(
 __global__
 void interp_{{tname}}(
        {{tname}}* out,
        const float *times, const float *knots,
-        float tstart, float tstep, int maxid
+        float tstart, float tstep, int maxid)
-) {
+{
    int id = gtid();
    if (id >= maxid) return;
    out = &out[id];
@ -249,7 +244,6 @@ void interp_{{tname}}(
    }
    {{endfor}}
 }
 """)
    _decls = Template(r"""
@ -259,22 +253,13 @@ typedef struct {
 {{endfor}}
 } {{tname}};
 /* Search through the fixed-size list 'hay' to find the rightmost index which
 * contains a value strictly smaller than the input 'needle'. The crazy
 * bitwise search is just for my own amusement.
 */
 __device__
 int bitwise_binsearch(const float *hay, float needle) {
    int lo = 0;
-    // TODO: improve efficiency on 64-bit arches
+""")
    {{for i in range(search_rounds-1, -1, -1)}}
    if (needle > hay[lo + {{1 << i}}])
        lo += {{1 << i}};
    {{endfor}}
    return lo;
 }
 catmullromlib = devlib(deps=[binsearchlib], decls=r'''
 __device__ __noinline__
 float catmull_rom(const float *times, const float *knots, float t);
 ''', defs=r'''
 __device__ __noinline__
 float catmull_rom(const float *times, const float *knots, float t) {
    int idx = bitwise_binsearch(times, t);
@ -303,67 +288,14 @@ float catmull_rom(const float *times, const float *knots, float t) {
         + m2 * (      ttt -      tt)
         + k2 * (-2.0f*ttt + 3.0f*tt);
 }
 ''')
-__global__
+palintlib = devlib(deps=[binsearchlib, ringbuflib, yuvlib, mwclib], decls='''
-void test_cr(const float *times, const float *knots, const float *t, float *r) {
+surface<void, cudaSurfaceType2D> flatpal;
-    int i = threadIdx.x + blockDim.x * blockIdx.x;
+''', defs=r'''
-    r[i] = catmull_rom(times, knots, t[i]);
+__device__ float4
-}
+interp_color(const float *times, const float4 *sources, float time)
-""")
+{
 class Palette(HunkOCode):
    # The JPEG YUV full-range matrix, without bias into the positve regime.
    # This assumes input color space is CIERGB D65, encoded with gamma 2.2.
    # Note that some interpolated colors may exceed the sRGB and YUV gamuts.
    YUV = np.matrix([[ 0.299,        0.587,      0.114],
                     [-0.168736,    -0.331264,   0.5],
                     [ 0.5,         -0.418688,  -0.081312]])
    def __init__(self, interp_mode):
        assert interp_mode in self.modes
        self.mode = interp_mode
        self.defs = self._defs.substitute(mode=interp_mode)
    def prepare(self, palettes):
        """
        Produce palettes suitable for uploading to the device. Returns an
        array of palettes in the same size and shape as the input.
        This function will never modify its argument, but may return it
        unmodified for certain interpolation modes.
        """
        if self.mode == 'yuvpolar':
            ys, uvrs, uvts, alphas = zip(*map(self.rgbtoyuvpolar, palettes))
            # Center all medians as closely to 0 as possible
            means = np.mean(uvts, axis=1)
            newmeans = (means + np.pi) % (2 * np.pi) - np.pi
            uvts = (newmeans - means).reshape((-1, 1)) + uvts
            zipped = zip(ys, uvrs, uvts, alphas)
            return np.array(zipped, dtype='f4').transpose((0, 2, 1))
        return palettes
    @classmethod
    def rgbtoyuvpolar(cls, pal):
        # TODO: premultiply alpha or some nonsense like that?
        y, u, v = np.array(cls.YUV * pal.T[:3])
        uvr = np.hypot(u, v)
        uvt = np.unwrap(np.arctan2(v, u))
        return y, uvr, uvt, pal.T[3]
    @classmethod
    def yuvpolartorgb(cls, y, uvr, uvt, a):
        u = uvr * np.cos(uvt)
        v = uvr * np.sin(uvt)
        r, g, b = np.array(cls.YUV.I * np.array([y, u, v]))
        # Ensure Fortran order so that the memory gets laid out correctly
        return np.array([r, g, b, a], order='F').T
    modes = ['hsv', 'yuv', 'yuvpolar']
    decls = "surface<void, cudaSurfaceType2D> flatpal;\n"
    _defs = Template(r"""
 __device__
 float4 interp_color(const float *times, const float4 *sources, float time) {
    int idx = fmaxf(bitwise_binsearch(times, time) + 1, 1);
    float lf = (times[idx] - time) / (times[idx] - times[idx-1]);
    float rf = 1.0f - lf;
@ -375,42 +307,11 @@ float4 interp_color(const float *times, const float4 *sources, float time) {
    float3 l3 = make_float3(left.x, left.y, left.z);
    float3 r3 = make_float3(right.x, right.y, right.z);
 {{if mode == 'hsv'}}
    float3 lhsv = rgb2hsv(l3);
    float3 rhsv = rgb2hsv(r3);
    if (fabs(lhsv.x - rhsv.x) > 3.0f)
        if (lhsv.x < rhsv.x)
            lhsv.x += 6.0f;
        else
            rhsv.x += 6.0f;
    float3 hsv;
    hsv.x = lhsv.x * lf + rhsv.x * rf;
    hsv.y = lhsv.y * lf + rhsv.y * rf;
    hsv.z = lhsv.z * lf + rhsv.z * rf;
    if (hsv.x > 6.0f)
        hsv.x -= 6.0f;
    if (hsv.x < 0.0f)
        hsv.x += 6.0f;
    yuv = rgb2yuv(hsv2rgb(hsv));
 {{elif mode.startswith('yuv')}}
    {{if mode == 'yuv'}}
    float3 lyuv = rgb2yuv(l3);
    float3 ryuv = rgb2yuv(r3);
    yuv.x = lyuv.x * lf + ryuv.x * rf;
    yuv.y = lyuv.y * lf + ryuv.y * rf;
    yuv.z = lyuv.z * lf + ryuv.z * rf;
    {{elif mode == 'yuvpolar'}}
    yuv.x = l3.x * lf + r3.x * rf;
    float radius = l3.y * lf + r3.y * rf;
    float angle  = l3.z * lf + r3.z * rf;
    yuv.y = radius * cosf(angle);
    yuv.z = radius * sinf(angle);
    {{endif}}
 {{endif}}
    yuv.y += 0.5f;
    yuv.z += 0.5f;
@ -418,28 +319,13 @@ float4 interp_color(const float *times, const float4 *sources, float time) {
    return make_float4(yuv.x, yuv.y, yuv.z, left.w * lf + right.w * rf);
 }
-__global__
+__global__ void interp_palette_flat(
-void interp_palette(uchar4 *outs,
+        ringbuf *rb, mwc_st *rctxs,
        const float *times, const float4 *sources,
-        float tstart, float tstep) {
+        float tstart, float tstep)
-    float time = tstart + blockIdx.x * tstep;
+{
-    float4 yuva = interp_color(times, sources, time);
+    mwc_st rctx = rctxs[rb_incr(rb->head, threadIdx.x)];
    uchar4 out;
    out.x = yuva.x * 255.0f;
    out.y = yuva.y * 255.0f;
    out.z = yuva.z * 255.0f;
    out.w = yuva.w * 255.0f;
    outs[blockDim.x * blockIdx.x + threadIdx.x] = out;
 }
 __global__
 void interp_palette_flat(mwc_st *rctxs,
        const float *times, const float4 *sources,
        float tstart, float tstep) {
    int gid = blockIdx.x * blockDim.x + threadIdx.x;
    mwc_st rctx = rctxs[gid];
    float time = tstart + blockIdx.x * tstep;
    float4 yuva = interp_color(times, sources, time);
@ -447,17 +333,30 @@ void interp_palette_flat(mwc_st *rctxs,
    // TODO: pack Y at full precision, UV at quarter
    uint2 out;
-    uint32_t y = min(255, (uint32_t) (yuva.x * 255.0f + 0.49f * mwc_next_11(rctx)));
+    uint32_t y = yuva.x * 255.0f + 0.49f * mwc_next_11(rctx);
-    uint32_t u = min(255, (uint32_t) (yuva.y * 255.0f + 0.49f * mwc_next_11(rctx)));
+    uint32_t u = yuva.y * 255.0f + 0.49f * mwc_next_11(rctx);
-    uint32_t v = min(255, (uint32_t) (yuva.z * 255.0f + 0.49f * mwc_next_11(rctx)));
+    uint32_t v = yuva.z * 255.0f + 0.49f * mwc_next_11(rctx);
    y = min(255, y);
    u = min(255, u);
    v = min(255, v);
    out.y = (1 << 22) | (y << 4);
    out.x = (u << 18) | v;
    surf2Dwrite(out, flatpal, 8 * threadIdx.x, blockIdx.x);
-    rctxs[gid] = rctx;
+    rctxs[rb_incr(rb->tail, threadIdx.x)] = rctx;
 }
-""")
+''')
 testcrlib = devlib(defs=r'''
 __global__ void
 test_cr(const float *times, const float *knots, const float *t, float *r) {
    int i = threadIdx.x + blockDim.x * blockIdx.x;
    r[i] = catmull_rom(times, knots, t[i]);
 }
 ''')
 if __name__ == "__main__":
    # Test spline evaluation. This code will probably drift pretty often.
    import pycuda.driver as cuda
    from pycuda.compiler import SourceModule
    import pycuda.autoinit
--- a/cuburn/code/iter.py
+++ b/cuburn/code/iter.py
@ -2,15 +2,16 @@
 The main iteration loop.
 """
-from cuburn.code import mwc, variations, interp
+import variations
-from cuburn.code.util import *
+import interp
 from util import Template, devlib, ringbuflib
 from mwc import mwclib
 def precalc_densities(pcp, std_xforms):
    # This pattern recurs a few times for precalc segments. Unfortunately,
    # namespace stuff means it's not easy to functionalize this boilerplate
    pre_cp = pcp._precalc()
    pre_cp._code(Template(r"""
        float sum = 0.0f;
        {{for n in std_xforms}}
@ -30,7 +31,6 @@ def precalc_densities(pcp, std_xforms):
 def precalc_chaos(pcp, std_xforms):
    pre_cp = pcp._precalc()
    pre_cp._code(Template("""
        float sum, rsum;
        {{for p in std_xforms}}
@ -50,7 +50,6 @@ def precalc_chaos(pcp, std_xforms):
        {{endfor}}
        {{endfor}}
    """, name='precalc_chaos').substitute(locals()))
 def precalc_camera(pcam):
@ -64,7 +63,6 @@ def precalc_camera(pcam):
    # . matrix([X],[Y],[1]);
    pre_cam._code(Template(r"""
        float rot = {{pre_cam.rotation}} * M_PI / 180.0f;
        float rotsin = sin(rot), rotcos = cos(rot);
        float cenx = {{pre_cam.center.x}}, ceny = {{pre_cam.center.y}};
@ -79,13 +77,11 @@ def precalc_camera(pcam):
        {{pre_cam._set('yy')}} = scale * rotcos;
        {{pre_cam._set('yo')}} = scale * -(rotsin * cenx + rotcos * ceny)
                               + 0.5f * acc_size.aheight;
-
+    """, 'precalc_camera').substitute(locals()))
    """).substitute(locals()))
 def precalc_xf_affine(px):
    pre = px._precalc()
    pre._code(Template(r"""
        float pri = {{pre.angle}} * M_PI / 180.0f;
        float spr = {{pre.spread}} * M_PI / 180.0f;
@ -98,27 +94,22 @@ def precalc_xf_affine(px):
        {{pre._set('yy')}} = magy * sin(pri+spr);
        {{pre._set('xo')}} = {{pre.offset.x}};
        {{pre._set('yo')}} = -{{pre.offset.y}};
    """, 'precalc_xf_affine').substitute(locals()))
-    """).substitute(locals()))
+def apply_affine(x, y, xo, yo, packer):
    return Template("""
    {{xo}} = {{packer.xx}} * {{x}} + {{packer.xy}} * {{y}} + {{packer.xo}};
    {{yo}} = {{packer.yx}} * {{x}} + {{packer.yy}} * {{y}} + {{packer.yo}};
    """, 'apply_affine').substitute(locals())
-class IterCode(HunkOCode):
+# The number of threads per block used in the iteration function. Don't change
-    # The number of threads per block
+# it lightly; the code may depend on it in unusual ways.
 NTHREADS = 256
-    def __init__(self, info, genome):
+iter_decls = """
        self.packer = interp.GenomePacker('iter_params')
        self.pcp = self.packer.view('params', genome, 'cp')
        iterbody = self._iterbody(info, genome)
        bodies = [self._xfbody(i,x) for i,x in sorted(genome.xforms.items())]
        bodies.append(iterbody)
        self.defs = '\n'.join(bodies)
    decls = """
 // Note: for normalized lookups, uchar4 actually returns floats
 texture<uchar4, cudaTextureType2D, cudaReadModeNormalizedFloat> palTex;
 __shared__ iter_params params;
 __device__ int rb_head, rb_tail, rb_size;
 typedef struct {
    uint32_t width;
@ -128,12 +119,9 @@ typedef struct {
    uint32_t astride;
 } acc_size_t;
 __constant__ acc_size_t acc_size;
 """
-    def _xfbody(self, xfid, xform):
+iter_xf_body_code = r"""
        px = self.pcp.xforms[xfid]
        tmpl = Template(r"""
 __device__
 void apply_xf_{{xfid}}(float &ox, float &oy, float &color, mwc_st &rctx) {
    float tx, ty;
@ -162,44 +150,30 @@ void apply_xf_{{xfid}}(float &ox, float &oy, float &color, mwc_st &rctx) {
    float csp = {{px.color_speed}};
    color = color * (1.0f - csp) + {{px.color}} * csp;
 };
-""", 'apply_xf_'+xfid)
+"""
 def iter_xf_body(pcp, xfid, xform):
    px = pcp.xforms[xfid]
    tmpl = Template(iter_xf_body_code, 'apply_xf_'+xfid)
    g = dict(globals())
    g.update(locals())
    return tmpl.substitute(g)
-    def _iterbody(self, info, genome):
+iter_body_code = r'''
-        tmpl = Template(r'''
+__global__ void
-
+iter(uint64_t out_ptr, uint64_t atom_ptr,
-__global__ void reset_rb(int size) {
+     ringbuf *rb, mwc_st *msts, float4 *points,
-    rb_head = rb_tail = 0;
+     const iter_params *all_params)
-    rb_size = size;
+{
 }
 __global__
 void iter(
        uint64_t out_ptr,
        mwc_st *msts,
        float4 *points,
        const iter_params *all_params
 {{if info.acc_mode == 'atomic'}}
      , uint64_t atom_ptr
 {{endif}}
 ) {
    const iter_params *global_params = &(all_params[blockIdx.x]);
    // load params to shared memory cooperatively
    const iter_params *global_params = &(all_params[blockIdx.x]);
    for (int i = threadIdx.y * blockDim.x + threadIdx.x;
         i < (sizeof(iter_params) / 4); i += blockDim.x * blockDim.y)
        reinterpret_cast<float*>(&params)[i] =
            reinterpret_cast<const float*>(global_params)[i];
-    __shared__ int rb_idx;
+    int this_rb_idx = rb_incr(rb->head, blockDim.x * threadIdx.y + threadIdx.x);
    if (threadIdx.y == 1 && threadIdx.x == 1)
        rb_idx = 32 * blockDim.y * (atomicAdd(&rb_head, 1) % rb_size);
    __syncthreads();
    int this_rb_idx = rb_idx + threadIdx.x + 32 * threadIdx.y;
    mwc_st rctx = msts[this_rb_idx];
    {{precalc_camera(pcp.camera)}}
@ -209,22 +183,15 @@ void iter(
        {{pcp.camera.yo}} += ditherwidth * mwc_next_11(rctx);
    }
 {{if info.acc_mode == 'global'}}
    __shared__ float time_frac;
    time_frac = blockIdx.x / (float) gridDim.x;
 {{else}}
    {{if info.acc_mode == 'atomic'}}
    // TODO: spare the register, reuse at call site?
    int time = blockIdx.x >> 4;
    {{endif}}
    float color_dither = 0.49f * mwc_next_11(rctx);
 {{endif}}
    // TODO: 4th channel unused. Kill or use for something helpful
    float4 old_point = points[this_rb_idx];
    float x = old_point.x, y = old_point.y, color = old_point.z;
-{{if not info.chaos_used}}
+{{if not chaos_used}}
    // Shared memory size can be reduced by a factor of four using a slower
    // 4-stage reduce, but on Fermi hardware shmem use isn't a bottleneck
    __shared__ float swap[{{4*NTHREADS}}];
@ -250,7 +217,7 @@ void iter(
    // TODO: link up with FUSE, etc
    for (int round = 0; round < 256; round++) {
-{{if info.chaos_used}}
+{{if chaos_used}}
        {{precalc_chaos(pcp, std_xforms)}}
@ -315,16 +282,7 @@ void iter(
 {{endif}}
 {{if info.acc_mode == 'deferred'}}
        int tid = threadIdx.y * 32 + threadIdx.x;
        int offset = 4 * (256 * (256 * blockIdx.x + round) + tid);
        int *log = reinterpret_cast<int*>(out_ptr + offset);
 {{endif}}
        if (fuse) {
 {{if info.acc_mode == 'deferred'}}
            *log = 0xffffffff;
 {{endif}}
            continue;
        }
@ -346,15 +304,11 @@ void iter(
        uint32_t ix = trunca(cx), iy = trunca(cy);
        if (ix >= acc_size.astride || iy >= acc_size.aheight) {
 {{if info.acc_mode == 'deferred'}}
            *log = 0xffffffff;
 {{endif}}
            continue;
        }
-        uint32_t ibase = (last_xf_used % {{info.max_nxf}}) * acc_size.aheight;
+        uint32_t i = iy * acc_size.astride + ix;
-        uint32_t i = (ibase + iy) * acc_size.astride + ix;
+
 {{if info.acc_mode == 'atomic'}}
        asm volatile ({{crep("""
 {
    // To prevent overflow, we need to flush each pixel before the density
@ -432,38 +386,45 @@ oflow_end:
        """)}}  ::  "f"(cc), "f"(color_dither), "r"(time), "r"(i),
                    "l"(atom_ptr), "f"(cosel[threadIdx.y + {{NWARPS}}]),
                    "l"(out_ptr));
 {{endif}}
 {{if info.acc_mode == 'global'}}
        float4 outcol = tex2D(palTex, cc, time_frac);
        float4 *accbuf = reinterpret_cast<float4*>(out_ptr + (16*i));
        float4 pix = *accbuf;
        pix.x += outcol.x;
        pix.y += outcol.y;
        pix.z += outcol.z;
        pix.w += 1.0f;
        *accbuf = pix;
 {{elif info.acc_mode == 'deferred'}}
        // 'color' gets the top 8 bits. TODO: add dithering via precalc.
        uint32_t icolor = fminf(1.0f, cc) * 255.0f + color_dither;
        asm("bfi.b32    %0, %1, %0, 24, 8;" : "+r"(i) : "r"(icolor));
        *log = i;
 {{endif}}
    }
-    if (threadIdx.x == 0 && threadIdx.y == 0)
+    this_rb_idx = rb_incr(rb->tail, blockDim.x * threadIdx.y + threadIdx.x);
        rb_idx = 32 * blockDim.y * (atomicAdd(&rb_tail, 1) % rb_size);
    __syncthreads();
    this_rb_idx = rb_idx + threadIdx.x + 32 * threadIdx.y;
    points[this_rb_idx] = make_float4(x, y, color, 0.0f);
    msts[this_rb_idx] = rctx;
    return;
 }
 '''
-{{if info.acc_mode == 'atomic'}}
+def iter_body(cp, pcp):
-__global__
+    # For legacy reasons, 'cp' is used here instead of 'genome'.
-void flush_atom(uint64_t out_ptr, uint64_t atom_ptr, int nbins) {
+    tmpl = Template(iter_body_code, 'iter_body')
    NWARPS = NTHREADS / 32
    std_xforms = [n for n in sorted(cp.xforms) if n != 'final']
    # TODO: detect this properly and use it
    chaos_used = False
    vars = globals()
    vars.update(locals())
    return tmpl.substitute(vars)
 def mkiterlib(genome):
    packer = interp.GenomePacker('iter_params')
    pcp = packer.view('params', genome, 'cp')
    iterbody = iter_body(genome, pcp)
    bodies = [iter_xf_body(pcp, i, x) for i, x in sorted(genome.xforms.items())]
    bodies.append(iterbody)
    packer_lib = packer.finalize()
    lib = devlib(deps=[packer_lib, mwclib, ringbuflib],
                 # We grab the surf decl from palintlib as well
                 decls=iter_decls + interp.palintlib.decls,
                 defs='\n'.join(bodies))
    return packer, lib
 flushatomlib = devlib(defs=Template(r'''
 __global__ void flush_atom(uint64_t out_ptr, uint64_t atom_ptr, int nbins) {
    int i = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;
    if (i >= nbins) return;
    asm volatile ({{crep("""
@ -499,168 +460,4 @@ void flush_atom(uint64_t out_ptr, uint64_t atom_ptr, int nbins) {
 }
    """)}}  ::  "r"(i), "l"(atom_ptr), "l"(out_ptr));
 }
-
+''', 'flush_atom').substitute())
 {{endif}}
 {{if info.acc_mode == 'deferred'}}
 // Block size, shared accumulation bits, shared accumulation width.
 #define BS 1024
 #define SHAB 12
 #define SHAW (1<<SHAB)
 // Read the point log, accumulate in shared memory, and write the results.
 // This kernel is to be launched with one block for every 4,096 addresses to
 // be processed, and will handle those addresses.
 //
 // log_bounds is an array mapping radix values to the first index in the log
 // with that radix position. For performance reasons in other parts of the
 // code, the radix may actually include bits within the lower SHAB part of the
 // address, or it might not cover the first few bits after the SHAB part;
 // log_bounds_shift covers that. glob_addr_bits specifies the number of bits
 // above SHAB which are address bits.
 __global__ void
 __launch_bounds__(BS, 1)
 write_shmem(
        float4 *acc,
        const uint32_t *log,
        const uint32_t *log_bounds,
        uint32_t nbins
 ) {
    const int tid = threadIdx.x;
    const int bid = blockIdx.x;
    // These two accumulators, used in write_shmem, hold {density, red} and
    // {green, blue} values as packed u16 pairs. The fixed size represents
    // 4,096 pixels in the accumulator.
    __shared__ uint32_t s_acc[SHAW*2];
    int idx = tid;
    for (int i = 0; i < (SHAW * 2 / BS); i++) {
        s_acc[idx] = 0;
        idx += BS;
    }
    __syncthreads();
    // Shut the compiler up
    idx = s_acc[0];
    // log_bounds[] holds inclusive prefix sums, so that log_bounds[0] is the
    // largest index with radix 0, and so on.
    int lb_idx_hi = bid & 0xff;
    int lb_idx_lo = lb_idx_hi - 1;
    int idx_lo;
    if (lb_idx_lo > 0) idx_lo = log_bounds[lb_idx_lo] & ~(BS - 1);
    else               idx_lo = 0;
    int idx_hi = (log_bounds[lb_idx_hi] & ~(BS - 1)) + BS;
    float rnrounds = {{'%d.0f' % info.palette_height}} / (idx_hi - idx_lo);
    float time = tid * rnrounds;
    float time_step = BS * rnrounds;
    int magic = ((blockIdx.x & 0xff) << 3) + ((blockIdx.x & 0xf00) << 12);
    int magic_mask = 0xf007f8;
    for (int i = idx_lo + tid; i < idx_hi; i += BS) {
        int entry = log[i];
        asm volatile ({{crep("""
 {
    .reg .pred q;
    .reg .u32 shoff, color, time, d, r, g, b, hi, lo, hiw, low, tmp;
    .reg .u64 ptr;
    .reg .f32 rf, gf, bf, df, rg, gg, dg, bg;
    // TODO: opacity
    and.b32         tmp,    %0,     %4;
    setp.eq.u32     q,      tmp,    %3;
@!q bra             before_sync;
    and.b32         shoff,  %0,     0xff800;
    shr.b32         shoff,  shoff,  5;
    bfi.b32         shoff,  %0,     shoff,  3,  3;
    bfe.u32         color,  %0,     24, 8;
    shl.b32         color,  color,  3;
    cvt.rni.u32.f32 time,   %1;
    suld.b.2d.v2.b32.clamp  {lo, hi},   [flatpal, {color, time}];
    ld.shared.v2.u32    {low, hiw},     [shoff];
    add.cc.u32          lo,     lo,     low;
    addc.u32            hi,     hi,     hiw;
    setp.hs.u32         q,      hi,     (1023 << 22);
@q  bra                 oflow_sync;
    st.shared.v2.u32    [shoff],        {hi, lo};
 before_sync:
    bar.sync            0;
    bra                 oflow_write_end;
 oflow_sync:
    st.shared.v2.u32    [shoff],    {0, 0};
    // TODO: opacity
    bfi.b32         shoff,  %0,     0,  4,  24;
    cvt.u64.u32     ptr,    shoff;
    add.u64         ptr,    ptr,    %2;
    ld.global.v4.f32    {dg,bg,gg,rg},  [ptr];
    bar.sync        0;
    bfe.u32         r,      hi,     4,      18;
    bfe.u32         g,      lo,     18,     14;
    bfi.b32         g,      hi,     g,      14,     4;
    and.b32         b,      lo,     ((1<<18)-1);
    cvt.rn.f32.u32  rf,     r;
    cvt.rn.f32.u32  gf,     g;
    cvt.rn.f32.u32  bf,     b;
    fma.rn.ftz.f32  rf,     rf,     (1.0/255.0),    rg;
    fma.rn.ftz.f32  gf,     gf,     (1.0/255.0),    gg;
    fma.rn.ftz.f32  bf,     bf,     (1.0/255.0),    bg;
    add.f32         df,     df,     dg;
    st.global.v4.f32    [ptr],  {df,bf,gf,rf};
 oflow_write_end:
 }
        """)}}  ::  "r"(entry), "f"(time), "l"(acc), "r"(magic), "r"(magic_mask));
        time += time_step;
    }
    __syncthreads();
    idx = tid;
    int glo_idx = magic | (((idx << 8) | idx) & 0xff807);
    for (int i = 0; i < (SHAW / BS) && glo_idx < nbins; i++) {
        int d, r, g, b;
        float4 pix = acc[glo_idx];
        asm({{crep("""
 {
    .reg .u32 hi, lo;
    ld.shared.v2.u32   {lo, hi},    [%4];
    shr.u32         %0,     hi,     22;
    bfe.u32         %1,     hi,     4,      18;
    bfe.u32         %2,     lo,     18,     14;
    bfi.b32         %2,     hi,     %2,     14,     4;
    and.b32         %3,     lo,     ((1<<18)-1);
 }
        """)}} : "=r"(d), "=r"(r), "=r"(g), "=r"(b) : "r"(idx*8));
        pix.x += r / 255.0f;
        pix.y += g / 255.0f;
        pix.z += b / 255.0f;
        pix.w += d;
        acc[glo_idx] = pix;
        idx += BS;
        glo_idx += (BS << 8);
    }
 }
 {{endif}}
 ''', 'iter_kern')
        return tmpl.substitute(
                info = info,
                cp = genome,
                pcp = self.pcp,
                NTHREADS = self.NTHREADS,
                NWARPS = self.NTHREADS / 32,
                std_xforms = [n for n in sorted(genome.xforms) if n != 'final'],
                **globals())
--- a/cuburn/code/mwc.py
+++ b/cuburn/code/mwc.py
@ -6,18 +6,53 @@ import os
 import warnings
 import numpy as np
-from util import *
+from util import devlib, assemble_code
-class MWC(HunkOCode):
+# Keeping this live in the module isn't necessary, but loading the mults
-    decls = """
+# can be surprisingly slow.
 mults = None
 def load_mults():
    pfpath = os.path.join(os.path.dirname(__file__), 'primes.bin')
    if os.path.isfile(pfpath):
        with open(pfpath) as fp:
            return np.frombuffer(fp.read(), dtype='<u4')
    warnings.warn('primes.bin not found, trying to download it')
    import bz2, urllib2
    ufp = urllib2.urlopen('http://aduro.strobe.cc/primes.diff.bin.bz2')
    diffs = np.frombuffer(bz2.decompress(ufp.read()), dtype='<u2')
    mults = np.cumsum(-np.array(diffs, dtype='<u4'), dtype='<u4')
    with open(pfpath, 'wb') as fp:
        fp.write(mults)
    return mults
 def make_seeds(nthreads, host_seed=None):
    global mults
    if mults is None:
        mults = load_mults()
    if host_seed:
        rand = np.random.RandomState(host_seed)
    else:
        rand = np.random
    # Create the seed structures. TODO: check that struct is 4-byte aligned
    seeds = np.empty((3, nthreads), dtype=np.uint32, order='F')
    seeds[0][:] = mults[:nthreads]
    # Excludes 0xffffffff for 32-bit compatibility with laziness
    seeds[1] = rand.randint(1, 0x7fffffff, size=nthreads)
    seeds[2] = rand.randint(1, 0x7fffffff, size=nthreads)
    return seeds
 mwclib = devlib(decls=r'''
 typedef struct {
    uint32_t    mul;
    uint32_t    state;
    uint32_t    carry;
 } mwc_st;
-"""
+''', defs=r'''
    defs = r"""
 __device__ uint32_t mwc_next(mwc_st &st) {
    asm("{\n\t"
        ".reg .u32 tmp;\n\t"
@ -40,46 +75,9 @@ __device__ float mwc_next_11(mwc_st &st) {
        : "=f"(ret) : "r"(val));
    return ret;
 }
-"""
+''')
-    @staticmethod
+mwctestlib = devlib(deps=[mwclib], defs="""
    def load_mults():
        pfpath = os.path.join(os.path.dirname(__file__), 'primes.bin')
        if os.path.isfile(pfpath):
            with open(pfpath) as fp:
                return np.frombuffer(fp.read(), dtype='<u4')
        warnings.warn('primes.bin not found, trying to download it')
        import bz2, urllib2
        ufp = urllib2.urlopen('http://aduro.strobe.cc/primes.diff.bin.bz2')
        diffs = np.frombuffer(bz2.decompress(ufp.read()), dtype='<u2')
        mults = np.cumsum(-np.array(diffs, dtype='<u4'), dtype='<u4')
        with open(pfpath, 'wb') as fp:
            fp.write(mults)
        return mults
    mults = None
    @classmethod
    def make_seeds(cls, nthreads, host_seed=None):
        if cls.mults is None:
            cls.mults = cls.load_mults()
        if host_seed:
            rand = np.random.RandomState(host_seed)
        else:
            rand = np.random
        # Create the seed structures. TODO: check that struct is 4-byte aligned
        seeds = np.empty((3, nthreads), dtype=np.uint32, order='F')
        seeds[0][:] = cls.mults[:nthreads]
        # Excludes 0xffffffff for 32-bit compatibility with laziness
        seeds[1] = rand.randint(1, 0x7fffffff, size=nthreads)
        seeds[2] = rand.randint(1, 0x7fffffff, size=nthreads)
        return seeds
 class MWCTest(HunkOCode):
    defs = """
 __global__ void test_mwc(mwc_st *msts, uint64_t *sums, float nrounds) {
    mwc_st rctx = msts[gtid()];
    uint64_t sum = 0;
@ -87,19 +85,18 @@ __global__ void test_mwc(mwc_st *msts, uint64_t *sums, float nrounds) {
    sums[gtid()] = sum;
    msts[gtid()] = rctx;
 }
-"""
+""")
-    @classmethod
+def test_mwc(rounds=5000, nblocks=64, blockwidth=512):
    def test_mwc(cls, rounds=5000, nblocks=64, blockwidth=512):
    import pycuda.driver as cuda
    from pycuda.compiler import SourceModule
    import time
    nthreads = blockwidth * nblocks
-        seeds = MWC.make_seeds(nthreads, host_seed = 5)
+    seeds = make_seeds(nthreads, host_seed=42)
    dseeds = cuda.to_device(seeds)
-        mod = SourceModule(assemble_code(BaseCode, MWC, cls))
+    mod = SourceModule(assemble_code(mwctestlib))
    for trial in range(2):
        print "Trial %d, on CPU: " % trial,
@ -133,5 +130,4 @@ __global__ void test_mwc(mwc_st *msts, uint64_t *sums, float nrounds) {
 if __name__ == "__main__":
    import pycuda.autoinit
-    MWCTest.test_mwc()
+    test_mwc()
--- a/cuburn/code/output.py
+++ b/cuburn/code/output.py
@ -0,0 +1,34 @@
 from util import devlib, ringbuflib
 from mwc import mwclib
 f32tou8lib = devlib(deps=[ringbuflib, mwclib], defs=r'''
 // Perform a conversion from float32 values to uint8 ones, applying
 // pixel- and channel-independent dithering to reduce suprathreshold banding
 // artifacts. Clamps values larger than 1.0f.
 // TODO: move to a separate module?
 // TODO: less ineffecient mwc_st handling?
 __global__ void f32_to_u8(
    ringbuf *rb, mwc_st *rctxs, uchar4 *dst, const float4 *src,
    int gutter, int dstride, int sstride, int height)
 {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    if (x > dstride || y > height) return;
    int isrc = sstride * (y + gutter) + x + gutter;
    int tid = blockDim.x * threadIdx.y + threadIdx.x;
    mwc_st rctx = rctxs[rb_incr(rb->head, tid)];
    float4 in = src[isrc];
    uchar4 out = make_uchar4(
        fminf(1.0f, in.x) * 255.0f + 0.49f * mwc_next_11(rctx),
        fminf(1.0f, in.y) * 255.0f + 0.49f * mwc_next_11(rctx),
        fminf(1.0f, in.z) * 255.0f + 0.49f * mwc_next_11(rctx),
        fminf(1.0f, in.w) * 255.0f + 0.49f * mwc_next_11(rctx)
    );
    int idst = dstride * y + x;
    dst[idst] = out;
    rctxs[rb_incr(rb->head, tid)] = rctx;
 }
 ''')
--- a/cuburn/code/util.py
+++ b/cuburn/code/util.py
@ -1,55 +1,119 @@
 """
 Provides tools and miscellaneous functions for building device code.
 """
 import os
 import tempfile
 from collections import namedtuple
 import pycuda.driver as cuda
 import pycuda.compiler
 import numpy as np
 import tempita
 def crep(s):
    """Escape for PTX assembly"""
    if isinstance(s, unicode):
        s = s.encode('utf-8')
    return '"%s"' % s.encode("string_escape")
 def argset(obj, **kwargs):
    """
    Allow an object with many properties to be set using one call.
    >>> x = argset(X(), a=1, b=2)
    >>> x.a
    1
    """
    for k, v in kwargs.items():
        setattr(obj, k, v)
    return obj
 def launch(name, mod, stream, block, grid, *args, **kwargs):
    """
    Oft-used kernel launch helper. Provides a nice boost in readability of
    densely-packed asynchronous code launches.
    """
    fun = mod.get_function(name)
    if isinstance(block, (int, np.number)):
        block = (int(block), 1, 1)
    if isinstance(grid, (int, np.number)):
        grid = (int(grid), 1)
    fun(*args, block=block, grid=grid, stream=stream, **kwargs)
 def crep(s):
    """Multiline literal escape for inline PTX assembly."""
    if isinstance(s, unicode):
        s = s.encode('utf-8')
    return '"%s"' % s.encode("string_escape")
 class Template(tempita.Template):
    """
    A Tempita template with extra stuff in the default namespace.
    """
    default_namespace = tempita.Template.default_namespace.copy()
 Template.default_namespace.update({'np': np, 'crep': crep})
-class HunkOCode(object):
+# Passive container for device code.
-    """An apparently passive container for device code."""
+DevLib = namedtuple('DevLib', 'deps headers decls defs')
    # Use property objects to make these dynamic
    headers = ''
    decls = ''
    defs = ''
-def assemble_code(*sections):
+def devlib(deps=(), headers='', decls='', defs=''):
-    return ''.join([''.join([getattr(sect, kind) for sect in sections])
+    """Create a library of device code."""
-                    for kind in ['headers', 'decls', 'defs']])
+    # This exists because namedtuple doesn't support default args
    return DevLib(deps, headers, decls, defs)
-def apply_affine(x, y, xo, yo, packer):
+def assemble_code(*libs):
-    return Template("""
+    seen = set()
-    {{xo}} = {{packer.xx}} * {{x}} + {{packer.xy}} * {{y}} + {{packer.xo}};
+    out = []
-    {{yo}} = {{packer.yx}} * {{x}} + {{packer.yy}} * {{y}} + {{packer.yo}};
+    def go(lib):
-    """).substitute(locals())
+        map(go, lib.deps)
        code = lib[1:]
        if code not in seen:
            seen.add(code)
            out.append(code)
    go(stdlib)
    map(go, libs)
    return ''.join(sum(zip(*out), ()))
-class BaseCode(HunkOCode):
+DEFAULT_CMP_OPTIONS = ('-use_fast_math', '-maxrregcount', '42')
-    headers = """
+DEFAULT_SAVE_KERNEL = True
 def compile(name, src, opts=DEFAULT_CMP_OPTIONS, save=DEFAULT_SAVE_KERNEL):
    """
    Compile a module. Returns a copy of the source (for inspection or
    display) and the compiled cubin.
    """
    dir = tempfile.gettempdir()
    if save:
        with open(os.path.join(dir, name + '_kern.cu'), 'w') as fp:
            fp.write(src)
    cubin = pycuda.compiler.compile(src, options=list(opts))
    if save:
        with open(os.path.join(dir, name + '_kern.cubin'), 'w') as fp:
            fp.write(cubin)
    return cubin
 class ClsMod(object):
    """
    Convenience class or mixin that automatically compiles and loads a module
    once per class, saving potentially expensive code regeneration. Only use
    if your class does not employ run-time code generation.
    """
    mod = None
    # Supply your own DevLib on this property
    lib = None
    def __init__(self):
        super(ClsMod, self).__init__()
        self.load()
    @classmethod
    def load(cls, name=None):
        if cls.mod is None:
            if name is None:
                name = cls.__name__.lower()
            cubin = compile(name, assemble_code(cls.lib))
            cls.mod = cuda.module_from_buffer(cubin)
 # This lib is included with *every* assembled module. It contains no global
 # functions, so it shouldn't slow down compilation time too much.
 stdlib = devlib(headers="""
 #include<cuda.h>
 #include<stdint.h>
 #include<stdio.h>
-"""
+""", decls=r"""
    decls = """
 float3 rgb2hsv(float3 rgb);
 float3 hsv2rgb(float3 hsv);
 """
    defs = Template(r"""
 #undef M_E
 #undef M_LOG2E
 #undef M_LOG10E
@ -84,33 +148,79 @@ float3 hsv2rgb(float3 hsv);
 #define bfe_decl(d, s, o, w) \
        int d; \
        bfe(d, s, o, w)
-
+""", defs=r'''
-// TODO: use launch parameter preconfig to eliminate unnecessary parts
+__device__ uint32_t gtid() {
 __device__
 uint32_t gtid() {
    return threadIdx.x + blockDim.x *
            (threadIdx.y + blockDim.y *
                (threadIdx.z + blockDim.z *
                    (blockIdx.x + (gridDim.x * blockIdx.y))));
 }
-__device__
+__device__ uint32_t trunca(float f) {
 uint32_t trunca(float f) {
    // truncate as used in address calculations. note the use of a signed
    // conversion is intentional here (simplifies image bounds checking).
    uint32_t ret;
    asm("cvt.rni.s32.f32    %0,     %1;" : "=r"(ret) : "f"(f));
    return ret;
 }
 ''')
-__global__
+def mkbinsearchlib(rounds):
-void fill_dptr(uint32_t* dptr, int size, uint32_t value) {
+    """
    Search through the fixed-size list 'hay' to find the rightmost index which
    contains a value strictly smaller than the input 'needle'. The list must
    be exactly '2^rounds' long, although padding at the top with very large
    numbers or even +inf effectively shortens it.
    """
    # TODO: this doesn't optimize well on a 64-bit arch, not that it's a
    # performance-critical chunk of code or anything
    src = Template(r'''
 __device__ int bitwise_binsearch(const float *hay, float needle) {
    int lo = 0;
    // TODO: improve efficiency on 64-bit arches
    {{for i in range(search_rounds-1, -1, -1)}}
    if (needle > hay[lo + {{1 << i}}])
        lo += {{1 << i}};
    {{endfor}}
    return lo;
 }
 ''', 'bitwise_binsearch')
    return devlib(defs=src.substitute(search_rounds=rounds))
 # 2^search_rounds is the maximum number of knots allowed in a single spline.
 # This includes the four required knots, so a 5 round search supports 28
 # interior knots in the domain (0, 1). 2^5 fits nicely on a single cache line.
 DEFAULT_SEARCH_ROUNDS = 5
 binsearchlib = mkbinsearchlib(DEFAULT_SEARCH_ROUNDS)
 filldptrlib = devlib(defs=r'''
 __global__ void
 fill_dptr(uint32_t* dptr, int size, uint32_t value) {
    int i = (gridDim.x * blockIdx.y + blockIdx.x) * blockDim.x + threadIdx.x;
    if (i < size) {
        dptr[i] = value;
    }
 }
 ''')
 def fill_dptr(mod, dptr, size, stream=None, value=np.uint32(0)):
    """
    A memory zeroer which can be embedded in a stream, unlike the various
    memset routines. Size is the number of 4-byte words in the pointer;
    value is the word to fill it with. If value is not an np.uint32, it
    will be coerced to a buffer and the first four bytes taken.
    """
    if not isinstance(value, np.uint32):
        if isinstance(value, int):
            value = np.uint32(value)
        else:
            value = np.frombuffer(buffer(value), np.uint32)[0]
    blocks = int(np.ceil(np.sqrt(size / 1024.)))
    launch('fill_dptr', mod, stream, (1024, 1, 1), (blocks, blocks),
            dptr, np.int32(size), value)
 writehalflib = devlib(defs=r'''
 /* read_half and write_half decode and encode, respectively, two
 * floating-point values from a 32-bit value (typed as a 'float' for
 * convenience but not really). The values are packed into u16s as a
@ -122,8 +232,8 @@ void fill_dptr(uint32_t* dptr, int size, uint32_t value) {
 * approach when the alpha channel is present.
 */
-__device__
+__device__ void
-void read_half(float &x, float &y, float xy, float den) {
+read_half(float &x, float &y, float xy, float den) {
    asm("\n\t{"
        "\n\t   .reg .u16       x, y;"
        "\n\t   .reg .f32       rc;"
@ -137,8 +247,8 @@ void read_half(float &x, float &y, float xy, float den) {
        : "=f"(x), "=f"(y) : "f"(xy), "f"(den));
 }
-__device__
+__device__ void
-void write_half(float &xy, float x, float y, float den) {
+write_half(float &xy, float x, float y, float den) {
    asm("\n\t{"
        "\n\t   .reg .u16       x, y;"
        "\n\t   .reg .f32       rc, xf, yf;"
@ -152,87 +262,41 @@ void write_half(float &xy, float x, float y, float den) {
        "\n\t}"
        : "=f"(xy) : "f"(x), "f"(y), "f"(den));
 }
 ''')
-/* This conversion uses the JPEG full-range standard. Note that UV have range
+def mkringbuflib(rb_size):
 * [-0.5, 0.5], so consider biasing the results. */
 __device__
 float3 rgb2yuv(float3 rgb) {
    return make_float3(
        0.299f      * rgb.x + 0.587f    * rgb.y + 0.114f    * rgb.z,
        -0.168736f  * rgb.x - 0.331264f * rgb.y + 0.5f      * rgb.z,
        0.5f        * rgb.x - 0.418688f * rgb.y - 0.081312f * rgb.z);
 }
 __device__
 float3 yuv2rgb(float3 yuv) {
    return make_float3(
        yuv.x                    + 1.402f   * yuv.z,
        yuv.x - 0.34414f * yuv.y - 0.71414f * yuv.z,
        yuv.x + 1.772f   * yuv.y);
 }
 __device__
 float3 rgb2hsv(float3 rgb) {
    float M = fmaxf(fmaxf(rgb.x, rgb.y), rgb.z);
    float m = fminf(fminf(rgb.x, rgb.y), rgb.z);
    float C = M - m;
    float s = M > 0.0f ? C / M : 0.0f;
    float h = 0.0f;
    if (s != 0.0f) {
        C = 1.0f / C;
        float rc = (M - rgb.x) * C;
        float gc = (M - rgb.y) * C;
        float bc = (M - rgb.z) * C;
        if      (rgb.x == M)  h = bc - gc;
        else if (rgb.y == M)  h = 2 + rc - bc;
        else                  h = 4 + gc - rc;
        if (h < 0) h += 6.0f;
    }
    return make_float3(h, s, M);
 }
 __device__
 float3 hsv2rgb(float3 hsv) {
    float whole = floorf(hsv.x);
    float frac = hsv.x - whole;
    float val = hsv.z;
    float min = val * (1 - hsv.y);
    float mid = val * (1 - (hsv.y * frac));
    float alt = val * (1 - (hsv.y * (1 - frac)));
    float3 out;
         if (whole == 0.0f) { out.x = val; out.y = alt; out.z = min; }
    else if (whole == 1.0f) { out.x = mid; out.y = val; out.z = min; }
    else if (whole == 2.0f) { out.x = min; out.y = val; out.z = alt; }
    else if (whole == 3.0f) { out.x = min; out.y = mid; out.z = val; }
    else if (whole == 4.0f) { out.x = alt; out.y = min; out.z = val; }
    else                    { out.x = val; out.y = min; out.z = mid; }
    return out;
 }
 """).substitute()
    @staticmethod
    def fill_dptr(mod, dptr, size, stream=None, value=np.uint32(0)):
    """
-        A memory zeroer which can be embedded in a stream, unlike the various
+    A ringbuffer for access to shared resources.
-        memset routines. Size is the number of 4-byte words in the pointer;
+
-        value is the word to fill it with. If value is not an np.uint32, it
+    Some components, such as the MWC contexts, are expensive to generate, and
-        will be coerced to a buffer and the first four bytes taken.
+    have no affinity to a particular block. Rather than maintain a separate
    copy of each of these objects for every thread block in a launch, we want
    to only keep enough copies of these resources around to service every
    thread block that could possibly be active simultaneously on one card,
    which is often considerably smaller. This class provides a simple
    ringbuffer type and an increment function, used in a couple places to
    implement this kind of sharing.
    """
-        fill = mod.get_function("fill_dptr")
+    return devlib(headers="#define RB_SIZE_MASK %d" % (rb_size - 1), decls='''
-        if not isinstance(value, np.uint32):
+typedef struct {
-            if isinstance(value, int):
+    int head;
-                value = np.uint32(value)
+    int tail;
-            else:
+} ringbuf;
-                value = np.frombuffer(buffer(value), np.uint32)[0]
+''', defs=r'''
-        blocks = int(np.ceil(np.sqrt(size / 1024 + 1)))
+__shared__ int rb_idx;
-        fill(dptr, np.int32(size), value, stream=stream,
+__device__ int rb_incr(int &rb_base, int tidx) {
-             block=(1024, 1, 1), grid=(blocks, blocks, 1))
+    if (threadIdx.y == 1 && threadIdx.x == 1)
-
+        rb_idx = 256 * (atomicAdd(&rb_base, 1) & RB_SIZE_MASK);
    __syncthreads();
    return rb_idx + tidx;
 }
 ''')
 # For now, the number of entries is fixed to a value known to work on all
 # Fermi cards. Autodetection, or perhaps just a global limit increase, will be
 # done when I get my hands on a Kepler device. The fixed size assumes blocks
 # of 256 threads, although even at that size there are pathological cases that
 # could break the assumption.
 DEFAULT_RB_SIZE = 64
 ringbuflib = mkringbuflib(DEFAULT_RB_SIZE)
--- a/cuburn/filters.py
+++ b/cuburn/filters.py
@ -0,0 +1,105 @@
 import numpy as np
 from numpy import float32 as f32, int32 as i32
 import pycuda.driver as cuda
 import pycuda.compiler
 from pycuda.gpuarray import vec
 import code.filters
 from code.util import ClsMod, argset, launch
 class Filter(object):
    def apply(self, fb, gnm, dim, tc, stream=None):
        """
        Queue the application of this filter. When the live stream finishes
        executing the last item enqueued by this method, the result must be
        live in the buffer pointed to by ``fb.d_front`` at the return of this
        function.
        """
        raise NotImplementedError()
 class Bilateral(Filter, ClsMod):
    lib = code.filters.bilaterallib
    def __init__(self, directions=8, r=15, sstd=6, cstd=0.05,
                 dstd=1.5, dpow=0.8, gspeed=4.0):
        # TODO: expose these parameters on the genome, or at least on the
        # profile, and set them by a less ugly mechanism
        for n in 'directions r sstd cstd dstd dpow gspeed'.split():
            setattr(self, n, locals()[n])
        super(Bilateral, self).__init__()
    def apply(self, fb, gnm, dim, tc, stream=None):
        # Helper variables and functions to keep it clean
        sb = 16 * dim.astride
        bs = sb * dim.ah
        bl, gr = (32, 8, 1), (dim.astride / 32, dim.ah / 8)
        mkdsc = lambda c: argset(cuda.ArrayDescriptor(), height=dim.ah,
                                 width=dim.astride, num_channels=c,
                                 format=cuda.array_format.FLOAT)
        def mktref(n):
            tref = self.mod.get_texref(n)
            tref.set_filter_mode(cuda.filter_mode.POINT)
            tref.set_address_mode(0, cuda.address_mode.WRAP)
            tref.set_address_mode(1, cuda.address_mode.WRAP)
            return tref
        dsc = mkdsc(4)
        tref = mktref('bilateral_src')
        grad_dsc = mkdsc(1)
        grad_tref = mktref('blur_src')
        for pattern in range(self.directions):
            # Scale spatial parameter so that a "pixel" is equivalent to an
            # actual pixel at 1080p
            sstd = self.sstd * dim.w / 1920.
            tref.set_address_2d(fb.d_front, dsc, sb)
            # Blur density two octaves along sampling vector, ultimately
            # storing in the side buffer
            launch('den_blur', self.mod, stream, bl, gr,
                    fb.d_back, i32(pattern), i32(0), texrefs=[tref])
            grad_tref.set_address_2d(fb.d_back, grad_dsc, sb / 4)
            launch('den_blur_1c', self.mod, stream, bl, gr,
                    fb.d_side, i32(pattern), i32(1), texrefs=[grad_tref])
            grad_tref.set_address_2d(fb.d_side, grad_dsc, sb / 4)
            launch('bilateral', self.mod, stream, bl, gr,
                    fb.d_back, i32(pattern), i32(self.r),
                    f32(sstd), f32(self.cstd), f32(self.dstd),
                    f32(self.dpow), f32(self.gspeed),
                    texrefs=[tref, grad_tref])
            fb.flip()
 class Logscale(Filter, ClsMod):
    lib = code.filters.logscalelib
    def apply(self, fb, gnm, dim, tc, stream=None):
        """Log-scale in place."""
        k1 = f32(gnm.color.brightness(tc) * 268 / 256)
        # Old definition of area is (w*h/(s*s)). Since new scale 'ns' is now
        # s/w, new definition is (w*h/(s*s*w*w)) = (h/(s*s*w))
        area = dim.h / (gnm.camera.scale(tc) ** 2 * dim.w)
        k2 = f32(1.0 / (area * gnm.spp(tc)))
        nbins = dim.ah * dim.astride
        launch('logscale', self.mod, stream, 256, nbins/256,
                fb.d_front, fb.d_front, k1, k2)
 class ColorClip(Filter, ClsMod):
    lib = code.filters.colorcliplib
    def apply(self, fb, gnm, dim, tc, stream=None):
        # TODO: implement integration over cubic splines?
        gam = f32(1 / gnm.color.gamma(tc))
        vib = f32(gnm.color.vibrance(tc))
        hipow = f32(gnm.color.highlight_power(tc))
        lin = f32(gnm.color.gamma_threshold(tc))
        lingam = f32(lin ** (gam-1.0) if lin > 0 else 0)
        bkgd = vec.make_float3(
                gnm.color.background.r(tc),
                gnm.color.background.g(tc),
                gnm.color.background.b(tc))
        nbins = dim.ah * dim.astride
        blocks = int(np.ceil(np.sqrt(nbins / 256.)))
        launch('colorclip', self.mod, stream, 256, (blocks, blocks),
                fb.d_front, gam, vib, hipow, lin, lingam, bkgd, i32(nbins))
--- a/cuburn/output.py
+++ b/cuburn/output.py
@ -0,0 +1,49 @@
 import numpy as np
 from numpy import float32 as f32, int32 as i32
 import pycuda.driver as cuda
 from code.util import ClsMod, launch
 from code.output import f32tou8lib
 import scipy.misc
 if not hasattr(scipy.misc, 'toimage'):
    raise ImportError("Could not find scipy.misc.toimage. "
                      "Are scipy and PIL installed?")
 class Output(object):
    def convert(self, fb, gnm, dim, stream=None):
        """
        Convert a filtered buffer to whatever output format is needed by the
        writer.
        """
        raise NotImplementedError()
    def copy(self, fb, dim, pool, stream=None):
        """
        Schedule a copy from the device buffer to host memory, returning the
        target buffer.
        """
        raise NotImplementedError()
 class PILOutput(Output, ClsMod):
    lib = f32tou8lib
    def convert(self, fb, gnm, dim, stream=None):
        launch('f32_to_u8', self.mod, stream,
                (32, 8, 1), (int(np.ceil(dim.w/32.)), int(np.ceil(dim.h/8.))),
                fb.d_rb, fb.d_seeds, fb.d_back, fb.d_front,
                i32(fb.gutter), i32(dim.w), i32(dim.astride), i32(dim.h))
    def copy(self, fb, dim, pool, stream=None):
        h_out = pool.allocate((dim.h, dim.w, 4), 'u1')
        cuda.memcpy_dtoh_async(h_out, fb.d_back, stream)
        return h_out
    @staticmethod
    def save(buf, name, type=None, quality=98):
        if type == 'jpeg' or (type is None and name.endswith('.jpg')):
            buf = buf[:,:,:3]
        img = scipy.misc.toimage(buf, cmin=0, cmax=1)
        img.save(name, type, quality=quality)
--- a/cuburn/render.py
+++ b/cuburn/render.py
@ -1,383 +1,372 @@
 """
 Resources and tools to perform rendering.
 """
 import os
 import sys
 import re
-import time as timemod
+import time
 import tempfile
 from collections import namedtuple
 from itertools import cycle, repeat, chain, izip, imap, ifilter
 from ctypes import *
 from cStringIO import StringIO
 import numpy as np
 from numpy import float32 as f32, int32 as i32, uint32 as u32, uint64 as u64
 from scipy import ndimage
 import pycuda.compiler
 import pycuda.driver as cuda
 import pycuda.tools
-import cuburn.genome
+import filters
-from cuburn import affine
+import output
-from cuburn.code import util, mwc, iter, interp, filtering, sort
+from code import util, mwc, iter, interp, sort
 from code.util import ClsMod, devlib, filldptrlib, assemble_code, launch
 RenderedImage = namedtuple('RenderedImage', 'buf idx gpu_time')
 Dimensions = namedtuple('Dimensions', 'w h aw ah astride')
-def _sync_stream(dst, src):
+class Framebuffers(object):
-    dst.wait_for_event(cuda.Event(cuda.event_flags.DISABLE_TIMING).record(src))
+    """
    The largest memory allocations, and a stream to serialize their use.
-class Renderer(object):
+    ``d_front`` and ``d_back`` are separate buffers, each large enough to hold
    four float32 components per pixel (including any gutter pixels added for
    alignment or padding). ``d_side`` is another buffer large enough to hold
    two float32 components per pixel.
    Every user of this set of buffers may use and overwrite the buffers in
    any way, as long as the output for the next stage winds up in the front
    buffer. The front and back buffers can be exchanged by the ``flip()``
    method (which simply exchanges the pointers); while no similar method
    exists for the side buffer, you're free to do the same by taking local
    copies of the references and exchanging them yourself.
    There's one spot in the stream interleaving where the behavior is
    different: the ``Output.convert`` call must store its output to the back
    buffer, which will remain untouched until the dtoh copy of the converted
    buffer is finished. This happens while the ``iter`` kernel of the next
    frame writes to the front and side buffers, which means in practice that
    there's essentially no waiting on any buffers.
    If an output module decides to get all krazy and actually replaces the
    references to the buffers on this object - to, say, implement a temporally
    aware tone-mapping or denoising pass - that's probably okay, but just make
    sure it appears like it's expected to.
    """
-    Control structure for rendering a series of frames.
+
    # Minimum extension of accumulation buffer beyond output region. Used to
    # alleviate edge effects during filtering. Actual gutter may be larger to
    # accomodate alignment requirements; when it is, that extension will be
    # applied to the lower-right corner of the buffer. This is asymmetrical,
    # but simplifies trimming logic when it's time for that.
    gutter = 10
    @classmethod
    def calc_dim(cls, width, height):
        """
        Given a width and height, return a valid set of dimensions which
        include at least enough gutter to exceed the minimum, and where
        (acc_width % 32) == 0 and (acc_height % 8) == 0.
        """
        awidth = width + 2 * cls.gutter
        aheight = 8 * int(np.ceil((height + 2 * cls.gutter) / 8.))
        astride = 32 * int(np.ceil(awidth / 32.))
        return Dimensions(width, height, awidth, aheight, astride)
    def __init__(self):
        self.stream = cuda.Stream()
        self._clear()
        # These resources rely on the slots/ringbuffer mechanism for sharing,
        # and so can be shared across any number of launches, genomes, and
        # render kernels. Notably, seeds are self-synchronizing, so they're not
        # attached to either stream object.
        self.d_rb = cuda.to_device(np.array([0, 0], dtype=u32))
        seeds = mwc.make_seeds(util.DEFAULT_RB_SIZE * 256)
        self.d_seeds = cuda.to_device(seeds)
        self._len_d_points = util.DEFAULT_RB_SIZE * 256 * 16
        self.d_points = cuda.mem_alloc(self._len_d_points)
    def _clear(self):
        self.nbins = self.d_front = self.d_back = self.d_side = None
    def free(self, stream=None):
        if stream is not None:
            stream.synchronize()
        else:
            cu
        for p in (self.d_front, self.d_back, self.d_side):
            if p is not None:
                p.free()
        self._clear()
    def alloc(self, dim, stream=None):
        """
        Ensure that this object's framebuffers are large enough to handle the
        given dimensions, allocating new ones if not.
        If ``stream`` is not None and a reallocation is necessary, the stream
        will be synchronized before the old buffers are deallocated.
        """
        nbins = dim.ah * dim.astride
        if self.nbins >= nbins: return
        if self.nbins is not None: self.free()
        try:
            self.d_front = cuda.mem_alloc(16 * nbins)
            self.d_back  = cuda.mem_alloc(16 * nbins)
            self.d_side  = cuda.mem_alloc(8 * nbins)
            self.nbins = nbins
        except cuda.MemoryError, e:
            # If a frame that's too large sneaks by the task distributor, we
            # don't want to kill the server, but we also don't want to leave
            # it stuck without any free memory to complete the next alloc.
            # TODO: measure free mem and only take tasks that fit (but that
            # should be done elsewhere)
            self.free(stream)
            raise e
    def set_dim(self, width, height, stream=None):
        """
        Compute padded dimensions for given width and height, ensure that the
        buffers are large enough (and reallocate if not), and return the
        calculated dimensions.
        Note that the returned dimensions are always the same for a given
        width, height, and minimum gutter, even if the underlying buffers are
        larger due to a previous allocation.
        """
        dim = self.calc_dim(width, height)
        self.alloc(dim, stream)
        return dim
    def flip(self):
        """Flip the front and back buffers."""
        self.d_front, self.d_back = self.d_back, self.d_front
 class DevSrc(object):
    """
    The buffers which represent a genome on-device, in the formats needed to
    serve as a source for interpolating temporal samples.
    """
    # Maximum number of knots per parameter. This also covers the maximum
    # number of palettes allowed.
    max_knots = 1 << util.DEFAULT_SEARCH_ROUNDS
    # Maximum number of parameters per genome. This number is exceedingly
    # high, and should never even come close to being hit.
    max_params = 1024
    def __init__(self):
        self.d_times = cuda.mem_alloc(4 * self.max_knots * self.max_params)
        self.d_knots = cuda.mem_alloc(4 * self.max_knots * self.max_params)
        self.d_ptimes = cuda.mem_alloc(4 * self.max_knots)
        self.d_pals = cuda.mem_alloc(4 * 4 * 256 * self.max_knots)
 class DevInfo(object):
    """
    The buffers which hold temporal samples on-device, as used by iter.
    """
    # The palette texture/surface covers the color coordinate from [0,1] with
    # equidistant horizontal samples, and spans the temporal range of the
    # frame linearly with this many rows. Increasing these values increases the
    # number of uniquely-dithered samples when using pre-dithered surfaces, as
    # is done in 'atomic' accumulation.
    palette_width = 256 # TODO: make this setting be respected
    palette_height = 64
    # This used to be determined automagically, but simultaneous occupancy
    # and a much smaller block size simplifies this.
    ntemporal_samples = 1024
    # Number of iterations to iterate without write after generating a new
    # point. This number is currently fixed pretty deeply in the set of magic
    # constants which govern buffer sizes; changing the value here won't
    # actually change the code on the device to do something different.
    # It's here just for documentation purposes.
    fuse = 256
    # The palette texture/surface covers the color coordinate from [0,1] with
    # (for now, a fixed 256) equidistant horizontal samples, and spans the
    # temporal range of the frame linearly with this many rows. Increasing
    # this value increases the number of uniquely-dithered samples when using
    # pre-dithered surfaces.
    palette_height = 64
    # Palette color interpolation mode (see code.interp.Palette)
    palette_interp_mode = 'yuv'
    # Maximum width of DE and other spatial filters, and thus in turn the
    # amount of padding applied. Note that, for now, this must not be changed!
    # The filtering code makes deep assumptions about this value.
    gutter = 10
    # Accumulation mode. Leave it at 'atomic' for now.
    acc_mode = 'atomic'
    # At most this many separate buffers for xforms will be allocated, after
    # which further xforms will wrap to the first when writing. Currently it
    # is compiled in, so power-of-two and no runtime maximization. Current
    # value of 16 fits into a 1GB card at 1080p.
    max_nxf = 1
    # TODO
    chaos_used = False
    cmp_options = ('-use_fast_math', '-maxrregcount', '42')
    keep = False
    def __init__(self):
-        self._iter = self.pal = self.src = self.cubin = self.mod = None
+        self.d_params = cuda.mem_alloc(
-
+                self.ntemporal_samples * DevSrc.max_params * 4)
-        # Ensure class options don't get contaminated on an instance
+        self.palette_surf_dsc = util.argset(cuda.ArrayDescriptor3D(),
-        self.cmp_options = list(self.cmp_options)
+                height=self.palette_height, width=self.palette_width, depth=0,
    def compile(self, genome, keep=None, cmp_options=None):
        """
        Compile a kernel capable of rendering every frame in this animation.
        The resulting compiled kernel is stored in the ``cubin`` property;
        the source is available as ``src``, and is also returned for
        inspection and display.
        This operation is idempotent, and has no side effects outside of
        setting properties on this instance (unless there's a compiler error,
        which is a bug); it should therefore be threadsafe as well.
        It is, however, rather slow.
        """
        keep = self.keep if keep is None else keep
        cmp_options = self.cmp_options if cmp_options is None else cmp_options
        self._iter = iter.IterCode(self, genome)
        self._iter.packer.finalize()
        self.pal = interp.Palette(self.palette_interp_mode)
        self.src = util.assemble_code(util.BaseCode, mwc.MWC, self._iter.packer,
                                      self.pal, self._iter)
        with open(os.path.join(tempfile.gettempdir(), 'kernel.cu'), 'w') as fp:
            fp.write(self.src)
        self.cubin = pycuda.compiler.compile(
                self.src, keep=keep, options=cmp_options,
                cache_dir=False if keep else None)
    def load(self, genome, jit_options=[]):
        if not self.cubin:
            self.compile(genome)
        self.mod = cuda.module_from_buffer(self.cubin, jit_options)
        with open('/tmp/iter_kern.cubin', 'wb') as fp:
            fp.write(self.cubin)
        return self.src
    def render(self, genome, times, width, height, blend=True):
        """
        Render a frame for each timestamp in the iterable value ``times``. This
        function returns a generator that will yield a RenderedImage object
        containing a shared reference to the output buffer for each specified
        frame.
        The returned buffer is page-locked host memory. Between the time a
        buffer is yielded and the time the next frame's results are requested,
        the buffer will not be modified. Thereafter, however, it will be
        overwritten by an asynchronous DMA operation coming from the CUDA
        device. If you hang on to it for longer than one frame, copy it.
        ``genome`` is the genome to be rendered. Successive calls to the
        `render()` method on one ``Renderer`` object must use genomes which
        produce identical compiled code, and this will not be verified by the
        renderer. In practice, this means you can alter genome parameter
        values, but the full set of keys must remain identical between runs on
        the same renderer.
        ``times`` is a list of (idx, cen_time) tuples, where ``idx`` is passed
        unmodified in the RenderedImage return value and ``cen_time`` is the
        central time of the current frame in spline-time units. (Any
        clock-time or frame-time units in the genome should be preconverted.)
        If ``blend`` is False, the output buffer will contain unclipped,
        premultiplied RGBA data, without vibrancy, highlight power, or the
        alpha elbow applied.
        """
        r = self.render_gen(genome, width, height, blend=blend)
        next(r)
        return ifilter(None, imap(r.send, chain(times, [None])))
    def render_gen(self, genome, width, height, blend=True):
        """
        Render frames. This method is wrapped by the ``render()`` method; see
        its docstring for warnings and details.
        Instead of passing frame times as an iterable, they are passed
        individually via the ``generator.send()`` method. There is an
        internal pipeline latency of one frame, so the first call to the
        ``send()`` method will return None, the second call will return the
        first frame's result, and so on. To retrieve the last frame in a
        sequence, send ``None``.
        Direct use of this method is useful for implementing render servers.
        """
        last_idx = None
        next_frame = yield
        if next_frame is None:
            return
        if not self.mod:
            self.load(genome)
        filt = filtering.Filtering()
        reset_rb_fun = self.mod.get_function("reset_rb")
        packer_fun = self.mod.get_function("interp_iter_params")
        iter_fun = self.mod.get_function("iter")
        # The synchronization model is messy. See helpers/task_model.svg.
        iter_stream = cuda.Stream()
        filt_stream = cuda.Stream()
        if self.acc_mode == 'deferred':
            write_stream = cuda.Stream()
            write_fun = self.mod.get_function("write_shmem")
        else:
            write_stream = iter_stream
        # These events fire when the corresponding buffer is available for
        # reading on the host (i.e. the copy is done). On the first pass, 'a'
        # will be ignored, and subsequently moved to 'b'.
        event_a = cuda.Event().record(filt_stream)
        event_b = None
        awidth = width + 2 * self.gutter
        aheight = 32 * int(np.ceil((height + 2 * self.gutter) / 32.))
        astride = 32 * int(np.ceil(awidth / 32.))
        dim = Dimensions(width, height, awidth, aheight, astride)
        d_acc_size = self.mod.get_global('acc_size')[0]
        cuda.memcpy_htod_async(d_acc_size, u32(list(dim)), write_stream)
        nbins = astride * aheight
        nxf = len(filter(lambda g: g != 'final', genome.xforms))
        nxf = min(nxf, self.max_nxf)
        d_accum = cuda.mem_alloc(16 * nbins * nxf)
        d_out = cuda.mem_alloc(16 * nbins)
        if self.acc_mode == 'atomic':
            d_atom = cuda.mem_alloc(8 * nbins * nxf)
            flush_fun = self.mod.get_function("flush_atom")
        else:
            # d_atom is also used as a scratch buffer during filtering, so we
            # need it at least this size
            d_atom = cuda.mem_alloc(4 * nbins)
        obuf_copy = util.argset(cuda.Memcpy2D(),
            src_y=self.gutter, src_x_in_bytes=16*self.gutter,
            src_pitch=16*astride, dst_pitch=16*width,
            width_in_bytes=16*width, height=height)
        obuf_copy.set_src_device(d_out)
        h_out_a = cuda.pagelocked_empty((height, width, 4), f32)
        h_out_b = cuda.pagelocked_empty((height, width, 4), f32)
        if self.acc_mode == 'deferred':
            # Having a fixed, power-of-two log size makes things much easier
            log_size = 64 << 20
            d_log = cuda.mem_alloc(log_size * 4)
            d_log_sorted = cuda.mem_alloc(log_size * 4)
            sorter = sort.Sorter(log_size)
            # We need to cover each unique tag - address bits 20-23 - with one
            # write block per sort bin. Or somethinig like that.
            nwriteblocks = int(np.ceil(nbins / float(1<<20))) * 256
        # Calculate 'nslots', the number of simultaneous running threads that
        # can be active on the GPU during iteration (and thus the number of
        # slots for loading and storing RNG and point context that will be
        # prepared on the device), and derive 'rb_size', the number of blocks in
        # 'nslots'.
        iter_threads_per_block = 256
        dev_data = pycuda.tools.DeviceData()
        occupancy = pycuda.tools.OccupancyRecord(
                dev_data, iter_threads_per_block,
                iter_fun.shared_size_bytes, iter_fun.num_regs)
        nsms = cuda.Context.get_device().multiprocessor_count
        rb_size = occupancy.warps_per_mp * nsms / (iter_threads_per_block / 32)
        nslots = iter_threads_per_block * rb_size
        # Reset the ringbuffer info for the slots
        reset_rb_fun(np.int32(rb_size), block=(1,1,1))
        d_points = cuda.mem_alloc(nslots * 16)
        # This statement may add extra seeds to simplify palette dithering.
        seeds = mwc.MWC.make_seeds(max(nslots, 256 * self.palette_height))
        d_seeds = cuda.to_device(seeds)
        # We used to auto-calculate this to a multiple of the number of SMs on
        # the device, but since we now use shorter launches and, to a certain
        # extent, allow simultaneous occupancy, that's not as important. The
        # 1024 is a magic constant to ensure reasonable and power-of-two log
        # size for deferred: 256MB / (4B * FUSE * NTHREADS). Enhancements to
        # the sort engine are needed to make this more flexible.
        ntemporal_samples = 1024
        genome_times, genome_knots = self._iter.packer.pack()
        d_genome_times = cuda.to_device(genome_times)
        d_genome_knots = cuda.to_device(genome_knots)
        info_size = 4 * len(self._iter.packer) * ntemporal_samples
        d_infos = cuda.mem_alloc(info_size)
        ptimes, pidxs = zip(*genome.palette_times)
        palint_times = np.empty(len(genome_times[0]), f32)
        palint_times.fill(1e10)
        palint_times[:len(ptimes)] = ptimes
        d_palint_times = cuda.to_device(palint_times)
        pvals = self.pal.prepare([genome.decoded_palettes[i] for i in pidxs])
        d_palint_vals = cuda.to_device(np.concatenate(pvals))
        if self.acc_mode in ('deferred', 'atomic'):
            palette_fun = self.mod.get_function("interp_palette_flat")
            dsc = util.argset(cuda.ArrayDescriptor3D(),
                    height=self.palette_height, width=256, depth=0,
                format=cuda.array_format.SIGNED_INT32,
                num_channels=2, flags=cuda.array3d_flags.SURFACE_LDST)
-            palarray = cuda.Array(dsc)
+        self.d_pal_array = cuda.Array(self.palette_surf_dsc)
 class Renderer(object):
    def __init__(self, gnm):
        self.packer, self.lib = iter.mkiterlib(gnm)
        cubin = util.compile('iter', assemble_code(self.lib))
        self.mod = cuda.module_from_buffer(cubin)
        # TODO: make these customizable
        self.filts = [ filters.Bilateral()
                     , filters.Logscale()
                     , filters.ColorClip() ]
        self.out = output.PILOutput()
 class RenderManager(ClsMod):
    lib = devlib(deps=[interp.palintlib, filldptrlib, iter.flushatomlib])
    def __init__(self):
        super(RenderManager, self).__init__()
        self.pool = pycuda.tools.PageLockedMemoryPool()
        self.fb = Framebuffers()
        self.src_a, self.src_b = DevSrc(), DevSrc()
        self.info_a, self.info_b = DevInfo(), DevInfo()
        self.stream_a, self.stream_b = cuda.Stream(), cuda.Stream()
        self.filt_evt = self.copy_evt = None
    def _copy(self, rdr, gnm):
        """
        Queue a copy of a host genome into a set of device interpolation source
        buffers.
        Note that for now, this is broken! It ignores ``gnm``, and only packs
        the genome that was used when creating the renderer.
        """
        times, knots = rdr.packer.pack(self.pool)
        cuda.memcpy_htod_async(self.src_a.d_times, times, self.stream_a)
        cuda.memcpy_htod_async(self.src_a.d_knots, knots, self.stream_a)
        ptimes, pidxs = zip(*gnm.palette_times)
        palettes = self.pool.allocate((len(ptimes), 256, 4), f32)
        palettes[:] = [gnm.decoded_palettes[i] for i in pidxs]
        palette_times = self.pool.allocate((self.src_a.max_knots,), f32)
        palette_times.fill(1e9)
        palette_times[:len(ptimes)] = ptimes
        cuda.memcpy_htod_async(self.src_a.d_pals, palettes, self.stream_a)
        cuda.memcpy_htod_async(self.src_a.d_ptimes, palette_times,
                               self.stream_a)
        # TODO: use bilerp tex as src for palette interp
    def _interp(self, rdr, gnm, dim, ts, td):
        d_acc_size = rdr.mod.get_global('acc_size')[0]
        p_dim = self.pool.allocate((len(dim),), u32)
        p_dim[:] = dim
        cuda.memcpy_htod_async(d_acc_size, p_dim, self.stream_a)
        tref = self.mod.get_surfref('flatpal')
-            tref.set_array(palarray, 0)
+        tref.set_array(self.info_a.d_pal_array, 0)
-        else:
+        launch('interp_palette_flat', self.mod, self.stream_a,
-            palette_fun = self.mod.get_function("interp_palette")
+                256, self.info_a.palette_height,
-            dsc = util.argset(cuda.ArrayDescriptor(),
+                self.fb.d_rb, self.fb.d_seeds,
-                    height=self.palette_height, width=256,
+                self.src_a.d_ptimes, self.src_a.d_pals,
-                    format=cuda.array_format.UNSIGNED_INT8,
+                f32(ts), f32(td / self.info_a.palette_height))
                    num_channels=4)
            d_palmem = cuda.mem_alloc(256 * self.palette_height * 4)
-            tref = self.mod.get_texref('palTex')
+        nts = self.info_a.ntemporal_samples
-            tref.set_address_2d(d_palmem, dsc, 1024)
+        launch('interp_iter_params', rdr.mod, self.stream_a,
-            tref.set_flags(cuda.TRSF_NORMALIZED_COORDINATES)
+                256, np.ceil(nts / 256.),
-            tref.set_filter_mode(cuda.filter_mode.LINEAR)
+                self.info_a.d_params, self.src_a.d_times, self.src_a.d_knots,
                f32(ts), f32(td / nts), i32(nts))
-        while next_frame is not None:
+    def _print_interp_knots(self, rdr, tsidx=5):
-            # tc, td, ts, te: central, delta, start, end times
+        infos = cuda.from_device_like(self.info_a.d_params,
-            idx, tc = next_frame
+                (tsidx + 1, self.info_a.max_params), f32)
-            td = genome.adj_frame_width(tc)
+        for i, n in zip(infos[-1], rdr.packer.packed):
            print '%60s %g' % ('_'.join(n), i)
    def _iter(self, rdr, gnm, dim, tc):
        tref = rdr.mod.get_surfref('flatpal')
        tref.set_array(self.info_a.d_pal_array, 0)
        nbins = dim.ah * dim.astride
        fill = lambda b, s, v=i32(0): util.fill_dptr(
                self.mod, b, s, stream=self.stream_a, value=v)
        fill(self.fb.d_front,  4 * nbins)
        fill(self.fb.d_side,   2 * nbins)
        fill(self.fb.d_points, self.fb._len_d_points / 4, f32(np.nan))
        nts = self.info_a.ntemporal_samples
        nsamps = (gnm.spp(tc) * dim.w * dim.h)
        nrounds = int(nsamps / (nts * 256. * 256)) + 1
        launch('iter', rdr.mod, self.stream_a, (32, 8, 1), (nts, nrounds),
                self.fb.d_front, self.fb.d_side,
                self.fb.d_rb, self.fb.d_seeds, self.fb.d_points,
                self.info_a.d_params)
        nblocks = int(np.ceil(np.sqrt(dim.ah*dim.astride/256.)))
        launch('flush_atom', self.mod, self.stream_a,
                256, (nblocks, nblocks),
                u64(self.fb.d_front), u64(self.fb.d_side), i32(nbins))
    def queue_frame(self, rdr, gnm, tc, w, h, copy=True):
        """
        Queue one frame for rendering.
        ``rdr`` is a compiled Renderer module. Caller must ensure that the
        module is compatible with the genome data provided.
        ``gnm`` is the genome to be rendered.
        ``tc`` is the center time at which to render.
        ``w``, ``h`` are the width and height of the desired output in px.
        If ``copy`` is False, the genome data will not be recopied for each
        new genome. This function must be called with ``copy=True`` the first
        time a new genome is used, and may be called in that manner
        subsequently without harm. I suspect the performance impact is low, so
        leave ``copy`` to True every time for now.
        The return value is a 2-tuple ``(evt, h_out)``, where ``evt`` is a
        CUDA event and ``h_out`` is the return value of the output module's
        ``copy`` function. In the typical case, ``h_out`` will be a host
        allocation containing data in an appropriate format for the output
        module's file writer, and ``evt`` indicates when the asynchronous
        DMA copy which will populate ``h_out`` is complete. This can vary
        depending on the output module in use, though.
        """
        # Note: we synchronize on the previous stream if buffers need to be
        # reallocated, which implicitly also syncs the current stream.
        dim = self.fb.set_dim(w, h, self.stream_b)
        td = gnm.adj_frame_width(tc)
        ts, te = tc - 0.5 * td, tc + 0.5 * td
-            if self.acc_mode in ('deferred', 'atomic'):
+        # The stream interleaving here is nontrivial.
-                # In this mode, the palette writes to a surface reference, but
+        # TODO: update diagram and link to it here
-                # requires dithering, so we pass it the seeds instead
+        if copy:
-                arg0 = d_seeds
+            self.src_a, self.src_b = self.src_b, self.src_a
-            else:
+            self._copy(rdr, gnm)
-                arg0 = d_palmem
+        self._interp(rdr, gnm, dim, ts, td)
-            palette_fun(arg0, d_palint_times, d_palint_vals,
+        if self.filt_evt:
-                        f32(ts), f32(td / self.palette_height),
+            self.stream_a.wait_for_event(self.filt_evt)
-                        block=(256,1,1), grid=(self.palette_height,1),
+        self._iter(rdr, gnm, dim, tc)
-                        stream=write_stream)
+        if self.copy_evt:
            self.stream_a.wait_for_event(self.copy_evt)
        for filt in rdr.filts:
            filt.apply(self.fb, gnm, dim, tc, self.stream_a)
        rdr.out.convert(self.fb, gnm, dim, self.stream_a)
        self.filt_evt = cuda.Event().record(self.stream_a)
        h_out = rdr.out.copy(self.fb, dim, self.pool, self.stream_a)
        self.copy_evt = cuda.Event().record(self.stream_a)
-            packer_fun(d_infos, d_genome_times, d_genome_knots,
+        self.info_a, self.info_b = self.info_b, self.info_a
-                       f32(ts), f32(td / ntemporal_samples),
+        self.stream_a, self.stream_b = self.stream_b, self.stream_a
-                       i32(ntemporal_samples), block=(256,1,1),
+        return self.copy_evt, h_out
                       grid=(int(np.ceil(ntemporal_samples/256.)),1),
                       stream=iter_stream)
            # Reset points so that they will be FUSEd
            util.BaseCode.fill_dptr(self.mod, d_points, 4 * nslots,
                                    iter_stream, f32(np.nan))
            # Get interpolated control points for debugging
            #iter_stream.synchronize()
            #d_temp = cuda.from_device(d_infos,
                    #(ntemporal_samples, len(self._iter.packer)), f32)
            #for i, n in zip(d_temp[5], self._iter.packer.packed):
                #print '%60s %g' % ('_'.join(n), i)
            util.BaseCode.fill_dptr(self.mod, d_accum, 4 * nbins * nxf,
                                    write_stream)
            if self.acc_mode == 'atomic':
                util.BaseCode.fill_dptr(self.mod, d_atom, 2 * nbins * nxf,
                                        write_stream)
            nrounds = int( (genome.spp(tc) * width * height)
                         / (ntemporal_samples * 256 * 256) ) + 1
            if self.acc_mode == 'deferred':
                for i in range(nrounds):
                    iter_fun(np.uint64(d_log), d_seeds, d_points, d_infos,
                             block=(32, self._iter.NTHREADS/32, 1),
                             grid=(ntemporal_samples, 1), stream=iter_stream)
                    _sync_stream(write_stream, iter_stream)
                    sorter.sort(d_log_sorted, d_log, log_size, 3, True,
                                stream=write_stream)
                    _sync_stream(iter_stream, write_stream)
                    write_fun(d_accum, d_log_sorted, sorter.dglobal, i32(nbins),
                              block=(1024, 1, 1), grid=(nwriteblocks, 1),
                              stream=write_stream)
            else:
                args = [u64(d_accum), d_seeds, d_points, d_infos]
                if self.acc_mode == 'atomic':
                    args.append(u64(d_atom))
                iter_fun(*args, block=(32, self._iter.NTHREADS/32, 1),
                         grid=(ntemporal_samples, nrounds), stream=iter_stream)
                if self.acc_mode == 'atomic':
                    nblocks = int(np.ceil(np.sqrt(nbins*nxf/float(512))))
                    flush_fun(u64(d_accum), u64(d_atom), i32(nbins*nxf),
                              block=(512, 1, 1), grid=(nblocks, nblocks),
                              stream=iter_stream)
            util.BaseCode.fill_dptr(self.mod, d_out, 4 * nbins, filt_stream)
            _sync_stream(filt_stream, write_stream)
            filt.de(d_out, d_accum, d_atom, genome, dim, tc, nxf,
                    stream=filt_stream)
            _sync_stream(write_stream, filt_stream)
            filt.colorclip(d_out, genome, dim, tc, blend, stream=filt_stream)
            obuf_copy.set_dst_host(h_out_a)
            obuf_copy(filt_stream)
            if event_b:
                while not event_a.query():
                    timemod.sleep(0.01)
                gpu_time = event_a.time_since(event_b)
                result = RenderedImage(h_out_b, last_idx, gpu_time)
            else:
                result = None
            last_idx = idx
            event_a, event_b = cuda.Event().record(filt_stream), event_a
            h_out_a, h_out_b = h_out_b, h_out_a
            # TODO: add ability to flush a frame without breaking the pipe
            next_frame = yield result
        while not event_a.query():
            timemod.sleep(0.001)
        gpu_time = event_a.time_since(event_b)
        yield RenderedImage(h_out_b, last_idx, gpu_time)
    def render(self, gnm, times, w, h):
        """
        A port of the old rendering function, retained for backwards
        compatibility. Some of this will be pulled into as-yet-undecided
        methods for more DRY.
        """
        rdr = Renderer(gnm)
        last_evt = cuda.Event().record(self.stream_a)
        last_idx = None
        def wait(): # Times like these where you wish for a macro
            while not last_evt.query():
                time.sleep(0.01)
            gpu_time = last_evt.time_since(two_evts_ago)
            return RenderedImage(h_buf, idx, gpu_time)
        for idx, tc in times:
            evt, h_buf = self.queue_frame(rdr, gnm, tc, w, h, last_idx is None)
            if last_idx:
                yield wait()
            two_evts_ago, last_evt = last_evt, evt
            last_buf, last_idx = h_buf, idx
        if last_idx:
            yield wait()
--- a/main.py
+++ b/main.py
@ -3,8 +3,8 @@
 # cuburn, one of a surprisingly large number of ports of the fractal flame
 # algorithm to NVIDIA GPUs.
 #
-# This one is copyright 2010-2011, Steven Robertson <steven@strobe.cc>
+# This one is copyright 2010-2012, Steven Robertson <steven@strobe.cc>
-# and Eric Reckase <e.reckase@gmail.com>.
+# and Erik Reckase <e.reckase@gmail.com>.
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License version 2 or later
@ -20,11 +20,9 @@ from subprocess import Popen
 from itertools import ifilter
 import numpy as np
 import Image
 import scipy
 import pycuda.driver as cuda
-from cuburn import genome, render
+from cuburn import genome, render, filters, output
 profiles = {
    '1080p': dict(fps=24, width=1920, height=1080, quality=3000, skip=0),
@ -33,11 +31,10 @@ profiles = {
    'preview': dict(fps=24, width=640, height=360, quality=800, skip=1)
 }
-def save(rframe):
+def save(out):
-    noalpha = rframe.buf[:,:,:3]
+    # Temporary! TODO: fix this
-    img = scipy.misc.toimage(noalpha, cmin=0, cmax=1)
+    output.PILOutput.save(out.buf, out.idx)
-    img.save(rframe.idx, quality=98)
+    print out.idx, out.gpu_time
    print rframe.idx, rframe.gpu_time
 def main(args, prof):
    import pycuda.autoinit
@ -53,9 +50,7 @@ def main(args, prof):
    gnm = genome.Genome(gnm)
    err, times = gnm.set_profile(prof)
-    anim = render.Renderer()
+    rmgr = render.RenderManager()
    anim.compile(gnm, keep=args.keep)
    anim.load(gnm)
    basename = os.path.basename(args.flame.name).rsplit('.', 1)[0] + '_'
    if args.flame.name == '-':
@ -76,9 +71,13 @@ def main(args, prof):
                  if not os.path.isfile(f[0]) or m > os.path.getmtime(f[0]))
    w, h = prof['width'], prof['height']
-    gen = anim.render(gnm, frames, w, h)
+    gen = rmgr.render(gnm, frames, w, h)
    if not args.gfx:
        for out in gen:
            save(out)
        return
    if args.gfx:
    import pyglet
    window = pyglet.window.Window(w, h, vsync=False)
    image = pyglet.image.CheckerImagePattern().create_image(w, h)
@ -126,11 +125,7 @@ def main(args, prof):
    pyglet.clock.set_fps_limit(30)
    pyglet.clock.schedule_interval(poll, 1/30.)
    pyglet.app.run()
-    else:
+
        for out in gen:
            save(out)
            if args.sync:
                cuda.Context.synchronize()
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Render fractal flames.')
@ -150,8 +145,6 @@ if __name__ == "__main__":
    parser.add_argument('--pause', action='store_true',
        help="Don't close the preview window after rendering is finished")
    parser.add_argument('--keep', action='store_true', dest='keep',
        help='Keep compilation directory (disables kernel caching)')
    parser.add_argument('--sync', action='store_true', dest='sync',
        help='Use synchronous launches whenever possible')
--- a/worker.py
+++ b/worker.py
@ -17,6 +17,8 @@ import redis
 from cuburn import render, genome
 import pycuda.driver as cuda
 pycuda = None
 # The default maximum number of waiting jobs. Also used to determine when a
@ -50,20 +52,6 @@ def get_temperature():
                return out[idx+1:idx+3]
    return ''
 def push_frame(r, out):
    if out is None:
        return
    sid, sidx, ftag = out.idx
    # TODO: gotta put this in a module somewhere and make it adjustable
    noalpha = out.buf[:,:,:3]
    img = scipy.misc.toimage(noalpha, cmin=0, cmax=1)
    buf = StringIO()
    img.save(buf, 'jpeg', quality=98)
    buf.seek(0)
    head = ' '.join([sidx, str(out.gpu_time), ftag])
    r.rpush(sid + ':queue', head + '\0' + buf.read())
    print 'Pushed frame: %s' % head
 def work(server):
    global pycuda
    import pycuda.autoinit
@ -80,7 +68,11 @@ def work(server):
    r.expire(wid, 180)
    last_ping = time.time()
-    last_pid, last_gid, riter = None, None, None
+    idx = evt = buf = None
    last_idx = last_buf = last_evt = two_evts_ago = None
    last_pid = last_gid = rdr = None
    mgr = render.RenderManager()
    while True:
        task = r.blpop('renderpool:' + rev + ':queue', 10)
@ -90,28 +82,46 @@ def work(server):
            r.expire(wid, 180)
            last_ping = now
        # last_evt will be populated during normal queue operation (when evt
        # contains the most recent event), as well as when the render queue is
        # flushing due to not receiving a new task before the timeout.
        if last_idx is not None:
            while not last_evt.query():
                # This delay could probably be a lot higher with zero impact
                # on throughput for Fermi cards
                time.sleep(0.05)
            sid, sidx, ftag = last_idx
            obuf = StringIO()
            rdr.out.save(buf, obuf, 'jpeg')
            obuf.seek(0)
            gpu_time = last_evt.time_since(two_evts_ago)
            head = ' '.join([sidx, str(gpu_time), ftag])
            r.rpush(sid + ':queue', head + '\0' + obuf.read())
            print 'Pushed frame: %s' % head
        two_evts_ago, last_evt = last_evt, evt
        last_idx, last_buf = idx, buf
        if not task:
-            if riter:
+            idx = evt = buf = None
                push_frame(r, riter.send(None))
                riter = None
            continue
        sid, sidx, pid, gid, ftime, ftag = task[1].split(' ', 5)
-        if pid != last_pid or gid != last_gid or not riter:
+        if pid != last_pid or gid != last_gid or not rdr:
            gnm = genome.Genome(json.loads(r.get(gid)))
            prof = json.loads(r.get(pid))
            gnm.set_profile(prof)
-            renderer = render.Renderer()
+            rdr = render.Renderer(gnm)
            renderer.load(gnm)
-            if riter:
+        if last_evt is None:
-                push_frame(r, riter.send(None))
+            # Create a dummy event for timing
            last_evt = cuda.Event().record(mgr.stream_a)
-            riter = renderer.render_gen(gnm, prof['width'], prof['height'])
+        copy = last_idx is None
-            next(riter)
+        w, h = prof['width'], prof['height']
-            last_pid, last_gid = pid, gid
+        evt, buf = mgr.queue_frame(rdr, gnm, float(ftime), w, h, copy)
-
+        idx = sid, sidx, ftag
        push_frame(r, riter.send(((sid, sidx, ftag), float(ftime))))
 def iter_genomes(prof, gpaths, pname='540p'):
    """