New genome representation, and device interp.

2025-12-23 21:39:00 -05:00 · 2011-10-25 15:44:39 -04:00
parent be31708c09
commit 8939a6343a
8 changed files with 1030 additions and 729 deletions
--- a/cuburn/code/filtering.py
+++ b/cuburn/code/filtering.py
@ -2,8 +2,8 @@
 from cuburn.code.util import *
 class ColorClip(HunkOCode):
-    def __init__(self, features):
+    def __init__(self, info):
-        self.defs = self.defs_tmpl.substitute(features=features)
+        self.defs = self.defs_tmpl.substitute(info=info)
    defs_tmpl = Template('''
 __global__
@ -63,7 +63,7 @@ void colorclip(float4 *pixbuf, float gamma, float vibrancy, float highpow,
    pix.y += (1.0f - vibrancy) * powf(opix.y, gamma);
    pix.z += (1.0f - vibrancy) * powf(opix.z, gamma);
-    {{if features.alpha_output_channel}}
+    {{if info.alpha_output_channel}}
    float 1_alpha = 1 / alpha;
    pix.x *= 1_alpha;
    pix.y *= 1_alpha;
@ -94,13 +94,13 @@ class DensityEst(HunkOCode):
    # Note, changing this does not yet have any effect, it's just informational
    MAX_WIDTH=15
-    def __init__(self, features, cp):
+    def __init__(self, info):
-        self.features, self.cp = features, cp
+        self.info = info
    headers = "#include<math_constants.h>\n"
    @property
    def defs(self):
-        return self.defs_tmpl.substitute(features=self.features, cp=self.cp)
+        return self.defs_tmpl.substitute(info=self.info)
    defs_tmpl = Template('''
 #define W 15        // Filter width (regardless of standard deviation chosen)
@ -147,9 +147,9 @@ void density_est(float4 *pixbuf, float4 *outbuf,
        de_r[i] = de_g[i] = de_b[i] = de_a[i] = 0.0f;
    __syncthreads();
-    for (int imrow = threadIdx.y + W2; imrow < {{features.acc_height}}; imrow += 32)
+    for (int imrow = threadIdx.y + W2; imrow < {{info.acc_height}}; imrow += 32)
    {
-        int idx = {{features.acc_stride}} * imrow +
+        int idx = {{info.acc_stride}} * imrow +
                + blockIdx.x * 32 + threadIdx.x + W2;
        float4 in = pixbuf[idx];
@ -249,7 +249,7 @@ void density_est(float4 *pixbuf, float4 *outbuf,
        __syncthreads();
        // TODO: could coalesce this, but what a pain
        for (int i = threadIdx.x; i < FW; i += 32) {
-            idx = {{features.acc_stride}} * imrow + blockIdx.x * 32 + i + W2;
+            idx = {{info.acc_stride}} * imrow + blockIdx.x * 32 + i + W2;
            int si = threadIdx.y * FW + i;
            float *out = reinterpret_cast<float*>(&outbuf[idx]);
            atomicAdd(out,   de_r[si]);
@ -285,12 +285,14 @@ void density_est(float4 *pixbuf, float4 *outbuf,
        # TODO: add no-est version
        # TODO: come up with a general way to average these parameters
-        k1 = np.float32(cp.brightness * 268 / 256)
+        k1 = np.float32(cp.color.brightness * 268 / 256)
-        area = self.features.width * self.features.height / cp.ppu ** 2
+        # Old definition of area is (w*h/(s*s)). Since new scale 'ns' is now
-        k2 = np.float32(1 / (area * cp.adj_density ))
+        # s/w, new definition is (w*h/(s*s*w*w)) = (h/(s*s*w))
        area = self.info.height / (cp.camera.scale ** 2 * self.info.width)
        k2 = np.float32(1 / (area * self.info.density ))
-        if self.cp.estimator == 0:
+        if cp.de.radius == 0:
-            nbins = self.features.acc_height * self.features.acc_stride
+            nbins = self.info.acc_height * self.info.acc_stride
            fun = mod.get_function("logscale")
            t = fun(abufd, obufd, k1, k2,
                    block=(512, 1, 1), grid=(nbins/512, 1), stream=stream)
@ -299,11 +301,11 @@ void density_est(float4 *pixbuf, float4 *outbuf,
            # 0.5, but the DE filters scale filter distance by the default
            # spatial support factor of 1.5, so the effective base SD is
            # (0.5/1.5)=1/3.
-            est_sd = np.float32(cp.estimator / 3.)
+            est_sd = np.float32(cp.de.radius / 3.)
-            neg_est_curve = np.float32(-cp.estimator_curve)
+            neg_est_curve = np.float32(-cp.de.curve)
-            est_min = np.float32(cp.estimator_minimum / 3.)
+            est_min = np.float32(cp.de.minimum / 3.)
            fun = mod.get_function("density_est")
            fun(abufd, obufd, est_sd, neg_est_curve, est_min, k1, k2,
-                block=(32, 32, 1), grid=(self.features.acc_width/32, 1),
+                block=(32, 32, 1), grid=(self.info.acc_width/32, 1),
                stream=stream)
--- a/cuburn/code/interp.py
+++ b/cuburn/code/interp.py
@ -0,0 +1,318 @@
 from collections import OrderedDict
 from itertools import cycle
 import numpy as np
 import tempita
 from cuburn.code.util import HunkOCode, Template
 from cuburn.genome import SplEval
 class GenomePackerName(str):
    """Class to indicate that a property is precalculated on the device"""
    pass
 class GenomePackerView(object):
    """
    Obtain accessors in generated code.
    Call ``GenomePacker.view(ptr_name, wrapped_obj, prefix)`` to generate a
    view, where ``ptr_name`` is the name of the CUDA pointer which holds the
    interpolated values, ``wrapped_obj`` is the base Genome instance which
    will be uploaded to the device for interpolating from, and ``prefix`` is
    the prefix that will be assigned to property accessors from this object.
    Accessing a property on the object synthesizes an accessor for use in your
    code and an interpolator for use in generating that code. This conversion
    is done when the property is coerced into a string by the templating
    mechanism, so you can easily nest objects by saying, for instance,
    {{pcp.camera.rotation}} from within templated code. The accessed property
    must be a SplEval object, or a precalculated value (see
    ``GenomePackerPrecalc``).
    Index operations are converted to property accesses as well, so that you
    don't have to make a mess with 'getattr' in your code: {{pcp.xforms[x]}}
    works just fine. This means, however, that no arrays can be packed
    directly; they must be converted to have string-based keys first, and
    any loops must be unrolled in your code.
    """
    def __init__(self, packer, ptr_name, wrapped, prefix=()):
        self.packer = packer
        self.ptr_name = ptr_name
        self.wrapped = wrapped
        self.prefix = prefix
    def __getattr__(self, name):
        w = getattr(self.wrapped, name)
        return type(self)(self.packer, self.ptr_name, w, self.prefix+(name,))
    # As with the Genome class, we're all-dict, no-array here
    __getitem__ = lambda s, n: getattr(s, str(n))
    def __str__(self):
        """
        Returns the packed name in a format suitable for embedding directly
        into device code.
        """
        # So evil. When the template calls __str__ to format the output, we
        # allocate things. This makes for neater embedded code, which is where
        # the real complexity lies, but it also means printf() debugging when
        # templating will screw with the allocation tables!
        if isinstance(self.wrapped, SplEval):
            self.packer._require(self.prefix)
        elif not isinstance(self.wrapped, GenomePackerName):
            raise TypeError("Tried to pack something that wasn't a spline or "
                            "a precalculated value")
        # TODO: verify namespace stomping, etc
        return '%s.%s' % (self.ptr_name, '_'.join(self.prefix))
    def _precalc(self):
        """Create a GenomePackerPrecalc object. See that class for details."""
        return GenomePackerPrecalc(self.packer, self.ptr_name,
                                   self.wrapped, self.prefix)
 class GenomePackerPrecalc(GenomePackerView):
    """
    Insert precalculated values into the packed genome.
    Create a Precalc object by calling a view object's _precalc() method. The
    returned object shares the same referent dict as the view object, but
    instead of returning view accessors for use in the body of your code,
    accessing a property synthesizes a call to an interpolation function for
    use within the precalculating function. Use this in your precalculated
    code to obtain values from a genome.
    Once you've obtained the needed parameters and performed the
    precalculation, write them to the genome object with a call to the '_set'
    method in your precalc template. This method generates a new accessor to
    which you can assign a value in your precalculation function. It also
    makes this accessor available for use on the original packer object from
    within your main function.
    Finally, call the '_code' method on the precalc object with your generated
    precalculation code to add it to the precalculation function. The code
    will be wrapped in a C block to prevent namespace leakage, so that
    multiple invocations of the precalculation code on different code blocks
    can be performed.
    Example:
        def do_precalc(px):
            pcam = px._precalc()
            pcam._code(Template('''
                {{pcam._set('prop_sin')}} = sin({{pcam.prop}});
                ''').substitute(pcam=pcam))
        def gen_code(px):
            return Template('''
                {{do_precalc(px)}}
                printf("The sin of %g is %g.", {{px.prop}}, {{px.prop_sin}});
                ''').substitute(px=px)
    """
    def __init__(self, packer, ptr_name, wrapped, prefix):
        super(GenomePackerPrecalc, self).__init__(packer, 'out', wrapped, prefix)
    def __str__(self):
        return self.packer._require_pre(self.prefix)
    def _set(self, name):
        fullname = self.prefix + (name,)
        self.packer._pre_alloc(fullname)
        # This just modifies the underlying object, because I'm too lazy right
        # now to ghost the namespace
        self.wrapped[name] = GenomePackerName('_'.join(fullname))
        return '%s->%s' % (self.ptr_name, self.wrapped[name])
    def _code(self, code):
        self.packer.precalc_code.append(code)
 class GenomePacker(HunkOCode):
    """
    Packs a genome for use in iteration.
    """
    # 2^search_rounds is the maximum number of control points, including
    # endpoints, that can be used in a single genome. It should be okay to
    # change this number without touching other code, but 32 samples fits
    # nicely on a single cache line.
    search_rounds = 5
    def __init__(self, tname):
        """
        Create a new DataPacker.
        ``tname`` is the name of the structure typedef that will be emitted
        via this object's ``decls`` property.
        """
        self.tname = tname
        # We could do this in the order that things are requested, but we want
        # to be able to treat the direct stuff as a list so this function
        # doesn't unroll any more than it has to. So we separate things into
        # direct requests, and those that need precalculation.
        # Values of OrderedDict are unused; basically, it's just OrderedSet.
        self.packed_direct = OrderedDict()
        self.genome_precalc = OrderedDict()
        self.packed_precalc = OrderedDict()
        self.precalc_code = []
        self.ns = {}
        self._len = None
        self.decls = None
        self.defs = None
        self.packed = None
        self.genome = None
    def __len__(self):
        assert self._len is not None, 'len() called before finalize()'
        return self._len
    def view(self, ptr_name, wrapped_obj, prefix):
        """Create a DataPacker view. See DataPackerView class for details."""
        self.ns[prefix] = wrapped_obj
        return GenomePackerView(self, ptr_name, wrapped_obj, (prefix,))
    def _require(self, name):
        """
        Called to indicate that the named parameter from the original genome
        must be available during interpolation.
        """
        self.packed_direct[name] = None
    def _require_pre(self, name):
        i = len(self.genome_precalc) << self.search_rounds
        self.genome_precalc[name] = None
        return 'catmull_rom(&times[%d], &knots[%d], time)' % (i, i)
    def _pre_alloc(self, name):
        self.packed_precalc[name] = None
    def finalize(self):
        """
        Create the code to render this genome.
        """
        # At the risk of packing a few things more than once, we don't
        # uniquify the overall precalc order, sparing us the need to implement
        # recursive code generation
        self.packed = self.packed_direct.keys() + self.packed_precalc.keys()
        self.genome = self.packed_direct.keys() + self.genome_precalc.keys()
        self._len = len(self.packed)
        self.decls = self._decls.substitute(
                packed=self.packed, tname=self.tname,
                search_rounds=self.search_rounds)
        self.defs = self._defs.substitute(
                packed_direct=self.packed_direct, tname=self.tname,
                precalc_code=self.precalc_code,
                search_rounds=self.search_rounds)
    def pack(self):
        """
        Return a packed copy of the genome ready for uploading to the GPU as a
        3D NDArray, with the first element being the times and the second
        being the values.
        """
        width = 1 << self.search_rounds
        out = np.empty((2, len(self.genome), width), dtype=np.float32)
        # Ensure that unused values at the top are always big (must be >2.0)
        out[0].fill(1e9)
        for idx, gname in enumerate(self.genome):
            attr = self.ns[gname[0]]
            for g in gname[1:]:
                attr = getattr(attr, g)
            if not isinstance(attr, SplEval):
                raise TypeError("%s isn't a spline" % '.'.join(gname))
            out[0][idx][:len(attr.knots[0])] = attr.knots[0]
            out[1][idx][:len(attr.knots[1])] = attr.knots[1]
        return out
    _defs = Template(r"""
 __global__
 void interp_{{tname}}({{tname}}* out, float *times, float *knots,
        float tstart, float tstep, mwc_st *rctxes) {
    int id = gtid();
    out = &out[id];
    mwc_st rctx = rctxes[id];
    float time = tstart + id * tstep;
    float *outf = reinterpret_cast<float*>(out);
    // TODO: unroll pragma?
    for (int i = 0; i < {{len(packed_direct)}}; i++) {
        int j = i << {{search_rounds}};
        outf[i] = catmull_rom(&times[j], &knots[j], time);
    }
    // Advance 'times' and 'knots' to the purely generated sections, so that
    // the pregenerated statements emitted by _require_pre are correct.
    times = &times[{{len(packed_direct)<<search_rounds}}];
    knots = &knots[{{len(packed_direct)<<search_rounds}}];
    {{for hunk in precalc_code}}
    if (1) {
    {{hunk}}
    }
    {{endfor}}
 }
 """)
    _decls = Template(r"""
 typedef struct {
 {{for name in packed}}
    float   {{'_'.join(name)}};
 {{endfor}}
 } {{tname}};
 /* Search through the fixed-size list 'hay' to find the rightmost index which
 * contains a value strictly smaller than the input 'needle'. The crazy
 * bitwise search is just for my own amusement.
 */
 __device__
 int bitwise_binsearch(float *hay, float needle) {
    int lo = 0;
    {{for i in range(search_rounds-1, -1, -1)}}
    if (needle > hay[lo | {{1 << i}}])
        lo |= {{1 << i}};
    {{endfor}}
    return lo;
 }
 __device__
 float catmull_rom(float *times, float *knots, float t) {
    int idx = bitwise_binsearch(times, t);
    // The left bias of the search means that we never have to worry about
    // overshooting unless the genome is corrupted
    idx = max(idx, 1);
    float t1 = times[idx], t2 = times[idx+1] - t1;
    float rt2 = 1.0f / t2;
    float t0 = (times[idx-1] - t1) * rt2, t3 = (times[idx+2] - t1) * rt2;
    t = (t - t1) * rt2;
    // Now t1 is effectively 0 and t2 is 1
    float k0 = knots[idx-1], k1 = knots[idx],
          k2 = knots[idx+1], k3 = knots[idx+2];
    float m1 = (k2 - k0) / (1.0f - t0),
          m2 = (k3 - k1) / (t3);
    float tt = t * t, ttt = tt * t;
    return m1 * (      ttt - 2.0f*tt + t)
         + k1 * ( 2.0f*ttt - 3.0f*tt + 1)
         + m2 * (      ttt -      tt)
         + k2 * (-2.0f*ttt + 3.0f*tt);
 }
 __global__
 void test_cr(float *times, float *knots, float *t, float *r) {
    int i = threadIdx.x + blockDim.x * blockIdx.x;
    r[i] = catmull_rom(times, knots, t[i]);
 }
 """)
--- a/cuburn/code/iter.py
+++ b/cuburn/code/iter.py
@ -2,26 +2,132 @@
 The main iteration loop.
 """
-from cuburn.code import mwc, variations
+from cuburn.code import mwc, variations, interp
 from cuburn.code.util import *
 def precalc_densities(pcp, std_xforms):
    # This pattern recurs a few times for precalc segments. Unfortunately,
    # namespace stuff means it's not easy to functionalize this boilerplate
    pre_cp = pcp._precalc()
    pre_cp._code(Template(r"""
        float sum = 0.0f;
        {{for n in std_xforms}}
        float den_{{n}} = {{pre_cp.xforms[n].density}};
        sum += den_{{n}};
        {{endfor}}
        float rsum = 1.0f / sum;
        sum = 0.0f;
        {{for n in std_xforms[:-1]}}
        sum += den_{{n}} * rsum;
        {{pre_cp._set('den_' + n)}} = sum;
        {{endfor}}
    """).substitute(locals()))
 def precalc_chaos(pcp, std_xforms):
    pre_cp = pcp._precalc()
    pre_cp._code(Template("""
        float sum, rsum;
        {{for p in std_xforms}}
        sum = 0.0f;
        {{for n in std_xforms}}
        float den_{{p}}_{{n}} = {{pre_cp.xforms[p].chaos[n]}};
        sum += den_{{p}}_{{n}};
        {{endfor}}
        rsum = 1.0f / sum;
        sum = 0.0f;
        {{for n in std_xforms[:-1]}}
        sum += den_{{p}}_{{n}} * rsum;
        {{pre_cp._set('chaos_%s_%s' % (p, n))}} = sum;
        {{endfor}}
        {{endfor}}
    """).substitute(locals()))
 def precalc_camera(info, pcam):
    pre_cam = pcam._precalc()
    # Maxima code to check my logic:
    #   matrix([1,0,0.5*width + g],[0,1,0.5*height+g],[0,0,1])
    # . matrix([width * scale,0,0], [0,width * scale,0], [0,0,1])
    # . matrix([cosr,-sinr,0], [sinr,cosr,0], [0,0,1])
    # . matrix([1,0,-cenx],[0,1,-ceny],[0,0,1])
    # . matrix([X],[Y],[1]);
    pre_cam._code(Template(r"""
        float rot = {{pre_cam.rotation}} * M_PI / 180.0f;
        float rotsin = sin(rot), rotcos = cos(rot);
        float cenx = {{pre_cam.center.x}}, ceny = {{pre_cam.center.y}};
        float scale = {{pre_cam.scale}} * {{info.width}};
        float ditherwidth = {{pre_cam.dither_width}} * 0.33f;
        float u0 = mwc_next_01(rctx);
        float r = ditherwidth * sqrtf(-2.0f * log2f(u0) / M_LOG2E);
        float u1 = 2.0f * M_PI * mwc_next_01(rctx);
        float ditherx = r * cos(u1);
        float dithery = r * sin(u1);
        {{pre_cam._set('xx')}} = scale * rotcos;
        {{pre_cam._set('xy')}} = scale * -rotsin;
        {{pre_cam._set('xo')}} = scale * (rotsin * ceny - rotcos * cenx)
                              + {{0.5 * info.width + info.gutter}} + ditherx;
        {{pre_cam._set('yx')}} = scale * rotsin;
        {{pre_cam._set('yy')}} = scale * rotcos;
        {{pre_cam._set('yo')}} = scale * -(rotsin * cenx + rotcos * ceny)
                              + {{0.5 * info.height + info.gutter}} + dithery;
    """).substitute(locals()))
 def precalc_xf_affine(px):
    pre = px._precalc()
    pre._code(Template(r"""
        float pri = {{pre.angle}} * M_PI / 180.0f;
        float spr = {{pre.spread}} * M_PI / 180.0f;
        float magx = {{pre.magnitude.x}};
        float magy = {{pre.magnitude.y}};
        {{pre._set('xx')}} = magx * cos(pri-spr);
        {{pre._set('xy')}} = magx * sin(pri-spr);
        {{pre._set('yx')}} = magy * cos(pri+spr);
        {{pre._set('yy')}} = magy * sin(pri+spr);
        {{pre._set('xo')}} = {{pre.offset.x}};
        {{pre._set('yo')}} = {{pre.offset.y}};
    """).substitute(locals()))
 class IterCode(HunkOCode):
    # The number of threads per block
    NTHREADS = 256
-    def __init__(self, features):
+    def __init__(self, info):
-        self.features = features
+        self.info = info
-        self.packer = DataPacker('iter_info')
+        self.packer = interp.GenomePacker('iter_params')
        self.pcp = self.packer.view('params', self.info.genome, 'cp')
        iterbody = self._iterbody()
-        bodies = [self._xfbody(i,x) for i,x in enumerate(self.features.xforms)]
+        bodies = [self._xfbody(i,x) for i,x in sorted(info.genome.xforms.items())]
        bodies.append(iterbody)
        self.defs = '\n'.join(bodies)
-        self.decls += self.pix_helpers.substitute(features=features)
+        self.decls += self.pix_helpers.substitute(info=info)
    decls = """
 // Note: for normalized lookups, uchar4 actually returns floats
 texture<uchar4, cudaTextureType2D, cudaReadModeNormalizedFloat> palTex;
-__shared__ iter_info info;
+__shared__ iter_params params;
 """
@ -29,14 +135,14 @@ __shared__ iter_info info;
 __device__
 void read_pix(float4 &pix, float &den) {
    den = pix.w;
-    {{if features.pal_has_alpha}}
+    {{if info.pal_has_alpha}}
    read_half(pix.z, pix.w, pix.z, den);
    {{endif}}
 }
 __device__
 void write_pix(float4 &pix, float den) {
-    {{if features.pal_has_alpha}}
+    {{if info.pal_has_alpha}}
    write_half(pix.z, pix.z, pix.w, den);
    {{endif}}
    pix.w = den;
@ -44,7 +150,7 @@ void write_pix(float4 &pix, float den) {
 __device__
 void update_pix(uint64_t ptr, uint32_t i, float4 c) {
-    {{if features.pal_has_alpha}}
+    {{if info.pal_has_alpha}}
    asm volatile ({{crep('''
    {
        .reg .u16       sz, sw;
@ -98,34 +204,37 @@ void update_pix(uint64_t ptr, uint32_t i, float4 c) {
 """)
    def _xfbody(self, xfid, xform):
-        px = self.packer.view('info', 'xf%d_' % xfid)
+        px = self.pcp.xforms[xfid]
-        px.sub('xf', 'cp.xforms[%d]' % xfid)
+        tmpl = Template(r"""
        tmpl = Template("""
 __device__
-void apply_xf{{xfid}}(float &ox, float &oy, float &color, mwc_st &rctx) {
+void apply_xf_{{xfid}}(float &ox, float &oy, float &color, mwc_st &rctx) {
    float tx, ty;
-    {{apply_affine_flam3('ox', 'oy', 'tx', 'ty', px, 'xf.c', 'pre')}}
+    {{precalc_xf_affine(px.affine)}}
    {{apply_affine('ox', 'oy', 'tx', 'ty', px.affine)}}
    ox = 0;
    oy = 0;
-    {{for v in xform.vars}}
+    {{for name in xform.variations}}
    if (1) {
-        float w = {{px.get('xf.var[%d]' % v)}};
+    {{py:pv = px.variations[name]}}
-        {{variations.var_code[variations.var_nos[v]].substitute(locals())}}
+    float w = {{pv.weight}};
    {{variations.var_code[name].substitute(locals())}}
    }
    {{endfor}}
-    {{if xform.has_post}}
+    {{if 'post' in xform}}
    tx = ox;
    ty = oy;
-    {{apply_affine_flam3('tx', 'ty', 'ox', 'oy', px, 'xf.post', 'post')}}
+    {{precalc_xf_affine(px.post)}}
    {{apply_affine('tx', 'ty', 'ox', 'oy', px.post)}}
    {{endif}}
-    float csp = {{px.get('xf.color_speed')}};
+    {{if 'color' in xform}}
-    color = color * (1.0f - csp) + {{px.get('xf.color')}} * csp;
+    float csp = {{px.color_speed}};
    color = color * (1.0f - csp) + {{px.color}} * csp;
    {{endif}}
 };
 """)
        g = dict(globals())
@ -135,42 +244,38 @@ void apply_xf{{xfid}}(float &ox, float &oy, float &color, mwc_st &rctx) {
    def _iterbody(self):
        tmpl = Template(r'''
 __global__
-void iter(mwc_st *msts, iter_info *infos, uint64_t accbuf_ptr) {
+void iter(uint64_t accbuf_ptr, mwc_st *msts, iter_params *all_params,
-    __shared__ int nsamps;
+          int nsamps_to_generate) {
    mwc_st rctx = msts[gtid()];
-    iter_info *info_glob = &(infos[blockIdx.x]);
+    iter_params *global_params = &(all_params[blockIdx.x]);
-    // load info to shared memory cooperatively
+    __shared__ int nsamps;
    nsamps = nsamps_to_generate;
    __shared__ float time_frac;
    time_frac = blockIdx.x / (float) gridDim.x;
    // load params to shared memory cooperatively
    for (int i = threadIdx.y * blockDim.x + threadIdx.x;
-         i * 4 < sizeof(iter_info); i += blockDim.x * blockDim.y)
+         i * 4 < sizeof(iter_params); i += blockDim.x * blockDim.y)
-        reinterpret_cast<float*>(&info)[i] =
+        reinterpret_cast<float*>(&params)[i] =
-            reinterpret_cast<float*>(info_glob)[i];
+            reinterpret_cast<float*>(global_params)[i];
-    if (threadIdx.y == 0 && threadIdx.x == 0)
+{{if info.chaos_used}}
        nsamps = {{packer.get("cp.width * cp.height / cp.ntemporal_samples * cp.adj_density")}};
    {{if features.chaos_used}}
    int last_xf_used = 0;
-    {{else}}
+{{else}}
-    // Size can be reduced by a factor of four using a slower 4-stage reduce
+    // Shared memory size can be reduced by a factor of four using a slower
    // 4-stage reduce, but on Fermi hardware shmem use isn't a bottleneck
    __shared__ float swap[{{4*NTHREADS}}];
    __shared__ float cosel[{{NWARPS}}];
    // This is normally done after the swap-sync in the main loop
    if (threadIdx.y == 0 && threadIdx.x < {{NWARPS}})
        cosel[threadIdx.x] = mwc_next_01(rctx);
-    {{endif}}
+{{endif}}
    __syncthreads();
-    int consec_bad = -{{features.fuse}};
+    int consec_bad = -{{info.fuse}};
    if (threadIdx.y == 1 && threadIdx.x == 0) {
        float ditherwidth = {{packer.get("0.33 * cp.spatial_filter_radius")}};
        float u0 = mwc_next_01(rctx);
        float r = ditherwidth * sqrtf(-2.0f * log2f(u0) / M_LOG2E);
        float u1 = 2.0f * M_PI * mwc_next_01(rctx);
        info.cam_xo += r * cos(u1);
        info.cam_yo += r * sin(u1);
    }
    float x, y, color;
    x = mwc_next_11(rctx);
@ -178,37 +283,44 @@ void iter(mwc_st *msts, iter_info *infos, uint64_t accbuf_ptr) {
    color = mwc_next_01(rctx);
    while (1) {
        {{if features.chaos_used}}
        // For now, we can't use the swap buffer with chaos enabled
        float xfsel = mwc_next_01(rctx);
        {{else}}
        float xfsel = cosel[threadIdx.y];
        {{endif}}
-        {{if features.chaos_used}}
+{{if info.chaos_used}}
-        {{for density_row_idx, prior_xform_idx in enumerate(features.std_xforms)}}
+
-        {{for density_col_idx,  this_xform_idx in enumerate(features.std_xforms)}}
+        {{precalc_chaos(pcp, std_xforms)}}
-        if (last_xf_used == {{prior_xform_idx}} &&
+
-                xfsel <= {{packer.get("cp.chaos_densities[%d][%d]" % (density_row_idx, density_col_idx))}}) {
+        // For now, we don't attempt to use the swap buffer when chaos is used
-            apply_xf{{this_xform_idx}}(x, y, color, rctx);
+        float xfsel = mwc_next_01(rctx);
-            last_xf_used = {{this_xform_idx}};
+
        {{for prior_xform_idx, prior_xform_name in enumerate(std_xforms)}}
        if (last_xf_used == {{prior_xform_idx}}) {
            {{for xform_idx, xform_name in enumerate(std_xforms[:-1])}}
            if (xfsel <= {{pcp['chaos_'+prior_xform_name+'_'+xform_name]}}) {
                apply_xf_{{xform_name}}(x, y, color, rctx);
                last_xf_used = {{xform_idx}};
            } else
            {{endfor}}
            {
                apply_xf_{{std_xforms[-1]}}(x, y, color, rctx);
                last_xf_used = {{len(std_xforms)-1}};
            }
        } else
        {{endfor}}
        {{endfor}}
        {{else}}
        {{for density_col_idx, this_xform_idx in enumerate(features.std_xforms)}}
        if (xfsel <= {{packer.get("cp.norm_density[%d]" % (density_col_idx))}}) {
            apply_xf{{this_xform_idx}}(x, y, color, rctx);
        } else
        {{endfor}}
        {{endif}}
        {
-            printf("Reached trap, aborting execution! %g (%d,%d,%d)\n",
+            printf("Something went *very* wrong.\n");
-                   xfsel, blockIdx.x, threadIdx.y, threadIdx.x);
+            asm("trap;");
            asm volatile ("trap;");
        }
-        {{if not features.chaos_used}}
+{{else}}
        {{precalc_densities(pcp, std_xforms)}}
        float xfsel = cosel[threadIdx.y];
        {{for xform_name in std_xforms[:-1]}}
        if (xfsel <= {{pcp['den_'+xform_name]}}) {
            apply_xf_{{xform_name}}(x, y, color, rctx);
        } else
        {{endfor}}
            apply_xf_{{std_xforms[-1]}}(x, y, color, rctx);
        // Swap thread states here so that writeback skipping logic doesn't die
        int sw = (threadIdx.y * 32 + threadIdx.x * 33) & {{NTHREADS-1}};
        int sr = threadIdx.y * 32 + threadIdx.x;
@ -222,17 +334,16 @@ void iter(mwc_st *msts, iter_info *infos, uint64_t accbuf_ptr) {
        // required per loop.
        if (nsamps < 0) break;
        {{if not features.chaos_used}}
        // Similarly, we select the next xforms here.
        if (threadIdx.y == 0 && threadIdx.x < {{NWARPS}})
            cosel[threadIdx.x] = mwc_next_01(rctx);
        {{endif}}
        consec_bad = swap[sr];
        x = swap[sr+{{NTHREADS}}];
        y = swap[sr+{{2*NTHREADS}}];
        color = swap[sr+{{3*NTHREADS}}];
-        {{endif}}
+
 {{endif}}
        if (consec_bad < 0) {
            consec_bad++;
@ -242,45 +353,50 @@ void iter(mwc_st *msts, iter_info *infos, uint64_t accbuf_ptr) {
        int remain = __popc(__ballot(1));
        if (threadIdx.x == 0) atomicSub(&nsamps, remain);
-        {{if features.final_xform_index}}
+{{if 'final' in cp.xforms}}
        float fx = x, fy = y, fcolor;
-        apply_xf{{features.final_xform_index}}(fx, fy, fcolor, rctx);
+        apply_xf_final(fx, fy, fcolor, rctx);
-        {{endif}}
+{{endif}}
        float cx, cy;
-        {{if features.final_xform_index}}
+
-        {{apply_affine('fx', 'fy', 'cx', 'cy', packer,
+        {{precalc_camera(info, pcp.camera)}}
-                       'cp.camera_transform', 'cam')}}
+
-        {{else}}
+{{if 'final' in cp.xforms}}
-        {{apply_affine('x', 'y', 'cx', 'cy', packer,
+        {{apply_affine('fx', 'fy', 'cx', 'cy', pcp.camera)}}
-                       'cp.camera_transform', 'cam')}}
+{{else}}
-        {{endif}}
+        {{apply_affine('x', 'y', 'cx', 'cy', pcp.camera)}}
 {{endif}}
        uint32_t ix = trunca(cx), iy = trunca(cy);
-        if (ix >= {{features.acc_width}} || iy >= {{features.acc_height}} ) {
+        if (ix >= {{info.acc_width}} || iy >= {{info.acc_height}} ) {
            consec_bad++;
-            if (consec_bad > {{features.max_oob}}) {
+            if (consec_bad > {{info.max_oob}}) {
                x = mwc_next_11(rctx);
                y = mwc_next_11(rctx);
                color = mwc_next_01(rctx);
-                consec_bad = -{{features.fuse}};
+                consec_bad = -{{info.fuse}};
            }
            continue;
        }
-        uint32_t i = iy * {{features.acc_stride}} + ix;
+        uint32_t i = iy * {{info.acc_stride}} + ix;
-        float4 outcol = tex2D(palTex, color, {{packer.get("cp_step_frac")}});
+
        float4 outcol = tex2D(palTex, color, time_frac);
        update_pix(accbuf_ptr, i, outcol);
    }
    msts[gtid()] = rctx;
 }
 ''')
        return tmpl.substitute(
-                features = self.features,
+                info = self.info,
-                packer = self.packer.view('info'),
+                cp = self.info.genome,
                pcp = self.pcp,
                NTHREADS = self.NTHREADS,
                NWARPS = self.NTHREADS / 32,
                std_xforms = [n for n in sorted(self.info.genome.xforms)
                              if n != 'final'],
                **globals())
--- a/cuburn/code/util.py
+++ b/cuburn/code/util.py
@ -24,28 +24,11 @@ def assemble_code(*sections):
    return ''.join([''.join([getattr(sect, kind) for sect in sections])
                    for kind in ['headers', 'decls', 'defs']])
-def apply_affine(x, y, xo, yo, packer, base_accessor, base_name):
+def apply_affine(x, y, xo, yo, packer):
    return Template("""
-    {{xo}} = {{packer.get(ba + '[0,0]', bn + '_xx')}} * {{x}}
+    {{xo}} = {{packer.xx}} * {{x}} + {{packer.xy}} * {{y}} + {{packer.xo}};
-           + {{packer.get(ba + '[0,1]', bn + '_xy')}} * {{y}}
+    {{yo}} = {{packer.yx}} * {{x}} + {{packer.yy}} * {{y}} + {{packer.yo}};
-           + {{packer.get(ba + '[0,2]', bn + '_xo')}};
+    """).substitute(locals())
    {{yo}} = {{packer.get(ba + '[1,0]', bn + '_yx')}} * {{x}}
           + {{packer.get(ba + '[1,1]', bn + '_yy')}} * {{y}}
           + {{packer.get(ba + '[1,2]', bn + '_yo')}};
    """).substitute(x=x, y=y, xo=xo, yo=yo, packer=packer,
                    ba=base_accessor, bn=base_name)
 def apply_affine_flam3(x, y, xo, yo, packer, base_accessor, base_name):
    """Read an affine transformation in *flam3 order* and apply it."""
    return tempita.Template("""
    {{xo}} = {{packer.get(ba + '[0][0]', bn + '_xx')}} * {{x}}
           + {{packer.get(ba + '[1][0]', bn + '_xy')}} * {{y}}
           + {{packer.get(ba + '[2][0]', bn + '_xo')}};
    {{yo}} = {{packer.get(ba + '[0][1]', bn + '_yx')}} * {{x}}
           + {{packer.get(ba + '[1][1]', bn + '_yy')}} * {{y}}
           + {{packer.get(ba + '[2][1]', bn + '_yo')}};
    """).substitute(x=x, y=y, xo=xo, yo=yo, packer=packer,
                    ba=base_accessor, bn=base_name)
 class BaseCode(HunkOCode):
    headers = """
@ -165,126 +148,4 @@ void write_half(float &xy, float x, float y, float den) {
        zero(dptr, np.int32(size), stream=stream,
             block=(1024, 1, 1), grid=(blocks, blocks, 1))
 class DataPackerView(object):
    """
    View of a data packer. Intended to be initialized using DataPacker.view().
    All views of a data packer share the same stream parameters, such as
    position and total size, but do not share other parameters, such as the
    pointer name used in emitted code lookups or the lookup context.
    """
    def __init__(self, packer, ptr, prefix, ns):
        self.packer, self.ptr, self.prefix, self.ns = packer, ptr, prefix, ns
    def get(self, accessor, name=None):
        """
        Add an access to the stream, returning the formatted load expression
        for device use. If 'name' is missing, the name components after the
        final dot in the accessor will be used. Little effort is made to
        ensure that this is valid C.
        """
        if name is None:
            name = accessor.rsplit('.', 1)[-1]
            name = name.replace('[', '_').replace(']', '')
        name = self.prefix + name
        self.packer._access(self, accessor, name)
        return '%s.%s' % (self.ptr, name)
    def sub(self, dst, src):
        """Add a substitution to the namespace."""
        self.ns.append((src, dst))
    def view(self, ptr_name, prefix=''):
        """
        As DataPacker.view(), but preserving the current set of namespace
        substitutions.
        """
        return DataPackerView(self.packer, ptr_name, prefix, list(self.ns))
    def _apply_subs(self, ns):
        for s, d in self.ns:
            ns[d] = eval(s, ns)
        return ns
 class DataPacker(HunkOCode):
    """
    Packs 32-bit float values into a dynamic data structure, and emits
    accessors to those data values from device code. Might get fancier in the
    future, but for now it's incredibly barebones.
    """
    default_namespace = {'np': np}
    def __init__(self, tname, clsize=4):
        """
        Create a new DataPacker.
        ``tname`` is the name of the structure typedef that will be emitted
        via this object's ``decls`` property.
        ``clsize`` is the size of a cache line, in bytes. The resulting
        data structure will be padded to that size.
        """
        self.tname = tname
        self.clsize = clsize
        self.packed = {}
        self.packed_order = []
    def view(self, ptr_name, prefix=''):
        """Create a DataPacker view. See DataPackerView class for details."""
        return DataPackerView(self, ptr_name, prefix, list())
    def _access(self, view, accessor, name):
        if name in self.packed:
            pview, paccessor, pcomp = self.packed[name]
            if pview == view and (accessor is None or paccessor == accessor):
                return
            raise ValueError("Same name, different accessor or view: %s" % name)
        comp_accessor = compile(accessor, '{{template}}', 'eval')
        self.packed[name] = (view, accessor, comp_accessor)
        self.packed_order.append(name)
    def __len__(self):
        return len(self.packed_order)
    @property
    def align(self):
        return (4 * len(self) + self.clsize - 1) / self.clsize * self.clsize
    def pack(self, **kwargs):
        base_ns = self.default_namespace.copy()
        base_ns.update(kwargs)
        out = np.zeros(self.align/4, dtype=np.float32)
        subbed_nses = {}
        for i, name in enumerate(self.packed_order):
            view, accessor, comp = self.packed[name]
            if view not in subbed_nses:
                subbed_nses[view] = view._apply_subs(dict(base_ns))
            try:
                val = eval(comp, subbed_nses[view])
            except Exception, e:
                print 'Error while evaluating accessor "%s"' % accessor
                raise e
            out[i] = val
        return out
    @property
    def decls(self):
        tmpl = Template("""
 typedef struct {
 {{for name, accessor in values}}
    float   {{'%-20s' % name}}; // {{accessor}}
 {{endfor}}
 {{if padding > 0}}
    // Align to fill whole cache lines
    float   padding[{{padding}}];
 {{endif}}
 } {{tname}};
 """)
        return tmpl.substitute(
                values  = [(n, self.packed[n][1]) for n in self.packed_order],
                padding = self.align / 4 - len(self),
                tname   = self.tname
            )
--- a/cuburn/code/variations.py
+++ b/cuburn/code/variations.py
@ -1,11 +1,32 @@
-import tempita
+import numpy as np
 from cuburn.code.util import Template
 var_nos = {}
 var_code = {}
 var_params = {}
-def var(num, name, code):
+def var(num, name, code, params=None):
-    var_nos[num] = name
+    var_code[name] = Template(code)
-    var_code[name] = tempita.Template(code)
+    if params is not None:
        r = {}
        for p in params.split():
            if '=' in p:
                p, default = p.split('=')
                if default == 'M_PI':
                    default = np.pi
                else:
                    default = float(default)
            else:
                default = 0.0
            r[p] = default
        var_params[name] = r
 # TODO: This is a shitty hack
 def precalc(name, code):
    def precalc_fun(pv):
        pre = pv._precalc()
        pre._code(Template(code).substitute(pre=pre))
    Template.default_namespace[name+'_precalc'] = precalc_fun
 # Variables note: all functions will have their weights as 'w',
 # input variables 'tx' and 'ty', and output 'ox' and 'oy' available
@ -119,10 +140,10 @@ var(14, 'bent', """
    """)
 var(15, 'waves', """
-    float c10 = {{px.get(None, 'pre_xy')}};
+    float c10 = {{px.affine.xy}};
-    float c11 = {{px.get(None, 'pre_yy')}};
+    float c11 = {{px.affine.yy}};
-    float dx = {{px.get(None, 'pre_xo')}};
+    float dx = {{px.affine.xo}};
-    float dy = {{px.get(None, 'pre_yo')}};
+    float dy = {{px.affine.yo}};
    float dx2 = 1.0f / (dx * dx);
    float dy2 = 1.0f / (dy * dy);
@ -140,8 +161,8 @@ var(16, 'fisheye', """
 var(17, 'popcorn', """
    float dx = tanf(3.0f*ty);
    float dy = tanf(3.0f*tx);
-    ox += w * (tx + {{px.get(None, 'pre_xo')}} * sinf(dx));
+    ox += w * (tx + {{px.affine.xo}} * sinf(dx));
-    oy += w * (ty + {{px.get(None, 'pre_yo')}} * sinf(dy));
+    oy += w * (ty + {{px.affine.yo}} * sinf(dy));
    """)
 var(18, 'exponential', """
@ -166,7 +187,7 @@ var(20, 'cosine', """
    """)
 var(21, 'rings', """
-    float dx = {{px.get(None, 'pre_xo')}} * {{px.get(None, 'pre_xo')}};
+    float dx = {{px.affine.xo}} * {{px.affine.xo}};
    float r = sqrtf(tx*tx + ty*ty);
    float a = atan2f(tx, ty);
    r = w * (fmodf(r+dx, 2.0f*dx) - dx + r * (1.0f - dx));
@ -175,9 +196,9 @@ var(21, 'rings', """
    """)
 var(22, 'fan', """
-    float dx = M_PI * ({{px.get(None, 'pre_xo')}} * {{px.get(None, 'pre_xo')}});
+    float dx = M_PI * ({{px.affine.xo}} * {{px.affine.xo}});
    float dx2 = 0.5f * dx;
-    float dy = {{px.get(None, 'pre_yo')}};
+    float dy = {{px.affine.yo}};
    float a = atan2f(tx, ty);
    a += (fmodf(a+dy, dx) > dx2) ? -dx2 : dx2;
    float r = w * sqrtf(tx*tx + ty*ty);
@ -188,24 +209,24 @@ var(22, 'fan', """
 var(23, 'blob', """
    float r = sqrtf(tx*tx + ty*ty);
    float a = atan2f(tx, ty);
-    float bdiff = 0.5f * ({{px.get('xf.blob_high - xf.blob_low','blob_diff')}});
+    float bdiff = 0.5f * ({{pv.high}} - {{pv.low}});
-    r *= w * ({{px.get('xf.blob_low')}} + bdiff * (1.0f + sinf({{px.get('xf.blob_waves')}} * a)));
+    r *= w * ({{pv.low}} + bdiff * (1.0f + sinf({{pv.waves}} * a)));
    ox += sinf(a) * r;
    oy += cosf(a) * r;
-    """)
+    """, 'low high=1 waves=1')
 var(24, 'pdj', """
-    float nx1 = cosf({{px.get('xf.pdj_b')}} * tx);
+    float nx1 = cosf({{pv.b}} * tx);
-    float nx2 = sinf({{px.get('xf.pdj_c')}} * tx);
+    float nx2 = sinf({{pv.c}} * tx);
-    float ny1 = sinf({{px.get('xf.pdj_a')}} * ty);
+    float ny1 = sinf({{pv.a}} * ty);
-    float ny2 = cosf({{px.get('xf.pdj_d')}} * ty);
+    float ny2 = cosf({{pv.d}} * ty);
    ox += w * (ny1 - nx1);
    oy += w * (nx2 - ny2);
-    """)
+    """, 'a b c d')
 var(25, 'fan2', """
-    float dy = {{px.get('xf.fan2_y')}};
+    float dy = {{pv.y}};
-    float dx = M_PI * {{px.get('xf.fan2_x')}} * {{px.get('xf.fan2_x')}};
+    float dx = M_PI * {{pv.x}} * {{pv.x}};
    float dx2 = 0.5f * dx;
    float a = atan2f(tx, ty);
    float r = w * sqrtf(tx*tx + ty*ty);
@ -217,16 +238,16 @@ var(25, 'fan2', """
    ox += r * sinf(a);
    oy += r * cosf(a);
-    """)
+    """, 'x y')
 var(26, 'rings2', """
-    float dx = {{px.get('xf.rings2_val')}} * {{px.get('xf.rings2_val')}};
+    float dx = {{pv.val}} * {{pv.val}};
    float r = sqrtf(tx*tx + ty*ty);
    float a = atan2f(tx, ty);
    r += -2.0f * dx * (int)((r+dx)/(2.0f*dx)) + r * (1.0f - dx);
    ox += w * sinf(a) * r;
    oy += w * cosf(a) * r;
-    """)
+    """, 'val')
 var(27, 'eyefish', """
    float r = 2.0f * w / (sqrtf(tx*tx + ty*ty) + 1.0f);
@ -245,16 +266,21 @@ var(29, 'cylinder', """
    oy += w * ty;
    """)
-var(30, 'perspective', """
+precalc('perspective', """
-    float pdist = {{px.get('xf.perspective_dist')}};
+        float pang = {{pre.angle}} * M_PI_2;
-    float pvsin = {{px.get('np.sin(xf.perspective_angle*np.pi/2)', 'pvsin')}};
+        float pdist = fmaxf(1e-9, {{pre.dist}});
-    float pvfcos = {{px.get(
+        {{pre._set('mdist')}} = pdist;
-        'xf.perspective_dist*np.cos(xf.perspective_angle*np.pi/2)', 'pvfcos')}};
+        {{pre._set('sin')}} = sin(pang);
        {{pre._set('cos')}} = pdist * cos(pang);
        """)
-    float t = 1.0f / (pdist - ty * pvsin);
+var(30, 'perspective', """
-    ox += w * pdist * tx * t;
+    {{perspective_precalc(pv)}}
-    oy += w * pvfcos * ty * t;
+
-    """)
+    float t = 1.0f / ({{pv.mdist}} - ty * {{pv.sin}});
    ox += w * {{pv.mdist}} * tx * t;
    oy += w * {{pv.cos}} * ty * t;
    """, 'angle dist')
 var(31, 'noise', """
    float tmpr = mwc_next_01(rctx) * 2.0f * M_PI;
@ -263,33 +289,39 @@ var(31, 'noise', """
    oy += ty * r * sinf(tmpr);
    """)
 precalc('julian',
        "{{pre._set('cn')}} = {{pre.dist}} / (2.0f * {{pre.power}});\n")
 var(32, 'julian', """
-    float power = {{px.get('xf.julian_power')}};
+    {{julina_precalc(pv)}}
    float power = {{pv.power}};
    float t_rnd = truncf(mwc_next_01(rctx) * fabsf(power));
    float a = atan2f(ty, tx);
    float tmpr = (a + 2.0f * M_PI * t_rnd) / power;
-    float cn = {{px.get('xf.julian_dist / xf.julian_power / 2', 'julian_cn')}};
+    float cn = {{pv.cn}};
    float r = w * powf(tx * tx + ty * ty, cn);
    ox += r * cosf(tmpr);
    oy += r * sinf(tmpr);
-    """)
+    """, 'power=1 dist=1')
 precalc('juliascope',
        "{{pre._set('cn')}} = {{pre.dist}} / (2.0f * {{pre.power}});\n")
 var(33, 'juliascope', """
    {{juliascope_precalc(pv)}}
    float ang = atan2f(ty, tx);
-    float power = {{px.get('xf.juliascope_power', 'juscope_power')}};
+    float power = {{pv.power}};
    float t_rnd = truncf(mwc_next_01(rctx) * fabsf(power));
    // TODO: don't draw the extra random number
    if (mwc_next(rctx) & 1) ang = -ang;
    float tmpr = (2.0f * M_PI * t_rnd + ang) / power;
-
+    float r = w * powf(tx * tx + ty * ty, {{pv.cn}});
    float cn = {{px.get('xf.juliascope_dist / xf.juliascope_power / 2',
                         'juscope_cn')}};
    float r = w * powf(tx * tx + ty * ty, cn);
    ox += r * cosf(tmpr);
    oy += r * sinf(tmpr);
-    """)
+    """, 'power=1 dist=1')
 var(34, 'blur', """
    float tmpr = mwc_next_01(rctx) * 2.0f * M_PI;
@ -300,40 +332,39 @@ var(34, 'blur', """
 var(35, 'gaussian', """
    float ang = mwc_next_01(rctx) * 2.0f * M_PI;
-    float r = w * ( mwc_next_01(rctx) + mwc_next_01(rctx)
+    float r = w * sqrtf(-2.0f * log2f(mwc_next_01(rctx)) / M_LOG2E);
                  + mwc_next_01(rctx) + mwc_next_01(rctx) - 2.0f );
    ox += r * cosf(ang);
    oy += r * sinf(ang);
    """)
 var(36, 'radial_blur', """
-    float blur_angle = {{px.get('xf.radial_blur_angle')}} * M_PI * 0.5f;
+    float blur_angle = {{pv.angle}} * M_PI * 0.5f;
    float spinvar = sinf(blur_angle);
    float zoomvar = cosf(blur_angle);
-    float r = w * ( mwc_next_01(rctx) + mwc_next_01(rctx)
+
-                   + mwc_next_01(rctx) + mwc_next_01(rctx) - 2.0f );
+    float r = w * sqrtf(-2.0f * log2f(mwc_next_01(rctx)) / M_LOG2E);
    float ra = sqrtf(tx*tx + ty*ty);
    float tmpa = atan2f(ty, tx) + spinvar * r;
    float rz = zoomvar * r - 1.0f;
    ox += ra*cosf(tmpa) + rz*tx;
    oy += ra*sinf(tmpa) + rz*ty;
-    """)
+    """, 'angle')
 var(37, 'pie', """
-    float slices = {{px.get('xf.pie_slices')}};
+    float slices = {{pv.slices}};
    float sl = truncf(mwc_next_01(rctx) * slices + 0.5f);
-    float a = {{px.get('xf.pie_rotation')}} +
+    float a = {{pv.rotation}} +
-                2.0f * M_PI * (sl + mwc_next_01(rctx) * {{px.get('xf.pie_thickness')}}) / slices;
+                2.0f * M_PI * (sl + mwc_next_01(rctx) * {{pv.thickness}}) / slices;
    float r = w * mwc_next_01(rctx);
    ox += r * cosf(a);
    oy += r * sinf(a);
-    """)
+    """, 'slices=6 rotation thickness=0.5')
 var(38, 'ngon', """
-    float power = {{px.get('xf.ngon_power')}} * 0.5f;
+    float power = {{pv.power}} * 0.5f;
-    float b = 2.0f * M_PI / {{px.get('xf.ngon_sides')}};
+    float b = 2.0f * M_PI / {{pv.sides}};
-    float corners = {{px.get('xf.ngon_corners')}};
+    float corners = {{pv.corners}};
-    float circle = {{px.get('xf.ngon_circle')}};
+    float circle = {{pv.circle}};
    float r_factor = powf(tx*tx + ty*ty, power);
    float theta = atan2f(ty, tx);
@ -343,11 +374,11 @@ var(38, 'ngon', """
    ox += w * tx * amp;
    oy += w * ty * amp;
-    """)
+    """, 'sides=5 power=3 circle=1 corners=2')
 var(39, 'curl', """
-    float c1 = {{px.get('xf.curl_c1')}};
+    float c1 = {{pv.c1}};
-    float c2 = {{px.get('xf.curl_c2')}};
+    float c2 = {{pv.c2}};
    float re = 1.0f + c1*tx + c2*(tx*tx - ty*ty);
    float im = c1*ty + 2.0f*c2*tx*ty;
@ -355,15 +386,15 @@ var(39, 'curl', """
    ox += r * (tx*re + ty*im);
    oy += r * (ty*re - tx*im);
-    """)
+    """, 'c1=1 c2')
 var(40, 'rectangles', """
-    float rx = {{px.get('xf.rectangles_x')}};
+    float rx = {{pv.x}};
-    float ry = {{px.get('xf.rectangles_y')}};
+    float ry = {{pv.y}};
    ox += w * ( (rx==0.0f) ? tx : rx * (2.0f * floorf(tx/rx) + 1.0f) - tx);
    oy += w * ( (ry==0.0f) ? ty : ry * (2.0f * floorf(ty/ry) + 1.0f) - ty);
-    """)
+    """, 'x y')
 var(41, 'arch', """
    float ang = mwc_next_01(rctx) * w * M_PI;
@ -417,9 +448,8 @@ var(48, 'cross', """
    """)
 var(49, 'disc2', """
-    float twist = {{px.get('xf.disc2_twist')}};
+    float twist = {{pv.twist}}
-    float rotpi = {{px.get('xf.disc2_rot', 'disc2_rotpi')}};
+    float rotpi = {{pv.rot}} * M_PI;
    rotpi *= M_PI;
    float sintwist = sinf(twist);
    float costwist = cosf(twist) - 1.0f;
@ -441,70 +471,71 @@ var(49, 'disc2', """
    ox += r * (sinf(t) + costwist);
    oy += r * (cosf(t) + sintwist);
-    """)
+    """, 'rot twist')
 var(50, 'super_shape', """
    float ang = atan2f(ty, tx);
-    float theta = 0.25f * ({{px.get('xf.super_shape_m')}} * ang + M_PI);
+    float theta = 0.25f * ({{pv.m}} * ang + M_PI);
    float t1 = fabsf(cosf(theta));
    float t2 = fabsf(sinf(theta));
-    t1 = powf(t1,{{px.get('xf.super_shape_n2')}});
+    t1 = powf(t1, {{pv.n2}});
-    t2 = powf(t2,{{px.get('xf.super_shape_n3')}});
+    t2 = powf(t2, {{pv.n3}});
-    float myrnd = {{px.get('xf.super_shape_rnd')}};
+    float myrnd = {{pv.rnd}};
    float d = sqrtf(tx*tx+ty*ty);
-    float r = w * ((myrnd*mwc_next_01(rctx) + (1.0f-myrnd)*d) - {{px.get('xf.super_shape_holes')}})
+    float r = w * ((myrnd*mwc_next_01(rctx) + (1.0f-myrnd)*d) - {{pv.holes}})
-                * powf(t1+t2, {{px.get('-1.0 / xf.super_shape_n1', 'super_shape_pneg')}}) / d;
+                * powf(t1+t2, -1.0f / {{pv.n1}}) / d;
    ox += r * tx;
    oy += r * ty;
-    """)
+    """, 'rnd m n1=1 n2=1 n3=1 holes')
 var(51, 'flower', """
-    float holes = {{px.get('xf.flower_holes')}};
+    float holes = {{pv.holes}};
-    float petals = {{px.get('xf.flower_petals')}};
+    float petals = {{pv.petals}};
-    float r = w * (mwc_next_01(rctx) - holes) * cosf(petals*atan2f(ty, tx)) / sqrtf(tx*tx + ty*ty);
+    float r = w * (mwc_next_01(rctx) - holes)
                * cosf(petals*atan2f(ty, tx)) / sqrtf(tx*tx + ty*ty);
    ox += r * tx;
    oy += r * ty;
-    """)
+    """, 'holes petals')
 var(52, 'conic', """
    float d = sqrtf(tx*tx + ty*ty);
    float ct = tx / d;
-    float holes = {{px.get('xf.conic_holes')}};
+    float holes = {{pv.holes}};
-    float eccen = {{px.get('xf.conic_eccentricity')}};
+    float eccen = {{pv.eccentricity}};
    float r = w * (mwc_next_01(rctx) - holes) * eccen / (1.0f + eccen*ct) / d;
    ox += r * tx;
    oy += r * ty;
-    """)
+    """, 'holes eccentricity=1')
 var(53, 'parabola', """
    float r = sqrtf(tx*tx + ty*ty);
    float sr = sinf(r);
    float cr = cosf(r);
-    ox += {{px.get('xf.parabola_height')}} * w * sr * sr * mwc_next_01(rctx);
+    ox += {{pv.height}} * w * sr * sr * mwc_next_01(rctx);
-    oy += {{px.get('xf.parabola_width')}} * w * cr * mwc_next_01(rctx);
+    oy += {{pv.width}}  * w * cr * mwc_next_01(rctx);
-    """)
+    """, 'height width')
 var(54, 'bent2', """
    float nx = 1.0f;
-    if (tx < 0.0f) nx = {{px.get('xf.bent2_x')}};
+    if (tx < 0.0f) nx = {{pv.x}};
    float ny = 1.0f;
-    if (ty < 0.0f) ny = {{px.get('xf.bent2_y')}};
+    if (ty < 0.0f) ny = {{pv.y}};
    ox += w * nx * tx;
    oy += w * ny * ty;
-    """)
+    """, 'x=1 y=1')
 var(55, 'bipolar', """
    float x2y2 = tx*tx + ty*ty;
    float t = x2y2 + 1.0f;
    float x2 = tx * 2.0f;
-    float ps = -M_PI_2 * {{px.get('xf.bipolar_shift')}};
+    float ps = -M_PI_2 * {{pv.shift}};
    float y = 0.5f * atan2f(2.0f * ty, x2y2 - 1.0f) + ps;
    if (y > M_PI_2)
@ -514,7 +545,7 @@ var(55, 'bipolar', """
    ox += w * 0.25f * M_2_PI * logf( (t+x2) / (t-x2) );
    oy += w * M_2_PI * y;
-    """)
+    """, 'shift')
 var(56, 'boarders', """
    float roundX = rintf(tx);
@ -556,7 +587,7 @@ var(57, 'butterfly', """
    """)
 var(58, 'cell', """
-    float cell_size = {{px.get('xf.cell_size')}};
+    float cell_size = {{pv.size}};
    float inv_cell_size = 1.0f/cell_size;
    /* calculate input cell */
@ -588,31 +619,35 @@ var(58, 'cell', """
    ox += w * (dx + x*cell_size);
    oy -= w * (dy + y*cell_size);
-    """)
+    """, 'size=1')
 var(59, 'cpow', """
    float a = atan2f(ty, tx);
    float lnr = 0.5f * logf(tx*tx+ty*ty);
-    float power = {{px.get('xf.cpow_power')}};
+    float power = {{pv.power}};
    float va = 2.0f * M_PI / power;
-    float vc = {{px.get('xf.cpow_r')}} / power;
+    float vc = {{pv.cpow_r}} / power;
-    float vd = {{px.get('xf.cpow_i')}} / power;
+    float vd = {{pv.cpow_i}} / power;
    float ang = vc*a + vd*lnr + va*floorf(power*mwc_next_01(rctx));
    float m = w * expf(vc * lnr - vd * a);
    ox += m * cosf(ang);
    oy += m * sinf(ang);
-    """)
+    """, 'r=1 i power=1')
 precalc('curve', '''
        float xl = {{pv.xlength}}, yl = {{pv.ylength}};
        {{pre._set('x2')}} = 1.0f / max(1e-20f, xl * xl);
        {{pre._set('y2')}} = 1.0f / max(1e-20f, yl * yl);
        ''')
 var(60, 'curve', """
-    float pc_xlen = {{px.get('xf.curve_xlength * xf.curve_xlength','pc_xlen')}};
+    {{curve_precalc()}}
-    float pc_ylen = {{px.get('xf.curve_ylength * xf.curve_ylength','pc_ylen')}};
+    float pc_xlen = {{pv.x2}}, pc_ylen = {{pv.y2}};
-    if (pc_xlen<1E-20f) pc_xlen = 1E-20f;
+    ox += w * (tx + {{pv.xamp}} * expf(-ty*ty*pc_xlen));
-    if (pc_ylen<1E-20f) pc_ylen = 1E-20f;
+    oy += w * (ty + {{pv.yamp}} * expf(-tx*tx*pc_ylen));
-
+    """, 'xamp yamp xlength=1 ylength=1')
    ox += w * (tx + {{px.get('xf.curve_xamp')}} * expf(-ty*ty/pc_xlen));
    oy += w * (ty + {{px.get('xf.curve_yamp')}} * expf(-tx*tx/pc_ylen));
    """)
 var(61, 'edisc', """
    float tmp = tx*tx + ty*ty + 1.0f;
@ -662,7 +697,7 @@ var(62, 'elliptic', """
 var(63, 'escher', """
    float a = atan2f(ty,tx);
    float lnr = 0.5f * logf(tx*tx + ty*ty);
-    float ebeta = {{px.get('xf.escher_beta')}};
+    float ebeta = {{pv.beta}};
    float seb = sinf(ebeta);
    float ceb = cosf(ebeta);
    float vc = 0.5f * (1.0f + ceb);
@ -672,7 +707,7 @@ var(63, 'escher', """
    ox += m * cosf(n);
    oy += m * sinf(n);
-    """)
+    """, 'beta')
 var(64, 'foci', """
    float expx = expf(tx) * 0.5f;
@ -685,27 +720,26 @@ var(64, 'foci', """
    """)
 var(65, 'lazysusan', """
-    float lx = {{px.get('xf.lazysusan_x')}};
+    float lx = {{pv.x}};
-    float ly = {{px.get('xf.lazysusan_y')}};
+    float ly = {{pv.y}};
    float x = tx - lx;
    float y = ty + ly;
    float r = sqrtf(x*x + y*y);
    if (r < w) {
-        float a = atan2f(y,x) + {{px.get('xf.lazysusan_spin')}} +
+        float a = atan2f(y,x) + {{pv.spin}} + {{pv.twist}} * (w - r);
                 {{px.get('xf.lazysusan_twist')}}*(w-r);
        r = w * r;
        ox += r * cosf(a) + lx;
        oy += r * sinf(a) - ly;
    } else {
-        r = w * (1.0f + {{px.get('xf.lazysusan_space')}} / r);
+        r = w * (1.0f + {{pv.space}} / r);
        ox += r * x + lx;
        oy += r * y - ly;
    }
-    """)
+    """, 'x y twist space spin')
 var(66, 'loonie', """
    float r2 = tx*tx + ty*ty;;
@ -732,8 +766,7 @@ var(67, 'pre_blur', """
    """)
 var(68, 'modulus', """
-    float mx = {{px.get('xf.modulus_x')}};
+    float mx = {{pv.x}}, my = {{pv.y}};
    float my = {{px.get('xf.modulus_y')}};
    float xr = 2.0f*mx;
    float yr = 2.0f*my;
@ -750,13 +783,13 @@ var(68, 'modulus', """
        oy += w * ( my - fmodf(my - ty, yr));
    else
        oy += w * ty;
-    """)
+    """, 'x y')
 var(69, 'oscope', """
-    float tpf = 2.0f * M_PI * {{px.get('xf.oscope_frequency')}};
+    float tpf = 2.0f * M_PI * {{pv.frequency}};
-    float amp = {{px.get('xf.oscope_amplitude')}};
+    float amp = {{pv.amplitude}};
-    float sep = {{px.get('xf.oscope_separation')}};
+    float sep = {{pv.separation}};
-    float dmp = {{px.get('xf.oscope_damping')}};
+    float dmp = {{pv.damping}};
    float t = amp * expf(-fabsf(tx)*dmp) * cosf(tpf*tx) + sep;
@ -765,7 +798,7 @@ var(69, 'oscope', """
        oy -= w*ty;
    else
        oy += w*ty;
-    """)
+    """, 'separation=1 frequency=M_PI amplitude=1 damping')
 var(70, 'polar2', """
    float p2v = w / M_PI;
@ -774,10 +807,10 @@ var(70, 'polar2', """
    """)
 var(71, 'popcorn2', """
-    float c = {{px.get('xf.popcorn2_c')}};
+    float c = {{pv.c}};
-    ox += w * (tx + {{px.get('xf.popcorn2_x')}} * sinf(tanf(ty*c)));
+    ox += w * (tx + {{pv.x}} * sinf(tanf(ty*c)));
-    oy += w * (ty + {{px.get('xf.popcorn2_y')}} * sinf(tanf(tx*c)));
+    oy += w * (ty + {{pv.y}} * sinf(tanf(tx*c)));
-    """)
+    """, 'x y c')
 var(72, 'scry', """
    /* note that scry does not multiply by weight, but as the */
@ -790,68 +823,61 @@ var(72, 'scry', """
    """)
 var(73, 'separation', """
-    float sx2 = {{px.get('xf.separation_x * xf.separation_x', 'sx2')}};
+    float sx2 = {{pv.x}} * {{pv.x}};
-    float sy2 = {{px.get('xf.separation_y * xf.separation_y', 'sy2')}};
+    float sy2 = {{pv.y}} * {{pv.y}};
    if (tx > 0.0f)
-        ox += w * (sqrtf(tx*tx + sx2) - tx*{{px.get('xf.separation_xinside')}});
+        ox += w * (sqrtf(tx*tx + sx2) - tx*{{pv.xinside}});
    else
-        ox -= w * (sqrtf(tx*tx + sx2) + tx*{{px.get('xf.separation_xinside')}});
+        ox -= w * (sqrtf(tx*tx + sx2) + tx*{{pv.xinside}});
    if (ty > 0.0f)
-        oy += w * (sqrtf(ty*ty + sy2) - ty*{{px.get('xf.separation_yinside')}});
+        oy += w * (sqrtf(ty*ty + sy2) - ty*{{pv.yinside}});
    else
-        oy -= w * (sqrtf(ty*ty + sy2) + ty*{{px.get('xf.separation_yinside')}});
+        oy -= w * (sqrtf(ty*ty + sy2) + ty*{{pv.yinside}});
-    """)
+    """, 'x xinside y yinside')
 var(74, 'split', """
-    if (cosf(tx*{{px.get('xf.split_xsize')}}*M_PI) >= 0.0f)
+    if (cosf(tx*{{pv.xsize}}*M_PI) >= 0.0f)
        oy += w*ty;
    else
        oy -= w*ty;
-    if (cosf(ty*{{px.get('xf.split_ysize')}}*M_PI) >= 0.0f)
+    if (cosf(ty*{{pv.ysize}}*M_PI) >= 0.0f)
        ox += w*tx;
    else
        ox -= w*tx;
-    """)
+    """, 'xsize ysize')
 var(75, 'splits', """
-    if (tx >= 0.0f)
+    ox += w*(tx + copysignf({{pv.x}}, tx));
-        ox += w*(tx + {{px.get('xf.splits_x')}});
+    oy += w*(ty + copysignf({{pv.y}}, ty));
-    else
+    """, 'x y')
        ox += w*(tx - {{px.get('xf.splits_x')}});
    if (ty >= 0)
        oy += w*(ty + {{px.get('xf.splits_y')}});
    else
        oy += w*(ty - {{px.get('xf.splits_y')}});
    """)
 var(76, 'stripes', """
    float roundx = floorf(tx + 0.5f);
    float offsetx = tx - roundx;
-    ox += w * (offsetx * (1.0f - {{px.get('xf.stripes_space')}}) + roundx);
+    ox += w * (offsetx * (1.0f - {{pv.space}}) + roundx);
-    oy += w * (ty + offsetx*offsetx*{{px.get('xf.stripes_warp')}});
+    oy += w * (ty + offsetx*offsetx*{{pv.warp}});
-    """)
+    """, 'space warp')
 var(77, 'wedge', """
    float r = sqrtf(tx*tx + ty*ty);
-    float a = atan2f(ty, tx) + {{px.get('xf.wedge_swirl')}} * r;
+    float a = atan2f(ty, tx) + {{pv.swirl}} * r;
-    float wc = {{px.get('xf.wedge_count')}};
+    float wc = {{pv.count}};
-    float wa = {{px.get('xf.wedge_angle')}};
+    float wa = {{pv.angle}};
    float c = floorf((wc * a + M_PI) * M_1_PI * 0.5f);
    float comp_fac = 1 - wa * wc * M_1_PI * 0.5f;
    a = a * comp_fac + c * wa;
-    r = w * (r + {{px.get('xf.wedge_hole')}});
+    r = w * (r + {{pv.hole}});
    ox += r * cosf(a);
    oy += r * sinf(a);
-    """)
+    """, 'angle hole count=1 swirl')
 var(81, 'waves2', """
-    ox += w*(tx + {{px.get('xf.waves2_scalex')}}*sinf(ty * {{px.get('xf.waves2_freqx')}}));
+    ox += w*(tx + {{pv.scalex}}*sinf(ty * {{pv.freqx}}));
-    oy += w*(ty + {{px.get('xf.waves2_scaley')}}*sinf(tx * {{px.get('xf.waves2_freqy')}}));
+    oy += w*(ty + {{pv.scaley}}*sinf(tx * {{pv.freqy}}));
-    """)
+    """, 'scalex scaley freqx freqy')
 var(82, 'exp', """
    float expe = expf(tx);
@ -935,21 +961,22 @@ var(95, 'coth', """
 var(97, 'flux', """
    float xpw = tx + w;
    float xmw = tx - w;
-    float avgr = w * (2.0f + {{px.get('xf.flux_spread')}}) * sqrtf(sqrtf(ty*ty+xpw*xpw)/sqrtf(ty*ty+xmw*xmw));
+    float avgr = w * (2.0f + {{pv.spread}})
               * sqrtf(sqrtf(ty*ty+xpw*xpw)/sqrtf(ty*ty+xmw*xmw));
    float avga = (atan2f(ty, xmw) - atan2f(ty,xpw))*0.5f;
    ox += avgr * cosf(avga);
    oy += avgr * sinf(avga);
-    """)
+    """, 'spread')
 var(98, 'mobius', """
-    float rea = {{px.get('xf.mobius_re_a')}};
+    float rea = {{pv.re_a}};
-    float ima = {{px.get('xf.mobius_im_a')}};
+    float ima = {{pv.im_a}};
-    float reb = {{px.get('xf.mobius_re_b')}};
+    float reb = {{pv.re_b}};
-    float imb = {{px.get('xf.mobius_im_b')}};
+    float imb = {{pv.im_b}};
-    float rec = {{px.get('xf.mobius_re_c')}};
+    float rec = {{pv.re_c}};
-    float imc = {{px.get('xf.mobius_im_c')}};
+    float imc = {{pv.im_c}};
-    float red = {{px.get('xf.mobius_re_d')}};
+    float red = {{pv.re_d}};
-    float imd = {{px.get('xf.mobius_im_d')}};
+    float imd = {{pv.im_d}};
    float re_u, im_u, re_v, im_v, rad_v;
@ -962,5 +989,5 @@ var(98, 'mobius', """
    ox += rad_v * (re_u*re_v + im_u*im_v);
    oy += rad_v * (im_u*re_v - re_u*im_v);
-    """)
+    """, 're_a im_a re_b im_b re_c im_c re_d im_d')
--- a/cuburn/genome.py
+++ b/cuburn/genome.py
@ -0,0 +1,151 @@
 import json
 import numpy as np
 import scipy.interpolate
 from cuburn import affine
 class SplEval(object):
    def __init__(self, knots):
        # If a single value is passed, normalize to a constant set of knots
        if isinstance(knots, (int, float)):
            knots = [-0.1, knots, 0.0, knots, 1.0, knots, 1.1, knots]
        # If stabilizing knots are missing before or after the edges of the
        # [0,1] interval, add them. TODO: choose better knots
        if knots[0] >= 0:
            knots = [-0.1, knots[1]] + knots
        if knots[-2] <= 1:
            knots.extend([1.1, knots[-1]])
        self.knots = np.zeros((2, len(knots)/2))
        self.knots.T.flat[:] = knots
    def __call__(self, itime):
        try:
            return np.asarray(map(self, itime))
        except:
            pass
        idx = np.searchsorted(self.knots[0], itime) - 2
        idx = max(0, min(len(self.knots[0]) - 4, idx))
        times = self.knots[0][idx:idx+4]
        vals = self.knots[1][idx:idx+4]
        # Normalize to [0,1]
        t = itime - times[1]
        times = times - times[1]
        t = t / times[2]
        times = times / times[2]
        return self._interp(times, vals, t)
    @staticmethod
    def _interp(times, vals, t):
        t2 = t * t
        t3 = t * t2
        m1 = (vals[2] - vals[0]) / (1.0 - times[0])
        m2 = (vals[3] - vals[1]) / times[3]
        r = ( m1 * (t3 - 2*t2 + t) + vals[1] * (2*t3 - 3*t2 + 1)
            + m2 * (t3 - t2) + vals[2] * (-2*t3 + 3*t2) )
        return r
    def __str__(self):
        return '[%g:%g]' % (self(0), self(1))
    def __repr__(self):
        return '<interp [%g:%g]>' % (self(0), self(1))
    @classmethod
    def wrap(cls, obj):
        """
        Given a dict 'obj' representing, for instance, a Genome object, walk
        through the object recursively and in-place, turning any number or
        list of numbers into an SplEval.
        """
        for k, v in obj.items():
            if (isinstance(v, (float, int)) or
                    (isinstance(v, list) and isinstance(v[1], (float, int)))):
                obj[k] = cls(v)
            elif isinstance(v, dict):
                cls.wrap(v)
 class RenderInfo(object):
    """
    Determine features and constants required to render a particular set of
    genomes. The values of this class are fixed before compilation begins.
    """
    # Constant parameters which control handling of out-of-frame samples:
    # Number of iterations to iterate without write after new point
    fuse = 10
    # Maximum consecutive out-of-bounds points before picking new point
    max_oob = 10
    # Height of the texture pallete which gets uploaded to the GPU (assuming
    # that palette-from-texture is enabled). For most genomes, this doesn't
    # need to be very large at all. However, since only an easily-cached
    # fraction of this will be accessed per SM, larger values shouldn't hurt
    # performance too much. Power-of-two, please.
    palette_height = 16
    # Maximum width of DE and other spatial filters, and thus in turn the
    # amount of padding applied. Note that, for now, this must not be changed!
    # The filtering code makes deep assumptions about this value.
    gutter = 16
    # TODO: for now, we always throw away the alpha channel before writing.
    # All code is in place to not do this, we just need to find a way to expose
    # this preference via the API (or push alpha blending entirely on the client,
    # which I'm not opposed to)
    alpha_output_channel = False
    # TODO: fix these
    chaos_used = False
    std_xforms = [0, 1, 2]
    final_xform_index = 3
    pal_has_alpha = False
    density = 2000
    def __init__(self, db, **kwargs):
        self.db = db
        # Copy all args into this object's namespace
        self.__dict__.update(kwargs)
        self.acc_width = self.width + 2 * self.gutter
        self.acc_height = self.height + 2 * self.gutter
        self.acc_stride = 32 * int(np.ceil(self.acc_width / 32.))
        # Deref genome
        self.genome = self.db.genomes[self.genome]
 class _AttrDict(dict):
    def __getattr__(self, name):
        return self[name]
 def load_info(contents):
    result = json.loads(contents, object_hook=_AttrDict)
    SplEval.wrap(result.genomes)
    # A Job object will have more details or something
    result = RenderInfo(result, **result.renders.values()[0])
    return result
 class HacketyGenome(object):
    """
    Holdover class to postpone a very deep refactoring as long as possible.
    Converts property accesses into interpolations over predetermined times.
    """
    def __init__(self, referent, times):
        # Times can be singular
        self.referent, self.times = referent, times
    def __getattr__(self, name):
        r = getattr(self.referent, str(name))
        if isinstance(r, _AttrDict):
            return HacketyGenome(r, self.times)
        elif isinstance(r, SplEval):
            return r(self.times)
        return r
    __getitem__ = __getattr__
 if __name__ == "__main__":
    import sys
    import pprint
    pprint.pprint(read_genome(sys.stdin))
--- a/cuburn/render.py
+++ b/cuburn/render.py
@ -1,12 +1,15 @@
 import os
 import sys
 import math
 import re
-import time
+import time as timemod
 import tempfile
 from itertools import cycle, repeat, chain, izip
 from ctypes import *
 from cStringIO import StringIO
 import numpy as np
 from scipy import ndimage
 import base64
 from fr0stlib import pyflam3
 from fr0stlib.pyflam3._flam3 import *
@ -17,79 +20,11 @@ import pycuda.driver as cuda
 import pycuda.tools
 from pycuda.gpuarray import vec
 import cuburn.genome
 from cuburn import affine
 from cuburn.code import util, mwc, iter, filtering
-def _chunk(l, cs):
+class Renderer(object):
    """
    Yield the contents of list ``l`` in chunks of size no more than ``cs``.
    """
    for i in range(0, len(l), cs):
        yield l[i:i+cs]
 class Genome(object):
    """
    Normalizes and precalculates some properties of a Genome. Assumes that
    Genome argument passed in will not change.
    """
    # Fix the ctypes ugliness since switching to __getattribute__ in 2.7.
    # There are more elegant ways to do this, but I can't be bothered.
    def __getattr__(self, name):
        return getattr(self.cp, name)
    def __init__(self, ctypes_genome):
        self.cp = ctypes_genome
        self.xforms = [self.xform[i] for i in range(self.num_xforms)]
        dens = np.array([x.density for i, x in enumerate(self.xforms)
                         if i != self.final_xform_index])
        num_std_xf = len(dens)
        self.chaos_densities = np.zeros( (num_std_xf,num_std_xf) )
        for r in range(num_std_xf):
            chaos_row = np.array([ctypes_genome.chaos[r][c]
                                  for c in range(num_std_xf)])
            chaos_row = chaos_row * dens
            chaos_row /= np.sum(chaos_row)
            chaos_row = np.cumsum(chaos_row)
            self.chaos_densities[r,:] = chaos_row
        dens /= np.sum(dens)
        self.norm_density = np.cumsum(dens)
        # For performance reasons, defer this calculation
        self._camera_transform = None
    scale = property(lambda cp: 2.0 ** cp.zoom)
    adj_density = property(lambda cp: cp.sample_density * (cp.scale ** 2))
    ppu = property(lambda cp: cp.pixels_per_unit * cp.scale)
    @property
    def camera_transform(self):
        """
        An affine matrix which will transform IFS coordinates to image width
        and height. Assumes that width and height are constant.
        """
        cp = self
        if self._camera_transform is not None:
            return self._camera_transform
        g = Features.gutter
        if cp.estimator:
            # The filter shifts by this amount as a side effect of it being
            # written in a confusing and sloppy manner
            # TODO: this will be weird in an animation where one endpoint has
            # a radius of 0, and the other does not
            g -= Features.gutter / 2 - 1
        self._camera_transform = (
                 affine.translate(0.5 * cp.width + g, 0.5 * cp.height + g)
               * affine.scale(cp.ppu, cp.ppu)
               * affine.translate(-cp._center[0], -cp._center[1])
               * affine.rotate(cp.rotate * 2 * np.pi / 360,
                               cp.rot_center[0],
                               cp.rot_center[1]) )
        return self._camera_transform
 class Animation(object):
    """
    Control structure for rendering a series of frames.
@ -108,27 +43,13 @@ class Animation(object):
    interpolated sequence between one or two genomes.
    """
    # Large launches lock the display for a considerable period and may be
    # killed due to a device timeout; small launches are harder to load-balance
    # on the GPU and incur overhead. This empirical value is multiplied by the
    # number of SMs on the device to determine how many blocks should be in
    # each launch. Extremely high quality, high resolution renders may still
    # encounter a device timeout, requiring the user to increase the split
    # amount. This factor is not used in async mode.
    SM_FACTOR = 8
    cmp_options = ('-use_fast_math', '-maxrregcount', '42')
    keep = False
-    def __init__(self, ctypes_genome_array):
+    def __init__(self, info):
-        self._g_arr = type(ctypes_genome_array)()
+        self.info = info
        libflam3.flam3_align(self._g_arr, ctypes_genome_array,
                             len(ctypes_genome_array))
        self.genomes = map(Genome, self._g_arr)
        self.features = Features(self.genomes)
        self._iter = self._de = self.src = self.cubin = self.mod = None
        self.packed_genome = None
        # Ensure class options don't get contaminated on an instance
        self.cmp_options = list(self.cmp_options)
@ -148,12 +69,14 @@ class Animation(object):
        keep = self.keep if keep is None else keep
        cmp_options = self.cmp_options if cmp_options is None else cmp_options
-        self._iter = iter.IterCode(self.features)
+        self._iter = iter.IterCode(self.info)
-        self._de = filtering.DensityEst(self.features, self.genomes[0])
+        self._de = filtering.DensityEst(self.info)
-        cclip = filtering.ColorClip(self.features)
+        cclip = filtering.ColorClip(self.info)
-        # TODO: make choice of filtering explicit
+        self._iter.packer.finalize()
        self.src = util.assemble_code(util.BaseCode, mwc.MWC, self._iter.packer,
                                      self._iter, cclip, self._de)
        with open(os.path.join(tempfile.gettempdir(), 'kernel.cu'), 'w') as fp:
            fp.write(self.src)
        self.cubin = pycuda.compiler.compile(
                self.src, keep=keep, options=cmp_options,
                cache_dir=False if keep else None)
@ -183,11 +106,11 @@ class Animation(object):
        self.mod = cuda.module_from_buffer(self.cubin, jit_options)
-    def render_frames(self, times=None, sync=False):
+    def render(self, times):
        """
        Render a flame for each genome in the iterable value 'genomes'.
        Returns a Python generator object which will yield a 2-tuple of
-        ``(time, buf)``, where ``time`` is the central time of the frame and
+        ``(time, buf)``, where ``time`` is the start time of the frame and
        ``buf`` is a 3D (width, height, channel) NumPy array containing
        [0,1]-valued RGBA components.
@ -196,73 +119,64 @@ class Animation(object):
        allowed to run until completion (by exhausting all items in the
        generator object).
-        A performance note: while any ready tasks will be scheduled on the GPU
+        ``times`` is a sequence of (start, stop) times defining the temporal
-        before yielding a result, spending a lot of time before returning
+        range to be rendered for each frame. This will change to be more
-        control to this function can allow the GPU to become idle. It's best
+        frame-centric in the future, allowing for interpolated temporal width.
        to hand the resulting array to another thread after grabbing it from
        the renderer for handling.
        ``times`` is a sequence of center times at which to render, or ``None``
        to render one frame for each genome used to create the animation.
        If ``sync`` is True, the CPU will sync with the GPU after every block
        of temporal samples and yield None until the frame is ready. This
        allows a single-card system to avoid having to go thirty seconds
        between window refreshes while rendering. Otherwise, tasks will be
        piled asynchronously on the card so that it is always under load.
        """
        if times == []:
            return
-        f = self.features
+        info = self.info
        times = times if times is not None else [cp.time for cp in self.genomes]
        iter_stream = cuda.Stream()
        filt_stream = cuda.Stream()
        cen_cp = pyflam3.Genome()
        dst_cp = pyflam3.Genome()
-        nbins = f.acc_height * f.acc_stride
+        nbins = info.acc_height * info.acc_stride
        d_accum = cuda.mem_alloc(16 * nbins)
        d_out = cuda.mem_alloc(16 * nbins)
        num_sm = cuda.Context.get_device().multiprocessor_count
-        if sync:
+        cps_per_block = 1024
            cps_per_block = num_sm * self.SM_FACTOR
        else:
            cps_per_block = f.max_cps
-        info_size = self._iter.packer.align * cps_per_block
+        genome_times, genome_knots = self._iter.packer.pack()
        d_genome_times = cuda.to_device(genome_times)
        d_genome_knots = cuda.to_device(genome_knots)
        info_size = 4 * len(self._iter.packer) * cps_per_block
        d_infos = cuda.mem_alloc(info_size)
-        d_palmem = cuda.mem_alloc(256 * f.palette_height * 4)
+        d_palmem = cuda.mem_alloc(256 * info.palette_height * 4)
        seeds = mwc.MWC.make_seeds(self._iter.NTHREADS * cps_per_block)
        d_seeds = cuda.to_device(seeds)
        h_infos = cuda.pagelocked_empty((info_size / 4,), np.float32)
        h_palmem = cuda.pagelocked_empty(
-                (f.palette_height, 256, 4), np.uint8)
+                (info.palette_height, 256, 4), np.uint8)
-        h_out = cuda.pagelocked_empty((f.acc_height, f.acc_stride, 4), np.float32)
+        h_out = cuda.pagelocked_empty((info.acc_height, info.acc_stride, 4),
                                      np.float32)
        filter_done_event = None
-        packer = self._iter.packer
+        packer_fun = self.mod.get_function("interp_iter_params")
        iter_fun = self.mod.get_function("iter")
        #iter_fun.set_cache_config(cuda.func_cache.PREFER_L1)
        util.BaseCode.zero_dptr(self.mod, d_accum, 4 * nbins, filt_stream)
-        last_time = times[0]
+        last_time = times[0][0]
-        for time in times:
+        # TODO: move palette stuff to separate class; do interp
-            self._interp(cen_cp, time)
+        pal = np.fromstring(base64.b64decode(info.db.palettes.values()[0]),
                            np.uint8)
        pal = np.reshape(pal, (256, 3))
        h_palmem[0,:,:3] = pal
        h_palmem[1:] = h_palmem[0]
-            h_palmem[:] = self._interp_colors(dst_cp, time,
+        for start, stop in times:
-                                              cen_cp.temporal_filter_width)
+            cen_cp = cuburn.genome.HacketyGenome(info.genome, (start+stop)/2)
            # "Interp" already done above, but...
            cuda.memcpy_htod_async(d_palmem, h_palmem, iter_stream)
            tref = self.mod.get_texref('palTex')
            array_info = cuda.ArrayDescriptor()
-            array_info.height = f.palette_height
+            array_info.height = info.palette_height
            array_info.width = 256
            array_info.array_format = cuda.array_format.UNSIGNED_INT8
            array_info.num_channels = 4
@ -272,69 +186,52 @@ class Animation(object):
            tref.set_flags(cuda.TRSF_NORMALIZED_COORDINATES)
            tref.set_filter_mode(cuda.filter_mode.LINEAR)
            # Must be accumulated over all CPs
            gam, vib = 0, 0
            bkgd = np.zeros(3)
-            mblur_times = enumerate( np.linspace(-0.5, 0.5, cen_cp.ntemporal_samples)
+            if filter_done_event:
-                                     * cen_cp.temporal_filter_width + time )
+                iter_stream.wait_for_event(filter_done_event)
-            for block_times in _chunk(list(mblur_times), cps_per_block):
+            width = np.float32((stop-start) / cps_per_block)
-                infos = []
+            packer_fun(d_infos, d_genome_times, d_genome_knots,
-                if len(self.genomes) > 1:
+                       np.float32(start), width, d_seeds,
-                    for n, t in block_times:
+                       block=(256,1,1), grid=(cps_per_block/256,1),
-                        self._interp(dst_cp, t)
+                       stream=iter_stream)
                        frac = float(n) / cen_cp.ntemporal_samples
                        info = packer.pack(cp=Genome(dst_cp), cp_step_frac=frac)
                        infos.append(info)
                        gam += dst_cp.gamma
                        vib += dst_cp.vibrancy
                        bkgd += np.array(dst_cp.background)
                else:
                    # Can't interpolate normally; just pack copies
                    packed = packer.pack(cp=self.genomes[0], cp_step_frac=0)
                    infos = [packed] * len(block_times)
                    gam += self.genomes[0].gamma * len(block_times)
                    vib += self.genomes[0].vibrancy * len(block_times)
                    bkgd += np.array(self.genomes[0].background) * len(block_times)
-                infos = np.concatenate(infos)
+            # Get interpolated control points for debugging
-                h_infos[:len(infos)] = infos
+            #iter_stream.synchronize()
-                cuda.memcpy_htod_async(d_infos, h_infos)
+            #d_temp = cuda.from_device(d_infos,
                    #(cps_per_block, len(self._iter.packer)), np.float32)
            #for i, n in zip(d_temp[5], self._iter.packer.packed):
                #print '%60s %g' % ('_'.join(n), i)
-                if filter_done_event:
+            nsamps = info.density * info.width * info.height / cps_per_block
-                    iter_stream.wait_for_event(filter_done_event)
+            iter_fun(np.uint64(d_accum), d_seeds, d_infos, np.int32(nsamps),
                     block=(32, self._iter.NTHREADS/32, 1),
                     grid=(cps_per_block, 1),
                     texrefs=[tref], stream=iter_stream)
-                # TODO: replace with option to split long runs shorter ones
+            if filter_done_event:
-                # for interactivity
+                while not filt_stream.is_done():
-                for i in range(1):
+                    timemod.sleep(0.01)
                    iter_fun(d_seeds, d_infos, np.uint64(d_accum),
                             block=(32, self._iter.NTHREADS/32, 1),
                             grid=(len(block_times), 1),
                             texrefs=[tref], stream=iter_stream)
                    if sync:
                        iter_stream.synchronize()
                        yield None
            if filter_done_event and not sync:
                filt_stream.synchronize()
                yield last_time, self._trim(h_out)
-                last_time = time
+                last_time = start
            util.BaseCode.zero_dptr(self.mod, d_out, 4 * nbins, filt_stream)
-            self._de.invoke(self.mod, Genome(cen_cp), d_accum, d_out, filt_stream)
+            self._de.invoke(self.mod, cen_cp, d_accum, d_out, filt_stream)
            util.BaseCode.zero_dptr(self.mod, d_accum, 4 * nbins, filt_stream)
            filter_done_event = cuda.Event().record(filt_stream)
            f32 = np.float32
-            n = f32(cen_cp.ntemporal_samples)
+            # TODO: implement integration over cubic splines?
-            gam = f32(n / gam)
+            gam = f32(1 / cen_cp.color.gamma)
-            vib = f32(vib / n)
+            vib = f32(cen_cp.color.vibrancy)
-            hipow = f32(cen_cp.highlight_power)
+            hipow = f32(cen_cp.color.highlight_power)
-            lin = f32(cen_cp.gam_lin_thresh)
+            lin = f32(cen_cp.color.gamma_threshold)
-            lingam = f32(math.pow(cen_cp.gam_lin_thresh, gam-1.0) if lin > 0 else 0)
+            lingam = f32(math.pow(lin, gam-1.0) if lin > 0 else 0)
-            bkgd = vec.make_float3(*(bkgd / n))
+            bkgd = vec.make_float3(
                    cen_cp.color.background.r,
                    cen_cp.color.background.g,
                    cen_cp.color.background.b)
            color_fun = self.mod.get_function("colorclip")
            blocks = int(np.ceil(np.sqrt(nbins / 256)))
@ -343,133 +240,10 @@ class Animation(object):
                      stream=filt_stream)
            cuda.memcpy_dtoh_async(h_out, d_out, filt_stream)
-            if sync:
+        filt_stream.synchronize()
-                filt_stream.synchronize()
+        yield start, self._trim(h_out)
                yield time, self._trim(h_out)
        if not sync:
            filt_stream.synchronize()
            yield time, self._trim(h_out)
    def _interp(self, cp, time):
        flam3_interpolate(self._g_arr, len(self._g_arr), time, 0, byref(cp))
    @staticmethod
    def _pal_to_np(cp):
        # Converting palettes by iteration has an enormous performance
        # overhead. We cheat massively and dangerously here.
        pal = cast(pointer(cp.palette), POINTER(c_double * (256 * 5)))
        val = np.frombuffer(buffer(pal.contents), count=256*5)
        return np.uint8(np.reshape(val, (256, 5))[:,1:] * 255.0)
    def _interp_colors(self, cp, time, twidth):
        # TODO: any visible difference between uint8 and richer formats?
        height = self.features.palette_height
        pal = np.empty((height, 256, 4), dtype=np.uint8)
        if len(self.genomes) > 1:
            # The typical case; applying real motion blur
            times = np.linspace(-0.5, 0.5, height) * twidth + time
            for n, t in enumerate(times):
                self._interp(cp, t)
                pal[n] = self._pal_to_np(cp)
        else:
            # Cannot call any interp functions on a single genome; rather than
            # have alternate code-paths, just copy the same colors everywhere
            pal[0] = self._pal_to_np(self.genomes[0])
            pal[1:] = pal[0]
        return pal
    def _trim(self, result):
-        g = self.features.gutter
+        g = self.info.gutter
        return result[g:-g,g:-g].copy()
 class Features(object):
    """
    Determine features and constants required to render a particular set of
    genomes. The values of this class are fixed before compilation begins.
    """
    # Constant parameters which control handling of out-of-frame samples:
    # Number of iterations to iterate without write after new point
    fuse = 10
    # Maximum consecutive out-of-bounds points before picking new point
    max_oob = 10
    max_nxforms = 12
    # Height of the texture pallete which gets uploaded to the GPU (assuming
    # that palette-from-texture is enabled). For most genomes, this doesn't
    # need to be very large at all. However, since only an easily-cached
    # fraction of this will be accessed per SM, larger values shouldn't hurt
    # performance too much. Power-of-two, please.
    palette_height = 16
    # Maximum width of DE and other spatial filters, and thus in turn the
    # amount of padding applied. Note that, for now, this must not be changed!
    # The filtering code makes deep assumptions about this value.
    gutter = 16
    # TODO: for now, we always throw away the alpha channel before writing.
    # All code is in place to not do this, we just need to find a way to expose
    # this preference via the API (or push alpha blending entirely on the client,
    # which I'm not opposed to)
    alpha_output_channel = False
    def __init__(self, genomes):
        any = lambda l: bool(filter(None, map(l, genomes)))
        self.max_ntemporal_samples = max(
                [cp.nbatches * cp.ntemporal_samples for cp in genomes])
        self.non_box_temporal_filter = genomes[0].temporal_filter_type
        self.palette_mode = genomes[0].palette_mode and "linear" or "nearest"
        assert len(set([len(cp.xforms) for cp in genomes])) == 1, ("genomes "
            "must have same number of xforms! (use flam3-genome first)")
        self.nxforms = len(genomes[0].xforms)
        self.xforms = [XFormFeatures([cp.xforms[i] for cp in genomes], i)
                       for i in range(self.nxforms)]
        if any(lambda cp: cp.final_xform_enable):
            if not all([cp.final_xform_index == genomes[0].final_xform_index
                        for cp in genomes]):
                raise ValueError("Differing final xform indexes")
            self.final_xform_index = genomes[0].final_xform_index
        else:
            self.final_xform_index = None
        alphas = np.array([c.color[3] for g in genomes
                           for c in g.palette.entries])
        self.pal_has_alpha = np.any(alphas != 1.0)
        self.max_cps = max([cp.ntemporal_samples for cp in genomes])
        self.width = genomes[0].width
        self.height = genomes[0].height
        self.acc_width = genomes[0].width + 2 * self.gutter
        self.acc_height = genomes[0].height + 2 * self.gutter
        self.acc_stride = 32 * int(math.ceil(self.acc_width / 32.))
        self.std_xforms = filter(lambda v: v != self.final_xform_index,
                                 range(self.nxforms))
        self.chaos_used = False
        for cp in genomes:
            for r in range(len(self.std_xforms)):
                for c in range(len(self.std_xforms)):
                    if cp.chaos[r][c] != 1.0:
                        self.chaos_used = True
 class XFormFeatures(object):
    def __init__(self, xforms, xform_id):
        self.id = xform_id
        any = lambda l: bool(filter(None, map(l, xforms)))
        self.has_post = any(lambda xf: not self.id_matrix(xf.post))
        self.vars = set()
        for x in xforms:
            self.vars = (
                self.vars.union(set([i for i, v in enumerate(x.var) if v])))
    @staticmethod
    def id_matrix(m):
        return (m[0][0] == 1 and m[1][0] == 0 and m[2][0] == 0 and
                m[0][1] == 0 and m[1][1] == 1 and m[2][1] == 0)
--- a/run_job.py
+++ b/run_job.py
@ -0,0 +1,52 @@
 #!/usr/bin/python
 # Temporary helper script while I update main.py
 import os
 import sys
 import time
 import argparse
 import multiprocessing
 from subprocess import Popen
 from ctypes import *
 from itertools import ifilter
 import numpy as np
 import Image
 import scipy
 import pycuda.autoinit
 import cuburn.render
 import cuburn.genome
 import pycuda.compiler
 import pycuda.driver as cuda
 import cuburn.code.interp
 np.set_printoptions(precision=5, edgeitems=20)
 real_stdout = sys.stdout
 def save(time, raw):
    noalpha = raw[:,:,:3]
    name = 'out/%05d' % time
    img = scipy.misc.toimage(noalpha, cmin=0, cmax=1)
    img.save(name+'.png')
 def main(jobfilepath):
    contents = '\n'.join([l.split('//', 1)[0] for l in open(jobfilepath)])
    # This includes the genomes and other cruft, a dedicated reader will be
    # built in time tho
    info = cuburn.genome.load_info(contents)
    times = np.linspace(0, 1, info.duration * info.fps + 1)[:3]
    renderer = cuburn.render.Renderer(info)
    renderer.compile()
    renderer.load()
    for idx, (ftime, out) in enumerate(renderer.render(zip(times, times[1:]))):
        save(idx, out)
 if __name__ == "__main__":
    main('04653_2.jsonc')