diff --git a/cuburn/code/filtering.py b/cuburn/code/filtering.py
index f05b182..59435e4 100644
--- a/cuburn/code/filtering.py
+++ b/cuburn/code/filtering.py
@@ -2,17 +2,20 @@
 from cuburn.code.util import *
 
 class ColorClip(HunkOCode):
-    defs = """
+    def __init__(self, features):
+        self.defs = self.defs_tmpl.substitute(features=features)
+
+    defs_tmpl = Template("""
 __global__
 void colorclip(float4 *pixbuf, float gamma, float vibrancy, float highpow,
-               float linrange, float lingam) {
+               float linrange, float lingam, float3 bkgd) {
     // TODO: test if over an edge of the framebuffer - currently gutters are
     // used and up to 256 pixels are ignored, which breaks when width<256
     int i = blockDim.x * blockIdx.x + threadIdx.x;
     float4 pix = pixbuf[i];
 
     if (pix.w <= 0) {
-        pixbuf[i] = make_float4(0, 0, 0, 0);
+        pixbuf[i] = make_float4(bkgd.x, bkgd.y, bkgd.z, 0);
         return;
     }
 
@@ -57,6 +60,18 @@ void colorclip(float4 *pixbuf, float gamma, float vibrancy, float highpow,
     pix.y += (1.0f - vibrancy) * powf(opix.y, gamma);
     pix.z += (1.0f - vibrancy) * powf(opix.z, gamma);
 
+    {{if features.alpha_output_channel}}
+    float 1_alpha = 1 / alpha;
+    pix.x *= 1_alpha;
+    pix.y *= 1_alpha;
+    pix.z *= 1_alpha;
+    {{else}}
+    pix.x += (1.0f - alpha) * bkgd.x;
+    pix.y += (1.0f - alpha) * bkgd.y;
+    pix.z += (1.0f - alpha) * bkgd.z;
+    {{endif}}
+    pix.w = alpha;
+
     // Clamp values. I think this is superfluous, but I'm not certain.
     pix.x = fminf(1.0f, pix.x);
     pix.y = fminf(1.0f, pix.y);
@@ -64,7 +79,7 @@ void colorclip(float4 *pixbuf, float gamma, float vibrancy, float highpow,
 
     pixbuf[i] = pix;
 }
-"""
+""")
 
 class DensityEst(HunkOCode):
     """
diff --git a/cuburn/render.py b/cuburn/render.py
index 8d05038..508996b 100644
--- a/cuburn/render.py
+++ b/cuburn/render.py
@@ -14,6 +14,7 @@ from fr0stlib.pyflam3.constants import *
 
 import pycuda.compiler
 import pycuda.driver as cuda
+from pycuda.gpuarray import vec
 
 from cuburn import affine
 from cuburn.code import util, mwc, iter, filtering
@@ -107,10 +108,11 @@ class Animation(object):
         """
         self._iter = iter.IterCode(self.features)
         self._de = filtering.DensityEst(self.features, self.genomes[0])
+        cclip = filtering.ColorClip(self.features)
         # TODO: make choice of filtering explicit
         # TODO: autoload dependent modules?
         self.src = util.assemble_code(util.BaseCode, mwc.MWC, self._iter.packer,
-                                      self._iter, filtering.ColorClip, self._de)
+                                      self._iter, cclip, self._de)
         self.cubin = pycuda.compiler.compile(self.src, keep=keep,
                                              options=list(cmp_options))
         return self.src
@@ -256,6 +258,7 @@ class _AnimRenderer(object):
 
         # Must be accumulated over all CPs
         gam, vib = 0, 0
+        bkgd = np.zeros(3)
 
         # This is gross, but there are a lot of fiddly corner cases with any
         # index-based iteration scheme.
@@ -270,6 +273,7 @@ class _AnimRenderer(object):
                     infos.append(info)
                     gam += cp.gamma
                     vib += cp.vibrancy
+                    bkgd += np.array(cp.background)
             else:
                 # Can't interpolate normally; just pack copies
                 # TODO: this still packs the genome 20 times or so instead of
@@ -278,6 +282,7 @@ class _AnimRenderer(object):
                 infos = [packed] * len(block_times)
                 gam += a.genomes[0].gamma * len(block_times)
                 vib += a.genomes[0].vibrancy * len(block_times)
+                bkgd += np.array(a.genomes[0].background) * len(block_times)
 
             infos = np.concatenate(infos)
             offset = b * packer.align * self.cps_per_block
@@ -293,7 +298,7 @@ class _AnimRenderer(object):
             # TODO: get block config from IterCode
             # TODO: print timing information
             iter_fun(self.d_seeds[b], np.uint64(d_info_off),
-                     self.d_accum, self.d_den,
+                     self.d_accum, self.d_den, texrefs=[tref],
                      block=(32, 16, 1), grid=(len(block_times), 1),
                      stream=self.stream)
 
@@ -301,19 +306,17 @@ class _AnimRenderer(object):
         # stream here. Later, once we've decided on a density-buffer prefilter,
         # we will move it to the GPU, allowing it to be embedded in the stream
         # and letting the remaining code be asynchronous.
-        self.stream.synchronize()
-        dbuf_dim = (a.features.acc_height, a.features.acc_stride)
-        dbuf = cuda.from_device(self.d_den, dbuf_dim, np.float32)
-        dbuf = ndimage.filters.gaussian_filter(dbuf, 0.6)
-        cuda.memcpy_htod(self.d_den, dbuf)
+        #self.stream.synchronize()
+        #dbuf_dim = (a.features.acc_height, a.features.acc_stride)
+        #dbuf = cuda.from_device(self.d_den, dbuf_dim, np.float32)
+        #dbuf = ndimage.filters.gaussian_filter(dbuf, 0.6)
+        #cuda.memcpy_htod(self.d_den, dbuf)
 
         util.BaseCode.zero_dptr(a.mod, self.d_out, 4 * self.nbins,
                                 self.stream)
-        self.stream.synchronize()
         a._de.invoke(a.mod, Genome(cen_cp),
                      self.d_accum, self.d_out, self.d_den,
                      self.stream)
-        self.stream.synchronize()
 
         f = np.float32
         n = f(self.ncps)
@@ -322,12 +325,12 @@ class _AnimRenderer(object):
         hipow = f(cen_cp.highlight_power)
         lin = f(cen_cp.gam_lin_thresh)
         lingam = f(math.pow(cen_cp.gam_lin_thresh, gam-1.0) if lin > 0 else 0)
-        print gam, vib, lin, lingam, cen_cp.gamma
+        bkgd = vec.make_float3(*(bkgd / n))
 
         # TODO: get block size from colorclip class? It actually does not
         # depend on that being the case
         color_fun = a.mod.get_function("colorclip")
-        color_fun(self.d_out, gam, vib, hipow, lin, lingam,
+        color_fun(self.d_out, gam, vib, hipow, lin, lingam, bkgd,
                   block=(256, 1, 1), grid=(self.nbins / 256, 1),
                   stream=self.stream)
 
@@ -402,6 +405,12 @@ class Features(object):
     # The filtering code makes deep assumptions about this value.
     gutter = 16
 
+    # TODO: for now, we always throw away the alpha channel before writing.
+    # All code is in place to not do this, we just need to find a way to expose
+    # this preference via the API (or push alpha blending entirely on the client,
+    # which I'm not opposed to)
+    alpha_output_channel = False
+
     def __init__(self, genomes):
         any = lambda l: bool(filter(None, map(l, genomes)))
         self.max_ntemporal_samples = max(