diff --git a/cuburn/code/filters.py b/cuburn/code/filters.py
index 4e6eb3d..d96dbc7 100644
--- a/cuburn/code/filters.py
+++ b/cuburn/code/filters.py
@@ -108,6 +108,26 @@ __global__ void den_blur_1c(float *dst, int pattern, int upsample) {
 }
 ''')
 
+
+fullblurlib = devlib(deps=[denblurlib], defs=r'''
+__global__ void full_blur(float4 *dst, int pattern, int upsample) {
+    GET_IDX_2(xi, yi, gi);
+    float x = xi, y = yi;
+
+    float4 val = make_float4(0, 0, 0, 0);
+
+    #pragma unroll
+    for (int i = 0; i < 7; i++) {
+        float4 pix = tex_shear(chan4_src, pattern, x, y, (i - 3) << upsample);
+        val.x += pix.x * gauss_coefs[i];
+        val.y += pix.y * gauss_coefs[i];
+        val.z += pix.z * gauss_coefs[i];
+        val.w += pix.w * gauss_coefs[i];
+    }
+    dst[gi] = val;
+}
+''')
+
 bilaterallib = devlib(deps=[logscalelib, texshearlib, denblurlib], defs=r'''
 /* sstd:    spatial standard deviation (Gaussian filter)
  * cstd:    color standard deviation (Gaussian on the range [0, 1], where 1
@@ -225,12 +245,11 @@ halocliplib = devlib(deps=[yuvlib, denblurlib], defs=r'''
 __global__ void apply_gamma(float *dst, float4 *src, float gamma) {
     GET_IDX(i);
     float4 pix = src[i];
-    float ls = powf(fmaxf(0.0f, src[i].z), gamma);
-    dst[i] = ls * pix.x;
+    dst[i] = powf(pix.x, gamma);
 }
 
 __global__ void
-haloclip(float4 *pixbuf, const float *denbuf, float gamma) {
+haloclip(float4 *pixbuf, const float *denbuf, float gamma_m_1) {
     GET_IDX(i);
     float4 pix = pixbuf[i];
     float areaval = denbuf[i];
@@ -240,12 +259,45 @@ haloclip(float4 *pixbuf, const float *denbuf, float gamma) {
         return;
     }
 
-    float ls = powf(pix.z, gamma) / fmaxf(1.0f, areaval);
-
+    float ls = powf(pix.w, gamma_m_1) / fmaxf(1.0f, areaval);
     scale_float4(pix, ls);
-
     yuvo2rgb(pix);
+    pixbuf[i] = pix;
+}
+''')
 
+smearcliplib = devlib(deps=[yuvlib, fullblurlib], defs=r'''
+// Apply gamma to all four pixels. Subtract one from the result, and clamp at
+// a minimum of 0.
+__global__ void apply_gamma_full_hi(float4 *dst, float4 *src, float gamma_m_1) {
+    GET_IDX(i);
+    float4 pix = src[i];
+    float ls = 0.0f;
+    if (pix.w > 0.0f)
+        ls = fmaxf(0.0f, pix.w - 1.0f) / pix.w;
+    scale_float4(pix, ls);
+    dst[i] = pix;
+}
+
+__global__ void
+smearclip(float4 *pixbuf, const float4 *smearbuf, float gamma_m_1) {
+    GET_IDX(i);
+    float4 pix = pixbuf[i];
+    float4 areaval = smearbuf[i];
+
+    pix.x += areaval.x;
+    pix.y += areaval.y;
+    pix.z += areaval.z;
+    pix.w += areaval.w;
+
+    if (pix.w <= 0) {
+        pixbuf[i] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+        return;
+    }
+
+    float ls = powf(pix.w, gamma_m_1);
+    scale_float4(pix, ls);
+    yuvo2rgb(pix);
     pixbuf[i] = pix;
 }
 ''')
diff --git a/cuburn/filters.py b/cuburn/filters.py
index 93c7d91..322a2bc 100644
--- a/cuburn/filters.py
+++ b/cuburn/filters.py
@@ -28,6 +28,8 @@ def mkdsc(dim, ch):
                   format=cuda.array_format.FLOAT)
 
 class Filter(object):
+    # Set to True if the filter requires a full 4-channel side buffer
+    full_side = False
     def apply(self, fb, gprof, params, dim, tc, stream=None):
         """
         Queue the application of this filter. When the live stream finishes
@@ -92,23 +94,43 @@ class Logscale(Filter, ClsMod):
 class HaloClip(Filter, ClsMod):
     lib = code.filters.halocliplib
     def apply(self, fb, gprof, params, dim, tc, stream=None):
-        gam = f32(1 / params.gamma(tc) - 1)
+        gam = f32(1 / gprof.filters.colorclip.gamma(tc) - 1)
 
         dsc = mkdsc(dim, 1)
         tref = mktref(self.mod, 'chan1_src')
 
         set_blur_width(self.mod, fb.pool, stream=stream)
         launch2('apply_gamma', self.mod, stream, dim,
-                fb.d_side, fb.d_front, gam)
-        tref.set_address_2d(fb.d_side, dsc, 4 * params.astride)
+                fb.d_side, fb.d_front, f32(0.1))
+        tref.set_address_2d(fb.d_side, dsc, 4 * dim.astride)
         launch2('den_blur_1c', self.mod, stream, dim,
-               fb.d_back, i32(0), i32(0), texrefs=[tref])
-        tref.set_address_2d(fb.d_back, dsc, 4 * params.astride)
+               fb.d_back, i32(2), i32(0), texrefs=[tref])
+        tref.set_address_2d(fb.d_back, dsc, 4 * dim.astride)
         launch2('den_blur_1c', self.mod, stream, dim,
-               fb.d_side, i32(1), i32(0), texrefs=[tref])
+               fb.d_side, i32(3), i32(0), texrefs=[tref])
 
         launch2('haloclip', self.mod, stream, dim,
-                fb.d_front, fb.d_side)
+                fb.d_front, fb.d_side, gam)
+
+class SmearClip(Filter, ClsMod):
+    full_side = True
+    lib = code.filters.smearcliplib
+    def apply(self, fb, gprof, params, dim, tc, stream=None):
+        gam = f32(1 / gprof.filters.colorclip.gamma(tc) - 1)
+        dsc = mkdsc(dim, 4)
+        tref = mktref(self.mod, 'chan4_src')
+
+        set_blur_width(self.mod, fb.pool, params.width(tc), stream)
+        launch2('apply_gamma_full_hi', self.mod, stream, dim,
+                fb.d_side, fb.d_front, gam)
+        tref.set_address_2d(fb.d_side, dsc, 16 * dim.astride)
+        launch2('full_blur', self.mod, stream, dim,
+               fb.d_back, i32(2), i32(0), texrefs=[tref])
+        tref.set_address_2d(fb.d_back, dsc, 16 * dim.astride)
+        launch2('full_blur', self.mod, stream, dim,
+               fb.d_side, i32(3), i32(0), texrefs=[tref])
+        launch2('smearclip', self.mod, stream, dim,
+                fb.d_front, fb.d_side, gam)
 
 class ColorClip(Filter, ClsMod):
     lib = code.filters.colorcliplib
@@ -124,6 +146,6 @@ class ColorClip(Filter, ClsMod):
 
 # Ungainly but practical.
 filter_map = dict(bilateral=Bilateral, logscale=Logscale, haloclip=HaloClip,
-                  colorclip=ColorClip)
+                  colorclip=ColorClip, smearclip=SmearClip)
 def create(gprof):
     return [filter_map[f]() for f in gprof.filter_order]
diff --git a/cuburn/genome/specs.py b/cuburn/genome/specs.py
index 9f420c4..d9332d3 100644
--- a/cuburn/genome/specs.py
+++ b/cuburn/genome/specs.py
@@ -51,7 +51,8 @@ filters = (
     , 'minimum': scalespline(0, max=1, d='Proportional min radius')
     , 'curve': scalespline(0.6, d='Power of filter radius with density')
     }
-  , 'haloclip': {'gamma': scalespline(4)}
+  , 'haloclip': {}
+  , 'smearclip': {'width': scalespline(0.75, d='Spatial stdev of filter')}
   , 'logscale': {'brightness': scalespline(4, d='Log-scale brightness')}
   })
 
@@ -96,7 +97,7 @@ edge.update(type='edge', author=author, blend=blend, link=link, time=time,
 anim = dict(base)
 anim.update(type='animation', authors=list_(author), link=link, time=time)
 
-default_filters = ['bilateral', 'logscale', 'colorclip']
+default_filters = ['bilateral', 'logscale', 'smearclip']
 # Yeah, now I'm just messing around.
 prof_filters = dict([(fk, dict([(k, refscalar(1, '.'.join(['filters', fk, k])))
                            for k in fv])) for fk, fv in filters.items()])
diff --git a/cuburn/render.py b/cuburn/render.py
index b272a8e..8ffc5cc 100644
--- a/cuburn/render.py
+++ b/cuburn/render.py
@@ -113,7 +113,7 @@ class Framebuffers(object):
         try:
             self.d_front = cuda.mem_alloc(16 * nbins)
             self.d_back  = cuda.mem_alloc(16 * nbins)
-            self.d_side  = cuda.mem_alloc(8 * nbins)
+            self.d_side  = cuda.mem_alloc(16 * nbins)
             self.nbins = nbins
         except cuda.MemoryError, e:
             # If a frame that's too large sneaks by the task distributor, we