diff --git a/cuburn/code/filter.py b/cuburn/code/filter.py
index f4f102b..a41674d 100644
--- a/cuburn/code/filter.py
+++ b/cuburn/code/filter.py
@@ -117,7 +117,7 @@ void density_est(float4 *pixbuf, float4 *outbuf, float *denbuf,
         float den = denbuf[idx];
 
         if (in.w > 0 && den > 0) {
-            float ls = k1 * 12 * logf(1.0 + in.w * k2) / in.w;
+            float ls = k1 * logf(1.0f + in.w * k2) / in.w;
             in.x *= ls;
             in.y *= ls;
             in.z *= ls;
@@ -200,19 +200,6 @@ void density_est(float4 *pixbuf, float4 *outbuf, float *denbuf,
             atomicAdd(out+3, de_a[si]);
         }
 
-        if (threadIdx.y == 5000) {
-            for (int i = threadIdx.x; i < FW; i += 32) {
-                idx = {{features.acc_stride}} * (imrow + 32)
-                    + blockIdx.x * 32 + i + W2;
-                int si = 32 * FW + i;
-                float *out = reinterpret_cast<float*>(&outbuf[idx]);
-                atomicAdd(out,   0.2 + de_r[si]);
-                atomicAdd(out+1, de_g[si]);
-                atomicAdd(out+2, de_b[si]);
-                atomicAdd(out+3, de_a[si]);
-            }
-        }
-
         __syncthreads();
         // TODO: shift instead of copying
         int tid = threadIdx.y * 32 + threadIdx.x;
@@ -239,9 +226,11 @@ void density_est(float4 *pixbuf, float4 *outbuf, float *denbuf,
     def invoke(self, mod, abufd, obufd, dbufd):
         # TODO: add no-est version
         # TODO: come up with a general way to average these parameters
+
         k1 = self.cp.brightness * 268 / 256
-        area = self.features.width * self.features.height / self.cp.ppu ** 2
+        area = self.features.acc_width * self.features.acc_height / self.cp.ppu ** 2
         k2 = 1 / (area * self.cp.adj_density)
+        print k1, k2, area
 
         if self.cp.estimator == 0:
             fun = mod.get_function("logscale")
@@ -251,7 +240,7 @@ void density_est(float4 *pixbuf, float4 *outbuf, float *denbuf,
         else:
             fun = mod.get_function("density_est")
             t = fun(abufd, obufd, dbufd, np.float32(k1), np.float32(k2),
-                    block=(32, 32, 1), grid=(self.features.acc_stride/32 - 1, 1),
+                    block=(32, 32, 1), grid=(self.features.acc_width/32, 1),
                     time_kernel=True)
             print "Density estimation: %g" % t
 
diff --git a/cuburn/code/iter.py b/cuburn/code/iter.py
index 15f7ce4..b714569 100644
--- a/cuburn/code/iter.py
+++ b/cuburn/code/iter.py
@@ -8,6 +8,7 @@ import pycuda.driver as cuda
 from pycuda.driver import In, Out, InOut
 from pycuda.compiler import SourceModule
 import numpy as np
+from scipy import ndimage
 
 from fr0stlib.pyflam3 import flam3_interpolate
 from cuburn.code import mwc, variations, filter
@@ -125,8 +126,8 @@ void iter(mwc_st *msts, iter_info *infos, float4 *accbuf, float *denbuf) {
 
         int ix = trunca(cx+ditherx), iy = trunca(cy+dithery);
 
-        if (ix < 0 || ix >= {{features.width}} ||
-            iy < 0 || iy >= {{features.height}} ) {
+        if (ix < 0 || ix >= {{features.acc_width}} ||
+            iy < 0 || iy >= {{features.acc_height}} ) {
             consec_bad++;
             if (consec_bad > {{features.max_oob}}) {
                 x = mwc_next_11(&rctx);
@@ -137,7 +138,7 @@ void iter(mwc_st *msts, iter_info *infos, float4 *accbuf, float *denbuf) {
             continue;
         }
 
-        int i = iy * {{features.width}} + ix;
+        int i = iy * {{features.acc_stride}} + ix;
 
         float4 outcol = tex2D(palTex, color, {{packer.get('cp_step_frac')}});
         float4 pix = accbuf[i];
@@ -148,6 +149,7 @@ void iter(mwc_st *msts, iter_info *infos, float4 *accbuf, float *denbuf) {
         accbuf[i] = pix;    // TODO: atomic operations (or better)
         denbuf[i] += 1.0f;
     }
+    asm volatile ("membar.cta;");
 }
 """)
         return tmpl.substitute(
@@ -158,8 +160,8 @@ void iter(mwc_st *msts, iter_info *infos, float4 *accbuf, float *denbuf) {
 def render(features, cps):
     # TODO: make this adjustable via genome
     nsteps = 1000
-    abuf = np.zeros((features.height, features.width, 4), dtype=np.float32)
-    dbuf = np.zeros((features.height, features.width), dtype=np.float32)
+    abuf = np.zeros((features.acc_height, features.acc_stride, 4), dtype=np.float32)
+    dbuf = np.zeros((features.acc_height, features.acc_stride), dtype=np.float32)
     seeds = mwc.MWC.make_seeds(512 * nsteps)
 
     iter = IterCode(features)
@@ -221,9 +223,13 @@ def render(features, cps):
 
     f = np.float32
 
-    npix = features.width * features.height
+    npix = features.acc_width * features.acc_height
 
+    # TODO: just allocate
     obufd = cuda.to_device(abuf)
+    dbuf = cuda.from_device_like(dbufd, dbuf)
+    dbuf = ndimage.filters.gaussian_filter(dbuf, 0.6)
+    dbufd = cuda.to_device(dbuf)
     de.invoke(mod, abufd, obufd, dbufd)
 
     fun = mod.get_function("colorclip")
diff --git a/cuburn/render.py b/cuburn/render.py
index f025b27..9267f14 100644
--- a/cuburn/render.py
+++ b/cuburn/render.py
@@ -38,7 +38,9 @@ class Genome(pyflam3.Genome):
         """
         # TODO: when reading as a property during packing, this may be
         # calculated 6 times instead of 1
-        return ( affine.translate(0.5 * cp.width, 0.5 * cp.height)
+        # TODO: also requires knowing gutter width
+        g = Features.gutter
+        return ( affine.translate(0.5 * cp.width + g, 0.5 * cp.height + g)
                * affine.scale(cp.ppu, cp.ppu)
                * affine.translate(-cp._center[0], -cp._center[1])
                * affine.rotate(cp.rotate * 2 * np.pi / 360,
@@ -90,6 +92,10 @@ class Features(object):
     # performance too much. Power-of-two, please.
     palette_height = 16
 
+    # Maximum width of DE and other spatial filters, and thus in turn the
+    # amount of padding applied
+    gutter = 16
+
     def __init__(self, genomes):
         any = lambda l: bool(filter(None, map(l, genomes)))
         self.max_ntemporal_samples = max(
@@ -112,9 +118,9 @@ class Features(object):
 
         self.width = genomes[0].width
         self.height = genomes[0].height
-        self.acc_width = genomes[0].width
-        self.acc_height = genomes[0].height
-        self.acc_stride = genomes[0].width
+        self.acc_width = genomes[0].width + 2 * self.gutter
+        self.acc_height = genomes[0].height + 2 * self.gutter
+        self.acc_stride = genomes[0].width + 2 * self.gutter
 
 class XFormFeatures(object):
     def __init__(self, xforms, xform_id):
diff --git a/main.py b/main.py
index 5316f5b..bf1ce85 100644
--- a/main.py
+++ b/main.py
@@ -44,6 +44,10 @@ def main(args):
         genomes = Genome.from_string(fp.read())
     anim = Animation(genomes)
     accum, den = render(anim.features, genomes)
+    accum = np.delete(accum, np.s_[:16], axis=0)
+    accum = np.delete(accum, np.s_[:16], axis=1)
+    accum = np.delete(accum, np.s_[-16:], axis=0)
+    accum = np.delete(accum, np.s_[-16:], axis=1)
 
     noalpha = np.delete(accum, 3, axis=2)
     scipy.misc.imsave('rendered.png', noalpha)