diff --git a/cuburn/code/filtering.py b/cuburn/code/filtering.py
index 3a64d48..2ad17bf 100644
--- a/cuburn/code/filtering.py
+++ b/cuburn/code/filtering.py
@@ -15,7 +15,7 @@ void colorclip(float4 *pixbuf, float gamma, float vibrancy, float highpow,
     float4 pix = pixbuf[i];
 
     if (pix.w <= 0) {
-        pixbuf[i] = make_float4(bkgd.x, bkgd.y, bkgd.z, 0);
+        pixbuf[i] = make_float4(bkgd.x, bkgd.y, bkgd.z, 0.0f);
         return;
     }
 
@@ -121,7 +121,9 @@ __device__ void de_add(int ibase, int ii, int jj, float4 scaled) {
 __global__
 void logscale(float4 *pixbuf, float4 *outbuf, float k1, float k2) {
     int i = blockDim.x * blockIdx.x + threadIdx.x;
+    float den;
     float4 pix = pixbuf[i];
+    read_pix(pix, den);
 
     float ls = fmaxf(0, k1 * logf(1.0f + pix.w * k2) / pix.w);
     pix.x *= ls;
@@ -138,7 +140,7 @@ void logscale(float4 *pixbuf, float4 *outbuf, float k1, float k2) {
 #define MAX_SD 4.33333333f
 
 __global__
-void density_est(float4 *pixbuf, float4 *outbuf, float *denbuf,
+void density_est(float4 *pixbuf, float4 *outbuf,
                  float est_sd, float neg_est_curve, float est_min,
                  float k1, float k2) {
     for (int i = threadIdx.x + 32*threadIdx.y; i < FW2; i += 32)
@@ -151,7 +153,8 @@ void density_est(float4 *pixbuf, float4 *outbuf, float *denbuf,
                 + blockIdx.x * 32 + threadIdx.x + W2;
 
         float4 in = pixbuf[idx];
-        float den = denbuf[idx];
+        float den;
+        read_pix(in, den);
 
         if (in.w > 0 && den > 0) {
             float ls = k1 * logf(1.0f + in.w * k2) / in.w;
@@ -279,7 +282,7 @@ void density_est(float4 *pixbuf, float4 *outbuf, float *denbuf,
 
 ''')
 
-    def invoke(self, mod, cp, abufd, obufd, dbufd, stream=None):
+    def invoke(self, mod, cp, abufd, obufd, stream=None):
         # TODO: add no-est version
         # TODO: come up with a general way to average these parameters
 
@@ -301,7 +304,7 @@ void density_est(float4 *pixbuf, float4 *outbuf, float *denbuf,
             neg_est_curve = np.float32(-cp.estimator_curve)
             est_min = np.float32(cp.estimator_minimum / 3.)
             fun = mod.get_function("density_est")
-            fun(abufd, obufd, dbufd, est_sd, neg_est_curve, est_min, k1, k2,
+            fun(abufd, obufd, est_sd, neg_est_curve, est_min, k1, k2,
                 block=(32, 32, 1), grid=(self.features.acc_width/32, 1),
                 stream=stream)
 
diff --git a/cuburn/code/iter.py b/cuburn/code/iter.py
index a5f87f1..d0f978e 100644
--- a/cuburn/code/iter.py
+++ b/cuburn/code/iter.py
@@ -16,13 +16,33 @@ class IterCode(HunkOCode):
         bodies = [self._xfbody(i,x) for i,x in enumerate(self.features.xforms)]
         bodies.append(iterbody)
         self.defs = '\n'.join(bodies)
+        self.decls += self.pix_helpers.substitute(features=features)
 
     decls = """
 // Note: for normalized lookups, uchar4 actually returns floats
 texture<uchar4, cudaTextureType2D, cudaReadModeNormalizedFloat> palTex;
 __shared__ iter_info info;
+
 """
 
+    pix_helpers = Template("""
+__device__
+void read_pix(float4 &pix, float &den) {
+    den = pix.w;
+    {{if features.pal_has_alpha}}
+    read_half(pix.z, pix.w, pix.z, den);
+    {{endif}}
+}
+
+__device__
+void write_pix(float4 &pix, float den) {
+    {{if features.pal_has_alpha}}
+    write_half(pix.z, pix.z, pix.w, den);
+    {{endif}}
+    pix.w = den;
+}
+""")
+
     def _xfbody(self, xfid, xform):
         px = self.packer.view('info', 'xf%d_' % xfid)
         px.sub('xf', 'cp.xforms[%d]' % xfid)
@@ -204,12 +224,17 @@ void iter(mwc_st *msts, iter_info *infos, float4 *accbuf, float *denbuf) {
 
         float4 outcol = tex2D(palTex, color, {{packer.get("cp_step_frac")}});
         float4 pix = accbuf[i];
+        float den;
+        // TODO: unify read/write_pix cycle when alpha is needed
+        read_pix(pix, den);
         pix.x += outcol.x;
         pix.y += outcol.y;
         pix.z += outcol.z;
         pix.w += outcol.w;
-        accbuf[i] = pix;    // TODO: atomic operations (or better)
-        denbuf[i] += 1.0f;
+        den += 1.0f;
+
+        write_pix(pix, den);
+        accbuf[i] = pix;
     }
 }
 ''')
diff --git a/cuburn/code/util.py b/cuburn/code/util.py
index 83bee2f..5c119a9 100644
--- a/cuburn/code/util.py
+++ b/cuburn/code/util.py
@@ -50,7 +50,7 @@ class BaseCode(HunkOCode):
 #include<stdio.h>
 """
 
-    defs = """
+    defs = r"""
 #undef M_E
 #undef M_LOG2E
 #undef M_LOG10E
@@ -103,6 +103,39 @@ void zero_dptr(float* dptr, int size) {
         dptr[i] = 0.0f;
     }
 }
+
+__device__
+void read_half(float &x, float &y, float xy, float den) {
+    asm("\n\t{"
+        "\n\t   .reg .u16       x, y;"
+        "\n\t   .reg .f32       rc;"
+        "\n\t   mov.b32         {x, y},     %2;"
+        "\n\t   mul.f32         rc,         %3,     0f37800000;" // 1/65536.
+        "\n\t   cvt.rn.f32.u16     %0,         x;"
+        "\n\t   cvt.rn.f32.u16     %1,         y;"
+        "\n\t   mul.f32         %0,         %0,     rc;"
+        "\n\t   mul.f32         %1,         %1,     rc;"
+        "\n\t}"
+        : "=f"(x), "=f"(y) : "f"(xy), "f"(den));
+}
+
+__device__
+void write_half(float &xy, float x, float y, float den) {
+    asm("\n\t{"
+        "\n\t   .reg .u16       x, y;"
+        "\n\t   .reg .f32       rc, xf, yf;"
+        "\n\t   rcp.approx.f32  rc,         %3;"
+        "\n\t   mul.f32         rc,         rc,     65536.0;"
+        "\n\t   mul.f32         xf,         %1,     rc;"
+        "\n\t   mul.f32         yf,         %2,     rc;"
+        "\n\t   cvt.rni.u16.f32 x,  xf;"
+        "\n\t   cvt.rni.u16.f32 y,  yf;"
+        "\n\t   mov.b32         %0,         {x, y};"
+        "\n\t}"
+        : "=f"(xy) : "f"(x), "f"(y), "f"(den));
+}
+
+
 """
 
     @staticmethod
diff --git a/cuburn/render.py b/cuburn/render.py
index 2698e0a..90a8a7a 100644
--- a/cuburn/render.py
+++ b/cuburn/render.py
@@ -229,7 +229,6 @@ class _AnimRenderer(object):
         memset(byref(self._cen_cp), 0, sizeof(self._cen_cp))
 
         self.nbins = anim.features.acc_height * anim.features.acc_stride
-        self.d_den = cuda.mem_alloc(4 * self.nbins)
         self.d_accum = cuda.mem_alloc(16 * self.nbins)
         self.d_out = cuda.mem_alloc(16 * self.nbins)
         self.d_infos = cuda.mem_alloc(anim._iter.packer.align * self.ncps)
@@ -245,8 +244,6 @@ class _AnimRenderer(object):
         a._interp(cen_time, cen_cp)
         palette = self._interp_colors(cen_time, cen_cp)
 
-        util.BaseCode.zero_dptr(a.mod, self.d_den, self.nbins,
-                                self.stream)
         util.BaseCode.zero_dptr(a.mod, self.d_accum, 4 * self.nbins,
                                 self.stream)
 
@@ -311,7 +308,7 @@ class _AnimRenderer(object):
             # TODO: get block config from IterCode
             # TODO: print timing information
             iter_fun(self.d_seeds[b], np.uint64(d_info_off),
-                     self.d_accum, self.d_den, texrefs=[tref],
+                     self.d_accum, texrefs=[tref],
                      block=(32, 16, 1), grid=(len(block_times), 1),
                      stream=self.stream)
 
@@ -327,8 +324,7 @@ class _AnimRenderer(object):
 
         util.BaseCode.zero_dptr(a.mod, self.d_out, 4 * self.nbins,
                                 self.stream)
-        a._de.invoke(a.mod, Genome(cen_cp),
-                     self.d_accum, self.d_out, self.d_den,
+        a._de.invoke(a.mod, Genome(cen_cp), self.d_accum, self.d_out,
                      self.stream)
 
         f = np.float32
@@ -381,11 +377,6 @@ class _AnimRenderer(object):
         g = a.features.gutter
         obuf_dim = (a.features.acc_height, a.features.acc_stride, 4)
         out = cuda.from_device(self.d_out, obuf_dim, np.float32)
-        #dacc = cuda.from_device(self.d_accum, obuf_dim, np.float32)
-        #daccw = dacc[:,:,3]
-        #print daccw.sum()
-        # TODO: performance?
-        g = a.features.gutter
         out = np.delete(out, np.s_[:g], axis=0)
         out = np.delete(out, np.s_[:g], axis=1)
         out = np.delete(out, np.s_[-g:], axis=0)
@@ -447,6 +438,10 @@ class Features(object):
         else:
             self.final_xform_index = None
 
+        alphas = np.array([c.color[3] for g in genomes
+                           for c in g.palette.entries])
+        self.pal_has_alpha = np.any(alphas != 1.0)
+
         self.max_cps = max([cp.ntemporal_samples for cp in genomes])
 
         self.width = genomes[0].width