diff --git a/cuburn/code/interp.py b/cuburn/code/interp.py
index 0cbae08..5b4961f 100644
--- a/cuburn/code/interp.py
+++ b/cuburn/code/interp.py
@@ -270,8 +270,8 @@ void interp_palette_hsv_flat(mwc_st *rctxs,
     uint32_t r = min(255, (uint32_t) (rgba.x * 255.0f + 0.49f * mwc_next_11(rctx)));
     uint32_t g = min(255, (uint32_t) (rgba.y * 255.0f + 0.49f * mwc_next_11(rctx)));
     uint32_t b = min(255, (uint32_t) (rgba.z * 255.0f + 0.49f * mwc_next_11(rctx)));
-    out.x = (1 << 22) | (r << 4);
-    out.y = (g << 18) | b;
+    out.y = (1 << 22) | (r << 4);
+    out.x = (g << 18) | b;
     surf2Dwrite(out, flatpal, 8 * threadIdx.x, blockIdx.x);
     rctxs[gid] = rctx;
 }
diff --git a/cuburn/code/iter.py b/cuburn/code/iter.py
index b996481..45eae76 100644
--- a/cuburn/code/iter.py
+++ b/cuburn/code/iter.py
@@ -187,8 +187,10 @@ void iter(
         uint64_t out_ptr,
         mwc_st *msts,
         float4 *points,
-        const iter_params *all_params,
-        int nsamps_to_generate
+        const iter_params *all_params
+{{if info.acc_mode == 'atomic'}}
+      , uint64_t atom_ptr
+{{endif}}
 ) {
     const iter_params *global_params = &(all_params[blockIdx.x]);
 
@@ -207,10 +209,14 @@ void iter(
     int this_rb_idx = rb_idx + threadIdx.x + 32 * threadIdx.y;
     mwc_st rctx = msts[this_rb_idx];
 
-{{if info.acc_mode != 'deferred'}}
+{{if info.acc_mode == 'global'}}
     __shared__ float time_frac;
     time_frac = blockIdx.x / (float) gridDim.x;
 {{else}}
+    {{if info.acc_mode == 'atomic'}}
+    // TODO: spare the register, reuse at call site?
+    int time = blockIdx.x >> 4;
+    {{endif}}
     float color_dither = 0.49f * mwc_next_11(rctx);
 {{endif}}
 
@@ -224,10 +230,10 @@ void iter(
     // Shared memory size can be reduced by a factor of four using a slower
     // 4-stage reduce, but on Fermi hardware shmem use isn't a bottleneck
     __shared__ float swap[{{4*NTHREADS}}];
-    __shared__ float cosel[{{NWARPS}}];
+    __shared__ float cosel[{{2*NWARPS}}];
 
     // This is normally done after the swap-sync in the main loop
-    if (threadIdx.y == 0 && threadIdx.x < {{NWARPS}})
+    if (threadIdx.y == 0 && threadIdx.x < {{NWARPS*2}})
         cosel[threadIdx.x] = mwc_next_01(rctx);
     __syncthreads();
 {{endif}}
@@ -295,7 +301,7 @@ void iter(
         __syncthreads();
 
         // We select the next xforms here, since we've just synced.
-        if (threadIdx.y == 0 && threadIdx.x < {{NWARPS}})
+        if (threadIdx.y == 0 && threadIdx.x < {{NWARPS*2}})
             cosel[threadIdx.x] = mwc_next_01(rctx);
 
         fuse = swap[sr];
@@ -345,15 +351,60 @@ void iter(
         }
 
         uint32_t i = iy * acc_size.stride + ix;
-
 {{if info.acc_mode == 'atomic'}}
-        float4 outcol = tex2D(palTex, cc, time_frac);
-        float *accbuf_f = reinterpret_cast<float*>(out_ptr + (16*i));
-        atomicAdd(accbuf_f,   outcol.x);
-        atomicAdd(accbuf_f+1, outcol.y);
-        atomicAdd(accbuf_f+2, outcol.z);
-        atomicAdd(accbuf_f+3, 1.0f);
-{{elif info.acc_mode == 'global'}}
+        asm volatile ({{crep("""
+{
+    .reg .pred  p;
+    .reg .u32   off, color, hi, lo, d, r, g, b;
+    .reg .f32   colorf, rf, gf, bf, df;
+    .reg .u64   ptr, val;
+
+    // TODO: coord dithering better, or pre-supersampled palette?
+    fma.rn.ftz.f32      colorf, %0,     255.0,  %1;
+    cvt.rni.u32.f32     color,  colorf;
+    shl.b32             color,  color,  3;
+
+    suld.b.2d.v2.b32.clamp      {lo, hi},   [flatpal, {color, %2}];
+    mov.b64             val,    {lo, hi};
+    shl.b32             off,    %3,     3;
+    cvt.u64.u32         ptr,    off;
+    add.u64             ptr,    ptr,    %4;
+    setp.le.f32         p,      %5,     0.98;
+@p  red.global.add.u64  [ptr],  val;
+@p  bra                 oflow_end;
+    atom.global.add.u64 val,    [ptr],  val;
+    mov.b64             {lo, hi},       val;
+    setp.lo.u32         p,      hi,     (256 << 22);
+@p  bra                 oflow_end;
+    atom.global.exch.b64 val,   [ptr],  0;
+    mov.b64             {lo, hi},       val;
+    shr.u32             d,      hi,     22;
+    bfe.u32             r,      hi,     4,      18;
+    bfe.u32             g,      lo,     18,     14;
+    bfi.b32             g,      hi,     g,      14,     4;
+    and.b32             b,      lo,     ((1<<18)-1);
+    cvt.rn.f32.u32      rf,     r;
+    cvt.rn.f32.u32      gf,     g;
+    cvt.rn.f32.u32      bf,     b;
+    cvt.rn.f32.u32      df,     d;
+    mul.rn.ftz.f32      rf,     rf,     (1.0/255.0);
+    mul.rn.ftz.f32      gf,     gf,     (1.0/255.0);
+    mul.rn.ftz.f32      bf,     bf,     (1.0/255.0);
+    shl.b32             off,    %3,     4;
+    cvt.u64.u32         ptr,    off;
+    add.u64             ptr,    ptr,    %6;
+    red.global.add.f32  [ptr],          rf;
+    red.global.add.f32  [ptr+4],        gf;
+    red.global.add.f32  [ptr+8],        bf;
+    red.global.add.f32  [ptr+12],       df;
+oflow_end:
+}
+        """)}}  ::  "f"(cc), "f"(color_dither), "r"(time), "r"(i),
+                    "l"(atom_ptr), "f"(cosel[threadIdx.y + {{NWARPS}}]),
+                    "l"(out_ptr));
+{{endif}}
+
+{{if info.acc_mode == 'global'}}
         float4 outcol = tex2D(palTex, cc, time_frac);
         float4 *accbuf = reinterpret_cast<float4*>(out_ptr + (16*i));
         float4 pix = *accbuf;
@@ -380,6 +431,47 @@ void iter(
     return;
 }
 
+{{if info.acc_mode == 'atomic'}}
+__global__
+void flush_atom(uint64_t out_ptr, uint64_t atom_ptr, int nbins) {
+    int i = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;
+    asm volatile ({{crep("""
+{
+    .reg .u32   off, hi, lo, d, r, g, b;
+    .reg .u64   val, ptr;
+    .reg .f32   rf, gf, bf, df, rg, gg, bg, dg;
+
+    // TODO: use explicit movs to handle this
+    shl.b32             off,    %0,     3;
+    cvt.u64.u32         ptr,    off;
+    add.u64             ptr,    ptr,    %1;
+    ld.global.v2.u32    {lo, hi},   [ptr];
+    shl.b32             off,    %0,     4;
+    cvt.u64.u32         ptr,    off;
+    add.u64             ptr,    ptr,    %2;
+    ld.global.v4.f32    {rg,gg,bg,dg},  [ptr];
+    shr.u32             d,      hi,     22;
+    bfe.u32             r,      hi,     4,      18;
+    bfe.u32             g,      lo,     18,     14;
+    bfi.b32             g,      hi,     g,      14,     4;
+    and.b32             b,      lo,     ((1<<18)-1);
+    cvt.rn.f32.u32      rf,     r;
+    cvt.rn.f32.u32      gf,     g;
+    cvt.rn.f32.u32      bf,     b;
+    cvt.rn.f32.u32      df,     d;
+    fma.rn.ftz.f32      rg,     rf,     (1.0/255.0),    rg;
+    fma.rn.ftz.f32      gg,     gf,     (1.0/255.0),    gg;
+    fma.rn.ftz.f32      bg,     bf,     (1.0/255.0),    bg;
+    add.rn.ftz.f32      dg,     df,     dg;
+    st.global.v4.f32    [ptr],  {rg,gg,bg,dg};
+}
+    """)}}  ::  "r"(i), "l"(atom_ptr), "l"(out_ptr));
+}
+
+{{endif}}
+
+{{if info.acc_mode == 'deferred'}}
+
 // Block size, shared accumulation bits, shared accumulation width.
 #define BS 1024
 #define SHAB 12
@@ -462,8 +554,8 @@ write_shmem(
     shl.b32         color,  color,  3;
     cvt.rni.u32.f32 time,   %1;
 
-    suld.b.2d.v2.b32.clamp  {hi, lo},   [flatpal, {color, time}];
-    ld.shared.v2.u32    {hiw, low},     [shoff];
+    suld.b.2d.v2.b32.clamp  {lo, hi},   [flatpal, {color, time}];
+    ld.shared.v2.u32    {low, hiw},     [shoff];
     add.cc.u32          lo,     lo,     low;
     addc.u32            hi,     hi,     hiw;
     setp.hs.u32         q,      hi,     (1023 << 22);
@@ -513,7 +605,7 @@ oflow_write_end:
         asm({{crep("""
 {
     .reg .u32 hi, lo;
-    ld.shared.v2.u32   {hi, lo},    [%4];
+    ld.shared.v2.u32   {lo, hi},    [%4];
     shr.u32         %0,     hi,     22;
     bfe.u32         %1,     hi,     4,      18;
     bfe.u32         %2,     lo,     18,     14;
@@ -530,7 +622,7 @@ oflow_write_end:
         glo_idx += (BS << 8);
     }
 }
-
+{{endif}}
 ''', 'iter_kern')
         return tmpl.substitute(
                 info = self.info,
diff --git a/cuburn/render.py b/cuburn/render.py
index 8986dae..977a73a 100644
--- a/cuburn/render.py
+++ b/cuburn/render.py
@@ -8,7 +8,7 @@ from itertools import cycle, repeat, chain, izip
 from ctypes import *
 from cStringIO import StringIO
 import numpy as np
-from numpy import int32 as i32
+from numpy import int32 as i32, uint64 as u64
 from scipy import ndimage
 
 from fr0stlib import pyflam3
@@ -110,7 +110,6 @@ class Renderer(object):
         reset_rb_fun = self.mod.get_function("reset_rb")
         packer_fun = self.mod.get_function("interp_iter_params")
         iter_fun = self.mod.get_function("iter")
-        write_fun = self.mod.get_function("write_shmem")
 
         info = self.info
 
@@ -119,6 +118,7 @@ class Renderer(object):
         filt_stream = cuda.Stream()
         if info.acc_mode == 'deferred':
             write_stream = cuda.Stream()
+            write_fun = self.mod.get_function("write_shmem")
         else:
             write_stream = iter_stream
 
@@ -132,6 +132,9 @@ class Renderer(object):
         # Extra padding in accum helps with write_shmem overruns
         d_accum = cuda.mem_alloc(16 * nbins + (1<<16))
         d_out = cuda.mem_alloc(16 * nbins)
+        if info.acc_mode == 'atomic':
+            d_atom = cuda.mem_alloc(8 * nbins)
+            flush_fun = self.mod.get_function("flush_atom")
 
         acc_size = np.array([info.acc_width, info.acc_height, info.acc_stride])
         d_acc_size = self.mod.get_global('acc_size')[0]
@@ -191,7 +194,7 @@ class Renderer(object):
         d_palint_vals = cuda.to_device(
                 np.concatenate(map(info.db.palettes.get, pals[1::2])))
 
-        if info.acc_mode == 'deferred':
+        if info.acc_mode in ('deferred', 'atomic'):
             palette_fun = self.mod.get_function("interp_palette_hsv_flat")
             dsc = cuda.ArrayDescriptor3D()
             dsc.height = info.palette_height
@@ -227,7 +230,7 @@ class Renderer(object):
 
         for idx, start, stop in times:
             twidth = np.float32((stop-start) / info.palette_height)
-            if info.acc_mode == 'deferred':
+            if info.acc_mode in ('deferred', 'atomic'):
                 palette_fun(d_seeds, d_palint_times, d_palint_vals,
                             np.float32(start), twidth,
                             block=(256,1,1), grid=(info.palette_height,1),
@@ -257,6 +260,8 @@ class Renderer(object):
                 #print '%60s %g' % ('_'.join(n), i)
 
             util.BaseCode.fill_dptr(self.mod, d_accum, 4 * nbins, write_stream)
+            if info.acc_mode == 'atomic':
+                util.BaseCode.fill_dptr(self.mod, d_atom, 2 * nbins, write_stream)
             nrounds = ( (info.density * info.width * info.height)
                       / (ntemporal_samples * 256 * 256) ) + 1
             if info.acc_mode == 'deferred':
@@ -273,9 +278,16 @@ class Renderer(object):
                               block=(1024, 1, 1), grid=(nwriteblocks, 1),
                               stream=write_stream)
             else:
-                iter_fun(np.uint64(d_accum), d_seeds, d_points, d_infos,
-                         block=(32, self._iter.NTHREADS/32, 1),
+                args = [u64(d_accum), d_seeds, d_points, d_infos]
+                if info.acc_mode == 'atomic':
+                    args.append(u64(d_atom))
+                iter_fun(*args, block=(32, self._iter.NTHREADS/32, 1),
                          grid=(ntemporal_samples, nrounds), stream=iter_stream)
+                if info.acc_mode == 'atomic':
+                    nblocks = int(np.ceil(np.sqrt(nbins/float(512))))
+                    flush_fun(u64(d_accum), u64(d_atom), i32(nbins),
+                              block=(512, 1, 1), grid=(nblocks, nblocks),
+                              stream=iter_stream)
 
             util.BaseCode.fill_dptr(self.mod, d_out, 4 * nbins, filt_stream)
             _sync_stream(filt_stream, write_stream)