diff --git a/cuburn/code/iter.py b/cuburn/code/iter.py
index acc6af9..73b2808 100644
--- a/cuburn/code/iter.py
+++ b/cuburn/code/iter.py
@@ -185,10 +185,6 @@ void iter(
 ) {
     const iter_params *global_params = &(all_params[blockIdx.x]);
 
-{{if info.acc_mode != 'deferred'}}
-    __shared__ float time_frac;
-    time_frac = blockIdx.x / (float) gridDim.x;
-{{endif}}
 
     // load params to shared memory cooperatively
     for (int i = threadIdx.y * blockDim.x + threadIdx.x;
@@ -204,6 +200,13 @@ void iter(
     int this_rb_idx = rb_idx + threadIdx.x + 32 * threadIdx.y;
     mwc_st rctx = msts[this_rb_idx];
 
+{{if info.acc_mode != 'deferred'}}
+    __shared__ float time_frac;
+    time_frac = blockIdx.x / (float) gridDim.x;
+{{else}}
+    float color_dither = 0.49f * mwc_next_11(rctx);
+{{endif}}
+
     // TODO: 4th channel unused. Kill or use for something helpful
     float4 old_point = points[this_rb_idx];
     float x = old_point.x, y = old_point.y, color = old_point.z;
@@ -351,7 +354,7 @@ void iter(
         *accbuf = pix;
 {{elif info.acc_mode == 'deferred'}}
         // 'color' gets the top 9 bits. TODO: add dithering via precalc.
-        uint32_t icolor = fminf(1.0f, cc) * 511.0f;
+        uint32_t icolor = fminf(1.0f, cc) * 511.0f + color_dither;
         asm("bfi.b32    %0, %1, %0, 23, 9;" : "+r"(i) : "r"(icolor));
         *log = i;
 {{endif}}