diff --git a/cuburn/device_code.py b/cuburn/device_code.py
index cd418ad..15762d3 100644
--- a/cuburn/device_code.py
+++ b/cuburn/device_code.py
@@ -90,7 +90,7 @@ class IterThread(PTXEntryPoint):
             reg.pred('p_last_cp')
             op.ldu.u32(num_cps, addr(g_num_cps))
             op.setp.ge.u32(p_last_cp, cp_idx, num_cps)
-            op.bra.uni('all_cps_done', ifp=p_last_cp)
+            op.bra('all_cps_done', ifp=p_last_cp)
 
         with block('Load CP address'):
             op.mov.u32(cpA, g_cp_array)
@@ -149,7 +149,7 @@ class IterThread(PTXEntryPoint):
             for xf in features.xforms:
                 label('XFORM_%d' % xf.id)
                 variations.apply_xform(xo, yo, coloro, xi, yi, colori, xf.id)
-                op.bra.uni("xform_done")
+                op.bra("xform_done")
 
         label("xform_done")
         with block("Test if we're still in FUSE"):
@@ -161,7 +161,7 @@ class IterThread(PTXEntryPoint):
 
         reg.pred('p_point_is_valid')
         with block("Write the result"):
-            hist.scatter(xo, yo, coloro, 0, p_point_is_valid)
+            hist.scatter(xo, yo, coloro, 0, p_point_is_valid, 'ldst')
             with block():
                 reg.u32('num_writes')
                 op.ld.local.u32(num_writes, addr(l_num_writes))
@@ -212,16 +212,15 @@ class IterThread(PTXEntryPoint):
         comment('Shuffle points between threads')
         shuf.shuffle(xi, yi, colori, consec_bad)
 
-        with block("If first warp, pick new thread offset"):
-            reg.u32('warpid')
+        with block("If in first warp, pick new offset"):
+            reg.u32('tid')
             reg.pred('first_warp')
-            op.mov.u32(warpid, '%tid.x')
-            op.shr.b32(warpid, warpid, 5)
-            op.setp.eq.u32(first_warp, warpid, 0)
-            #std.asrt("Looks like we're not the first warp", notp=first_warp,
-                    #ret=True)
-            op.bra.uni(iter_loop_choose_xform, ifp=first_warp)
-        op.bra.uni(iter_loop_start)
+            op.mov.u32(tid, '%tid.x')
+            assert ctx.warps_per_cta <= 32, \
+                   "Special-case for CTAs with >1024 threads not implemented"
+            op.setp.lo.u32(first_warp, tid, 32)
+            op.bra(iter_loop_choose_xform, ifp=first_warp)
+        op.bra(iter_loop_start)
 
         label('all_cps_done')
         # TODO this is for testing, move it to a debug statement
@@ -258,14 +257,15 @@ class IterThread(PTXEntryPoint):
         super(IterThread, self)._call(ctx, func, texrefs=[tr])
 
     def call_teardown(self, ctx):
-        shape = (ctx.grid[0], ctx.block[0]/32, 32)
+        w = ctx.warps_per_cta
+        shape = (ctx.grid[0], w, 32)
 
         def print_thing(s, a):
             print '%s:' % s
             for i, r in enumerate(a):
-                for j in range(0,len(r),8):
+                for j in range(0,len(r),w):
                     print '%2d\t%s' % (i,
-                        '\t'.join(['%g '%np.mean(r[k]) for k in range(j,j+8)]))
+                        '\t'.join(['%g '%np.mean(r[k]) for k in range(j,j+w)]))
 
         num_rounds_dp, num_rounds_l = ctx.mod.get_global('g_num_rounds')
         num_writes_dp, num_writes_l = ctx.mod.get_global('g_num_writes')
@@ -484,6 +484,11 @@ class HistScatter(PTXFragment):
             op.mov.u32(hist_bin_addr, g_hist_bins)
             op.mad.lo.u32(hist_bin_addr, hist_index, 16, hist_bin_addr)
 
+            if type == 'fake_notex':
+                op.st.local.u32(addr(l_scatter_fake_adr), hist_bin_addr)
+                op.st.local.f32(addr(l_scatter_fake_alpha), color)
+                return
+
             reg.f32('r g b a norm_time')
             cp.get(cpA, norm_time, 'cp.norm_time')
             palette.look_up(r, g, b, a, color, norm_time)
diff --git a/cuburn/render.py b/cuburn/render.py
index 284e2b1..79b3370 100644
--- a/cuburn/render.py
+++ b/cuburn/render.py
@@ -154,7 +154,7 @@ class Animation(object):
         the active device.
         """
         # TODO: user-configurable test control
-        self.ctx = LaunchContext([IterThread], block=(256,1,1), grid=(28,1),
+        self.ctx = LaunchContext([IterThread], block=(512,1,1), grid=(28,1),
                                  tests=True)
         # TODO: user-configurable verbosity control
         self.ctx.compile(verbose=3, anim=self, features=self.features)
diff --git a/helpers/shuf.py b/helpers/shuf.py
index 5ea1ae2..47fd9c9 100644
--- a/helpers/shuf.py
+++ b/helpers/shuf.py
@@ -280,6 +280,21 @@ def shuf_better(a):
 
 print '    With better shuffle: %g' % monte(make(), shuf_better, 1000, 32)
 
+print 'For 32*16:'
+t = 512
+print '    With no shuffle:     %g' % monte(make(), shuf_none, 1000, 32)
+print '    With full shuffle:   %g' % monte(make(), shuf_all, 1000, 32)
+print '    With simple shuffle: %g' % monte(make(), shuf_simple, 1000, 32)
+print '    With better shuffle: %g' % monte(make(), shuf_better, 1000, 32)
+print 'For 32*32:'
+t = 1024
+print '    With no shuffle:     %g' % monte(make(), shuf_none, 1000, 32)
+print '    With full shuffle:   %g' % monte(make(), shuf_all, 1000, 32)
+print '    With simple shuffle: %g' % monte(make(), shuf_simple, 1000, 32)
+print '    With better shuffle: %g' % monte(make(), shuf_better, 1000, 32)
+
+
+
 print """
 Okay I actually intended this to be a blog post but I started writing before
 having done any of the math. Actually the simple shuffle looks like it's