diff --git a/cuburn/device_code.py b/cuburn/device_code.py
index 15762d3..5df1763 100644
--- a/cuburn/device_code.py
+++ b/cuburn/device_code.py
@@ -36,20 +36,18 @@ class IterThread(PTXEntryPoint):
 
     @ptx_func
     def entry(self):
-        # For now, we indulge in the luxury of shared memory.
         # Index number of current CP, shared across CTA
         mem.shared.u32('s_cp_idx')
 
         # Number of samples that have been generated so far in this CTA
         # If this number is negative, we're still fusing points, so this
         # behaves slightly differently (see ``fuse_loop_start``)
+        # TODO: replace (or at least simplify) this logic
         mem.shared.s32('s_num_samples')
         op.st.shared.s32(addr(s_num_samples), -(features.num_fuse_samples+1))
 
         mem.shared.f32('s_xf_sel', ctx.warps_per_cta)
 
-        #std.store_per_thread(g_whatever, 1234)
-
         # TODO: temporary, for testing
         mem.local.u32('l_num_rounds')
         mem.local.u32('l_num_writes')
@@ -264,8 +262,10 @@ class IterThread(PTXEntryPoint):
             print '%s:' % s
             for i, r in enumerate(a):
                 for j in range(0,len(r),w):
-                    print '%2d\t%s' % (i,
-                        '\t'.join(['%g '%np.mean(r[k]) for k in range(j,j+w)]))
+                    print '%2d' % i,
+                    for k in range(j,j+w,8):
+                        print '\t' + ' '.join(
+                            ['%8g'%np.mean(r[l]) for l in range(k,k+8)])
 
         num_rounds_dp, num_rounds_l = ctx.mod.get_global('g_num_rounds')
         num_writes_dp, num_writes_l = ctx.mod.get_global('g_num_writes')
@@ -536,9 +536,9 @@ class ShufflePoints(PTXFragment):
     def shuffle(self, *args, **kwargs):
         """
         Shuffle the data from each register in args across threads. Keyword
-        argument ``bar`` specifies which barrier to use.
+        argument ``bar`` specifies which barrier to use (default is 2).
         """
-        bar = kwargs.pop('bar', 8)
+        bar = kwargs.pop('bar', 2)
         with block("Shuffle across threads"):
             reg.u32('shuf_read shuf_write')
             with block("Calculate read and write offsets"):
diff --git a/cuburn/ptx.py b/cuburn/ptx.py
index e471505..5b03e81 100644
--- a/cuburn/ptx.py
+++ b/cuburn/ptx.py
@@ -458,7 +458,6 @@ class Mem(object):
 class _MemFactory(_CallChain):
     """Actual `mem` object"""
     def _call(self, type, name, array=False, init=None):
-        assert len(type) == 2
         memobj = Mem(type, name, array, init)
         if array is True:
             array = ['[]']
@@ -468,7 +467,7 @@ class _MemFactory(_CallChain):
             array = []
         if init:
             array += [' = ', init]
-        self.block.code(op='.%s.%s ' % (type[0], type[1]), vars=[name, array])
+        self.block.code(op='.%s ' % ' .'.join(type), vars=[name, array])
         self.block.inject(name, memobj)
 
     # TODO: move vec, addr here, or make this public