From 6ed8907fcb2ff7c11a89204bba226b28b5e1ed9e Mon Sep 17 00:00:00 2001
From: Steven Robertson <steven@strobe.cc>
Date: Sun, 12 Sep 2010 13:45:55 -0400
Subject: [PATCH] LaunchContext.get_per_thread

---
 cuburn/device_code.py | 43 ++++++++++++++-----------------------------
 cuburn/ptx.py         | 15 +++++++++++----
 cuburn/render.py      |  2 +-
 3 files changed, 26 insertions(+), 34 deletions(-)

diff --git a/cuburn/device_code.py b/cuburn/device_code.py
index 8911016..ba0e23f 100644
--- a/cuburn/device_code.py
+++ b/cuburn/device_code.py
@@ -255,29 +255,20 @@ class IterThread(PTXEntryPoint):
         super(IterThread, self)._call(ctx, func, texrefs=[tr])
 
     def call_teardown(self, ctx):
-        w = ctx.warps_per_cta
-        shape = (ctx.grid[0], w, 32)
-
         def print_thing(s, a):
             print '%s:' % s
             for i, r in enumerate(a):
-                for j in range(0,len(r),w):
+                for j in range(0,len(r),ctx.warps_per_cta):
                     print '%2d' % i,
-                    for k in range(j,j+w,8):
+                    for k in range(j,j+ctx.warps_per_cta,8):
                         print '\t' + ' '.join(
                             ['%8g'%np.mean(r[l]) for l in range(k,k+8)])
 
-        num_rounds_dp, num_rounds_l = ctx.mod.get_global('g_num_rounds')
-        num_writes_dp, num_writes_l = ctx.mod.get_global('g_num_writes')
-        whatever_dp, whatever_l = ctx.mod.get_global('g_whatever')
-        rounds = cuda.from_device(num_rounds_dp, shape, np.int32)
-        writes = cuda.from_device(num_writes_dp, shape, np.int32)
-        whatever = cuda.from_device(whatever_dp, shape, np.int32)
+        rounds = ctx.get_per_thread('g_num_rounds', np.int32, shaped=True)
+        writes = ctx.get_per_thread('g_num_writes', np.int32, shaped=True)
         print_thing("Rounds", rounds)
         print_thing("Writes", writes)
-        #print_thing("Whatever", whatever)
-
-        print np.sum(rounds)
+        print "Total number of rounds:", np.sum(rounds)
 
         dp, l = ctx.mod.get_global('g_num_cps_started')
         cps_started = cuda.from_device(dp, 1, np.uint32)
@@ -641,13 +632,13 @@ class MWCRNG(PTXFragment):
         rand.shuffle(mults)
         # Copy multipliers and seeds to the device
         multdp, multl = ctx.mod.get_global('mwc_rng_mults')
-        cuda.memcpy_htod_async(multdp, mults.tostring()[:multl])
+        cuda.memcpy_htod(multdp, mults.tostring()[:multl])
         # Intentionally excludes both 0 and (2^32-1), as they can lead to
         # degenerate sequences of period 0
         states = np.array(rand.randint(1, 0xffffffff, size=2*ctx.nthreads),
                           dtype=np.uint32)
         statedp, statel = ctx.mod.get_global('mwc_rng_state')
-        cuda.memcpy_htod_async(statedp, states.tostring())
+        cuda.memcpy_htod(statedp, states.tostring())
         self.threads_ready = ctx.nthreads
 
     def call_setup(self, ctx):
@@ -696,10 +687,8 @@ class MWCRNGTest(PTXTest):
 
     def call_setup(self, ctx):
         # Get current multipliers and seeds from the device
-        multdp, multl = ctx.mod.get_global('mwc_rng_mults')
-        self.mults = cuda.from_device(multdp, ctx.nthreads, np.uint32)
-        statedp, statel = ctx.mod.get_global('mwc_rng_state')
-        self.fullstates = cuda.from_device(statedp, ctx.nthreads, np.uint64)
+        self.mults = ctx.get_per_thread('mwc_rng_mults', np.uint32)
+        self.fullstates = ctx.get_per_thread('mwc_rng_states', np.uint64)
         self.sums = np.zeros(ctx.nthreads, np.uint64)
 
         print "Running %d states forward %d rounds" % \
@@ -714,18 +703,15 @@ class MWCRNGTest(PTXTest):
         print "Done on host, took %g seconds" % ctime
 
     def call_teardown(self, ctx):
-        multdp, multl = ctx.mod.get_global('mwc_rng_mults')
-        statedp, statel = ctx.mod.get_global('mwc_rng_state')
-
-        dfullstates = cuda.from_device(statedp, ctx.nthreads, np.uint64)
+        dfullstates = ctx.get_per_thread('mwc_rng_states', np.uint64)
         if not (dfullstates == self.fullstates).all():
             print "State discrepancy"
             print dfullstates
             print self.fullstates
             raise PTXTestFailure("MWC RNG state discrepancy")
 
-        sumdp, suml = ctx.mod.get_global('mwc_rng_test_sums')
-        dsums = cuda.from_device(sumdp, ctx.nthreads, np.uint64)
+
+        dsums = ctx.get_per_thread('mwc_rng_test_sums', np.uint64)
         if not (dsums == self.sums).all():
             print "Sum discrepancy"
             print dsums
@@ -794,9 +780,8 @@ class MWCRNGFloatsTest(PTXTest):
                 ]
 
         for fkind, rkind, exp, lim in tests:
-            dp, l = ctx.mod.get_global(
-                    'mwc_rng_float_%s_test_%s' % (fkind, rkind))
-            vals = cuda.from_device(dp, ctx.nthreads, np.float32)
+            name = 'mwc_rng_float_%s_test_%s' % (fkind, rkind)
+            vals = ctx.get_per_thread(name, np.float32)
             avg = np.mean(vals)
             if np.abs(avg - exp) > tol:
                 raise PTXTestFailure("%s %s %g too far from %g" %
diff --git a/cuburn/ptx.py b/cuburn/ptx.py
index 125ae3c..25cb997 100644
--- a/cuburn/ptx.py
+++ b/cuburn/ptx.py
@@ -696,21 +696,28 @@ class _PTXStdLib(PTXFragment):
 
     @ptx_func
     def store_per_thread(self, *args):
-        """Store b32 at `base+gtid*4`. Super-common debug pattern."""
+        """For each pair of arguments ``addr, val``, write ``val`` to the
+        address given by ``addr+sizeof(val)*gtid``. If ``val`` is not a
+        register, size will be taken from ``addr``; if ``addr`` is not a Mem
+        instance, size defaults to 4."""
         with block("Per-thread storing values"):
             reg.u32('spt_base spt_offset')
             self.get_gtid(spt_offset)
-            op.mul.lo.u32(spt_offset, spt_offset, 4)
             for i in range(0, len(args), 2):
                 base, val = args[i], args[i+1]
+                width = 4
+                if isinstance(base, Mem):
+                    width = int(base.type[-1][-2:])/8
+                if isinstance(val, Reg):
+                    width = int(val.type[-2:])/8
                 op.mov.u32(spt_base, base)
-                op.add.u32(spt_base, spt_base, spt_offset)
+                op.mad.lo.u32(spt_base, spt_offset, width, spt_base)
                 if isinstance(val, float):
                     # Turn a constant float into the big-endian PTX binary f32
                     # representation, 0fXXXXXXXX (where XX is hex byte)
                     val = '0f%x%x%x%x' % reversed(map(ord,
                                                       struct.pack('f', val)))
-                op.st.b32(addr(spt_base), val)
+                op._call(['st', 'b%d' % (width*4)], addr(spt_base), val)
 
     @ptx_func
     def set_is_first_thread(self, p_dst):
diff --git a/cuburn/render.py b/cuburn/render.py
index 79b3370..6748613 100644
--- a/cuburn/render.py
+++ b/cuburn/render.py
@@ -87,7 +87,7 @@ class Frame(object):
         center = self.center_cp
         ncps = center.nbatches * center.ntemporal_samples
 
-        if ncps < ctx.ctas:
+        if ncps < ctx.nctas:
             raise NotImplementedError(
                 "Distribution of a CP across multiple CTAs not yet done")