From 3265982fecd8a303de7fc59f1b90f4cc17553b94 Mon Sep 17 00:00:00 2001
From: Steven Robertson <steven@strobe.cc>
Date: Sun, 12 Sep 2010 11:13:53 -0400
Subject: [PATCH] Change 'ctx.threads' to 'ctx.nthreads', as it should have
 been from the start

---
 cuburn/cuda.py        |  6 +++---
 cuburn/device_code.py | 44 +++++++++++++++++++++----------------------
 cuburn/ptx.py         |  6 +++---
 3 files changed, 28 insertions(+), 28 deletions(-)

diff --git a/cuburn/cuda.py b/cuburn/cuda.py
index 08aaa5f..ecfc088 100644
--- a/cuburn/cuda.py
+++ b/cuburn/cuda.py
@@ -30,7 +30,7 @@ class LaunchContext(object):
 
         `block`:    3-tuple of (x,y,z); dimensions of each CTA.
         `grid`:     2-tuple of (x,y); dimensions of the grid of CTAs.
-        `threads`:  Number of active threads on device as a whole.
+        `nthreads`: Number of active threads on device as a whole.
         `mod`:      Final compiled module. Unavailable during assembly.
 
     """
@@ -41,11 +41,11 @@ class LaunchContext(object):
         self.stream = cuda.Stream()
 
     @property
-    def threads(self):
+    def nthreads(self):
         return reduce(lambda a, b: a*b, self.block + self.grid)
 
     @property
-    def ctas(self):
+    def nctas(self):
         return self.grid[0] * self.grid[1]
 
     @property
diff --git a/cuburn/device_code.py b/cuburn/device_code.py
index 5df1763..8911016 100644
--- a/cuburn/device_code.py
+++ b/cuburn/device_code.py
@@ -30,9 +30,9 @@ class IterThread(PTXEntryPoint):
         mem.global_.u32('g_num_cps')
         mem.global_.u32('g_num_cps_started')
         # TODO move into debug statement
-        mem.global_.u32('g_num_rounds', ctx.threads)
-        mem.global_.u32('g_num_writes', ctx.threads)
-        mem.global_.b32('g_whatever', ctx.threads)
+        mem.global_.u32('g_num_rounds', ctx.nthreads)
+        mem.global_.u32('g_num_writes', ctx.nthreads)
+        mem.global_.b32('g_whatever', ctx.nthreads)
 
     @ptx_func
     def entry(self):
@@ -567,8 +567,8 @@ class MWCRNG(PTXFragment):
 
     @ptx_func
     def module_setup(self):
-        mem.global_.u32('mwc_rng_mults', ctx.threads)
-        mem.global_.u64('mwc_rng_state', ctx.threads)
+        mem.global_.u32('mwc_rng_mults', ctx.nthreads)
+        mem.global_.u64('mwc_rng_state', ctx.nthreads)
 
     @ptx_func
     def entry_setup(self):
@@ -637,21 +637,21 @@ class MWCRNG(PTXFragment):
         stream = cuda.Stream()
         # Randomness in choosing multipliers is good, but larger multipliers
         # have longer periods, which is also good. This is a compromise.
-        mults = np.array(mults[:ctx.threads*4])
+        mults = np.array(mults[:ctx.nthreads*4])
         rand.shuffle(mults)
         # Copy multipliers and seeds to the device
         multdp, multl = ctx.mod.get_global('mwc_rng_mults')
         cuda.memcpy_htod_async(multdp, mults.tostring()[:multl])
         # Intentionally excludes both 0 and (2^32-1), as they can lead to
         # degenerate sequences of period 0
-        states = np.array(rand.randint(1, 0xffffffff, size=2*ctx.threads),
+        states = np.array(rand.randint(1, 0xffffffff, size=2*ctx.nthreads),
                           dtype=np.uint32)
         statedp, statel = ctx.mod.get_global('mwc_rng_state')
         cuda.memcpy_htod_async(statedp, states.tostring())
-        self.threads_ready = ctx.threads
+        self.threads_ready = ctx.nthreads
 
     def call_setup(self, ctx):
-        if self.threads_ready < ctx.threads:
+        if self.threads_ready < ctx.nthreads:
             self.seed(ctx)
 
     def tests(self):
@@ -668,7 +668,7 @@ class MWCRNGTest(PTXTest):
 
     @ptx_func
     def module_setup(self):
-        mem.global_.u64('mwc_rng_test_sums', ctx.threads)
+        mem.global_.u64('mwc_rng_test_sums', ctx.nthreads)
 
     @ptx_func
     def entry(self):
@@ -697,10 +697,10 @@ class MWCRNGTest(PTXTest):
     def call_setup(self, ctx):
         # Get current multipliers and seeds from the device
         multdp, multl = ctx.mod.get_global('mwc_rng_mults')
-        self.mults = cuda.from_device(multdp, ctx.threads, np.uint32)
+        self.mults = cuda.from_device(multdp, ctx.nthreads, np.uint32)
         statedp, statel = ctx.mod.get_global('mwc_rng_state')
-        self.fullstates = cuda.from_device(statedp, ctx.threads, np.uint64)
-        self.sums = np.zeros(ctx.threads, np.uint64)
+        self.fullstates = cuda.from_device(statedp, ctx.nthreads, np.uint64)
+        self.sums = np.zeros(ctx.nthreads, np.uint64)
 
         print "Running %d states forward %d rounds" % \
               (len(self.mults), self.rounds)
@@ -717,7 +717,7 @@ class MWCRNGTest(PTXTest):
         multdp, multl = ctx.mod.get_global('mwc_rng_mults')
         statedp, statel = ctx.mod.get_global('mwc_rng_state')
 
-        dfullstates = cuda.from_device(statedp, ctx.threads, np.uint64)
+        dfullstates = cuda.from_device(statedp, ctx.nthreads, np.uint64)
         if not (dfullstates == self.fullstates).all():
             print "State discrepancy"
             print dfullstates
@@ -725,7 +725,7 @@ class MWCRNGTest(PTXTest):
             raise PTXTestFailure("MWC RNG state discrepancy")
 
         sumdp, suml = ctx.mod.get_global('mwc_rng_test_sums')
-        dsums = cuda.from_device(sumdp, ctx.threads, np.uint64)
+        dsums = cuda.from_device(sumdp, ctx.nthreads, np.uint64)
         if not (dsums == self.sums).all():
             print "Sum discrepancy"
             print dsums
@@ -746,12 +746,12 @@ class MWCRNGFloatsTest(PTXTest):
 
     @ptx_func
     def module_setup(self):
-        mem.global_.f32('mwc_rng_float_01_test_sums', ctx.threads)
-        mem.global_.f32('mwc_rng_float_01_test_mins', ctx.threads)
-        mem.global_.f32('mwc_rng_float_01_test_maxs', ctx.threads)
-        mem.global_.f32('mwc_rng_float_11_test_sums', ctx.threads)
-        mem.global_.f32('mwc_rng_float_11_test_mins', ctx.threads)
-        mem.global_.f32('mwc_rng_float_11_test_maxs', ctx.threads)
+        mem.global_.f32('mwc_rng_float_01_test_sums', ctx.nthreads)
+        mem.global_.f32('mwc_rng_float_01_test_mins', ctx.nthreads)
+        mem.global_.f32('mwc_rng_float_01_test_maxs', ctx.nthreads)
+        mem.global_.f32('mwc_rng_float_11_test_sums', ctx.nthreads)
+        mem.global_.f32('mwc_rng_float_11_test_mins', ctx.nthreads)
+        mem.global_.f32('mwc_rng_float_11_test_maxs', ctx.nthreads)
 
     @ptx_func
     def loop(self, kind):
@@ -796,7 +796,7 @@ class MWCRNGFloatsTest(PTXTest):
         for fkind, rkind, exp, lim in tests:
             dp, l = ctx.mod.get_global(
                     'mwc_rng_float_%s_test_%s' % (fkind, rkind))
-            vals = cuda.from_device(dp, ctx.threads, np.float32)
+            vals = cuda.from_device(dp, ctx.nthreads, np.float32)
             avg = np.mean(vals)
             if np.abs(avg - exp) > tol:
                 raise PTXTestFailure("%s %s %g too far from %g" %
diff --git a/cuburn/ptx.py b/cuburn/ptx.py
index 5b03e81..125ae3c 100644
--- a/cuburn/ptx.py
+++ b/cuburn/ptx.py
@@ -415,7 +415,7 @@ class Mem(object):
     Reserve memory, optionally with an array size attached.
 
     >>> mem.global_.u32('global_scalar')
-    >>> mem.local.u32('context_sized_local_array', ctx.threads*4)
+    >>> mem.local.u32('context_sized_local_array', ctx.nthreads*4)
     >>> mem.shared.u32('shared_array', 12)
     >>> mem.const.u32('const_array_of_unknown_length', True)
 
@@ -678,7 +678,7 @@ class _PTXStdLib(PTXFragment):
         # multiple devices first, which we definitely do not yet do
         self.block.code(prefix='.version 2.1', semi=False)
         self.block.code(prefix='.target sm_21', semi=False)
-        mem.global_.u32('g_std_exit_err', ctx.threads)
+        mem.global_.u32('g_std_exit_err', ctx.nthreads)
 
     @ptx_func
     def get_gtid(self, dst):
@@ -812,7 +812,7 @@ class _PTXStdLib(PTXFragment):
         at the start of your entry. Yes, it's a hacky solution.
         """
         dp, l = ctx.mod.get_global('g_std_exit_err')
-        errs = cuda.from_device(dp, ctx.threads, np.uint32)
+        errs = cuda.from_device(dp, ctx.nthreads, np.uint32)
         if np.sum(errs) != 0:
             print "Some threads terminated unsuccessfully."
             for i, msg in enumerate(self.asserts):