From 3265982fecd8a303de7fc59f1b90f4cc17553b94 Mon Sep 17 00:00:00 2001 From: Steven Robertson Date: Sun, 12 Sep 2010 11:13:53 -0400 Subject: [PATCH] Change 'ctx.threads' to 'ctx.nthreads', as it should have been from the start --- cuburn/cuda.py | 6 +++--- cuburn/device_code.py | 44 +++++++++++++++++++++---------------------- cuburn/ptx.py | 6 +++--- 3 files changed, 28 insertions(+), 28 deletions(-) diff --git a/cuburn/cuda.py b/cuburn/cuda.py index 08aaa5f..ecfc088 100644 --- a/cuburn/cuda.py +++ b/cuburn/cuda.py @@ -30,7 +30,7 @@ class LaunchContext(object): `block`: 3-tuple of (x,y,z); dimensions of each CTA. `grid`: 2-tuple of (x,y); dimensions of the grid of CTAs. - `threads`: Number of active threads on device as a whole. + `nthreads`: Number of active threads on device as a whole. `mod`: Final compiled module. Unavailable during assembly. """ @@ -41,11 +41,11 @@ class LaunchContext(object): self.stream = cuda.Stream() @property - def threads(self): + def nthreads(self): return reduce(lambda a, b: a*b, self.block + self.grid) @property - def ctas(self): + def nctas(self): return self.grid[0] * self.grid[1] @property diff --git a/cuburn/device_code.py b/cuburn/device_code.py index 5df1763..8911016 100644 --- a/cuburn/device_code.py +++ b/cuburn/device_code.py @@ -30,9 +30,9 @@ class IterThread(PTXEntryPoint): mem.global_.u32('g_num_cps') mem.global_.u32('g_num_cps_started') # TODO move into debug statement - mem.global_.u32('g_num_rounds', ctx.threads) - mem.global_.u32('g_num_writes', ctx.threads) - mem.global_.b32('g_whatever', ctx.threads) + mem.global_.u32('g_num_rounds', ctx.nthreads) + mem.global_.u32('g_num_writes', ctx.nthreads) + mem.global_.b32('g_whatever', ctx.nthreads) @ptx_func def entry(self): @@ -567,8 +567,8 @@ class MWCRNG(PTXFragment): @ptx_func def module_setup(self): - mem.global_.u32('mwc_rng_mults', ctx.threads) - mem.global_.u64('mwc_rng_state', ctx.threads) + mem.global_.u32('mwc_rng_mults', ctx.nthreads) + mem.global_.u64('mwc_rng_state', ctx.nthreads) @ptx_func def entry_setup(self): @@ -637,21 +637,21 @@ class MWCRNG(PTXFragment): stream = cuda.Stream() # Randomness in choosing multipliers is good, but larger multipliers # have longer periods, which is also good. This is a compromise. - mults = np.array(mults[:ctx.threads*4]) + mults = np.array(mults[:ctx.nthreads*4]) rand.shuffle(mults) # Copy multipliers and seeds to the device multdp, multl = ctx.mod.get_global('mwc_rng_mults') cuda.memcpy_htod_async(multdp, mults.tostring()[:multl]) # Intentionally excludes both 0 and (2^32-1), as they can lead to # degenerate sequences of period 0 - states = np.array(rand.randint(1, 0xffffffff, size=2*ctx.threads), + states = np.array(rand.randint(1, 0xffffffff, size=2*ctx.nthreads), dtype=np.uint32) statedp, statel = ctx.mod.get_global('mwc_rng_state') cuda.memcpy_htod_async(statedp, states.tostring()) - self.threads_ready = ctx.threads + self.threads_ready = ctx.nthreads def call_setup(self, ctx): - if self.threads_ready < ctx.threads: + if self.threads_ready < ctx.nthreads: self.seed(ctx) def tests(self): @@ -668,7 +668,7 @@ class MWCRNGTest(PTXTest): @ptx_func def module_setup(self): - mem.global_.u64('mwc_rng_test_sums', ctx.threads) + mem.global_.u64('mwc_rng_test_sums', ctx.nthreads) @ptx_func def entry(self): @@ -697,10 +697,10 @@ class MWCRNGTest(PTXTest): def call_setup(self, ctx): # Get current multipliers and seeds from the device multdp, multl = ctx.mod.get_global('mwc_rng_mults') - self.mults = cuda.from_device(multdp, ctx.threads, np.uint32) + self.mults = cuda.from_device(multdp, ctx.nthreads, np.uint32) statedp, statel = ctx.mod.get_global('mwc_rng_state') - self.fullstates = cuda.from_device(statedp, ctx.threads, np.uint64) - self.sums = np.zeros(ctx.threads, np.uint64) + self.fullstates = cuda.from_device(statedp, ctx.nthreads, np.uint64) + self.sums = np.zeros(ctx.nthreads, np.uint64) print "Running %d states forward %d rounds" % \ (len(self.mults), self.rounds) @@ -717,7 +717,7 @@ class MWCRNGTest(PTXTest): multdp, multl = ctx.mod.get_global('mwc_rng_mults') statedp, statel = ctx.mod.get_global('mwc_rng_state') - dfullstates = cuda.from_device(statedp, ctx.threads, np.uint64) + dfullstates = cuda.from_device(statedp, ctx.nthreads, np.uint64) if not (dfullstates == self.fullstates).all(): print "State discrepancy" print dfullstates @@ -725,7 +725,7 @@ class MWCRNGTest(PTXTest): raise PTXTestFailure("MWC RNG state discrepancy") sumdp, suml = ctx.mod.get_global('mwc_rng_test_sums') - dsums = cuda.from_device(sumdp, ctx.threads, np.uint64) + dsums = cuda.from_device(sumdp, ctx.nthreads, np.uint64) if not (dsums == self.sums).all(): print "Sum discrepancy" print dsums @@ -746,12 +746,12 @@ class MWCRNGFloatsTest(PTXTest): @ptx_func def module_setup(self): - mem.global_.f32('mwc_rng_float_01_test_sums', ctx.threads) - mem.global_.f32('mwc_rng_float_01_test_mins', ctx.threads) - mem.global_.f32('mwc_rng_float_01_test_maxs', ctx.threads) - mem.global_.f32('mwc_rng_float_11_test_sums', ctx.threads) - mem.global_.f32('mwc_rng_float_11_test_mins', ctx.threads) - mem.global_.f32('mwc_rng_float_11_test_maxs', ctx.threads) + mem.global_.f32('mwc_rng_float_01_test_sums', ctx.nthreads) + mem.global_.f32('mwc_rng_float_01_test_mins', ctx.nthreads) + mem.global_.f32('mwc_rng_float_01_test_maxs', ctx.nthreads) + mem.global_.f32('mwc_rng_float_11_test_sums', ctx.nthreads) + mem.global_.f32('mwc_rng_float_11_test_mins', ctx.nthreads) + mem.global_.f32('mwc_rng_float_11_test_maxs', ctx.nthreads) @ptx_func def loop(self, kind): @@ -796,7 +796,7 @@ class MWCRNGFloatsTest(PTXTest): for fkind, rkind, exp, lim in tests: dp, l = ctx.mod.get_global( 'mwc_rng_float_%s_test_%s' % (fkind, rkind)) - vals = cuda.from_device(dp, ctx.threads, np.float32) + vals = cuda.from_device(dp, ctx.nthreads, np.float32) avg = np.mean(vals) if np.abs(avg - exp) > tol: raise PTXTestFailure("%s %s %g too far from %g" % diff --git a/cuburn/ptx.py b/cuburn/ptx.py index 5b03e81..125ae3c 100644 --- a/cuburn/ptx.py +++ b/cuburn/ptx.py @@ -415,7 +415,7 @@ class Mem(object): Reserve memory, optionally with an array size attached. >>> mem.global_.u32('global_scalar') - >>> mem.local.u32('context_sized_local_array', ctx.threads*4) + >>> mem.local.u32('context_sized_local_array', ctx.nthreads*4) >>> mem.shared.u32('shared_array', 12) >>> mem.const.u32('const_array_of_unknown_length', True) @@ -678,7 +678,7 @@ class _PTXStdLib(PTXFragment): # multiple devices first, which we definitely do not yet do self.block.code(prefix='.version 2.1', semi=False) self.block.code(prefix='.target sm_21', semi=False) - mem.global_.u32('g_std_exit_err', ctx.threads) + mem.global_.u32('g_std_exit_err', ctx.nthreads) @ptx_func def get_gtid(self, dst): @@ -812,7 +812,7 @@ class _PTXStdLib(PTXFragment): at the start of your entry. Yes, it's a hacky solution. """ dp, l = ctx.mod.get_global('g_std_exit_err') - errs = cuda.from_device(dp, ctx.threads, np.uint32) + errs = cuda.from_device(dp, ctx.nthreads, np.uint32) if np.sum(errs) != 0: print "Some threads terminated unsuccessfully." for i, msg in enumerate(self.asserts):