mirror of
https://github.com/stevenrobertson/cuburn.git
synced 2025-02-05 11:40:04 -05:00
Change 'ctx.threads' to 'ctx.nthreads', as it should have been from the start
This commit is contained in:
parent
a439bf671d
commit
3265982fec
@ -30,7 +30,7 @@ class LaunchContext(object):
|
|||||||
|
|
||||||
`block`: 3-tuple of (x,y,z); dimensions of each CTA.
|
`block`: 3-tuple of (x,y,z); dimensions of each CTA.
|
||||||
`grid`: 2-tuple of (x,y); dimensions of the grid of CTAs.
|
`grid`: 2-tuple of (x,y); dimensions of the grid of CTAs.
|
||||||
`threads`: Number of active threads on device as a whole.
|
`nthreads`: Number of active threads on device as a whole.
|
||||||
`mod`: Final compiled module. Unavailable during assembly.
|
`mod`: Final compiled module. Unavailable during assembly.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
@ -41,11 +41,11 @@ class LaunchContext(object):
|
|||||||
self.stream = cuda.Stream()
|
self.stream = cuda.Stream()
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def threads(self):
|
def nthreads(self):
|
||||||
return reduce(lambda a, b: a*b, self.block + self.grid)
|
return reduce(lambda a, b: a*b, self.block + self.grid)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def ctas(self):
|
def nctas(self):
|
||||||
return self.grid[0] * self.grid[1]
|
return self.grid[0] * self.grid[1]
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
@ -30,9 +30,9 @@ class IterThread(PTXEntryPoint):
|
|||||||
mem.global_.u32('g_num_cps')
|
mem.global_.u32('g_num_cps')
|
||||||
mem.global_.u32('g_num_cps_started')
|
mem.global_.u32('g_num_cps_started')
|
||||||
# TODO move into debug statement
|
# TODO move into debug statement
|
||||||
mem.global_.u32('g_num_rounds', ctx.threads)
|
mem.global_.u32('g_num_rounds', ctx.nthreads)
|
||||||
mem.global_.u32('g_num_writes', ctx.threads)
|
mem.global_.u32('g_num_writes', ctx.nthreads)
|
||||||
mem.global_.b32('g_whatever', ctx.threads)
|
mem.global_.b32('g_whatever', ctx.nthreads)
|
||||||
|
|
||||||
@ptx_func
|
@ptx_func
|
||||||
def entry(self):
|
def entry(self):
|
||||||
@ -567,8 +567,8 @@ class MWCRNG(PTXFragment):
|
|||||||
|
|
||||||
@ptx_func
|
@ptx_func
|
||||||
def module_setup(self):
|
def module_setup(self):
|
||||||
mem.global_.u32('mwc_rng_mults', ctx.threads)
|
mem.global_.u32('mwc_rng_mults', ctx.nthreads)
|
||||||
mem.global_.u64('mwc_rng_state', ctx.threads)
|
mem.global_.u64('mwc_rng_state', ctx.nthreads)
|
||||||
|
|
||||||
@ptx_func
|
@ptx_func
|
||||||
def entry_setup(self):
|
def entry_setup(self):
|
||||||
@ -637,21 +637,21 @@ class MWCRNG(PTXFragment):
|
|||||||
stream = cuda.Stream()
|
stream = cuda.Stream()
|
||||||
# Randomness in choosing multipliers is good, but larger multipliers
|
# Randomness in choosing multipliers is good, but larger multipliers
|
||||||
# have longer periods, which is also good. This is a compromise.
|
# have longer periods, which is also good. This is a compromise.
|
||||||
mults = np.array(mults[:ctx.threads*4])
|
mults = np.array(mults[:ctx.nthreads*4])
|
||||||
rand.shuffle(mults)
|
rand.shuffle(mults)
|
||||||
# Copy multipliers and seeds to the device
|
# Copy multipliers and seeds to the device
|
||||||
multdp, multl = ctx.mod.get_global('mwc_rng_mults')
|
multdp, multl = ctx.mod.get_global('mwc_rng_mults')
|
||||||
cuda.memcpy_htod_async(multdp, mults.tostring()[:multl])
|
cuda.memcpy_htod_async(multdp, mults.tostring()[:multl])
|
||||||
# Intentionally excludes both 0 and (2^32-1), as they can lead to
|
# Intentionally excludes both 0 and (2^32-1), as they can lead to
|
||||||
# degenerate sequences of period 0
|
# degenerate sequences of period 0
|
||||||
states = np.array(rand.randint(1, 0xffffffff, size=2*ctx.threads),
|
states = np.array(rand.randint(1, 0xffffffff, size=2*ctx.nthreads),
|
||||||
dtype=np.uint32)
|
dtype=np.uint32)
|
||||||
statedp, statel = ctx.mod.get_global('mwc_rng_state')
|
statedp, statel = ctx.mod.get_global('mwc_rng_state')
|
||||||
cuda.memcpy_htod_async(statedp, states.tostring())
|
cuda.memcpy_htod_async(statedp, states.tostring())
|
||||||
self.threads_ready = ctx.threads
|
self.threads_ready = ctx.nthreads
|
||||||
|
|
||||||
def call_setup(self, ctx):
|
def call_setup(self, ctx):
|
||||||
if self.threads_ready < ctx.threads:
|
if self.threads_ready < ctx.nthreads:
|
||||||
self.seed(ctx)
|
self.seed(ctx)
|
||||||
|
|
||||||
def tests(self):
|
def tests(self):
|
||||||
@ -668,7 +668,7 @@ class MWCRNGTest(PTXTest):
|
|||||||
|
|
||||||
@ptx_func
|
@ptx_func
|
||||||
def module_setup(self):
|
def module_setup(self):
|
||||||
mem.global_.u64('mwc_rng_test_sums', ctx.threads)
|
mem.global_.u64('mwc_rng_test_sums', ctx.nthreads)
|
||||||
|
|
||||||
@ptx_func
|
@ptx_func
|
||||||
def entry(self):
|
def entry(self):
|
||||||
@ -697,10 +697,10 @@ class MWCRNGTest(PTXTest):
|
|||||||
def call_setup(self, ctx):
|
def call_setup(self, ctx):
|
||||||
# Get current multipliers and seeds from the device
|
# Get current multipliers and seeds from the device
|
||||||
multdp, multl = ctx.mod.get_global('mwc_rng_mults')
|
multdp, multl = ctx.mod.get_global('mwc_rng_mults')
|
||||||
self.mults = cuda.from_device(multdp, ctx.threads, np.uint32)
|
self.mults = cuda.from_device(multdp, ctx.nthreads, np.uint32)
|
||||||
statedp, statel = ctx.mod.get_global('mwc_rng_state')
|
statedp, statel = ctx.mod.get_global('mwc_rng_state')
|
||||||
self.fullstates = cuda.from_device(statedp, ctx.threads, np.uint64)
|
self.fullstates = cuda.from_device(statedp, ctx.nthreads, np.uint64)
|
||||||
self.sums = np.zeros(ctx.threads, np.uint64)
|
self.sums = np.zeros(ctx.nthreads, np.uint64)
|
||||||
|
|
||||||
print "Running %d states forward %d rounds" % \
|
print "Running %d states forward %d rounds" % \
|
||||||
(len(self.mults), self.rounds)
|
(len(self.mults), self.rounds)
|
||||||
@ -717,7 +717,7 @@ class MWCRNGTest(PTXTest):
|
|||||||
multdp, multl = ctx.mod.get_global('mwc_rng_mults')
|
multdp, multl = ctx.mod.get_global('mwc_rng_mults')
|
||||||
statedp, statel = ctx.mod.get_global('mwc_rng_state')
|
statedp, statel = ctx.mod.get_global('mwc_rng_state')
|
||||||
|
|
||||||
dfullstates = cuda.from_device(statedp, ctx.threads, np.uint64)
|
dfullstates = cuda.from_device(statedp, ctx.nthreads, np.uint64)
|
||||||
if not (dfullstates == self.fullstates).all():
|
if not (dfullstates == self.fullstates).all():
|
||||||
print "State discrepancy"
|
print "State discrepancy"
|
||||||
print dfullstates
|
print dfullstates
|
||||||
@ -725,7 +725,7 @@ class MWCRNGTest(PTXTest):
|
|||||||
raise PTXTestFailure("MWC RNG state discrepancy")
|
raise PTXTestFailure("MWC RNG state discrepancy")
|
||||||
|
|
||||||
sumdp, suml = ctx.mod.get_global('mwc_rng_test_sums')
|
sumdp, suml = ctx.mod.get_global('mwc_rng_test_sums')
|
||||||
dsums = cuda.from_device(sumdp, ctx.threads, np.uint64)
|
dsums = cuda.from_device(sumdp, ctx.nthreads, np.uint64)
|
||||||
if not (dsums == self.sums).all():
|
if not (dsums == self.sums).all():
|
||||||
print "Sum discrepancy"
|
print "Sum discrepancy"
|
||||||
print dsums
|
print dsums
|
||||||
@ -746,12 +746,12 @@ class MWCRNGFloatsTest(PTXTest):
|
|||||||
|
|
||||||
@ptx_func
|
@ptx_func
|
||||||
def module_setup(self):
|
def module_setup(self):
|
||||||
mem.global_.f32('mwc_rng_float_01_test_sums', ctx.threads)
|
mem.global_.f32('mwc_rng_float_01_test_sums', ctx.nthreads)
|
||||||
mem.global_.f32('mwc_rng_float_01_test_mins', ctx.threads)
|
mem.global_.f32('mwc_rng_float_01_test_mins', ctx.nthreads)
|
||||||
mem.global_.f32('mwc_rng_float_01_test_maxs', ctx.threads)
|
mem.global_.f32('mwc_rng_float_01_test_maxs', ctx.nthreads)
|
||||||
mem.global_.f32('mwc_rng_float_11_test_sums', ctx.threads)
|
mem.global_.f32('mwc_rng_float_11_test_sums', ctx.nthreads)
|
||||||
mem.global_.f32('mwc_rng_float_11_test_mins', ctx.threads)
|
mem.global_.f32('mwc_rng_float_11_test_mins', ctx.nthreads)
|
||||||
mem.global_.f32('mwc_rng_float_11_test_maxs', ctx.threads)
|
mem.global_.f32('mwc_rng_float_11_test_maxs', ctx.nthreads)
|
||||||
|
|
||||||
@ptx_func
|
@ptx_func
|
||||||
def loop(self, kind):
|
def loop(self, kind):
|
||||||
@ -796,7 +796,7 @@ class MWCRNGFloatsTest(PTXTest):
|
|||||||
for fkind, rkind, exp, lim in tests:
|
for fkind, rkind, exp, lim in tests:
|
||||||
dp, l = ctx.mod.get_global(
|
dp, l = ctx.mod.get_global(
|
||||||
'mwc_rng_float_%s_test_%s' % (fkind, rkind))
|
'mwc_rng_float_%s_test_%s' % (fkind, rkind))
|
||||||
vals = cuda.from_device(dp, ctx.threads, np.float32)
|
vals = cuda.from_device(dp, ctx.nthreads, np.float32)
|
||||||
avg = np.mean(vals)
|
avg = np.mean(vals)
|
||||||
if np.abs(avg - exp) > tol:
|
if np.abs(avg - exp) > tol:
|
||||||
raise PTXTestFailure("%s %s %g too far from %g" %
|
raise PTXTestFailure("%s %s %g too far from %g" %
|
||||||
|
@ -415,7 +415,7 @@ class Mem(object):
|
|||||||
Reserve memory, optionally with an array size attached.
|
Reserve memory, optionally with an array size attached.
|
||||||
|
|
||||||
>>> mem.global_.u32('global_scalar')
|
>>> mem.global_.u32('global_scalar')
|
||||||
>>> mem.local.u32('context_sized_local_array', ctx.threads*4)
|
>>> mem.local.u32('context_sized_local_array', ctx.nthreads*4)
|
||||||
>>> mem.shared.u32('shared_array', 12)
|
>>> mem.shared.u32('shared_array', 12)
|
||||||
>>> mem.const.u32('const_array_of_unknown_length', True)
|
>>> mem.const.u32('const_array_of_unknown_length', True)
|
||||||
|
|
||||||
@ -678,7 +678,7 @@ class _PTXStdLib(PTXFragment):
|
|||||||
# multiple devices first, which we definitely do not yet do
|
# multiple devices first, which we definitely do not yet do
|
||||||
self.block.code(prefix='.version 2.1', semi=False)
|
self.block.code(prefix='.version 2.1', semi=False)
|
||||||
self.block.code(prefix='.target sm_21', semi=False)
|
self.block.code(prefix='.target sm_21', semi=False)
|
||||||
mem.global_.u32('g_std_exit_err', ctx.threads)
|
mem.global_.u32('g_std_exit_err', ctx.nthreads)
|
||||||
|
|
||||||
@ptx_func
|
@ptx_func
|
||||||
def get_gtid(self, dst):
|
def get_gtid(self, dst):
|
||||||
@ -812,7 +812,7 @@ class _PTXStdLib(PTXFragment):
|
|||||||
at the start of your entry. Yes, it's a hacky solution.
|
at the start of your entry. Yes, it's a hacky solution.
|
||||||
"""
|
"""
|
||||||
dp, l = ctx.mod.get_global('g_std_exit_err')
|
dp, l = ctx.mod.get_global('g_std_exit_err')
|
||||||
errs = cuda.from_device(dp, ctx.threads, np.uint32)
|
errs = cuda.from_device(dp, ctx.nthreads, np.uint32)
|
||||||
if np.sum(errs) != 0:
|
if np.sum(errs) != 0:
|
||||||
print "Some threads terminated unsuccessfully."
|
print "Some threads terminated unsuccessfully."
|
||||||
for i, msg in enumerate(self.asserts):
|
for i, msg in enumerate(self.asserts):
|
||||||
|
Loading…
Reference in New Issue
Block a user