mirror of
https://github.com/stevenrobertson/cuburn.git
synced 2025-02-05 11:40:04 -05:00
Fix occupancy issues (1 block/SM when shuffle was on).
There are 16 bar.sync() registers available per *chip*, not per block, and I was using number 8 in the shuffle code. Evidently the driver rewrites them per SM, but does not compact their range. Good to know.
This commit is contained in:
parent
ce0571deaf
commit
a439bf671d
@ -36,20 +36,18 @@ class IterThread(PTXEntryPoint):
|
|||||||
|
|
||||||
@ptx_func
|
@ptx_func
|
||||||
def entry(self):
|
def entry(self):
|
||||||
# For now, we indulge in the luxury of shared memory.
|
|
||||||
# Index number of current CP, shared across CTA
|
# Index number of current CP, shared across CTA
|
||||||
mem.shared.u32('s_cp_idx')
|
mem.shared.u32('s_cp_idx')
|
||||||
|
|
||||||
# Number of samples that have been generated so far in this CTA
|
# Number of samples that have been generated so far in this CTA
|
||||||
# If this number is negative, we're still fusing points, so this
|
# If this number is negative, we're still fusing points, so this
|
||||||
# behaves slightly differently (see ``fuse_loop_start``)
|
# behaves slightly differently (see ``fuse_loop_start``)
|
||||||
|
# TODO: replace (or at least simplify) this logic
|
||||||
mem.shared.s32('s_num_samples')
|
mem.shared.s32('s_num_samples')
|
||||||
op.st.shared.s32(addr(s_num_samples), -(features.num_fuse_samples+1))
|
op.st.shared.s32(addr(s_num_samples), -(features.num_fuse_samples+1))
|
||||||
|
|
||||||
mem.shared.f32('s_xf_sel', ctx.warps_per_cta)
|
mem.shared.f32('s_xf_sel', ctx.warps_per_cta)
|
||||||
|
|
||||||
#std.store_per_thread(g_whatever, 1234)
|
|
||||||
|
|
||||||
# TODO: temporary, for testing
|
# TODO: temporary, for testing
|
||||||
mem.local.u32('l_num_rounds')
|
mem.local.u32('l_num_rounds')
|
||||||
mem.local.u32('l_num_writes')
|
mem.local.u32('l_num_writes')
|
||||||
@ -264,8 +262,10 @@ class IterThread(PTXEntryPoint):
|
|||||||
print '%s:' % s
|
print '%s:' % s
|
||||||
for i, r in enumerate(a):
|
for i, r in enumerate(a):
|
||||||
for j in range(0,len(r),w):
|
for j in range(0,len(r),w):
|
||||||
print '%2d\t%s' % (i,
|
print '%2d' % i,
|
||||||
'\t'.join(['%g '%np.mean(r[k]) for k in range(j,j+w)]))
|
for k in range(j,j+w,8):
|
||||||
|
print '\t' + ' '.join(
|
||||||
|
['%8g'%np.mean(r[l]) for l in range(k,k+8)])
|
||||||
|
|
||||||
num_rounds_dp, num_rounds_l = ctx.mod.get_global('g_num_rounds')
|
num_rounds_dp, num_rounds_l = ctx.mod.get_global('g_num_rounds')
|
||||||
num_writes_dp, num_writes_l = ctx.mod.get_global('g_num_writes')
|
num_writes_dp, num_writes_l = ctx.mod.get_global('g_num_writes')
|
||||||
@ -536,9 +536,9 @@ class ShufflePoints(PTXFragment):
|
|||||||
def shuffle(self, *args, **kwargs):
|
def shuffle(self, *args, **kwargs):
|
||||||
"""
|
"""
|
||||||
Shuffle the data from each register in args across threads. Keyword
|
Shuffle the data from each register in args across threads. Keyword
|
||||||
argument ``bar`` specifies which barrier to use.
|
argument ``bar`` specifies which barrier to use (default is 2).
|
||||||
"""
|
"""
|
||||||
bar = kwargs.pop('bar', 8)
|
bar = kwargs.pop('bar', 2)
|
||||||
with block("Shuffle across threads"):
|
with block("Shuffle across threads"):
|
||||||
reg.u32('shuf_read shuf_write')
|
reg.u32('shuf_read shuf_write')
|
||||||
with block("Calculate read and write offsets"):
|
with block("Calculate read and write offsets"):
|
||||||
|
@ -458,7 +458,6 @@ class Mem(object):
|
|||||||
class _MemFactory(_CallChain):
|
class _MemFactory(_CallChain):
|
||||||
"""Actual `mem` object"""
|
"""Actual `mem` object"""
|
||||||
def _call(self, type, name, array=False, init=None):
|
def _call(self, type, name, array=False, init=None):
|
||||||
assert len(type) == 2
|
|
||||||
memobj = Mem(type, name, array, init)
|
memobj = Mem(type, name, array, init)
|
||||||
if array is True:
|
if array is True:
|
||||||
array = ['[]']
|
array = ['[]']
|
||||||
@ -468,7 +467,7 @@ class _MemFactory(_CallChain):
|
|||||||
array = []
|
array = []
|
||||||
if init:
|
if init:
|
||||||
array += [' = ', init]
|
array += [' = ', init]
|
||||||
self.block.code(op='.%s.%s ' % (type[0], type[1]), vars=[name, array])
|
self.block.code(op='.%s ' % ' .'.join(type), vars=[name, array])
|
||||||
self.block.inject(name, memobj)
|
self.block.inject(name, memobj)
|
||||||
|
|
||||||
# TODO: move vec, addr here, or make this public
|
# TODO: move vec, addr here, or make this public
|
||||||
|
Loading…
Reference in New Issue
Block a user