mirror of
https://github.com/stevenrobertson/cuburn.git
synced 2025-02-05 03:30:05 -05:00
Experiments with larger CTAs for IterThread
This commit is contained in:
parent
e2b1c161cf
commit
c13f6a06cf
@ -90,7 +90,7 @@ class IterThread(PTXEntryPoint):
|
||||
reg.pred('p_last_cp')
|
||||
op.ldu.u32(num_cps, addr(g_num_cps))
|
||||
op.setp.ge.u32(p_last_cp, cp_idx, num_cps)
|
||||
op.bra.uni('all_cps_done', ifp=p_last_cp)
|
||||
op.bra('all_cps_done', ifp=p_last_cp)
|
||||
|
||||
with block('Load CP address'):
|
||||
op.mov.u32(cpA, g_cp_array)
|
||||
@ -149,7 +149,7 @@ class IterThread(PTXEntryPoint):
|
||||
for xf in features.xforms:
|
||||
label('XFORM_%d' % xf.id)
|
||||
variations.apply_xform(xo, yo, coloro, xi, yi, colori, xf.id)
|
||||
op.bra.uni("xform_done")
|
||||
op.bra("xform_done")
|
||||
|
||||
label("xform_done")
|
||||
with block("Test if we're still in FUSE"):
|
||||
@ -161,7 +161,7 @@ class IterThread(PTXEntryPoint):
|
||||
|
||||
reg.pred('p_point_is_valid')
|
||||
with block("Write the result"):
|
||||
hist.scatter(xo, yo, coloro, 0, p_point_is_valid)
|
||||
hist.scatter(xo, yo, coloro, 0, p_point_is_valid, 'ldst')
|
||||
with block():
|
||||
reg.u32('num_writes')
|
||||
op.ld.local.u32(num_writes, addr(l_num_writes))
|
||||
@ -212,16 +212,15 @@ class IterThread(PTXEntryPoint):
|
||||
comment('Shuffle points between threads')
|
||||
shuf.shuffle(xi, yi, colori, consec_bad)
|
||||
|
||||
with block("If first warp, pick new thread offset"):
|
||||
reg.u32('warpid')
|
||||
with block("If in first warp, pick new offset"):
|
||||
reg.u32('tid')
|
||||
reg.pred('first_warp')
|
||||
op.mov.u32(warpid, '%tid.x')
|
||||
op.shr.b32(warpid, warpid, 5)
|
||||
op.setp.eq.u32(first_warp, warpid, 0)
|
||||
#std.asrt("Looks like we're not the first warp", notp=first_warp,
|
||||
#ret=True)
|
||||
op.bra.uni(iter_loop_choose_xform, ifp=first_warp)
|
||||
op.bra.uni(iter_loop_start)
|
||||
op.mov.u32(tid, '%tid.x')
|
||||
assert ctx.warps_per_cta <= 32, \
|
||||
"Special-case for CTAs with >1024 threads not implemented"
|
||||
op.setp.lo.u32(first_warp, tid, 32)
|
||||
op.bra(iter_loop_choose_xform, ifp=first_warp)
|
||||
op.bra(iter_loop_start)
|
||||
|
||||
label('all_cps_done')
|
||||
# TODO this is for testing, move it to a debug statement
|
||||
@ -258,14 +257,15 @@ class IterThread(PTXEntryPoint):
|
||||
super(IterThread, self)._call(ctx, func, texrefs=[tr])
|
||||
|
||||
def call_teardown(self, ctx):
|
||||
shape = (ctx.grid[0], ctx.block[0]/32, 32)
|
||||
w = ctx.warps_per_cta
|
||||
shape = (ctx.grid[0], w, 32)
|
||||
|
||||
def print_thing(s, a):
|
||||
print '%s:' % s
|
||||
for i, r in enumerate(a):
|
||||
for j in range(0,len(r),8):
|
||||
for j in range(0,len(r),w):
|
||||
print '%2d\t%s' % (i,
|
||||
'\t'.join(['%g '%np.mean(r[k]) for k in range(j,j+8)]))
|
||||
'\t'.join(['%g '%np.mean(r[k]) for k in range(j,j+w)]))
|
||||
|
||||
num_rounds_dp, num_rounds_l = ctx.mod.get_global('g_num_rounds')
|
||||
num_writes_dp, num_writes_l = ctx.mod.get_global('g_num_writes')
|
||||
@ -484,6 +484,11 @@ class HistScatter(PTXFragment):
|
||||
op.mov.u32(hist_bin_addr, g_hist_bins)
|
||||
op.mad.lo.u32(hist_bin_addr, hist_index, 16, hist_bin_addr)
|
||||
|
||||
if type == 'fake_notex':
|
||||
op.st.local.u32(addr(l_scatter_fake_adr), hist_bin_addr)
|
||||
op.st.local.f32(addr(l_scatter_fake_alpha), color)
|
||||
return
|
||||
|
||||
reg.f32('r g b a norm_time')
|
||||
cp.get(cpA, norm_time, 'cp.norm_time')
|
||||
palette.look_up(r, g, b, a, color, norm_time)
|
||||
|
@ -154,7 +154,7 @@ class Animation(object):
|
||||
the active device.
|
||||
"""
|
||||
# TODO: user-configurable test control
|
||||
self.ctx = LaunchContext([IterThread], block=(256,1,1), grid=(28,1),
|
||||
self.ctx = LaunchContext([IterThread], block=(512,1,1), grid=(28,1),
|
||||
tests=True)
|
||||
# TODO: user-configurable verbosity control
|
||||
self.ctx.compile(verbose=3, anim=self, features=self.features)
|
||||
|
@ -280,6 +280,21 @@ def shuf_better(a):
|
||||
|
||||
print ' With better shuffle: %g' % monte(make(), shuf_better, 1000, 32)
|
||||
|
||||
print 'For 32*16:'
|
||||
t = 512
|
||||
print ' With no shuffle: %g' % monte(make(), shuf_none, 1000, 32)
|
||||
print ' With full shuffle: %g' % monte(make(), shuf_all, 1000, 32)
|
||||
print ' With simple shuffle: %g' % monte(make(), shuf_simple, 1000, 32)
|
||||
print ' With better shuffle: %g' % monte(make(), shuf_better, 1000, 32)
|
||||
print 'For 32*32:'
|
||||
t = 1024
|
||||
print ' With no shuffle: %g' % monte(make(), shuf_none, 1000, 32)
|
||||
print ' With full shuffle: %g' % monte(make(), shuf_all, 1000, 32)
|
||||
print ' With simple shuffle: %g' % monte(make(), shuf_simple, 1000, 32)
|
||||
print ' With better shuffle: %g' % monte(make(), shuf_better, 1000, 32)
|
||||
|
||||
|
||||
|
||||
print """
|
||||
Okay I actually intended this to be a blog post but I started writing before
|
||||
having done any of the math. Actually the simple shuffle looks like it's
|
||||
|
Loading…
Reference in New Issue
Block a user