mirror of
https://github.com/stevenrobertson/cuburn.git
synced 2025-02-05 11:40:04 -05:00
Simultaneous occupancy microbenchmark
This commit is contained in:
parent
3e4e1d88a2
commit
d01de61952
105
bench.py
105
bench.py
@ -104,9 +104,11 @@ class L2WriteCombining(PTXTest):
|
|||||||
op.setp.ge.u32(p_done, x, 2)
|
op.setp.ge.u32(p_done, x, 2)
|
||||||
op.bra.uni(l2_restart, ifnotp=p_done)
|
op.bra.uni(l2_restart, ifnotp=p_done)
|
||||||
|
|
||||||
|
def call_setup(self, ctx):
|
||||||
|
self.scratch = np.zeros(self.block_size*ctx.nctas/4, np.uint64)
|
||||||
|
self.times_bytes = np.zeros((4, ctx.nthreads), np.uint64, 'F')
|
||||||
|
|
||||||
def _call(self, ctx, func):
|
def _call(self, ctx, func):
|
||||||
self.scratch = np.zeros(self.block_size*ctx.ctas/4, np.uint64)
|
|
||||||
self.times_bytes = np.zeros((4, ctx.threads), np.uint64, 'F')
|
|
||||||
super(L2WriteCombining, self)._call(ctx, func,
|
super(L2WriteCombining, self)._call(ctx, func,
|
||||||
cuda.InOut(self.times_bytes), cuda.InOut(self.scratch))
|
cuda.InOut(self.times_bytes), cuda.InOut(self.scratch))
|
||||||
|
|
||||||
@ -118,6 +120,102 @@ class L2WriteCombining(PTXTest):
|
|||||||
print "Bytes for uncoa was %g ± %g" % pm(self.times_bytes[3])
|
print "Bytes for uncoa was %g ± %g" % pm(self.times_bytes[3])
|
||||||
print
|
print
|
||||||
|
|
||||||
|
class SimulOccupancy(PTXTest):
|
||||||
|
"""
|
||||||
|
Test to discover whether Fermi GPUs will launch multiple entry points
|
||||||
|
in the same kernel on the same CTA simultaneously.
|
||||||
|
"""
|
||||||
|
entry_name = 'simul1'
|
||||||
|
# Only has to be big enough to hold the kernel on the device for a while
|
||||||
|
rounds = 1000000
|
||||||
|
|
||||||
|
def deps(self):
|
||||||
|
return [MWCRNG]
|
||||||
|
|
||||||
|
@ptx_func
|
||||||
|
def module_setup(self):
|
||||||
|
n = self.entry_name + '_'
|
||||||
|
mem.global_.u64(n+'start', ctx.nthreads)
|
||||||
|
mem.global_.u64(n+'end', ctx.nthreads)
|
||||||
|
mem.global_.u32(n+'smid', ctx.nthreads)
|
||||||
|
mem.global_.u32(n+'warpid_start', ctx.nthreads)
|
||||||
|
mem.global_.u32(n+'warpid_end', ctx.nthreads)
|
||||||
|
|
||||||
|
@ptx_func
|
||||||
|
def entry(self):
|
||||||
|
n = self.entry_name + '_'
|
||||||
|
reg.u64('now')
|
||||||
|
reg.u32('warpid')
|
||||||
|
op.mov.u64(now, '%clock64')
|
||||||
|
op.mov.u32(warpid, '%warpid')
|
||||||
|
std.store_per_thread(n+'start', now,
|
||||||
|
n+'warpid_start', warpid)
|
||||||
|
|
||||||
|
reg.u32('loopct rnd')
|
||||||
|
reg.pred('p_done')
|
||||||
|
op.mov.u32(loopct, self.rounds)
|
||||||
|
label('loopstart')
|
||||||
|
mwc.next_b32(rnd)
|
||||||
|
std.store_per_thread(n+'smid', rnd)
|
||||||
|
op.sub.u32(loopct, loopct, 1)
|
||||||
|
op.setp.eq.u32(p_done, loopct, 0)
|
||||||
|
op.bra.uni(loopstart, ifnotp=p_done)
|
||||||
|
|
||||||
|
reg.u32('smid')
|
||||||
|
op.mov.u32(smid, '%smid')
|
||||||
|
op.mov.u32(warpid, '%warpid')
|
||||||
|
op.mov.u64(now, '%clock64')
|
||||||
|
std.store_per_thread(n+'end', now,
|
||||||
|
n+'smid', smid,
|
||||||
|
n+'warpid_end', warpid)
|
||||||
|
|
||||||
|
def _call(self, ctx, func):
|
||||||
|
stream1, stream2 = cuda.Stream(), cuda.Stream()
|
||||||
|
self._call2(ctx, stream1)
|
||||||
|
_SimulOccupancy._call2(ctx, stream2)
|
||||||
|
stream2.synchronize()
|
||||||
|
stream1.synchronize()
|
||||||
|
|
||||||
|
@instmethod
|
||||||
|
def _call2(self, ctx, stream):
|
||||||
|
func = ctx.mod.get_function(self.entry_name)
|
||||||
|
func.prepare([], ctx.block)
|
||||||
|
# TODO: load number of SMs from ctx
|
||||||
|
func.launch_grid_async(7, 1, stream)
|
||||||
|
|
||||||
|
def call_teardown(self, ctx):
|
||||||
|
sm_log = [[] for i in range(7)]
|
||||||
|
self._teardown(ctx, sm_log)
|
||||||
|
_SimulOccupancy._teardown(ctx, sm_log)
|
||||||
|
for sm in range(len(sm_log)):
|
||||||
|
print "\nPrinting log for SM %d" % sm
|
||||||
|
for t, ev in sorted(sm_log[sm]):
|
||||||
|
print '%6d %s' % (t/1000, ev)
|
||||||
|
|
||||||
|
@instmethod
|
||||||
|
def _teardown(self, ctx, sm_log):
|
||||||
|
# For this method, the GPU is intentionally underloaded; trim results
|
||||||
|
th = 7 * ctx.threads_per_cta
|
||||||
|
n = self.entry_name + '_'
|
||||||
|
start = ctx.get_per_thread(n+'start', np.uint64)[:th]
|
||||||
|
end = ctx.get_per_thread(n+'end', np.uint64)[:th]
|
||||||
|
smid = ctx.get_per_thread(n+'smid', np.uint32)[:th]
|
||||||
|
warpid_start = ctx.get_per_thread(n+'warpid_start', np.uint32)[:th]
|
||||||
|
warpid_end = ctx.get_per_thread(n+'warpid_end', np.uint32)[:th]
|
||||||
|
for i in range(0, th, 32):
|
||||||
|
sm_log[smid[i]].append((start[i], "%s%4d entered SM" % (n, i/32)))
|
||||||
|
sm_log[smid[i]].append((end[i], "%s%4d left SM" % (n, i/32)))
|
||||||
|
if not np.alltrue(np.equal(warpid_start, warpid_end)):
|
||||||
|
print "Warp IDs changed. Do further research."
|
||||||
|
|
||||||
|
class _SimulOccupancy(SimulOccupancy):
|
||||||
|
# Don't call this one
|
||||||
|
entry_name = 'simul2'
|
||||||
|
def call(self, ctx):
|
||||||
|
pass
|
||||||
|
def call_teardown(self, ctx):
|
||||||
|
pass
|
||||||
|
|
||||||
def printover(a, r, s=1):
|
def printover(a, r, s=1):
|
||||||
for i in range(0, len(a), r*s):
|
for i in range(0, len(a), r*s):
|
||||||
for j in range(i, i+r*s, s):
|
for j in range(i, i+r*s, s):
|
||||||
@ -126,10 +224,11 @@ def printover(a, r, s=1):
|
|||||||
|
|
||||||
def main():
|
def main():
|
||||||
# TODO: block/grid auto-optimization
|
# TODO: block/grid auto-optimization
|
||||||
ctx = LaunchContext([L2WriteCombining, MWCRNGTest],
|
ctx = LaunchContext([L2WriteCombining, SimulOccupancy, _SimulOccupancy],
|
||||||
block=(128,1,1), grid=(7*8,1), tests=True)
|
block=(128,1,1), grid=(7*8,1), tests=True)
|
||||||
ctx.compile(verbose=3)
|
ctx.compile(verbose=3)
|
||||||
ctx.run_tests()
|
ctx.run_tests()
|
||||||
|
SimulOccupancy.call(ctx)
|
||||||
L2WriteCombining.call(ctx)
|
L2WriteCombining.call(ctx)
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
Loading…
Reference in New Issue
Block a user