diff --git a/bench.py b/bench.py deleted file mode 100644 index 8151775..0000000 --- a/bench.py +++ /dev/null @@ -1,237 +0,0 @@ -#!/usr/bin/python -# -*- encoding: utf-8 -*- - -""" -Various micro-benchmarks and other experiments. -""" -import numpy as np -import pycuda.autoinit -import pycuda.driver as cuda -from cuburn.ptx import PTXFragment, PTXTest, ptx_func, instmethod -from cuburn.cuda import LaunchContext -from cuburn.device_code import MWCRNG, MWCRNGTest - -class L2WriteCombining(PTXTest): - """ - Test of L2 write combining. - """ - entry_name = 'l2_write_combining' - entry_params = [('u64', 'a_report_addr'), ('u64', 'a_scratch_addr')] - - block_size = 2**20 # 1MB/CTA. - rounds = int(1e6) - - @ptx_func - def entry(self): - mem.shared.u32('s_offset') - reg.u32('bytes_written offset write_size laneid ctaid rounds x') - reg.u64('scratch_addr scratch_offset clka clkb bytes') - reg.pred('p_write p_loop_wrsz p_is_first p_done p_coalesced') - - op.mov.u32(laneid, '%laneid') - op.setp.eq.u32(p_is_first, laneid, 0) - - op.ld.param.u32(scratch_addr, addr(a_scratch_addr)) - op.mov.u32(ctaid, '%ctaid.x') - op.cvt.u64.u32(scratch_offset, ctaid) - op.mad.lo.u64(scratch_addr, scratch_offset, self.block_size, - scratch_addr) - - op.mov.u32(x, 0) - - label('l2_restart') - comment("If CTA is even, do coalesced first") - op.and_.b32(ctaid, ctaid, 1) - op.setp.eq.u32(p_coalesced, ctaid, 0) - op.bra.uni('l2_loop_start') - - label('l2_loop_start') - op.st.shared.u32(addr(s_offset), 0, ifp=p_is_first) - op.mov.u32(rounds, 0) - op.mov.u32(write_size, 16) - op.mov.u64(clka, '%clock64') - op.mov.u64(bytes, 0) - - label('l2_loop') - comment("Increment offset across the CTA") - op.atom.shared.add.u32(offset, addr(s_offset), write_size, - ifp=p_is_first) - - comment("Find write address from current offset and lane") - op.ld.shared.u32(offset, addr(s_offset)) - op.add.u32(offset, offset, laneid) - op.mul.lo.u32(offset, offset, 8) - op.and_.b32(offset, offset, self.block_size-1) - - op.cvt.u64.u32(scratch_offset, offset) - op.add.u64(scratch_offset, scratch_offset, scratch_addr) - - comment("If lane < write_size, write to address") - op.setp.lt.u32(p_write, laneid, write_size) - op.st.u64(addr(scratch_offset), scratch_offset, ifp=p_write) - - comment("Add to number of bytes written") - op.add.u64(bytes, bytes, 8, ifp=p_write) - - comment("If uncoalesced, store new write size") - op.add.u32(write_size, write_size, 1, ifnotp=p_coalesced) - op.setp.gt.u32(p_loop_wrsz, write_size, 32) - op.mov.u32(write_size, 2, ifp=p_loop_wrsz) - - comment("Loop!") - op.add.u32(rounds, rounds, 1) - op.setp.ge.u32(p_done, rounds, self.rounds) - op.bra.uni(l2_loop, ifnotp=p_done) - - label('l2_loop_end') - op.mov.u64(clkb, '%clock64') - op.sub.u64(clka, clkb, clka) - with block("Store the time l2_loop took"): - reg.u64('report_addr report_offset') - reg.u32('gtid') - std.get_gtid(gtid) - op.mul.lo.u32(gtid, gtid, 32) - op.add.u32(gtid, gtid, 16, ifnotp=p_coalesced) - op.cvt.u64.u32(report_offset, gtid) - op.ld.param.u64(report_addr, addr(a_report_addr)) - op.add.u64(report_addr, report_addr, report_offset) - op.st.u64(addr(report_addr), clka) - op.st.u64(addr(report_addr,8), bytes) - - comment("If we did coalesced, go back and do uncoalesced") - op.add.u32(ctaid, ctaid, 1) - op.add.u32(x, x, 1) - op.setp.ge.u32(p_done, x, 2) - op.bra.uni(l2_restart, ifnotp=p_done) - - def call_setup(self, ctx): - self.scratch = np.zeros(self.block_size*ctx.nctas/4, np.uint64) - self.times_bytes = np.zeros((4, ctx.nthreads), np.uint64, 'F') - - def _call(self, ctx, func): - super(L2WriteCombining, self)._call(ctx, func, - cuda.InOut(self.times_bytes), cuda.InOut(self.scratch)) - - def call_teardown(self, ctx): - pm = lambda a: (np.mean(a), np.std(a) / np.sqrt(len(a))) - print "Clks for coa was %g ± %g" % pm(self.times_bytes[0]) - print "Bytes for coa was %g ± %g" % pm(self.times_bytes[1]) - print "Clks for uncoa was %g ± %g" % pm(self.times_bytes[2]) - print "Bytes for uncoa was %g ± %g" % pm(self.times_bytes[3]) - print - -class SimulOccupancy(PTXTest): - """ - Test to discover whether Fermi GPUs will launch multiple entry points - in the same kernel on the same CTA simultaneously. - """ - entry_name = 'simul1' - # Only has to be big enough to hold the kernel on the device for a while - rounds = 1000000 - - def deps(self): - return [MWCRNG] - - @ptx_func - def module_setup(self): - n = self.entry_name + '_' - mem.global_.u64(n+'start', ctx.nthreads) - mem.global_.u64(n+'end', ctx.nthreads) - mem.global_.u32(n+'smid', ctx.nthreads) - mem.global_.u32(n+'warpid_start', ctx.nthreads) - mem.global_.u32(n+'warpid_end', ctx.nthreads) - - @ptx_func - def entry(self): - n = self.entry_name + '_' - reg.u64('now') - reg.u32('warpid') - op.mov.u64(now, '%clock64') - op.mov.u32(warpid, '%warpid') - std.store_per_thread(n+'start', now, - n+'warpid_start', warpid) - - reg.u32('loopct rnd') - reg.pred('p_done') - op.mov.u32(loopct, self.rounds) - label('loopstart') - mwc.next_b32(rnd) - std.store_per_thread(n+'smid', rnd) - op.sub.u32(loopct, loopct, 1) - op.setp.eq.u32(p_done, loopct, 0) - op.bra.uni(loopstart, ifnotp=p_done) - - reg.u32('smid') - op.mov.u32(smid, '%smid') - op.mov.u32(warpid, '%warpid') - op.mov.u64(now, '%clock64') - std.store_per_thread(n+'end', now, - n+'smid', smid, - n+'warpid_end', warpid) - - def _call(self, ctx, func): - stream1, stream2 = cuda.Stream(), cuda.Stream() - self._call2(ctx, stream1) - _SimulOccupancy._call2(ctx, stream2) - stream2.synchronize() - stream1.synchronize() - - @instmethod - def _call2(self, ctx, stream): - func = ctx.mod.get_function(self.entry_name) - func.prepare([], ctx.block) - # TODO: load number of SMs from ctx - func.launch_grid_async(7, 1, stream) - - def call_teardown(self, ctx): - sm_log = [[] for i in range(7)] - self._teardown(ctx, sm_log) - _SimulOccupancy._teardown(ctx, sm_log) - for sm in range(len(sm_log)): - print "\nPrinting log for SM %d" % sm - for t, ev in sorted(sm_log[sm]): - print '%6d %s' % (t/1000, ev) - - @instmethod - def _teardown(self, ctx, sm_log): - # For this method, the GPU is intentionally underloaded; trim results - th = 7 * ctx.threads_per_cta - n = self.entry_name + '_' - start = ctx.get_per_thread(n+'start', np.uint64)[:th] - end = ctx.get_per_thread(n+'end', np.uint64)[:th] - smid = ctx.get_per_thread(n+'smid', np.uint32)[:th] - warpid_start = ctx.get_per_thread(n+'warpid_start', np.uint32)[:th] - warpid_end = ctx.get_per_thread(n+'warpid_end', np.uint32)[:th] - for i in range(0, th, 32): - sm_log[smid[i]].append((start[i], "%s%4d entered SM" % (n, i/32))) - sm_log[smid[i]].append((end[i], "%s%4d left SM" % (n, i/32))) - if not np.alltrue(np.equal(warpid_start, warpid_end)): - print "Warp IDs changed. Do further research." - -class _SimulOccupancy(SimulOccupancy): - # Don't call this one - entry_name = 'simul2' - def call(self, ctx): - pass - def call_teardown(self, ctx): - pass - -def printover(a, r, s=1): - for i in range(0, len(a), r*s): - for j in range(i, i+r*s, s): - if j < len(a): print a[j], - print - -def main(): - # TODO: block/grid auto-optimization - ctx = LaunchContext([L2WriteCombining, SimulOccupancy, _SimulOccupancy], - block=(128,1,1), grid=(7*8,1), tests=True) - ctx.compile(verbose=3) - ctx.run_tests() - SimulOccupancy.call(ctx) - L2WriteCombining.call(ctx) - -if __name__ == "__main__": - main() - -