From aa065dc25d3e7b9344dc1183fdcb473413e57698 Mon Sep 17 00:00:00 2001 From: Steven Robertson Date: Tue, 7 Sep 2010 12:44:12 -0400 Subject: [PATCH] Add the first of many microbenchmarks --- bench.py | 145 ++++++++++++++++++++++++++++++++++++++++++++++ cuburnlib/cuda.py | 4 ++ cuburnlib/ptx.py | 29 ++++------ 3 files changed, 160 insertions(+), 18 deletions(-) create mode 100644 bench.py diff --git a/bench.py b/bench.py new file mode 100644 index 0000000..5b64a9e --- /dev/null +++ b/bench.py @@ -0,0 +1,145 @@ +#!/usr/bin/python +# -*- encoding: utf-8 -*- + +""" +Various micro-benchmarks and other experiments. +""" +import numpy as np +import pycuda.autoinit +import pycuda.driver as cuda +from cuburnlib.ptx import PTXFragment, PTXTest, ptx_func +from cuburnlib.cuda import LaunchContext +from cuburnlib.device_code import MWCRNG + +class L2WriteCombining(PTXTest): + """ + Test of L2 write combining. + """ + entry_name = 'l2_write_combining' + entry_params = [('u64', 'a_report_addr'), ('u64', 'a_scratch_addr')] + + block_size = 2**20 # 1MB/CTA. + rounds = int(1e6) + + @ptx_func + def entry(self): + mem.shared.u32('s_offset') + reg.u32('bytes_written offset write_size laneid ctaid rounds x') + reg.u64('scratch_addr scratch_offset clka clkb bytes') + reg.pred('p_write p_loop_wrsz p_is_first p_done p_coalesced') + + op.mov.u32(laneid, '%laneid') + op.setp.eq.u32(p_is_first, laneid, 0) + + op.ld.param.u32(scratch_addr, addr(a_scratch_addr)) + op.mov.u32(ctaid, '%ctaid.x') + op.cvt.u64.u32(scratch_offset, ctaid) + op.mad.lo.u64(scratch_addr, scratch_offset, self.block_size, + scratch_addr) + + op.mov.u32(x, 0) + + label('l2_restart') + comment("If CTA is even, do coalesced first") + op.and_.b32(ctaid, ctaid, 1) + op.setp.eq.u32(p_coalesced, ctaid, 0) + op.bra.uni('l2_loop_start') + + label('l2_loop_start') + op.st.shared.u32(addr(s_offset), 0, ifp=p_is_first) + op.mov.u32(rounds, 0) + op.mov.u32(write_size, 16) + op.mov.u64(clka, '%clock64') + op.mov.u64(bytes, 0) + + label('l2_loop') + comment("Increment offset across the CTA") + op.atom.shared.add.u32(offset, addr(s_offset), write_size, + ifp=p_is_first) + + comment("Find write address from current offset and lane") + op.ld.shared.u32(offset, addr(s_offset)) + op.add.u32(offset, offset, laneid) + op.mul.lo.u32(offset, offset, 8) + op.and_.b32(offset, offset, self.block_size-1) + + op.cvt.u64.u32(scratch_offset, offset) + op.add.u64(scratch_offset, scratch_offset, scratch_addr) + + comment("If lane < write_size, write to address") + op.setp.lt.u32(p_write, laneid, write_size) + op.st.u64(addr(scratch_offset), scratch_offset, ifp=p_write) + + comment("Add to number of bytes written") + op.add.u64(bytes, bytes, 8, ifp=p_write) + + comment("If uncoalesced, store new write size") + op.add.u32(write_size, write_size, 1, ifnotp=p_coalesced) + op.setp.gt.u32(p_loop_wrsz, write_size, 32) + op.mov.u32(write_size, 2, ifp=p_loop_wrsz) + + comment("Loop!") + op.add.u32(rounds, rounds, 1) + op.setp.ge.u32(p_done, rounds, self.rounds) + op.bra.uni(l2_loop, ifnotp=p_done) + + label('l2_loop_end') + op.mov.u64(clkb, '%clock64') + op.sub.u64(clka, clkb, clka) + with block("Store the time l2_loop took"): + reg.u64('report_addr report_offset') + reg.u32('gtid') + std.get_gtid(gtid) + op.mul.lo.u32(gtid, gtid, 32) + op.add.u32(gtid, gtid, 16, ifnotp=p_coalesced) + op.cvt.u64.u32(report_offset, gtid) + op.ld.param.u64(report_addr, addr(a_report_addr)) + op.add.u64(report_addr, report_addr, report_offset) + op.st.u64(addr(report_addr), clka) + op.st.u64(addr(report_addr,8), bytes) + + comment("If we did coalesced, go back and do uncoalesced") + op.add.u32(ctaid, ctaid, 1) + op.add.u32(x, x, 1) + op.setp.ge.u32(p_done, x, 2) + op.bra.uni(l2_restart, ifnotp=p_done) + + + def call(self, ctx): + scratch = np.zeros(self.block_size*ctx.ctas/4, np.uint64) + times_bytes = np.zeros((4, ctx.threads), np.uint64, 'F') + func = ctx.mod.get_function(self.entry_name) + dtime = func(cuda.InOut(times_bytes), cuda.InOut(scratch), + block=ctx.block, grid=ctx.grid, time_kernel=True) + + #printover(times_bytes[0], 6, 32) + #printover(times_bytes[1], 6) + #printover(times_bytes[2], 6, 32) + #printover(times_bytes[3], 6) + #printover(scratch[i:i+16], 8) + + print "\nTotal time was %g seconds" % dtime + pm = lambda a: (np.mean(a), np.std(a) / np.sqrt(len(a))) + print "Clks for coa was %g ± %g" % pm(times_bytes[0]) + print "Bytes for coa was %g ± %g" % pm(times_bytes[1]) + print "Clks for uncoa was %g ± %g" % pm(times_bytes[2]) + print "Bytes for uncoa was %g ± %g" % pm(times_bytes[3]) + print '' + +def printover(a, r, s=1): + for i in range(0, len(a), r*s): + for j in range(i, i+r*s, s): + if j < len(a): print a[j], + print + +def main(): + # TODO: block/grid auto-optimization + ctx = LaunchContext([L2WriteCombining], block=(128,1,1), grid=(7*8,1), + tests=True) + ctx.compile(verbose=3) + ctx.ptx.instances[L2WriteCombining].call(ctx) + +if __name__ == "__main__": + main() + + diff --git a/cuburnlib/cuda.py b/cuburnlib/cuda.py index 7acc673..053c51a 100644 --- a/cuburnlib/cuda.py +++ b/cuburnlib/cuda.py @@ -42,6 +42,10 @@ class LaunchContext(object): def threads(self): return reduce(lambda a, b: a*b, self.block + self.grid) + @property + def ctas(self): + return self.grid[0] * self.grid[1] + def compile(self, verbose=False, **kwargs): kwargs['ctx'] = self self.ptx = PTXModule(self.entry_types, kwargs, self.build_tests) diff --git a/cuburnlib/ptx.py b/cuburnlib/ptx.py index 98ed379..a72f72a 100644 --- a/cuburnlib/ptx.py +++ b/cuburnlib/ptx.py @@ -337,8 +337,8 @@ class _CallChain(object): self.__chain = [] return r def __getattr__(self, name): - if name == 'global_': - name = 'global' + if name.endswith('_'): + name = name[:-1] self.__chain.append(name) # Another great crime against the universe: return self @@ -630,23 +630,15 @@ class _PTXStdLib(PTXFragment): def get_gtid(self, dst): """ Get the global thread ID (the position of this thread in a grid of - blocks of threads). Notably, this assumes that both grid and block are - one-dimensional, which in most cases is true. + blocks of threads). This assumes that both grid and block are + one-dimensional! (This is always true for cuburn.) """ with block("Load GTID into %s" % str(dst)): - reg.u16('tmp') - reg.u32('cta ncta tid gtid') - - op.mov.u16(tmp, '%ctaid.x') - op.cvt.u32.u16(cta, tmp) - op.mov.u16(tmp, '%ntid.x') - op.cvt.u32.u16(ncta, tmp) - op.mul.lo.u32(gtid, cta, ncta) - - op.mov.u16(tmp, '%tid.x') - op.cvt.u32.u16(tid, tmp) - op.add.u32(gtid, gtid, tid) - op.mov.b32(dst, gtid) + reg.u32('cta ncta tid') + op.mov.u32(cta, '%ctaid.x') + op.mov.u32(ncta, '%ntid.x') + op.mov.u32(tid, '%tid.x') + op.mad.lo.u32(dst, cta, ncta, tid) @ptx_func def store_per_thread(self, base, val): @@ -792,7 +784,8 @@ class PTXModule(object): params = [Reg('.param.' + str(type), name) for (type, name) in ent.entry_params] _block.code(op='.entry %s ' % ent.entry_name, semi=False, - vars=['(', ['%s %s' % (r.type, r.name) for r in params], ')']) + vars=['(', ', '.join(['%s %s' % (r.type, r.name) + for r in params]), ')']) with Block(_block): [_block.inject(r.name, r) for r in params] for dep in insts: