mirror of
				https://github.com/stevenrobertson/cuburn.git
				synced 2025-11-04 02:10:45 -05:00 
			
		
		
		
	Add the first of many microbenchmarks
This commit is contained in:
		
							
								
								
									
										145
									
								
								bench.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										145
									
								
								bench.py
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,145 @@
 | 
				
			|||||||
 | 
					#!/usr/bin/python
 | 
				
			||||||
 | 
					# -*- encoding: utf-8 -*-
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					Various micro-benchmarks and other experiments.
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					import numpy as np
 | 
				
			||||||
 | 
					import pycuda.autoinit
 | 
				
			||||||
 | 
					import pycuda.driver as cuda
 | 
				
			||||||
 | 
					from cuburnlib.ptx import PTXFragment, PTXTest, ptx_func
 | 
				
			||||||
 | 
					from cuburnlib.cuda import LaunchContext
 | 
				
			||||||
 | 
					from cuburnlib.device_code import MWCRNG
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class L2WriteCombining(PTXTest):
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    Test of L2 write combining.
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    entry_name = 'l2_write_combining'
 | 
				
			||||||
 | 
					    entry_params = [('u64', 'a_report_addr'), ('u64', 'a_scratch_addr')]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    block_size = 2**20 # 1MB/CTA.
 | 
				
			||||||
 | 
					    rounds = int(1e6)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    @ptx_func
 | 
				
			||||||
 | 
					    def entry(self):
 | 
				
			||||||
 | 
					        mem.shared.u32('s_offset')
 | 
				
			||||||
 | 
					        reg.u32('bytes_written offset write_size laneid ctaid rounds x')
 | 
				
			||||||
 | 
					        reg.u64('scratch_addr scratch_offset clka clkb bytes')
 | 
				
			||||||
 | 
					        reg.pred('p_write p_loop_wrsz p_is_first p_done p_coalesced')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        op.mov.u32(laneid, '%laneid')
 | 
				
			||||||
 | 
					        op.setp.eq.u32(p_is_first, laneid, 0)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        op.ld.param.u32(scratch_addr, addr(a_scratch_addr))
 | 
				
			||||||
 | 
					        op.mov.u32(ctaid, '%ctaid.x')
 | 
				
			||||||
 | 
					        op.cvt.u64.u32(scratch_offset, ctaid)
 | 
				
			||||||
 | 
					        op.mad.lo.u64(scratch_addr, scratch_offset, self.block_size,
 | 
				
			||||||
 | 
					                      scratch_addr)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        op.mov.u32(x, 0)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        label('l2_restart')
 | 
				
			||||||
 | 
					        comment("If CTA is even, do coalesced first")
 | 
				
			||||||
 | 
					        op.and_.b32(ctaid, ctaid, 1)
 | 
				
			||||||
 | 
					        op.setp.eq.u32(p_coalesced, ctaid, 0)
 | 
				
			||||||
 | 
					        op.bra.uni('l2_loop_start')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        label('l2_loop_start')
 | 
				
			||||||
 | 
					        op.st.shared.u32(addr(s_offset), 0, ifp=p_is_first)
 | 
				
			||||||
 | 
					        op.mov.u32(rounds, 0)
 | 
				
			||||||
 | 
					        op.mov.u32(write_size, 16)
 | 
				
			||||||
 | 
					        op.mov.u64(clka, '%clock64')
 | 
				
			||||||
 | 
					        op.mov.u64(bytes, 0)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        label('l2_loop')
 | 
				
			||||||
 | 
					        comment("Increment offset across the CTA")
 | 
				
			||||||
 | 
					        op.atom.shared.add.u32(offset, addr(s_offset), write_size,
 | 
				
			||||||
 | 
					                               ifp=p_is_first)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        comment("Find write address from current offset and lane")
 | 
				
			||||||
 | 
					        op.ld.shared.u32(offset, addr(s_offset))
 | 
				
			||||||
 | 
					        op.add.u32(offset, offset, laneid)
 | 
				
			||||||
 | 
					        op.mul.lo.u32(offset, offset, 8)
 | 
				
			||||||
 | 
					        op.and_.b32(offset, offset, self.block_size-1)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        op.cvt.u64.u32(scratch_offset, offset)
 | 
				
			||||||
 | 
					        op.add.u64(scratch_offset, scratch_offset, scratch_addr)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        comment("If lane < write_size, write to address")
 | 
				
			||||||
 | 
					        op.setp.lt.u32(p_write, laneid, write_size)
 | 
				
			||||||
 | 
					        op.st.u64(addr(scratch_offset), scratch_offset, ifp=p_write)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        comment("Add to number of bytes written")
 | 
				
			||||||
 | 
					        op.add.u64(bytes, bytes, 8, ifp=p_write)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        comment("If uncoalesced, store new write size")
 | 
				
			||||||
 | 
					        op.add.u32(write_size, write_size, 1, ifnotp=p_coalesced)
 | 
				
			||||||
 | 
					        op.setp.gt.u32(p_loop_wrsz, write_size, 32)
 | 
				
			||||||
 | 
					        op.mov.u32(write_size, 2, ifp=p_loop_wrsz)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        comment("Loop!")
 | 
				
			||||||
 | 
					        op.add.u32(rounds, rounds, 1)
 | 
				
			||||||
 | 
					        op.setp.ge.u32(p_done, rounds, self.rounds)
 | 
				
			||||||
 | 
					        op.bra.uni(l2_loop, ifnotp=p_done)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        label('l2_loop_end')
 | 
				
			||||||
 | 
					        op.mov.u64(clkb, '%clock64')
 | 
				
			||||||
 | 
					        op.sub.u64(clka, clkb, clka)
 | 
				
			||||||
 | 
					        with block("Store the time l2_loop took"):
 | 
				
			||||||
 | 
					            reg.u64('report_addr report_offset')
 | 
				
			||||||
 | 
					            reg.u32('gtid')
 | 
				
			||||||
 | 
					            std.get_gtid(gtid)
 | 
				
			||||||
 | 
					            op.mul.lo.u32(gtid, gtid, 32)
 | 
				
			||||||
 | 
					            op.add.u32(gtid, gtid, 16, ifnotp=p_coalesced)
 | 
				
			||||||
 | 
					            op.cvt.u64.u32(report_offset, gtid)
 | 
				
			||||||
 | 
					            op.ld.param.u64(report_addr, addr(a_report_addr))
 | 
				
			||||||
 | 
					            op.add.u64(report_addr, report_addr, report_offset)
 | 
				
			||||||
 | 
					            op.st.u64(addr(report_addr), clka)
 | 
				
			||||||
 | 
					            op.st.u64(addr(report_addr,8), bytes)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        comment("If we did coalesced, go back and do uncoalesced")
 | 
				
			||||||
 | 
					        op.add.u32(ctaid, ctaid, 1)
 | 
				
			||||||
 | 
					        op.add.u32(x, x, 1)
 | 
				
			||||||
 | 
					        op.setp.ge.u32(p_done, x, 2)
 | 
				
			||||||
 | 
					        op.bra.uni(l2_restart, ifnotp=p_done)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def call(self, ctx):
 | 
				
			||||||
 | 
					        scratch = np.zeros(self.block_size*ctx.ctas/4, np.uint64)
 | 
				
			||||||
 | 
					        times_bytes = np.zeros((4, ctx.threads), np.uint64, 'F')
 | 
				
			||||||
 | 
					        func = ctx.mod.get_function(self.entry_name)
 | 
				
			||||||
 | 
					        dtime = func(cuda.InOut(times_bytes), cuda.InOut(scratch),
 | 
				
			||||||
 | 
					                     block=ctx.block, grid=ctx.grid, time_kernel=True)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        #printover(times_bytes[0], 6, 32)
 | 
				
			||||||
 | 
					        #printover(times_bytes[1], 6)
 | 
				
			||||||
 | 
					        #printover(times_bytes[2], 6, 32)
 | 
				
			||||||
 | 
					        #printover(times_bytes[3], 6)
 | 
				
			||||||
 | 
					        #printover(scratch[i:i+16], 8)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        print "\nTotal time was %g seconds" % dtime
 | 
				
			||||||
 | 
					        pm = lambda a: (np.mean(a), np.std(a) / np.sqrt(len(a)))
 | 
				
			||||||
 | 
					        print "Clks for coa was %g ± %g" % pm(times_bytes[0])
 | 
				
			||||||
 | 
					        print "Bytes for coa was %g ± %g" % pm(times_bytes[1])
 | 
				
			||||||
 | 
					        print "Clks for uncoa was %g ± %g" % pm(times_bytes[2])
 | 
				
			||||||
 | 
					        print "Bytes for uncoa was %g ± %g" % pm(times_bytes[3])
 | 
				
			||||||
 | 
					        print ''
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def printover(a, r, s=1):
 | 
				
			||||||
 | 
					    for i in range(0, len(a), r*s):
 | 
				
			||||||
 | 
					        for j in range(i, i+r*s, s):
 | 
				
			||||||
 | 
					            if j < len(a): print a[j],
 | 
				
			||||||
 | 
					        print
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def main():
 | 
				
			||||||
 | 
					    # TODO: block/grid auto-optimization
 | 
				
			||||||
 | 
					    ctx = LaunchContext([L2WriteCombining], block=(128,1,1), grid=(7*8,1),
 | 
				
			||||||
 | 
					                        tests=True)
 | 
				
			||||||
 | 
					    ctx.compile(verbose=3)
 | 
				
			||||||
 | 
					    ctx.ptx.instances[L2WriteCombining].call(ctx)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					if __name__ == "__main__":
 | 
				
			||||||
 | 
					    main()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -42,6 +42,10 @@ class LaunchContext(object):
 | 
				
			|||||||
    def threads(self):
 | 
					    def threads(self):
 | 
				
			||||||
        return reduce(lambda a, b: a*b, self.block + self.grid)
 | 
					        return reduce(lambda a, b: a*b, self.block + self.grid)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    @property
 | 
				
			||||||
 | 
					    def ctas(self):
 | 
				
			||||||
 | 
					        return self.grid[0] * self.grid[1]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def compile(self, verbose=False, **kwargs):
 | 
					    def compile(self, verbose=False, **kwargs):
 | 
				
			||||||
        kwargs['ctx'] = self
 | 
					        kwargs['ctx'] = self
 | 
				
			||||||
        self.ptx = PTXModule(self.entry_types, kwargs, self.build_tests)
 | 
					        self.ptx = PTXModule(self.entry_types, kwargs, self.build_tests)
 | 
				
			||||||
 | 
				
			|||||||
@ -337,8 +337,8 @@ class _CallChain(object):
 | 
				
			|||||||
        self.__chain = []
 | 
					        self.__chain = []
 | 
				
			||||||
        return r
 | 
					        return r
 | 
				
			||||||
    def __getattr__(self, name):
 | 
					    def __getattr__(self, name):
 | 
				
			||||||
        if name == 'global_':
 | 
					        if name.endswith('_'):
 | 
				
			||||||
            name = 'global'
 | 
					            name = name[:-1]
 | 
				
			||||||
        self.__chain.append(name)
 | 
					        self.__chain.append(name)
 | 
				
			||||||
        # Another great crime against the universe:
 | 
					        # Another great crime against the universe:
 | 
				
			||||||
        return self
 | 
					        return self
 | 
				
			||||||
@ -630,23 +630,15 @@ class _PTXStdLib(PTXFragment):
 | 
				
			|||||||
    def get_gtid(self, dst):
 | 
					    def get_gtid(self, dst):
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        Get the global thread ID (the position of this thread in a grid of
 | 
					        Get the global thread ID (the position of this thread in a grid of
 | 
				
			||||||
        blocks of threads). Notably, this assumes that both grid and block are
 | 
					        blocks of threads). This assumes that both grid and block are
 | 
				
			||||||
        one-dimensional, which in most cases is true.
 | 
					        one-dimensional! (This is always true for cuburn.)
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        with block("Load GTID into %s" % str(dst)):
 | 
					        with block("Load GTID into %s" % str(dst)):
 | 
				
			||||||
            reg.u16('tmp')
 | 
					            reg.u32('cta ncta tid')
 | 
				
			||||||
            reg.u32('cta ncta tid gtid')
 | 
					            op.mov.u32(cta, '%ctaid.x')
 | 
				
			||||||
 | 
					            op.mov.u32(ncta, '%ntid.x')
 | 
				
			||||||
            op.mov.u16(tmp, '%ctaid.x')
 | 
					            op.mov.u32(tid, '%tid.x')
 | 
				
			||||||
            op.cvt.u32.u16(cta, tmp)
 | 
					            op.mad.lo.u32(dst, cta, ncta, tid)
 | 
				
			||||||
            op.mov.u16(tmp, '%ntid.x')
 | 
					 | 
				
			||||||
            op.cvt.u32.u16(ncta, tmp)
 | 
					 | 
				
			||||||
            op.mul.lo.u32(gtid, cta, ncta)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
            op.mov.u16(tmp, '%tid.x')
 | 
					 | 
				
			||||||
            op.cvt.u32.u16(tid, tmp)
 | 
					 | 
				
			||||||
            op.add.u32(gtid, gtid, tid)
 | 
					 | 
				
			||||||
            op.mov.b32(dst, gtid)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @ptx_func
 | 
					    @ptx_func
 | 
				
			||||||
    def store_per_thread(self, base, val):
 | 
					    def store_per_thread(self, base, val):
 | 
				
			||||||
@ -792,7 +784,8 @@ class PTXModule(object):
 | 
				
			|||||||
            params = [Reg('.param.' + str(type), name)
 | 
					            params = [Reg('.param.' + str(type), name)
 | 
				
			||||||
                      for (type, name) in ent.entry_params]
 | 
					                      for (type, name) in ent.entry_params]
 | 
				
			||||||
            _block.code(op='.entry %s ' % ent.entry_name, semi=False,
 | 
					            _block.code(op='.entry %s ' % ent.entry_name, semi=False,
 | 
				
			||||||
                vars=['(', ['%s %s' % (r.type, r.name) for r in params], ')'])
 | 
					                vars=['(', ', '.join(['%s %s' % (r.type, r.name)
 | 
				
			||||||
 | 
					                                      for r in params]), ')'])
 | 
				
			||||||
            with Block(_block):
 | 
					            with Block(_block):
 | 
				
			||||||
                [_block.inject(r.name, r) for r in params]
 | 
					                [_block.inject(r.name, r) for r in params]
 | 
				
			||||||
                for dep in insts:
 | 
					                for dep in insts:
 | 
				
			||||||
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user