cuburn/bench.py

#!/usr/bin/python
# -*- encoding: utf-8 -*-

"""
Various micro-benchmarks and other experiments.
"""
import numpy as np
import pycuda.autoinit
import pycuda.driver as cuda
from cuburn.ptx import PTXFragment, PTXTest, ptx_func, instmethod
from cuburn.cuda import LaunchContext
from cuburn.device_code import MWCRNG, MWCRNGTest

class L2WriteCombining(PTXTest):
    """
    Test of L2 write combining.
    """
    entry_name = 'l2_write_combining'
    entry_params = [('u64', 'a_report_addr'), ('u64', 'a_scratch_addr')]

    block_size = 2**20 # 1MB/CTA.
    rounds = int(1e6)

    @ptx_func
    def entry(self):
        mem.shared.u32('s_offset')
        reg.u32('bytes_written offset write_size laneid ctaid rounds x')
        reg.u64('scratch_addr scratch_offset clka clkb bytes')
        reg.pred('p_write p_loop_wrsz p_is_first p_done p_coalesced')

        op.mov.u32(laneid, '%laneid')
        op.setp.eq.u32(p_is_first, laneid, 0)

        op.ld.param.u32(scratch_addr, addr(a_scratch_addr))
        op.mov.u32(ctaid, '%ctaid.x')
        op.cvt.u64.u32(scratch_offset, ctaid)
        op.mad.lo.u64(scratch_addr, scratch_offset, self.block_size,
                      scratch_addr)

        op.mov.u32(x, 0)

        label('l2_restart')
        comment("If CTA is even, do coalesced first")
        op.and_.b32(ctaid, ctaid, 1)
        op.setp.eq.u32(p_coalesced, ctaid, 0)
        op.bra.uni('l2_loop_start')

        label('l2_loop_start')
        op.st.shared.u32(addr(s_offset), 0, ifp=p_is_first)
        op.mov.u32(rounds, 0)
        op.mov.u32(write_size, 16)
        op.mov.u64(clka, '%clock64')
        op.mov.u64(bytes, 0)

        label('l2_loop')
        comment("Increment offset across the CTA")
        op.atom.shared.add.u32(offset, addr(s_offset), write_size,
                               ifp=p_is_first)

        comment("Find write address from current offset and lane")
        op.ld.shared.u32(offset, addr(s_offset))
        op.add.u32(offset, offset, laneid)
        op.mul.lo.u32(offset, offset, 8)
        op.and_.b32(offset, offset, self.block_size-1)

        op.cvt.u64.u32(scratch_offset, offset)
        op.add.u64(scratch_offset, scratch_offset, scratch_addr)

        comment("If lane < write_size, write to address")
        op.setp.lt.u32(p_write, laneid, write_size)
        op.st.u64(addr(scratch_offset), scratch_offset, ifp=p_write)

        comment("Add to number of bytes written")
        op.add.u64(bytes, bytes, 8, ifp=p_write)

        comment("If uncoalesced, store new write size")
        op.add.u32(write_size, write_size, 1, ifnotp=p_coalesced)
        op.setp.gt.u32(p_loop_wrsz, write_size, 32)
        op.mov.u32(write_size, 2, ifp=p_loop_wrsz)

        comment("Loop!")
        op.add.u32(rounds, rounds, 1)
        op.setp.ge.u32(p_done, rounds, self.rounds)
        op.bra.uni(l2_loop, ifnotp=p_done)

        label('l2_loop_end')
        op.mov.u64(clkb, '%clock64')
        op.sub.u64(clka, clkb, clka)
        with block("Store the time l2_loop took"):
            reg.u64('report_addr report_offset')
            reg.u32('gtid')
            std.get_gtid(gtid)
            op.mul.lo.u32(gtid, gtid, 32)
            op.add.u32(gtid, gtid, 16, ifnotp=p_coalesced)
            op.cvt.u64.u32(report_offset, gtid)
            op.ld.param.u64(report_addr, addr(a_report_addr))
            op.add.u64(report_addr, report_addr, report_offset)
            op.st.u64(addr(report_addr), clka)
            op.st.u64(addr(report_addr,8), bytes)

        comment("If we did coalesced, go back and do uncoalesced")
        op.add.u32(ctaid, ctaid, 1)
        op.add.u32(x, x, 1)
        op.setp.ge.u32(p_done, x, 2)
        op.bra.uni(l2_restart, ifnotp=p_done)

    def call_setup(self, ctx):
        self.scratch = np.zeros(self.block_size*ctx.nctas/4, np.uint64)
        self.times_bytes = np.zeros((4, ctx.nthreads), np.uint64, 'F')

    def _call(self, ctx, func):
        super(L2WriteCombining, self)._call(ctx, func,
                cuda.InOut(self.times_bytes), cuda.InOut(self.scratch))

    def call_teardown(self, ctx):
        pm = lambda a: (np.mean(a), np.std(a) / np.sqrt(len(a)))
        print "Clks for coa was %g ± %g" % pm(self.times_bytes[0])
        print "Bytes for coa was %g ± %g" % pm(self.times_bytes[1])
        print "Clks for uncoa was %g ± %g" % pm(self.times_bytes[2])
        print "Bytes for uncoa was %g ± %g" % pm(self.times_bytes[3])
        print

class SimulOccupancy(PTXTest):
    """
    Test to discover whether Fermi GPUs will launch multiple entry points
    in the same kernel on the same CTA simultaneously.
    """
    entry_name = 'simul1'
    # Only has to be big enough to hold the kernel on the device for a while
    rounds = 1000000

    def deps(self):
        return [MWCRNG]

    @ptx_func
    def module_setup(self):
        n = self.entry_name + '_'
        mem.global_.u64(n+'start', ctx.nthreads)
        mem.global_.u64(n+'end', ctx.nthreads)
        mem.global_.u32(n+'smid', ctx.nthreads)
        mem.global_.u32(n+'warpid_start', ctx.nthreads)
        mem.global_.u32(n+'warpid_end', ctx.nthreads)

    @ptx_func
    def entry(self):
        n = self.entry_name + '_'
        reg.u64('now')
        reg.u32('warpid')
        op.mov.u64(now, '%clock64')
        op.mov.u32(warpid, '%warpid')
        std.store_per_thread(n+'start', now,
                             n+'warpid_start', warpid)

        reg.u32('loopct rnd')
        reg.pred('p_done')
        op.mov.u32(loopct, self.rounds)
        label('loopstart')
        mwc.next_b32(rnd)
        std.store_per_thread(n+'smid', rnd)
        op.sub.u32(loopct, loopct, 1)
        op.setp.eq.u32(p_done, loopct, 0)
        op.bra.uni(loopstart, ifnotp=p_done)

        reg.u32('smid')
        op.mov.u32(smid, '%smid')
        op.mov.u32(warpid, '%warpid')
        op.mov.u64(now, '%clock64')
        std.store_per_thread(n+'end', now,
                             n+'smid', smid,
                             n+'warpid_end', warpid)

    def _call(self, ctx, func):
        stream1, stream2 = cuda.Stream(), cuda.Stream()
        self._call2(ctx, stream1)
        _SimulOccupancy._call2(ctx, stream2)
        stream2.synchronize()
        stream1.synchronize()

    @instmethod
    def _call2(self, ctx, stream):
        func = ctx.mod.get_function(self.entry_name)
        func.prepare([], ctx.block)
        # TODO: load number of SMs from ctx
        func.launch_grid_async(7, 1, stream)

    def call_teardown(self, ctx):
        sm_log = [[] for i in range(7)]
        self._teardown(ctx, sm_log)
        _SimulOccupancy._teardown(ctx, sm_log)
        for sm in range(len(sm_log)):
            print "\nPrinting log for SM %d" % sm
            for t, ev in sorted(sm_log[sm]):
                print '%6d %s' % (t/1000, ev)

    @instmethod
    def _teardown(self, ctx, sm_log):
        # For this method, the GPU is intentionally underloaded; trim results
        th = 7 * ctx.threads_per_cta
        n = self.entry_name + '_'
        start = ctx.get_per_thread(n+'start', np.uint64)[:th]
        end = ctx.get_per_thread(n+'end', np.uint64)[:th]
        smid = ctx.get_per_thread(n+'smid', np.uint32)[:th]
        warpid_start = ctx.get_per_thread(n+'warpid_start', np.uint32)[:th]
        warpid_end = ctx.get_per_thread(n+'warpid_end', np.uint32)[:th]
        for i in range(0, th, 32):
            sm_log[smid[i]].append((start[i], "%s%4d entered SM" % (n, i/32)))
            sm_log[smid[i]].append((end[i],   "%s%4d left SM" % (n, i/32)))
        if not np.alltrue(np.equal(warpid_start, warpid_end)):
            print "Warp IDs changed. Do further research."

class _SimulOccupancy(SimulOccupancy):
    # Don't call this one
    entry_name = 'simul2'
    def call(self, ctx):
        pass
    def call_teardown(self, ctx):
        pass

def printover(a, r, s=1):
    for i in range(0, len(a), r*s):
        for j in range(i, i+r*s, s):
            if j < len(a): print a[j],
        print

def main():
    # TODO: block/grid auto-optimization
    ctx = LaunchContext([L2WriteCombining, SimulOccupancy, _SimulOccupancy],
                        block=(128,1,1), grid=(7*8,1), tests=True)
    ctx.compile(verbose=3)
    ctx.run_tests()
    SimulOccupancy.call(ctx)
    L2WriteCombining.call(ctx)

if __name__ == "__main__":
    main()
Add the first of many microbenchmarks 2010-09-07 12:44:12 -04:00			`#!/usr/bin/python`
			`# -- encoding: utf-8 --`

			`"""`
			`Various micro-benchmarks and other experiments.`
			`"""`
			`import numpy as np`
			`import pycuda.autoinit`
			`import pycuda.driver as cuda`
Rename "cuburnlib" (stupid) to "cuburn" (stupid but shorter) --HG-- rename : cuburnlib/__init__.py => cuburn/__init__.py rename : cuburnlib/cuda.py => cuburn/cuda.py rename : cuburnlib/device_code.py => cuburn/device_code.py rename : cuburnlib/ptx.py => cuburn/ptx.py rename : cuburnlib/render.py => cuburn/render.py 2010-09-10 14:48:34 -04:00			`from cuburn.ptx import PTXFragment, PTXTest, ptx_func, instmethod`
			`from cuburn.cuda import LaunchContext`
			`from cuburn.device_code import MWCRNG, MWCRNGTest`
Add the first of many microbenchmarks 2010-09-07 12:44:12 -04:00
			`class L2WriteCombining(PTXTest):`
			`"""`
			`Test of L2 write combining.`
			`"""`
			`entry_name = 'l2_write_combining'`
			`entry_params = [('u64', 'a_report_addr'), ('u64', 'a_scratch_addr')]`

			`block_size = 2**20 # 1MB/CTA.`
			`rounds = int(1e6)`

			`@ptx_func`
			`def entry(self):`
			`mem.shared.u32('s_offset')`
			`reg.u32('bytes_written offset write_size laneid ctaid rounds x')`
			`reg.u64('scratch_addr scratch_offset clka clkb bytes')`
			`reg.pred('p_write p_loop_wrsz p_is_first p_done p_coalesced')`

			`op.mov.u32(laneid, '%laneid')`
			`op.setp.eq.u32(p_is_first, laneid, 0)`

			`op.ld.param.u32(scratch_addr, addr(a_scratch_addr))`
			`op.mov.u32(ctaid, '%ctaid.x')`
			`op.cvt.u64.u32(scratch_offset, ctaid)`
			`op.mad.lo.u64(scratch_addr, scratch_offset, self.block_size,`
			`scratch_addr)`

			`op.mov.u32(x, 0)`

			`label('l2_restart')`
			`comment("If CTA is even, do coalesced first")`
			`op.and_.b32(ctaid, ctaid, 1)`
			`op.setp.eq.u32(p_coalesced, ctaid, 0)`
			`op.bra.uni('l2_loop_start')`

			`label('l2_loop_start')`
			`op.st.shared.u32(addr(s_offset), 0, ifp=p_is_first)`
			`op.mov.u32(rounds, 0)`
			`op.mov.u32(write_size, 16)`
			`op.mov.u64(clka, '%clock64')`
			`op.mov.u64(bytes, 0)`

			`label('l2_loop')`
			`comment("Increment offset across the CTA")`
			`op.atom.shared.add.u32(offset, addr(s_offset), write_size,`
			`ifp=p_is_first)`

			`comment("Find write address from current offset and lane")`
			`op.ld.shared.u32(offset, addr(s_offset))`
			`op.add.u32(offset, offset, laneid)`
			`op.mul.lo.u32(offset, offset, 8)`
			`op.and_.b32(offset, offset, self.block_size-1)`

			`op.cvt.u64.u32(scratch_offset, offset)`
			`op.add.u64(scratch_offset, scratch_offset, scratch_addr)`

			`comment("If lane < write_size, write to address")`
			`op.setp.lt.u32(p_write, laneid, write_size)`
			`op.st.u64(addr(scratch_offset), scratch_offset, ifp=p_write)`

			`comment("Add to number of bytes written")`
			`op.add.u64(bytes, bytes, 8, ifp=p_write)`

			`comment("If uncoalesced, store new write size")`
			`op.add.u32(write_size, write_size, 1, ifnotp=p_coalesced)`
			`op.setp.gt.u32(p_loop_wrsz, write_size, 32)`
			`op.mov.u32(write_size, 2, ifp=p_loop_wrsz)`

			`comment("Loop!")`
			`op.add.u32(rounds, rounds, 1)`
			`op.setp.ge.u32(p_done, rounds, self.rounds)`
			`op.bra.uni(l2_loop, ifnotp=p_done)`

			`label('l2_loop_end')`
			`op.mov.u64(clkb, '%clock64')`
			`op.sub.u64(clka, clkb, clka)`
			`with block("Store the time l2_loop took"):`
			`reg.u64('report_addr report_offset')`
			`reg.u32('gtid')`
			`std.get_gtid(gtid)`
			`op.mul.lo.u32(gtid, gtid, 32)`
			`op.add.u32(gtid, gtid, 16, ifnotp=p_coalesced)`
			`op.cvt.u64.u32(report_offset, gtid)`
			`op.ld.param.u64(report_addr, addr(a_report_addr))`
			`op.add.u64(report_addr, report_addr, report_offset)`
			`op.st.u64(addr(report_addr), clka)`
			`op.st.u64(addr(report_addr,8), bytes)`

			`comment("If we did coalesced, go back and do uncoalesced")`
			`op.add.u32(ctaid, ctaid, 1)`
			`op.add.u32(x, x, 1)`
			`op.setp.ge.u32(p_done, x, 2)`
			`op.bra.uni(l2_restart, ifnotp=p_done)`

Simultaneous occupancy microbenchmark 2010-09-12 16:23:24 -04:00			`def call_setup(self, ctx):`
			`self.scratch = np.zeros(self.block_size*ctx.nctas/4, np.uint64)`
			`self.times_bytes = np.zeros((4, ctx.nthreads), np.uint64, 'F')`

Refactor call() to be more elegant 2010-09-10 14:43:20 -04:00			`def _call(self, ctx, func):`
			`super(L2WriteCombining, self)._call(ctx, func,`
Fixed bench.py, with the help of Device Assertions™!* * Not actually a trademark 2010-09-11 00:16:43 -04:00			`cuda.InOut(self.times_bytes), cuda.InOut(self.scratch))`
Refactor call() to be more elegant 2010-09-10 14:43:20 -04:00
			`def call_teardown(self, ctx):`
Add the first of many microbenchmarks 2010-09-07 12:44:12 -04:00			`pm = lambda a: (np.mean(a), np.std(a) / np.sqrt(len(a)))`
Refactor call() to be more elegant 2010-09-10 14:43:20 -04:00			`print "Clks for coa was %g ± %g" % pm(self.times_bytes[0])`
			`print "Bytes for coa was %g ± %g" % pm(self.times_bytes[1])`
			`print "Clks for uncoa was %g ± %g" % pm(self.times_bytes[2])`
			`print "Bytes for uncoa was %g ± %g" % pm(self.times_bytes[3])`
Test to make sure floating point numbers were in the right range. 2010-09-10 19:36:39 -04:00			`print`
Add the first of many microbenchmarks 2010-09-07 12:44:12 -04:00
Simultaneous occupancy microbenchmark 2010-09-12 16:23:24 -04:00			`class SimulOccupancy(PTXTest):`
			`"""`
			`Test to discover whether Fermi GPUs will launch multiple entry points`
			`in the same kernel on the same CTA simultaneously.`
			`"""`
			`entry_name = 'simul1'`
			`# Only has to be big enough to hold the kernel on the device for a while`
			`rounds = 1000000`

			`def deps(self):`
			`return [MWCRNG]`

			`@ptx_func`
			`def module_setup(self):`
			`n = self.entry_name + '_'`
			`mem.global_.u64(n+'start', ctx.nthreads)`
			`mem.global_.u64(n+'end', ctx.nthreads)`
			`mem.global_.u32(n+'smid', ctx.nthreads)`
			`mem.global_.u32(n+'warpid_start', ctx.nthreads)`
			`mem.global_.u32(n+'warpid_end', ctx.nthreads)`

			`@ptx_func`
			`def entry(self):`
			`n = self.entry_name + '_'`
			`reg.u64('now')`
			`reg.u32('warpid')`
			`op.mov.u64(now, '%clock64')`
			`op.mov.u32(warpid, '%warpid')`
			`std.store_per_thread(n+'start', now,`
			`n+'warpid_start', warpid)`

			`reg.u32('loopct rnd')`
			`reg.pred('p_done')`
			`op.mov.u32(loopct, self.rounds)`
			`label('loopstart')`
			`mwc.next_b32(rnd)`
			`std.store_per_thread(n+'smid', rnd)`
			`op.sub.u32(loopct, loopct, 1)`
			`op.setp.eq.u32(p_done, loopct, 0)`
			`op.bra.uni(loopstart, ifnotp=p_done)`

			`reg.u32('smid')`
			`op.mov.u32(smid, '%smid')`
			`op.mov.u32(warpid, '%warpid')`
			`op.mov.u64(now, '%clock64')`
			`std.store_per_thread(n+'end', now,`
			`n+'smid', smid,`
			`n+'warpid_end', warpid)`

			`def _call(self, ctx, func):`
			`stream1, stream2 = cuda.Stream(), cuda.Stream()`
			`self._call2(ctx, stream1)`
			`_SimulOccupancy._call2(ctx, stream2)`
			`stream2.synchronize()`
			`stream1.synchronize()`

			`@instmethod`
			`def _call2(self, ctx, stream):`
			`func = ctx.mod.get_function(self.entry_name)`
			`func.prepare([], ctx.block)`
			`# TODO: load number of SMs from ctx`
			`func.launch_grid_async(7, 1, stream)`

			`def call_teardown(self, ctx):`
			`sm_log = [[] for i in range(7)]`
			`self._teardown(ctx, sm_log)`
			`_SimulOccupancy._teardown(ctx, sm_log)`
			`for sm in range(len(sm_log)):`
			`print "\nPrinting log for SM %d" % sm`
			`for t, ev in sorted(sm_log[sm]):`
			`print '%6d %s' % (t/1000, ev)`

			`@instmethod`
			`def _teardown(self, ctx, sm_log):`
			`# For this method, the GPU is intentionally underloaded; trim results`
			`th = 7 * ctx.threads_per_cta`
			`n = self.entry_name + '_'`
			`start = ctx.get_per_thread(n+'start', np.uint64)[:th]`
			`end = ctx.get_per_thread(n+'end', np.uint64)[:th]`
			`smid = ctx.get_per_thread(n+'smid', np.uint32)[:th]`
			`warpid_start = ctx.get_per_thread(n+'warpid_start', np.uint32)[:th]`
			`warpid_end = ctx.get_per_thread(n+'warpid_end', np.uint32)[:th]`
			`for i in range(0, th, 32):`
			`sm_log[smid[i]].append((start[i], "%s%4d entered SM" % (n, i/32)))`
			`sm_log[smid[i]].append((end[i], "%s%4d left SM" % (n, i/32)))`
			`if not np.alltrue(np.equal(warpid_start, warpid_end)):`
			`print "Warp IDs changed. Do further research."`

			`class _SimulOccupancy(SimulOccupancy):`
			`# Don't call this one`
			`entry_name = 'simul2'`
			`def call(self, ctx):`
			`pass`
			`def call_teardown(self, ctx):`
			`pass`

Add the first of many microbenchmarks 2010-09-07 12:44:12 -04:00			`def printover(a, r, s=1):`
			`for i in range(0, len(a), r*s):`
			`for j in range(i, i+r*s, s):`
			`if j < len(a): print a[j],`
			`print`

			`def main():`
			`# TODO: block/grid auto-optimization`
Simultaneous occupancy microbenchmark 2010-09-12 16:23:24 -04:00			`ctx = LaunchContext([L2WriteCombining, SimulOccupancy, _SimulOccupancy],`
Refactor call() to be more elegant 2010-09-10 14:43:20 -04:00			`block=(128,1,1), grid=(7*8,1), tests=True)`
Add the first of many microbenchmarks 2010-09-07 12:44:12 -04:00			`ctx.compile(verbose=3)`
Refactor call() to be more elegant 2010-09-10 14:43:20 -04:00			`ctx.run_tests()`
Simultaneous occupancy microbenchmark 2010-09-12 16:23:24 -04:00			`SimulOccupancy.call(ctx)`
instmethod decorator: another hack (to get around ctx.ptx.instances[]) 2010-09-08 13:12:46 -04:00			`L2WriteCombining.call(ctx)`
Add the first of many microbenchmarks 2010-09-07 12:44:12 -04:00
			`if __name__ == "__main__":`
			`main()`