diff --git a/bench.py b/bench.py index ac655b4..495ed6e 100644 --- a/bench.py +++ b/bench.py @@ -9,7 +9,7 @@ import pycuda.autoinit import pycuda.driver as cuda from cuburnlib.ptx import PTXFragment, PTXTest, ptx_func, instmethod from cuburnlib.cuda import LaunchContext -from cuburnlib.device_code import MWCRNG +from cuburnlib.device_code import MWCRNG, MWCRNGTest class L2WriteCombining(PTXTest): """ @@ -104,26 +104,18 @@ class L2WriteCombining(PTXTest): op.setp.ge.u32(p_done, x, 2) op.bra.uni(l2_restart, ifnotp=p_done) - @instmethod - def call(self, ctx): - scratch = np.zeros(self.block_size*ctx.ctas/4, np.uint64) - times_bytes = np.zeros((4, ctx.threads), np.uint64, 'F') - func = ctx.mod.get_function(self.entry_name) - dtime = func(cuda.InOut(times_bytes), cuda.InOut(scratch), - block=ctx.block, grid=ctx.grid, time_kernel=True) + def _call(self, ctx, func): + self.scratch = np.zeros(self.block_size*ctx.ctas/4, np.uint64) + self.times_bytes = np.zeros((4, ctx.threads), np.uint64, 'F') + super(L2WriteCombining, self)._call(ctx, func, + cuda.InOut(self.scratch), cuda.InOut(self.times_bytes)) - #printover(times_bytes[0], 6, 32) - #printover(times_bytes[1], 6) - #printover(times_bytes[2], 6, 32) - #printover(times_bytes[3], 6) - #printover(scratch[i:i+16], 8) - - print "\nTotal time was %g seconds" % dtime + def call_teardown(self, ctx): pm = lambda a: (np.mean(a), np.std(a) / np.sqrt(len(a))) - print "Clks for coa was %g ± %g" % pm(times_bytes[0]) - print "Bytes for coa was %g ± %g" % pm(times_bytes[1]) - print "Clks for uncoa was %g ± %g" % pm(times_bytes[2]) - print "Bytes for uncoa was %g ± %g" % pm(times_bytes[3]) + print "Clks for coa was %g ± %g" % pm(self.times_bytes[0]) + print "Bytes for coa was %g ± %g" % pm(self.times_bytes[1]) + print "Clks for uncoa was %g ± %g" % pm(self.times_bytes[2]) + print "Bytes for uncoa was %g ± %g" % pm(self.times_bytes[3]) print '' def printover(a, r, s=1): @@ -134,9 +126,10 @@ def printover(a, r, s=1): def main(): # TODO: block/grid auto-optimization - ctx = LaunchContext([L2WriteCombining], block=(128,1,1), grid=(7*8,1), - tests=True) + ctx = LaunchContext([L2WriteCombining, MWCRNGTest], + block=(128,1,1), grid=(7*8,1), tests=True) ctx.compile(verbose=3) + ctx.run_tests() L2WriteCombining.call(ctx) if __name__ == "__main__": diff --git a/cuburnlib/cuda.py b/cuburnlib/cuda.py index 71222c1..dd39a67 100644 --- a/cuburnlib/cuda.py +++ b/cuburnlib/cuda.py @@ -10,7 +10,7 @@ import pycuda.gl.autoinit import numpy as np -from cuburnlib.ptx import PTXModule +from cuburnlib.ptx import PTXModule, PTXTest, PTXTestFailure class LaunchContext(object): """ @@ -72,29 +72,34 @@ class LaunchContext(object): entry.entry_name, func.num_regs, func.shared_size_bytes, func.local_size_bytes) - def set_up(self): - for inst in self.ptx.deporder(self.ptx.instances.values(), - self.ptx.instances): - inst.device_init(self) + def call_setup(self, entry_inst): + for inst in self.ptx.entry_deps[type(entry_inst)]: + inst.call_setup(self) - def run(self): - if not self.setup_done: self.set_up() - - def run_test(self, test_type): - if not self.setup_done: self.set_up() - inst = self.ptx.instances[test_type] - print "Running test: %s... " % inst.name - try: - cuda.Context.synchronize() - if inst.call(self): - print "Test %s passed." % inst.name + def call_teardown(self, entry_inst): + okay = True + for inst in reversed(self.ptx.entry_deps[type(entry_inst)]): + if inst is entry_inst and isinstance(entry_inst, PTXTest): + try: + inst.call_teardown(self) + except PTXTestFailure, e: + print "PTX Test %s failed!" % inst.entry_name, e + okay = False else: - print "Test %s FAILED." % inst.name - except Exception, e: - print "Test %s FAILED (exception thrown)." % inst.name - raise e + inst.call_teardown(self) + return okay def run_tests(self): - map(self.run_test, self.ptx.tests) - + if not self.ptx.tests: + print "No tests to run." + return True + all_okay = True + for test in self.ptx.tests: + cuda.Context.synchronize() + if test.call(self): + print "Test %s passed." % test.entry_name + else: + print "Test %s FAILED." % test.entry_name + all_okay = False + return all_okay diff --git a/cuburnlib/device_code.py b/cuburnlib/device_code.py index bc978e2..6873eac 100644 --- a/cuburnlib/device_code.py +++ b/cuburnlib/device_code.py @@ -148,18 +148,19 @@ class IterThread(PTXEntryPoint): CPDataStream.print_record(ctx, cp_stream, 5) self.cps_uploaded = True - @instmethod - def call(self, ctx): + def call_setup(self, ctx): if not self.cps_uploaded: raise Error("Cannot call IterThread before uploading CPs") num_cps_st_dp, num_cps_st_l = ctx.mod.get_global('g_num_cps_started') cuda.memset_d32(num_cps_st_dp, 0, 1) - func = ctx.mod.get_function('iter_thread') + def _call(self, ctx, func): + # Get texture reference from the Palette + # TODO: more elegant method than reaching into ctx.ptx? tr = ctx.ptx.instances[PaletteLookup].texref - dtime = func(block=ctx.block, grid=ctx.grid, time_kernel=True, - texrefs=[tr]) + super(IterThread, self)._call(ctx, func, texrefs=[tr]) + def call_teardown(self, ctx): shape = (ctx.grid[0], ctx.block[0]/32, 32) num_rounds_dp, num_rounds_l = ctx.mod.get_global('g_num_rounds') num_writes_dp, num_writes_l = ctx.mod.get_global('g_num_writes') @@ -325,7 +326,7 @@ class PaletteLookup(PTXFragment): self.texref.set_address_mode(1, cuda.address_mode.CLAMP) self.texref.set_array(dev_array) - def device_init(self, ctx): + def call_setup(self, ctx): assert self.texref, "Must upload palette texture before launch!" class HistScatter(PTXFragment): @@ -368,7 +369,7 @@ class HistScatter(PTXFragment): op.red.add.f32(addr(hist_bin_addr,12), a) - def device_init(self, ctx): + def call_setup(self, ctx): hist_bins_dp, hist_bins_l = ctx.mod.get_global('g_hist_bins') cuda.memset_d32(hist_bins_dp, 0, hist_bins_l/4) @@ -383,14 +384,10 @@ class MWCRNG(PTXFragment): shortname = "mwc" def __init__(self): - self.rand = np.random self.threads_ready = 0 if not os.path.isfile('primes.bin'): raise EnvironmentError('primes.bin not found') - def set_seed(self, seed): - self.rand = np.random.mtrand.RandomState(seed) - @ptx_func def module_setup(self): mem.global_.u32('mwc_rng_mults', ctx.threads) @@ -450,11 +447,12 @@ class MWCRNG(PTXFragment): op.cvt.rn.f32.s32(dst_reg, mwc_st) op.mul.f32(dst_reg, dst_reg, '0f30000000') # 1./(1<<31) - def device_init(self, ctx): - if self.threads_ready >= ctx.threads: - # Already set up enough random states, don't push again - return - + @instmethod + def seed(self, ctx, rand=np.random): + """ + Seed the random number generators with values taken from a + ``np.random`` instance. + """ # Load raw big-endian u32 multipliers from primes.bin. with open('primes.bin') as primefp: dt = np.dtype(np.uint32).newbyteorder('B') @@ -463,18 +461,22 @@ class MWCRNG(PTXFragment): # Randomness in choosing multipliers is good, but larger multipliers # have longer periods, which is also good. This is a compromise. mults = np.array(mults[:ctx.threads*4]) - self.rand.shuffle(mults) + rand.shuffle(mults) # Copy multipliers and seeds to the device multdp, multl = ctx.mod.get_global('mwc_rng_mults') cuda.memcpy_htod_async(multdp, mults.tostring()[:multl]) # Intentionally excludes both 0 and (2^32-1), as they can lead to # degenerate sequences of period 0 - states = np.array(self.rand.randint(1, 0xffffffff, size=2*ctx.threads), + states = np.array(rand.randint(1, 0xffffffff, size=2*ctx.threads), dtype=np.uint32) statedp, statel = ctx.mod.get_global('mwc_rng_state') cuda.memcpy_htod_async(statedp, states.tostring()) self.threads_ready = ctx.threads + def call_setup(self, ctx): + if self.threads_ready < ctx.threads: + self.seed(ctx) + def tests(self): return [MWCRNGTest] @@ -515,7 +517,7 @@ class MWCRNGTest(PTXTest): op.mad.lo.u32(adr, offset, 8, adr) op.st.global_.u64(addr(adr), sum) - def call(self, ctx): + def call_setup(self, ctx): # Get current multipliers and seeds from the device multdp, multl = ctx.mod.get_global('mwc_rng_mults') mults = cuda.from_device(multdp, ctx.threads, np.uint32) @@ -533,15 +535,13 @@ class MWCRNGTest(PTXTest): ctime = time.time() - ctime print "Done on host, took %g seconds" % ctime - func = ctx.mod.get_function('MWC_RNG_test') - dtime = func(block=ctx.block, grid=ctx.grid, time_kernel=True) - print "Done on device, took %g seconds (%gx)" % (dtime, ctime/dtime) + def call_teardown(self, ctx): dfullstates = cuda.from_device(statedp, ctx.threads, np.uint64) if not (dfullstates == fullstates).all(): print "State discrepancy" print dfullstates print fullstates - return False + raise PTXTestFailure("MWC RNG state discrepancy") sumdp, suml = ctx.mod.get_global('mwc_rng_test_sums') dsums = cuda.from_device(sumdp, ctx.threads, np.uint64) @@ -549,11 +549,7 @@ class MWCRNGTest(PTXTest): print "Sum discrepancy" print dsums print sums - return False - return True - -class CameraCoordTransform(PTXFragment): - pass + raise PTXTestFailure("MWC RNG sum discrepancy") class CPDataStream(DataStream): """DataStream which stores the control points.""" diff --git a/cuburnlib/ptx.py b/cuburnlib/ptx.py index 5171a32..2529fe0 100644 --- a/cuburnlib/ptx.py +++ b/cuburnlib/ptx.py @@ -578,13 +578,23 @@ class PTXFragment(object): """ return [] - def device_init(self, ctx): + def call_setup(self, ctx): """ Do stuff on the host to prepare the device for execution. 'ctx' is a LaunchContext or similar. This will get called (in dependency order, of - course) *either* before any entry point invocation, or before *each* - invocation, I'm not sure which yet. (For now it's "each".) + course) before each function invocation. """ + # I haven't found a good way to get outside context in for this method. + # As a result, this is usually just a check to see if some other + # necessary method has been called before trying to launch. + pass + + def call_teardown(self, ctx): + """ + As with ``call_setup``, but after a call and in reverse order. + """ + # Exceptions raised here will propagate from the invocation in Python, + # so this is a good place to do error checking. pass def instmethod(func): @@ -599,8 +609,6 @@ def instmethod(func): return classmethod(wrap) class PTXEntryPoint(PTXFragment): - # Human-readable entry point name - name = "" # Device code entry name entry_name = "" # List of (type, name) pairs for entry params, e.g. [('u32', 'thing')] @@ -615,28 +623,44 @@ class PTXEntryPoint(PTXFragment): """ raise NotImplementedError + def _call(self, ctx, func, *args, **kwargs): + """ + Override this if you need to change how a function is called. + """ + # TODO: global debugging / verbosity + print "Invoking PTX function '%s' on device" % self.entry_name + kwargs.setdefault('block', ctx.block) + kwargs.setdefault('grid', ctx.grid) + dtime = func(time_kernel=True, *args, **kwargs) + print "'%s' completed in %gs" % (self.entry_name, dtime) + @instmethod - def call(self, ctx): + def call(self, ctx, *args, **kwargs): """ - Calls the entry point on the device. Haven't worked out the details - of this one yet. + Calls the entry point on the device, performing any setup and teardown + needed. """ - pass + ctx.call_setup(self) + func = ctx.mod.get_function(self.entry_name) + self._call(ctx, func, *args, **kwargs) + return ctx.call_teardown(self) + +class PTXTestFailure(Exception): pass class PTXTest(PTXEntryPoint): - """PTXTests are semantically equivalent to PTXEntryPoints, but they - differ slightly in use. In particular: + """PTXTests are semantically equivalent to PTXEntryPoints, but they differ + slightly in the way they are invoked: - * The "name" property should describe the test being performed, - * ctx.stream will be synchronized before 'call' is run, and should be - synchronized afterwards (i.e. sync it yourself or don't use it), - * call() should return True to indicate that a test passed, or - False (or raise an exception) if it failed. + * The active context will be synchronized before each call, + * call_teardown() should raise ``PTXTestFailure`` if a test failed. + This exception will be caught and cleanup will be completed + (unless another exception is raised). """ pass class _PTXStdLib(PTXFragment): shortname = "std" + def __init__(self, block): # Only module that gets the privilege of seeing 'block' directly. self.block = block @@ -728,6 +752,7 @@ class PTXModule(object): insts, tests, all_deps, entry_deps = ( self.deptrace(block, entries, build_tests)) self.instances = insts + self.entry_deps = entry_deps self.tests = tests inject = dict(inject) diff --git a/cuburnlib/render.py b/cuburnlib/render.py index 290129a..0705d5d 100644 --- a/cuburnlib/render.py +++ b/cuburnlib/render.py @@ -130,7 +130,6 @@ class Animation(object): # TODO: allow animation-long override of certain parameters (size, etc) frame = Frame(self._frame, time) frame.upload_data(self.ctx, self.filters, time) - self.ctx.set_up() IterThread.call(self.ctx) return HistScatter.get_bins(self.ctx, self.features)