diff --git a/cuburnlib/cuda.py b/cuburnlib/cuda.py index d681045..d367a60 100644 --- a/cuburnlib/cuda.py +++ b/cuburnlib/cuda.py @@ -34,7 +34,6 @@ class LaunchContext(object): """ def __init__(self, entries, block=(1,1,1), grid=(1,1), seed=None, tests=False): - self.devinfo = pycuda.tools.DeviceData() self.entry_types = entries self.block, self.grid, self.build_tests = block, grid, tests self.rand = np.random.mtrand.RandomState(seed) diff --git a/cuburnlib/device_code.py b/cuburnlib/device_code.py index 439737b..49e8ad3 100644 --- a/cuburnlib/device_code.py +++ b/cuburnlib/device_code.py @@ -175,7 +175,6 @@ class MWCRNG(PTXFragment): states = np.array(ctx.rand.randint(1, 0xffffffff, size=2*ctx.threads), dtype=np.uint32) statedp, statel = ctx.mod.get_global('mwc_rng_state') - print states, len(states.tostring()) cuda.memcpy_htod_async(statedp, states.tostring()) self.threads_ready = ctx.threads @@ -184,7 +183,7 @@ class MWCRNG(PTXFragment): class MWCRNGTest(PTXTest): name = "MWC RNG sum-of-threads" - rounds = 10000 + rounds = 5000 entry_name = 'MWC_RNG_test' entry_params = '' @@ -227,7 +226,7 @@ class MWCRNGTest(PTXTest): fullstates = cuda.from_device(statedp, ctx.threads, np.uint64) sums = np.zeros(ctx.threads, np.uint64) - print "Running states forward %d rounds" % self.rounds + print "Running %d states forward %d rounds" % (len(mults), self.rounds) ctime = time.time() for i in range(self.rounds): states = fullstates & 0xffffffff @@ -241,7 +240,6 @@ class MWCRNGTest(PTXTest): dtime = func(block=ctx.block, grid=ctx.grid, time_kernel=True) print "Done on device, took %g seconds (%gx)" % (dtime, ctime/dtime) dfullstates = cuda.from_device(statedp, ctx.threads, np.uint64) - print dfullstates, fullstates if not (dfullstates == fullstates).all(): print "State discrepancy" print dfullstates @@ -250,7 +248,6 @@ class MWCRNGTest(PTXTest): sumdp, suml = ctx.mod.get_global('mwc_rng_test_sums') dsums = cuda.from_device(sumdp, ctx.threads, np.uint64) - print dsums, sums if not (dsums == sums).all(): print "Sum discrepancy" print dsums @@ -259,30 +256,7 @@ class MWCRNGTest(PTXTest): return True class CameraCoordTransform(PTXFragment): - # This is here until I get the device stream packer going, or decide on - # how to handle C struct addressing if we go for unpacked structures - prelude = ".global .u32 camera_coords[8];" - - def _cam_coord_xf(self, x, y, dreg): - """ - Given `.f32 x, y`, a coordinate in IFS space, writes the integer - offset from the start of the sampling lattice into `.u32 dreg`. - """ - - return """{ - .pred is_badval; - // TODO: This will change when data streaming is done - .reg .u32 camera_coord_address; - mov.u32 camera_coord_address, camera_coords; - // TODO: see if preloading everything hurts register count - .reg .f32 width_scale, width_upper_bound, height_scale, height_upper_bound; - ldu.v4.f32 {width_scale, width_upper_bound, - height_scale, height_upper_bound}, - [camera_coord_address+0]; - .reg .f32 x_xf, y_xf; - mad.rz.f32 x_xf, x, width_scale""" - # TODO unfinished - - + # TODO finish + pass diff --git a/cuburnlib/ptx.py b/cuburnlib/ptx.py index 7e67e37..437446c 100644 --- a/cuburnlib/ptx.py +++ b/cuburnlib/ptx.py @@ -51,7 +51,7 @@ from collections import namedtuple # reg.u32('hooray_reg') # load_zero(hooray_reg) # -# But using blocks alone to track names, it would turn in to this ugliness:: +# But using blocks alone to track names, it would turn in to this mess:: # # def load_zero(block, dest_reg): # block.op.mov.u32(block.op.dest_reg, 0) @@ -229,9 +229,6 @@ class _PTXFuncWrapper(object): func = types.FunctionType(self.func.func_code, newglobals, self.func.func_name, self.func.func_defaults, self.func.func_closure) - # TODO: if we generate a new dict every time, we can kill the - # _BlockInjector and move BI.inject() back to _Block, but I don't want - # to delete working code just yet with block.injector(func.func_globals): func(*args, **kwargs) @@ -348,37 +345,6 @@ class _RegFactory(_CallChain): self.block.code(op='.reg .' + type, vars=_softjoin(names, ', ')) [self.block.inject(r.name, r) for r in regs] -# Pending resolution of the op(regs, guard=x) debate -#class Pred(object): - #""" - #Allows for predicated execution of operations. - - #>>> pred('p_some_test p_another_test') - #>>> op.setp.eq.u32(p_some_test, reg1, reg2) - #>>> op.setp.and.eq.u32(p_another_test, reg1, reg2, p_some_test) - #>>> with p_some_test.is_set(): - #>>> op.ld.global.u32(reg1, addr(areg)) - - #Predication supports nested function calls, and will cover all code - #generated inside the predicate block: - - #>>> with p_another_test.is_unset(): - #>>> some_ptxdsl_function(reg2) - #>>> op.st.global.u32(addr(areg), reg2) - - #It is a syntax error to declare registers, - #However, multiple predicate blocks cannot be nested. Doing so is a syntax - #error. - - #>>> with p_some_test.is_set(): - #>>> with p_another_test.is_unset(): - #>>> pass - #SyntaxError: ... - #""" - #def __init__(self, name): - #self.name = name - #def is_set(self, isnot=False): - class Op(_CallChain): """ Performs an operation. @@ -470,7 +436,7 @@ class _MemFactory(_CallChain): class Label(object): """ - Specifies the target for a branch. Scoped in PTX? TODO: test that it is. + Specifies the target for a branch. >>> label('infinite_loop') >>> op.bra.uni('label') @@ -704,7 +670,6 @@ class PTXModule(object): self.__needs_recompilation = False self.assemble(block, all_deps, entry_deps) self.instances.pop(_PTXStdLib) - print self.instances if not formatter: formatter = PTXFormatter()