diff --git a/cuburn/cuda.py b/cuburn/cuda.py deleted file mode 100644 index 275394b..0000000 --- a/cuburn/cuda.py +++ /dev/null @@ -1,124 +0,0 @@ -# These imports are order-sensitive! -#import pyglet -#import pyglet.gl as gl -#gl.get_current_context() - -import pycuda.driver as cuda -from pycuda.compiler import SourceModule -import pycuda.tools -#import pycuda.gl as cudagl -#import pycuda.gl.autoinit -import pycuda.autoinit - -import numpy as np - -from cuburn.ptx import PTXFormatter - -class Module(object): - def __init__(self, entries): - self.entries = entries - self.source = self.compile(entries) - self.mod = self.assemble(self.source) - - @staticmethod - def compile(entries): - formatter = PTXFormatter() - for entry in entries: - entry.format_source(formatter) - return formatter.get_source() - - def assemble(self, src): - # TODO: make this a debugging option - with open('/tmp/cuburn.ptx', 'w') as f: f.write(src) - try: - mod = cuda.module_from_buffer(src, - [(cuda.jit_option.OPTIMIZATION_LEVEL, 0), - (cuda.jit_option.TARGET_FROM_CUCONTEXT, 1)]) - except (cuda.CompileError, cuda.RuntimeError), e: - # TODO: if output not written above, print different message - # TODO: read assembler output and recover Python source lines - print "Compile error. Source is at /tmp/cuburn.ptx" - print e - raise e - return mod - -class LaunchContext(object): - def __init__(self, entries, block=(1,1,1), grid=(1,1), tests=False): - self.entry_types = entries - self.block, self.grid, self.build_tests = block, grid, tests - self.setup_done = False - self.stream = cuda.Stream() - - @property - def nthreads(self): - return reduce(lambda a, b: a*b, self.block + self.grid) - - @property - def nctas(self): - return self.grid[0] * self.grid[1] - - @property - def threads_per_cta(self): - return self.block[0] * self.block[1] * self.block[2] - - @property - def warps_per_cta(self): - return self.threads_per_cta / 32 - - def compile(self, verbose=False, **kwargs): - kwargs['ctx'] = self - self.ptx = PTXModule(self.entry_types, kwargs, self.build_tests) - # TODO: make this optional and let user choose path - if verbose: - for entry in self.ptx.entries: - func = self.mod.get_function(entry.entry_name) - print "Compiled %s: used %d regs, %d sm, %d local" % ( - entry.entry_name, func.num_regs, - func.shared_size_bytes, func.local_size_bytes) - - def call_setup(self, entry_inst): - for inst in self.ptx.entry_deps[type(entry_inst)]: - inst.call_setup(self) - - def call_teardown(self, entry_inst): - okay = True - for inst in reversed(self.ptx.entry_deps[type(entry_inst)]): - if inst is entry_inst and isinstance(entry_inst, PTXTest): - try: - inst.call_teardown(self) - except PTXTestFailure, e: - print "\nTest %s FAILED!" % inst.entry_name - print "Reason:", e - print - okay = False - else: - inst.call_teardown(self) - return okay - - def run_tests(self): - if not self.ptx.tests: - print "No tests to run." - return True - all_okay = True - for test in self.ptx.tests: - cuda.Context.synchronize() - if test.call(self): - print "Test %s passed.\n" % test.entry_name - else: - all_okay = False - return all_okay - - def get_per_thread(self, name, dtype, shaped=False): - """ - Convenience function to get the contents of the global memory variable - ``name`` from the device as a numpy array of type ``dtype``, as might - be stored by _PTXStdLib.store_per_thread. If ``shaped`` is True, the - array will be 3D, as (cta_no, warp_no, lane_no). - """ - if shaped: - shape = (self.nctas, self.warps_per_cta, 32) - else: - shape = self.nthreads - dp, l = self.mod.get_global(name) - return cuda.from_device(dp, shape, dtype) - diff --git a/cuburn/device_code.py b/cuburn/device_code.py index 8523d87..a07045e 100644 --- a/cuburn/device_code.py +++ b/cuburn/device_code.py @@ -9,10 +9,10 @@ import struct import pycuda.driver as cuda import numpy as np -from cuburn.ptx import * +from pyptx import ptx, run from cuburn.variations import Variations -class IterThread(PTXEntryPoint): +class IterThread(object): entry_name = 'iter_thread' entry_params = [] @@ -23,7 +23,6 @@ class IterThread(PTXEntryPoint): return [MWCRNG, CPDataStream, HistScatter, Variations, ShufflePoints, Timeouter] - @ptx_func def module_setup(self): mem.global_.u32('g_cp_array', cp.stream_size*features.max_ntemporal_samples) @@ -34,7 +33,6 @@ class IterThread(PTXEntryPoint): mem.global_.u32('g_num_writes', ctx.nthreads) mem.global_.b32('g_whatever', ctx.nthreads) - @ptx_func def entry(self): # Index number of current CP, shared across CTA mem.shared.u32('s_cp_idx') @@ -205,7 +203,6 @@ class IterThread(PTXEntryPoint): std.store_per_thread(g_num_rounds, num_rounds, g_num_writes, num_writes) - @instmethod def upload_cp_stream(self, ctx, cp_stream, num_cps): cp_array_dp, cp_array_l = ctx.mod.get_global('g_cp_array') assert len(cp_stream) <= cp_array_l, "Stream too big!" @@ -250,12 +247,11 @@ class IterThread(PTXEntryPoint): cps_started = cuda.from_device(dp, 1, np.uint32) print "CPs started:", cps_started -class CameraTransform(PTXFragment): +class CameraTransform(object): shortname = 'camera' def deps(self): return [CPDataStream] - @ptx_func def rotate(self, rotated_x, rotated_y, x, y): """ Rotate an IFS-space coordinate as defined by the camera. @@ -293,7 +289,6 @@ class CameraTransform(PTXFragment): op.mov.f32(rotated_x, x) op.mov.f32(rotated_y, y) - @ptx_func def get_norm(self, norm_x, norm_y, x, y): """ Find the [0,1]-normalized floating-point histogram coordinates @@ -309,7 +304,6 @@ class CameraTransform(PTXFragment): cam_offset, 'cp.camera.norm_offset[1]') op.fma.f32(norm_y, norm_y, cam_scale, cam_offset) - @ptx_func def get_index(self, index, x, y, pred=None): """ Find the histogram index (as a u32) from the IFS spatial coordinate in @@ -349,7 +343,7 @@ class CameraTransform(PTXFragment): op.mad.lo.u32(index, index_y, features.hist_stride, index_x) op.mov.u32(index, 0xffffffff, ifnotp=pred) -class PaletteLookup(PTXFragment): +class PaletteLookup(object): shortname = "palette" # Resolution of texture on device. Bigger = more palette rez, maybe slower texheight = 16 @@ -360,11 +354,9 @@ class PaletteLookup(PTXFragment): def deps(self): return [CPDataStream] - @ptx_func def module_setup(self): mem.global_.texref('t_palette') - @ptx_func def look_up(self, r, g, b, a, color, norm_time, ifp): """ Look up the values of ``r, g, b, a`` corresponding to ``color_coord`` @@ -376,7 +368,6 @@ class PaletteLookup(PTXFragment): if features.non_box_temporal_filter: raise NotImplementedError("Non-box temporal filters not supported") - @instmethod def upload_palette(self, ctx, frame, cp_list): """ Extract the palette from the given list of interpolated CPs, and upload @@ -409,25 +400,22 @@ class PaletteLookup(PTXFragment): def call_setup(self, ctx): assert self.texref, "Must upload palette texture before launch!" -class HistScatter(PTXFragment): +class HistScatter(object): shortname = "hist" def deps(self): return [CPDataStream, CameraTransform, PaletteLookup] - @ptx_func def module_setup(self): mem.global_.f32('g_hist_bins', features.hist_height * features.hist_stride * 4) comment("Target to ensure fake local values get written") mem.global_.f32('g_hist_dummy') - @ptx_func def entry_setup(self): comment("Fake bins for fake scatter") mem.local.f32('l_scatter_fake_adr') mem.local.f32('l_scatter_fake_alpha') - @ptx_func def entry_teardown(self): with block("Store fake histogram bins to dummy global"): reg.b32('hist_dummy') @@ -436,7 +424,6 @@ class HistScatter(PTXFragment): op.ld.local.b32(hist_dummy, addr(l_scatter_fake_alpha)) op.st.volatile.b32(addr(g_hist_dummy), hist_dummy) - @ptx_func def scatter(self, hist_index, color, xf_idx, p_valid, type='ldst'): """ Scatter the given point directly to the histogram bins. I think this @@ -479,7 +466,6 @@ class HistScatter(PTXFragment): hist_bins_dp, hist_bins_l = ctx.mod.get_global('g_hist_bins') cuda.memset_d32(hist_bins_dp, 0, hist_bins_l/4) - @instmethod def get_bins(self, ctx, features): hist_bins_dp, hist_bins_l = ctx.mod.get_global('g_hist_bins') return cuda.from_device(hist_bins_dp, @@ -487,18 +473,16 @@ class HistScatter(PTXFragment): dtype=np.float32) -class ShufflePoints(PTXFragment): +class ShufflePoints(object): """ Shuffle points in shared memory. See helpers/shuf.py for details. """ shortname = "shuf" - @ptx_func def module_setup(self): # TODO: if needed, merge this shared memory block with others mem.shared.f32('s_shuf_data', ctx.threads_per_cta) - @ptx_func def shuffle(self, *args, **kwargs): """ Shuffle the data from each register in args across threads. Keyword @@ -523,57 +507,71 @@ class ShufflePoints(PTXFragment): op.bar.sync(bar) op.ld.volatile.shared.b32(var, addr(shuf_read)) + class MWCRNG(object): - def __init__(self, entry, seed=None): + """ + Marsaglia multiply-with-carry random number generator. Produces very long + periods with sufficient statistical properties using only three 32-bit + state registers. Since each thread uses a separate multiplier, no two + threads will ever be on the same sequence, but beyond this the independence + of each thread's sequence was not explicitly tested. + + The RNG must be seeded at least once per entry point using the ``seed`` + method. + """ + def __init__(self, entry): # TODO: install this in data directory or something if not os.path.isfile('primes.bin'): raise EnvironmentError('primes.bin not found') - self.threads_ready = 0 + self.nthreads_ready = 0 self.mults, self.state = None, None - self.entry = entry - entry.add_param('mwc_mults', entry.types.u32) - entry.add_param('mwc_states', entry.types.u32) - r, o = entry.regs, entry.ops - with entry.head as e: - #mwc_mult_addr = gtid * 4 + e.params.mwc_mults - gtid = o.mad.lo(e.special.ctaid_x, ctx.threads_per_cta, - e.special.tid_x) - mwc_mult_addr = o.mad.lo.u32(gtid, 4, e.params.mwc_mults) - r.mwc_mult = o.load.u32(mwc_mult_addr) - mwc_state_addr = o.mad.lo.u32(gtid, 8, e.params.mwc_states) - r.mwc_state, r.mwc_carry = o.load.u64(mwc_state_addr) - with entry.tail as e: - #gtid = e.special.ctaid_x * ctx.threads_per_cta + e.special.tid_x - gtid = o.mad.lo(e.special.ctaid_x, ctx.threads_per_cta, - e.special.tid_x) - mwc_state_addr = o.mad.lo.u32(gtid, 8, e.params.mwc_states) - o.store.v2(mwc_state_addr, (r.mwc_state, r.mwc_carry)) + entry.add_ptr_param('mwc_mults', 'u32') + entry.add_ptr_param('mwc_states', 'u32') - def next_b32(self): - e, r, o = self.entry, self.entry.regs, self.entry.ops - mwc_out = o.cvt.u64(r.mwc_carry) - mwc_out = o.mad.wide.u32(r.mwc_mult, r.mwc_state, mwc_out) - r.mwc_state, r.mwc_carry = o.mov(mwc_out) + with entry.head(): + self.entry_head(entry) + entry.tail_callback(self.entry_tail, entry) + + def entry_head(self, entry): + e, r, o, m, p, s = entry.locals + gtid = s.ctaid_x * s.ntid_x + s.tid_x + r.mwc_mult, r.mwc_state, r.mwc_carry = r.u32(), r.u32(), r.u32() + r.mwc_mult = o.ld(p.mwc_mults[gtid]) + r.mwc_state, r.mwc_carry = o.ld.v2(p.mwc_states[2*gtid]) + + def entry_tail(self, entry): + e, r, o, m, p, s = entry.locals + gtid = s.ctaid_x * s.ntid_x + s.tid_x + o.st.v2.u32(p.mwc_states[2*gtid], r.mwc_state, r.mwc_carry) + + def next_b32(self, entry): + e, r, o, m, p, s = entry.locals + carry = o.cvt.u64(r.mwc_carry) + mwc_out = o.mad.wide(r.mwc_mult, r.mwc_state, carry) + r.mwc_state, r.mwc_carry = o.split.v2(mwc_out) return r.mwc_state - def next_f32_01(self): - e, r, o = self.entry, self.entry.regs, self.entry.ops + def next_f32_01(self, entry): + e, r, o, m, p, s = entry.locals mwc_float = o.cvt.rn.f32.u32(self.next_b32()) - # TODO: check the precision on the uploaded types here return o.mul.f32(mwc_float, 1./(1<<32)) - def next_f32_11(self): - e, r, o = self.entry, self.entry.regs, self.entry.ops + def next_f32_11(self, entry): + e, r, o, m, p, s = entry.locals mwc_float = o.cvt.rn.f32.s32(self.next_b32()) return o.mul.f32(mwc_float, 1./(1<<31)) - def call_setup(self, ctx, force=False): + def seed(self, ctx, seed=None, force=False): """ Seed the random number generators with values taken from a ``np.random`` instance. """ if force or self.nthreads_ready < ctx.nthreads: + if seed: + rand = np.random.RandomState(seed) + else: + rand = np.random # Load raw big-endian u32 multipliers from primes.bin. with open('primes.bin') as primefp: dt = np.dtype(np.uint32).newbyteorder('B') @@ -582,73 +580,83 @@ class MWCRNG(object): # have longer periods, which is also good. This is a compromise. mults = np.array(mults[:ctx.nthreads*4]) rand.shuffle(mults) - locked_mults = ctx.hostpool.allocate(ctx.nthreads, np.uint32) - locked_mults[:] = mults[ctx.nthreads] - self.mults = ctx.pool.allocate(4*ctx.nthreads) - cuda.memcpy_htod_async(self.mults, locked_mults.base, ctx.stream) + #locked_mults = ctx.hostpool.allocate(ctx.nthreads, np.uint32) + #locked_mults[:] = mults[ctx.nthreads] + #self.mults = ctx.pool.allocate(4*ctx.nthreads) + #cuda.memcpy_htod_async(self.mults, locked_mults.base, ctx.stream) + self.mults = cuda.mem_alloc(4*ctx.nthreads) + cuda.memcpy_htod(self.mults, mults[:ctx.nthreads].tostring()) # Intentionally excludes both 0 and (2^32-1), as they can lead to # degenerate sequences of period 0 states = np.array(rand.randint(1, 0xffffffff, size=2*ctx.nthreads), dtype=np.uint32) - locked_states = ctx.hostpool.allocate(2*ctx.nthreads, np.uint32) - locked_states[:] = states - self.states = ctx.pool.allocate(8*ctx.nthreads) - cuda.memcpy_htod_async(self.states, locked_states, ctx.stream) + #locked_states = ctx.hostpool.allocate(2*ctx.nthreads, np.uint32) + #locked_states[:] = states + #self.states = ctx.pool.allocate(8*ctx.nthreads) + #cuda.memcpy_htod_async(self.states, locked_states, ctx.stream) + self.states = cuda.mem_alloc(8*ctx.nthreads) + cuda.memcpy_htod(self.states, states.tostring()) self.nthreads_ready = ctx.nthreads ctx.set_param('mwc_mults', self.mults) ctx.set_param('mwc_states', self.states) -class MWCRNGTest(PTXEntry): +class MWCRNGTest(object): + """ + Test the ``MWCRNG`` class. This is not a test of the generator's + statistical properties, but merely a test that the generator is implemented + correctly on the GPU. + """ rounds = 5000 def __init__(self, entry): - self.entry = entry self.mwc = MWCRNG(entry) + entry.add_ptr_param('mwc_test_sums', 'u64') - entry.add_param('mwc_test_sums', entry.types.u32) with entry.body(): - self.entry_body() + self.entry_body(entry) - def entry_body(self): - e, r, o = self.entry, self.entry.regs, self.entry.ops + def entry_body(self, entry): + e, r, o, m, p, s = entry.locals + r.sum = r.u64(0) + r.count = r.f32(self.rounds) + start = e.label() + r.sum = r.sum + o.cvt.u64.u32(self.mwc.next_b32(e)) + r.count = r.count - 1 + with r.count > 0: + o.bra.uni(start) + e.comment('yay') + gtid = s.ctaid_x * s.ntid_x + s.tid_x + o.st(p.mwc_test_sums[gtid], r.sum) - r.sum = 0 - with e.std.loop(self.rounds) as mwc_rng_sum: - addend = o.cvt.u64.u32(self.mwc.next_b32()) - r.sum = o.add.u64(r.sum, addend) + def run_test(self, ctx): + self.mwc.seed(ctx) + mults = cuda.from_device(self.mwc.mults, ctx.nthreads, np.uint32) + states = cuda.from_device(self.mwc.states, ctx.nthreads, np.uint64) - e.std.store_per_thread(e.params.mwc_test_sums, r.sum) - - def call(self, ctx): - # Generate current state, upload it to GPU - self.mwc.call_setup(ctx, force=True) - mults, fullstates = self.mwc.mults, self.mwc.fullstates - sums = np.zeros_like(fullstates) - - # Run two trials, to ensure device state is getting saved properly for trial in range(2): print "Trial %d, on CPU: " % trial, + sums = np.zeros_like(states) ctime = time.time() for i in range(self.rounds): - states = fullstates & 0xffffffff - carries = fullstates >> 32 - fullstates = self.mults * states + carries - sums += fullstates & 0xffffffff + vals = states & 0xffffffff + carries = states >> 32 + states = mults * vals + carries + sums += states & 0xffffffff ctime = time.time() - ctime print "Took %g seconds." % ctime print "Trial %d, on device: " % trial, - dsums = np.empty_like(sums) - ctx.set_param('mwc_test_sums', cuda.Out(dsums)) - print "Took %g seconds." % ctx.call() - + dsums = cuda.mem_alloc(8*ctx.nthreads) + ctx.set_param('mwc_test_sums', dsums) + print "Took %g seconds." % ctx.call_timed() + print ctx.nthreads + dsums = cuda.from_device(dsums, ctx.nthreads, np.uint64) if not np.all(np.equal(sums, dsums)): print "Sum discrepancy!" print sums print dsums - raise TODOSomeKindOfException() -class MWCRNGFloatsTest(PTXTest): +class MWCRNGFloatsTest(object): """ Note this only tests that the distributions are in the correct range, *not* that they have good random properties. MWC is a suitable algorithm, but @@ -660,7 +668,6 @@ class MWCRNGFloatsTest(PTXTest): def deps(self): return [MWCRNG] - @ptx_func def module_setup(self): mem.global_.f32('mwc_rng_float_01_test_sums', ctx.nthreads) mem.global_.f32('mwc_rng_float_01_test_mins', ctx.nthreads) @@ -669,7 +676,6 @@ class MWCRNGFloatsTest(PTXTest): mem.global_.f32('mwc_rng_float_11_test_mins', ctx.nthreads) mem.global_.f32('mwc_rng_float_11_test_maxs', ctx.nthreads) - @ptx_func def loop(self, kind): with block('Sum %d floats in %s' % (self.rounds, kind)): reg.f32('loopct val rsum rmin rmax') @@ -691,7 +697,6 @@ class MWCRNGFloatsTest(PTXTest): 'mwc_rng_float_%s_test_mins' % kind, rmin, 'mwc_rng_float_%s_test_maxs' % kind, rmax) - @ptx_func def entry(self): self.loop('01') self.loop('11') @@ -721,15 +726,14 @@ class MWCRNGFloatsTest(PTXTest): raise PTXTestFailure("%s %s %g violates hard limit %g" % (fkind, rkind, lim(vals), exp)) -class CPDataStream(DataStream): +class CPDataStream(object): """DataStream which stores the control points.""" shortname = 'cp' -class Timeouter(PTXFragment): +class Timeouter(object): """Time-out infinite loops so that data can still be retrieved.""" shortname = 'timeout' - @ptx_func def entry_setup(self): mem.shared.u64('s_timeouter_start_time') with block("Load start time for this block"): @@ -737,7 +741,6 @@ class Timeouter(PTXFragment): op.mov.u64(now, '%clock64') op.st.shared.u64(addr(s_timeouter_start_time), now) - @ptx_func def check_time(self, secs): """ Drop this into your mainloop somewhere. diff --git a/cuburn/ptx.py b/cuburn/ptx.py deleted file mode 100644 index 6d81856..0000000 --- a/cuburn/ptx.py +++ /dev/null @@ -1,913 +0,0 @@ -""" -PTX DSL, a domain-specific language for NVIDIA's PTX. -""" - -# If you see 'import inspect', you know you're in for a good time -import inspect -import struct -from cStringIO import StringIO -from collections import namedtuple -from math import * - -import numpy as np -import pycuda.driver as cuda - -from pprint import pprint - -PTX_VERSION=(2, 1) - -Type = namedtuple('Type', 'name kind bits bytes') -TYPES = {} -for kind in 'busf': - for width in [8, 16, 32, 64]: - TYPES[kind+str(width)] = Type(kind+str(width), kind, width, width / 8) -del TYPES['f8'] -TYPES['pred'] = Type('pred', 'p', 0, 0) - -class Statement(object): - """ - Representation of a PTX statement. - """ - known_opnames = ('add addc sub subc mul mad mul24 mad24 sad div rem abs ' - 'neg min max popc clz bfind brev bfe bfi prmt testp copysign rcp ' - 'sqrt rsqrt sin cos lg2 ex2 set setp selp slct and or xor not ' - 'cnot shl shr mov ld ldu st prefetch prefetchu isspacep cvta cvt ' - 'tex txq suld sust sured suq bra call ret exit bar membar atom red ' - 'vote vadd vsub vabsdiff vmin vmax vshl vshr vmad vset').split() - - def __init__(self, name, args, line_info = None): - self.opname = name - self.fullname, self.operands, self.rtype = self.parse(name, args) - self.result = None - self.python_line = line_info - self.ptx_line = None - - @staticmethod - def parse(name, args): - """ - Parses and expands a (possibly incomplete) PTX statement, returning the - complete operation name and destination register type. - - ``name`` is a list of the parts of the operation name (as would be - given by ``'add.u32'.split()``, for example). - ``args`` is a list of the arguments to the operation, excluding the - destination register. - - Returns a 3-tuple of ``(fullname, args, rtype)``, where ``fullname`` is - the fully-expanded name of the operation, ``args`` is the list of - arguments with all untyped values converted to ``Immediate`` values of - the appropriate type, and ``type`` is the expected result type of the - statement. If the statement does not have a destination register, - ``type`` will be None. - """ - # TODO: .ftz insertion - - if name[0] in 'tex txq suld sust sured suq call'.split(): - raise NotImplementedError("No support for %s yet" % name[0]) - - # Make sure we don't modify the caller's list/tuple - name, args = list(name), list(args) - - # Six constants that just have to be unique from each other - # 'stype', 'dtype', 'ignore', 'u32', 'pred', 'memory' - ST, DT, IG, U3, PR, ME = range(6) - - if name[0] in ('add addc sub subc mul mul24 div rem min max and or ' - 'xor not cnot copysign').split(): - atypes = [ST, ST] - elif name[0] in ('abs neg popc clz bfind brev testp rcp sqrt rsqrt sin ' - 'cos lg2 ex2 mov cvt cvta isspacep split').split(): - atypes = [ST] - elif name[0] == 'mad' and name[1] == 'wide': - atypes = [ST, ST, DT] - elif name[0] in 'mad mad24 sad'.split(): - atypes = [ST, ST, ST] - elif name[0] == 'bfe': - atypes = [ST, U3, U3] - elif name[0] == 'bfi': - atypes = [ST, ST, U3, U3] - elif name[0] == 'prmt': - atypes = [U3, U3, U3] - elif name[0] in 'ld ldu prefetch prefetchu': - atypes = [ME] - elif name[0] == 'st': - atypes = [ME, ST] - elif name[0] in 'set setp selp'.split(): - atypes = [ST, ST, IG] - elif name[0] == 'slct': - atypes = [DT, DT, ST] - elif name[0] in ('shl', 'shr'): - atypes = [ST, U3] - elif name[0] in ('atom', 'red'): - if name[1] == 'cas': - atypes = [ME, ST, ST] - else: - atypes = [ME, ST] - elif name[0] in 'ret exit membar'.split(): - atypes = [] - elif name[0] == 'vote': - atypes = [PR] - elif name[0] in 'bar': - atypes = [U3, IG, IG] - else: - raise NotImplementedError("Don't recognize the %s statement. " - "If you think this is a bug, and it may well be, please " - "report it!" % name[0]) - - if (len(args) < len(filter(lambda t: t != IG, atypes)) or - len(args) > len(atypes)): - print args - print atypes - raise ValueError("Incorrect number of args for '%s'" % name[0]) - - stype, dtype = None, None - did_inference = False - - if isinstance(args[0], Pointer): - # Get stype from pointer (explicit stype overrides this) - if name[0] in 'ld ldu st'.split(): - stype = args[0].dtype - did_inference = True - # Get sspace from pointer if missing - if name[0] in 'ld ldu st prefetch atom red'.split(): - sspos = 2 if len(name) > 1 and name[1] == 'volatile' else 1 - if (len(name) <= sspos or name[sspos] not in - 'global local shared param const'.split()): - name.insert(sspos, args[0].sspace) - - # These instructions lack an stype suffix - if name[0] in ('prmt prefetch prefetchu isspacep bra ret exit membar ' - 'bar vote'.split()): - # False (as opposed to None) prevents stype inference attempt - stype = False - else: - # These instructions require a dtype - if name[0] in 'set slct cvt': - if name[-1] not in TYPES: - raise SyntaxError("'%s' requires a dtype." % name[0]) - if name[-2] in TYPES: - dtype, stype = TYPES[name[-2]], TYPES[name[-1]] - else: - dtype = TYPES[name[-1]] - else: - if name[-1] in TYPES: - stype = TYPES[name[-1]] - did_inference = False - - # stype wasn't explicitly set, try to infer it from the arguments - if stype is None: - maybe_typed = [a for a, t in zip(args, atypes) if t == ST] - types = [a.type for a in maybe_typed if isinstance(a, Register)] - if not types: - raise TypeError("Not enough information to infer type. " - "Explicitly specify the source argument type.") - stype = types[0] - did_inference = True - - if did_inference: - name.append(stype.name) - - # These instructions require a 'b32'-type argument, despite working - # on u32 and s32 types just fine, so change the name but not stype - if name[0] in 'popc clz bfind brev bfi and or xor not cnot shl'.split(): - name[-1] = 'b' + name[1:] - - # Calculate destination type (may influence some args too) - if (name[0] in 'popc clz bfind prmt'.split() or - name[:3] == ['bar', 'red', 'popc'] or - name[:2] == ['vote', 'ballot']): - dtype = TYPES['u32'] - elif (name[0] in 'testp setp isspacep vote'.split() or - name[:2] == ['bar', 'red']): - dtype = TYPES['pred'] - elif (name[0] in 'st prefetch prefetchu bra ret exit bar membar ' - 'red'.split()): - dtype = None - elif name[0] in ('mul', 'mad') and name[1] == 'wide': - dtype = TYPES[stype.kind + str(2*stype.bits)] - elif dtype is None: - dtype = stype - - atype_dict = {ST: stype, DT: dtype, U3: TYPES['u32']} - - # Wrap any untyped immediates - for idx, arg in enumerate(args): - if not isinstance(arg, Register): - t = atype_dict.get(atypes[idx]) - args[idx] = Immediate(None, t, arg) - - if did_inference: - for i, (arg, atype) in enumerate(zip(args, atypes)): - if atype in atype_dict and arg.type != atype_dict[atype]: - raise TypeError("Arg %d differs from expected type %s. " - "If this is intentional, explicitly specify the " - "source argument type." % (i, atype.name)) - if name[0] in 'ld ldu st red atom'.split(): - if (isinstance(args[0], Pointer) and - args[0].dtype.bits != stype.bits): - raise TypeError("The inferred type %s differs in size " - "from the referent's type %s. If this is intentional, " - "explicitly specify the source argument type." % - (stype.name, args[0].dtype.name)) - - return name, tuple(args), dtype - -class Register(object): - """ - The workhorse. - """ - def __init__(self, entry, type): - self.entry, self.type = entry, type - # Ordinary register naming / lifetime tracking - self.name, self.inferred_name, self.rebound_to = None, None, None - # Immediate value binding and other non-user-exposed hackery - self._ptx = None - - def _set_val(self, val): - if not isinstance(val, Register): - val = Immediate(self.entry, self.type, val) - self.entry.add_rebinding(self, val) - val = property(lambda s: s, _set_val) - def __repr__(self): - s = super(Register, self).__repr__()[:-1] - return s + ': type=%s, name=%s, inferred_name=%s>' % ( - self.type.name, self.name, self.inferred_name) - def get_name(self): - if self._ptx is not None: - return str(self._ptx) - if self.rebound_to: - return self.rebound_to.get_name() - return self.name or self.inferred_name - - def _infer_name(self, depth=2): - """ - To produce more readable code, this method reaches in to the stack and - tries to find the name of this register in the calling method's locals. - If a register is still unbound at code generation time, this name will - be preferred over a meaningless ``rXX``-style identifier. - - This best-guess effort should have absolutely no semantic impact on the - generated PTX, and is only here for readability, so we don't sweat the - potential corner cases associated with it. - - ``depth`` is the index of the relevant frame in this function's stack. - """ - if self.inferred_name is None: - frame = inspect.stack()[depth][0] - for key, val in frame.f_locals.items(): - if self is val: - self.inferred_name = key - break - -class Pointer(Register): - """ - A register which knows (in Python, at least) the type, state space, and - address of a datum in memory. - """ - # TODO: use u64 as type if device has >=4GB of memory - ptr_type = TYPES['u32'] - def __init__(self, entry, sspace, dtype): - super(Pointer, self).__init__(entry, self.ptr_type) - self.sspace, self.dtype = sspace, dtype - -class Immediate(Register): - """ - An Immediate is the DSL's way of storing PTX immediate values. It differs - from a Register in two respects: - - - A non-Register value can be assigned to the ``val`` property (or passed - to ``__init__``). If the value is an int or float, it will be coerced to - follow PTX's strict parsing rules for the type of the ``Immediate``; - otherwise, it'll simply be coerced to ``str`` and pasted in the PTX. - - - The ``type`` can be None, which disables all coercion and introspection. - This is practical for labels and the like. - """ - def __init__(self, entry, type, val=None): - super(Immediate, self).__init__(entry, type) - self.val = val - def _set_val(self, val): - self._ptx = self.coerce(self.type, val) - val = property(lambda s: s._ptx, _set_val) - def __repr__(self): - return object.__repr__(self)[:-1] + ': type=%s, value=%s>' % ( - self.type.name, self._ptx) - @staticmethod - def coerce(type, val): - if type is None or not isinstance(val, (int, long, float)): - return val - if type.kind == 'u' and val < 0: - raise ValueError("Can't convert (< 0) val to unsigned") - # Maybe more later? - if type.kind in 'us': - return int(val) - if type.kind in 'f': - return float(val) - raise TypeError("Immediates not supported for type %s" % type.name) - -class Regs(object): - """ - The ``entry.regs`` object to which Registers are bound. - """ - def __init__(self, entry): - self.__dict__['_entry'] = entry - self.__dict__['_named_regs'] = dict() - def __create_register_func(self, type): - def f(*args, **kwargs): - return self._entry.create_register(type, *args, **kwargs) - return f - def __getattr__(self, name): - if name in TYPES: - return self.__create_register_func(TYPES[name]) - if name in self._named_regs: - return self._named_regs[name] - raise KeyError("Unrecognized register name %s" % name) - def __setattr__(self, name, val): - if name in self._named_regs: - self._named_regs[name].val = val - else: - if isinstance(val, Register): - assert val in self._entry._regs, "Reg from nowhere!" - val.name = name - self._named_regs[name] = val - else: - raise TypeError("What Is This %s You Have Given Me" % val) - - -class Memory(object): - """ - Memory objects reference device memory and and provide a convenient - shorthand for address calculations. - - The base address of a memory location may be retreived from the ``addr`` - property as a ``Pointer`` for manual address calculations. - - Somewhat more automatic address calculations can be performed using Python - bracket notation:: - - >>> r1 = o.ld(m.something[r2]) - >>> o.st(m.something[2*r2], r1) - - If the value passed in the brackets is u32, it will *not* be coerced to - u64 until being added to the base pointer. To access arrays that are more - than 4GB in size, you must coerce the input type to u64 yourself. - - Currently, all steps in an address calculation are performed for each - access, and so for inner loops manual address calculation (or simply saving - the resulting register for reuse in the next memory operation) may be more - efficient. Once the register lifetime profiler is complete, that behavior - may change. - """ - def __init__(self, entry, space, type, name): - self.entry, self.space, self.type, self.name = entry, space, type, name - @property - def addr(self): - ptr = Pointer(self.entry, self.space, self.type) - ptr._ptx = self.name - def __getitem__(self, key): - # TODO: make this multi-type-safe, perform strength reduction/precalc - ptr = Pointer(self.entry, self.space, self.type) - self.entry.add_stmt(['mad', 'lo', 'u32'], key, self.type.bytes, - self.addr, result=ptr) - return ptr - -class PtrParam(Memory): - """ - Entry parameters which contain pointers to memory locations, as created - through ``entry.add_ptr_param()``, use this type to hide the address load - from parameter space. - """ - # TODO: this assumes u32 addresses, which won't be true for long - @property - def addr(self): - ptr = Pointer(self.entry, self.space, self.type) - self.entry.add_stmt(['ld', 'param', ptr.type.name], - self.name, result=ptr) - return ptr - -class Params(object): - """ - The ``entry.params`` object to which parameters are bound. - """ - def __init__(self, entry): - # Boy this 'everything references entry` thing has gotten old - self.entry = entry - def __getattr__(self, name): - if name not in self.entry._params: - raise KeyError("Did not recognize parameter name.") - param = self.entry._params[name] - if isinstance(param, PtrParam): - return param - return self.entry.ops.ld(param.addr) - -class _DotNameHelper(object): - def __init__(self, callback, name = ()): - self.__callback = callback - self.__name = name - def __getattr__(self, name): - return _DotNameHelper(self.__callback, self.__name + (name,)) - def __call__(self, *args, **kwargs): - return self.__callback(self.__name, *args, **kwargs) - -RegUse = namedtuple('RegUse', 'src dst') -Rebinding = namedtuple('Rebinding', 'dst src') - -class Entry(object): - """ - Manager extraordinaire. - - TODO: document this. - """ - - def __init__(self, name, block_width, block_height=1, block_depth=1): - self.name = name - self.block = (block_width, block_height, block_depth) - self.threads_per_cta = block_width * block_height - self.body_seen = False - self.tail_cbs = [] - self.identifiers = set() - - self.ops = _DotNameHelper(self.add_stmt) - self._stmts = [] - self._labels = [] - self.regs = Regs(self) - self._regs = {} - - # Intended to be read by the ``params`` object below - self._params = {} - self.params = Params(self) - - def __enter__(self): - # May do more later - pass - - def __exit__(self, etype, eval, tb): - # May do more later - pass - - def add_stmt(self, name, *operands, **kwargs): - stmt = Statement(name, operands) - idx = len(self._stmts) - for operand in stmt.operands: - operand._infer_name(2) - use = self._regs.setdefault(operand, RegUse([], [])) - use.src.append(idx) - if stmt.rtype is not None: - result = kwargs.pop('result', None) - if result: - assert result.type == stmt.rtype, "Internal type error" - else: - result = Register(self, stmt.rtype) - stmt.result = result - self._regs[result] = RegUse(src=[], dst=[idx]) - if kwargs: - raise KeyError("Unrecognized keyword arguments: %s" % kwargs) - self._stmts.append(stmt) - return stmt.result - - def add_rebinding(self, dst, src): - idx = len(self._stmts) - self._regs[dst].dst.append(idx) - if not isinstance(src, Immediate): - self._regs[src].src.append(idx) - self._stmts.append(Rebinding(dst, src)) - - def create_register(self, type, initial=None): - r = Register(self, type) - self._regs[r] = RegUse([], []) - if initial: - r.val = initial - return r - - def head(self): - """ - Top-level code segment that will be placed at the start of the entry. - Useful for initialization of memory or registers by types that do - not implement an entry point themselves. - """ - # This may do more later - return self - - def body(self): - """ - Top-level code segment representing the body of the entry point. - """ - # This may do more later - assert not self.body_seen, "Only one body per entry allowed." - self.body_seen = True - return self - - def tail_callback(self, cb, *args, **kwargs): - """ - Registers a tail callback function. After the body segment is complete, - the tail callbacks will be called in reverse, such that each head/tail - pair nests in dependency order. - - Any arguments to this function will be passed to the callback. - """ - self.tail_cbs.append((cb, args, kwargs)) - - def add_param(self, ptype, name): - """ - Adds a parameter to this entry. ``type`` and ``name`` are strings. - """ - if ptype not in TYPES: - raise TypeError("Unrecognized PTX type name.") - self._params[name] = Memory(self, 'param', TYPES[ptype], name) - - def add_ptr_param(self, name, mtype): - """ - Adds a parameter to this entry which points to a location in global - memory. The resulting property of ``entry.params`` will be a - ``PtrParam`` for convenient access. - - ``name`` is the param name, and ``mtype`` is the base type of the - memory location being pointed to. The actual type of the pointer will - be chosen based on the amount of addressable memory on the device. - """ - if mtype not in TYPES: - raise TypeError("Unrecognized PTX type name.") - # TODO: add pointer size heuristic - self._params[name] = PtrParam(self, 'global', TYPES[mtype], name) - - def finalize(self): - """ - This method runs the tail callbacks and performs any introspection - necessary prior to emitting PTX. - """ - assert self.tail_cbs is not None, "Cannot finalize more than once!" - for cb, args, kwargs in reversed(self.tail_cbs): - cb(*args, **kwargs) - self.tail_cbs = None - - # This loop verifies rebinding of floating registers to named ones. - # If all of the conditions below are met, the src register's name will - # be allowed to match the dst register; otherwise, the src's value - # will be copied to the dst's with a ``mov`` instruction - for idx, stmt in enumerate(self._stmts): - if not isinstance(stmt, Rebinding): continue - dst, src = stmt - # src must be floating reg, not immediate or bound reg - # Examples: - # r.a = r.u32(4) - # b = r.u32(r.a) - move = isinstance(src, Immediate) or src.name is not None - # dst cannot be used between src's originating expression and - # the rebinding itself - # Example 1: - # r.a, r.b = r.u32(1), r.u32(1) - # x = o.add(r.a, r.b) - # r.b = o.add(r.a, x) - # r.a = x - # Example 2: - # r.a, r.b = r.u32(1), r.u32(1) - # label('start') - # x = o.add(r.a, r.b) - # y = o.add(r.a, x) - # r.a = x - # r.b = y - # bra.uni('start') - # TODO: incorporate branch tracking - if not move: - for oidx in (self._regs[dst].src + self._regs[dst].dst): - if oidx > self._regs[src].dst[0] and oidx < idx: - move = True - if move: - src.rebound_to = None - stmt = Statement(('mov',), (src,)) - stmt.result = dst - self._stmts[idx] = stmt - - # Identify all uses of registers by name in the program - bound = dict([(t, set()) for t in TYPES.values()]) - free = dict([(t, {}) for t in TYPES.values()]) - for stmt in self._stmts: - if isinstance(stmt, Rebinding): - regs = [stmt.src, stmt.dst] - else: - regs = filter(lambda r: r and not isinstance(r, Immediate), - (stmt.result,) + stmt.operands) - for reg in regs: - if reg.name: - bound[reg.type].add(reg.name) - else: - rl = free[reg.type].setdefault(reg.inferred_name, []) - if reg not in rl: - rl.append(reg) - - # Store the data required for register declarations - self.bound = bound - self.temporary = {} - - # Generate names for all unbound registers - # TODO: include memory, label, instr identifiers in this list - identifiers = set() - map(identifiers.update, bound.values()) - used_bases = set([i.rstrip('1234567890') for i in identifiers]) - for t, inames in free.items(): - for ibase, regs in inames.items(): - if ibase is None: - ibase = t.name + '_' - while ibase in used_bases: - ibase = ibase + '_' - trl = self.temporary.setdefault(t, []) - trl.append('%s<%d>' % (ibase, len(regs))) - for i, reg in enumerate(regs): - reg.name = ibase + str(i) - - def format_source(self, formatter): - assert self.tail_cbs is None, "Must finalize entry before formatting" - params = [v for k, v in sorted(self._params.items())] - formatter.entry_start(self.name, params, reqntid=self.block) - [formatter.regs(t, r) for t, r in sorted(self.bound.items()) if r] - formatter.comment("Temporary registers") - [formatter.regs(t, r) for t, r in sorted(self.temporary.items()) if r] - formatter.blank() - - for stmt in self._stmts: - if isinstance(stmt, Statement): - stmt.ptx_line = formatter.stmt(stmt) - formatter.entry_end() - - -class PTXFormatter(object): - def __init__(self, ptxver=PTX_VERSION, target='sm_21'): - self.indent_level = 0 - self.lines = ['.version %d.%d' % ptxver, '.target %s' % target] - - def blank(self): - self.lines.append('') - - def comment(self, text): - self.lines.append(' ' * self.indent_level + '// ' + text) - - def regs(self, type, names): - # TODO: indenting, length limits, etc. - self.lines.append(' ' * self.indent_level + '.reg .%s ' % (type.name) + - ', '.join(sorted(names)) + ';') - - def stmt(self, stmt): - res = ('%s, ' % stmt.result.get_name()) if stmt.result else '' - args = [o.get_name() for o in stmt.operands] - # Wrap the arg in brackets if needed (no good place to put this) - if stmt.fullname[0] in ('ld ldu st prefetch prefetchu isspacep ' - 'atom red'.split()): - args[0] = '[%s]' % args[0] - - self.lines.append(''.join([' ' * self.indent_level, - '%-12s ' % '.'.join(stmt.fullname), res, ', '.join(args), ';'])) - return len(self.lines) - - def entry_start(self, name, params, **directives): - """ - Define the start of an entry point. ``name`` and ``params`` should be - obvious, ``directives`` is a dictionary of performance tuning directive - strings. As a special case, if a ``directive`` value is a tuple, it - will be converted to a comma-separated string. - """ - for k, v in directives.items(): - if isinstance(v, tuple): - directives[k] = ','.join(map(str, v)) - dstr = ' '.join(['.%s %s' % i for i in directives.items()]) - # TODO: support full param options like alignment and array decls - # (base the param type off a memory type) - pstrs = ['.param .%s %s' % (p.type.name, p.name) for p in params] - pstr = '(%s)' % ', '.join(pstrs) - self.lines.append(' '.join(['.entry', name, pstr, dstr])) - self.lines.append('{') - self.indent_level += 4 - - def entry_end(self): - self.indent_level += 4 - self.lines.append('}') - - def get_source(self): - return '\n'.join(self.lines) - -_TExp = namedtuple('_TExp', 'type exprlist') -_DataCell = namedtuple('_DataCell', 'offset size texp') - -class DataStream(object): - """ - Simple interface between Python and PTX, designed to create and tightly - pack control structs. - - (In the original implementation, this actually used a stack with - variable positions determined at runtime. The resulting structure had to be - read strictly sequentially to be parsed, hence the name "stream".) - - Subclass this and give it a shortname, then depend on the subclass in your - PTX fragments. An instance-based approach is under consideration. - - >>> class ExampleDataStream(DataStream): - >>> shortname = "ex" - - Inside DSL functions, you can retrieve arbitrary Python expressions from - the data stream. - - >>> def example_func(): - >>> reg.u32('reg1 reg2 regA') - >>> op.mov.u32(regA, some_device_allocation_base_address) - >>> # From the structure at the base address in 'regA', load the value - >>> # of 'ctx.nthreads' into reg1 - >>> ex.get(regA, reg1, 'ctx.nthreads+padding') - - The expressions will be stored as strings and mapped to particular - positions in the struct. Later, the expressions will be evaluated and - coerced into a type matching the destination register: - - >>> data = ExampleDataStream.pack(ctx, padding=4) - - Expressions will be aligned and may be reused in such a way as to minimize - access times when taking device caching into account. This also implies - that the evaluated expressions should not modify any state. - - >>> def example_func_2(): - >>> reg.u32('reg1 reg2') - >>> reg.f32('regf') - >>> ex.get(regA, reg1, 'ctx.nthreads * 2') - >>> # Same expression, so load comes from same memory location - >>> ex.get(regA, reg2, 'ctx.nthreads * 2') - >>> # Vector loads are pre-coerced, so you can mix types - >>> ex.get_v2(regA, reg1, '4', regf, '5.5') - - You can even do device allocations in the file, using the post-finalized - variable '[prefix]_stream_size'. It's a DelayVar; simple things like - multiplying by a number work (as long as the DelayVar comes first), but - fancy things like multiplying two DelayVars aren't implemented yet. - - >>> class Whatever(PTXFragment): - >>> def module_setup(self): - >>> mem.global_.u32('ex_streams', ex.stream_size*1000) - """ - # Must be at least as large as the largest load (.v4.u32 = 16) - alignment = 16 - - def __init__(self): - self.texp_map = {} - self.cells = [] - self._size = 0 - self.free = {} - self.size_delayvars = [] - self.finalized = False - - _types = dict(s8='b', u8='B', s16='h', u16='H', s32='i', u32='I', f32='f', - s64='l', u64='L', f64='d') - def _get_type(self, regs): - size = int(regs[0].type[1:]) - for reg in regs: - if reg.type not in self._types: - raise TypeError("Register %s of type %s not supported" % - (reg.name, reg.type)) - if int(reg.type[1:]) != size: - raise TypeError("Can't vector-load different size regs") - return size/8, ''.join([self._types.get(r.type) for r in regs]) - - def _alloc(self, vsize, texp): - # A really crappy allocator. May later include optimizations for - # keeping common variables on the same cache line, etc. - alloc = vsize - idx = self.free.get(alloc) - while idx is None and alloc < self.alignment: - alloc *= 2 - idx = self.free.get(alloc) - if idx is None: - # No aligned free cells, allocate a new `align`-byte free cell - assert alloc not in self.free - self.free[alloc] = idx = len(self.cells) - self.cells.append(_DataCell(self._size, alloc, None)) - self._size += alloc - # Overwrite the free cell at `idx` with texp - assert self.cells[idx].texp is None - offset = self.cells[idx].offset - self.cells[idx] = _DataCell(offset, vsize, texp) - self.free.pop(alloc) - # Now reinsert the fragmented free cells. - fragments = alloc - vsize - foffset = offset + vsize - fsize = 1 - fidx = idx - while fsize < self.alignment: - if fragments & fsize: - assert fsize not in self.free - fidx += 1 - self.cells.insert(fidx, _DataCell(foffset, fsize, None)) - foffset += fsize - for k, v in filter(lambda (k, v): v >= fidx, self.free.items()): - self.free[k] = v+1 - self.free[fsize] = fidx - fsize *= 2 - return offset - - def _stream_get_internal(self, areg, dregs, exprs, ifp, ifnotp): - size, type = self._get_type(dregs) - vsize = size * len(dregs) - texp = _TExp(type, tuple(exprs)) - if texp in self.texp_map: - offset = self.texp_map[texp] - else: - offset = self._alloc(vsize, texp) - self.texp_map[texp] = offset - opname = ['ldu', 'b%d' % (size*8)] - if len(dregs) > 1: - opname.insert(1, 'v%d' % len(dregs)) - dregs = vec(*dregs) - op._call(opname, dregs, addr(areg, offset), ifp=ifp, ifnotp=ifnotp) - - def get(self, areg, dreg, expr, ifp=None, ifnotp=None): - self._stream_get_internal(areg, [dreg], [expr], ifp, ifnotp) - - def get_v2(self, areg, dreg1, expr1, dreg2, expr2, ifp=None, ifnotp=None): - self._stream_get_internal(areg, [dreg1, dreg2], [expr1, expr2], - ifp, ifnotp) - - # The interleaved signature makes calls easier to read - def get_v4(self, areg, d1, e1, d2, e2, d3, e3, d4, e4, - ifp=None, ifnotp=None): - self._stream_get_internal(areg, [d1, d2, d3, d4], [e1, e2, e3, e4], - ifp, ifnotp) - - @property - def stream_size(self): - if self.finalized: - return self._size - x = DelayVar("not_yet_determined") - self.size_delayvars.append(x) - return x - - def finalize_code(self): - self.finalized = True - for dv in self.size_delayvars: - dv.val = self._size - print "Finalized stream:" - self._print_format() - - def pack(self, ctx, _out_file_ = None, **kwargs): - """ - Evaluates all statements in the context of **kwargs. Take this code, - presumably inside a PTX func:: - - >>> ex.get(regA, reg1, 'sum([x+frob for x in xyz.things])') - - To pack this into a struct, call this method on an instance: - - >>> data = ExampleDataStream.pack(ctx, frob=4, xyz=xyz) - - This evaluates each Python expression from the stream with the provided - arguments as locals, coerces it to the appropriate type, and returns - the resulting structure as a string. - - The supplied LaunchContext is added to the namespace as ``ctx`` by - default. To supress, this, override ``ctx`` in the keyword arguments: - - >>> data = ExampleDataStream.pack(ctx, frob=5, xyz=xyz, ctx=None) - """ - out = StringIO() - cls.pack_into(out, kwargs) - return out.read() - - def pack_into(self, ctx, outfile, **kwargs): - """ - Like pack(), but write data to a file-like object at the file's current - offset instead of returning it as a string. - - >>> ex_stream.pack_into(ctx, strio_inst, frob=4, xyz=thing) - >>> ex_stream.pack_into(ctx, strio_inst, frob=6, xyz=another_thing) - """ - kwargs.setdefault('ctx', ctx) - for offset, size, texp in self.cells: - if texp: - type = texp.type - vals = [eval(e, globals(), kwargs) for e in texp.exprlist] - else: - type = 'x'*size # Padding bytes - vals = [] - outfile.write(struct.pack(type, *vals)) - - def _print_format(self, ctx=None, stream=None): - for cell in self.cells: - if cell.texp is None: - print '%3d %2d --' % (cell.offset, cell.size) - continue - print '%3d %2d %4s %s' % (cell.offset, cell.size, cell.texp.type, - cell.texp.exprlist[0]) - for exp in cell.texp.exprlist[1:]: - print '%11s %s' % ('', exp) - - def print_record(self, ctx, stream, limit=None): - for i in range(0, len(stream), self._size): - for cell in self.cells: - if cell.texp is None: - print '%3d %2d --' % (cell.offset, cell.size) - continue - s = '%3d %2d %4s' % (cell.offset, cell.size, cell.texp.type) - vals = struct.unpack(cell.texp.type, - stream[cell.offset:cell.offset+cell.size]) - for val, exp in zip(vals, cell.texp.exprlist): - print '%11s %-20s %s' % (s, val, exp) - s = '' - print '\n----\n' - if limit is not None: - limit -= 1 - if limit <= 0: break - diff --git a/cuburn/render.py b/cuburn/render.py index c25b4a9..5f71860 100644 --- a/cuburn/render.py +++ b/cuburn/render.py @@ -9,7 +9,6 @@ from fr0stlib import pyflam3 from fr0stlib.pyflam3._flam3 import * from fr0stlib.pyflam3.constants import * -from cuburn.cuda import LaunchContext from cuburn.device_code import * from cuburn.variations import Variations diff --git a/cuburn/variations.py b/cuburn/variations.py index 6aad6bf..355abb8 100644 --- a/cuburn/variations.py +++ b/cuburn/variations.py @@ -1,6 +1,6 @@ -from cuburn.ptx import PTXFragment, ptx_func +from pyptx import ptx -class Variations(PTXFragment): +class Variations(object): """ You know it. """ @@ -27,7 +27,6 @@ class Variations(PTXFragment): "waves2", "exp", "log", "sin", "cos", "tan", "sec", "csc", "cot", "sinh", "cosh", "tanh", "sech", "csch", "coth", "auger", "flux", ] - @ptx_func def xfg(self, dst, expr): """ Convenience wrapper around cp.get which loads the given property from @@ -37,19 +36,16 @@ class Variations(PTXFragment): # expression will be evaluated using each CP in stream packing. cp.get(cpA, dst, 'cp.xforms[%d].%s' % (self.xform_idx, expr)) - @ptx_func def xfg_v2(self, dst1, expr1, dst2, expr2): cp.get_v2(cpA, dst1, 'cp.xforms[%d].%s' % (self.xform_idx, expr1), dst2, 'cp.xforms[%d].%s' % (self.xform_idx, expr2)) - @ptx_func def xfg_v4(self, d1, e1, d2, e2, d3, e3, d4, e4): cp.get_v4(cpA, d1, 'cp.xforms[%d].%s' % (self.xform_idx, e1), d2, 'cp.xforms[%d].%s' % (self.xform_idx, e2), d3, 'cp.xforms[%d].%s' % (self.xform_idx, e3), d4, 'cp.xforms[%d].%s' % (self.xform_idx, e4)) - @ptx_func def apply_xform(self, xo, yo, co, xi, yi, ci, xform_idx): """ Apply a transform. @@ -107,12 +103,10 @@ class Variations(PTXFragment): op.fma.rn.ftz.f32(xo, c10, yt, xo) op.fma.rn.ftz.f32(yo, c11, yt, yo) - @ptx_func def linear(self, xo, yo, xi, yi, wgt): op.fma.rn.ftz.f32(xo, xi, wgt, xo) op.fma.rn.ftz.f32(yo, yi, wgt, yo) - @ptx_func def sinusoidal(self, xo, yo, xi, yi, wgt): reg.f32('sinval') op.sin.approx.ftz.f32(sinval, xi) @@ -120,7 +114,6 @@ class Variations(PTXFragment): op.sin.approx.ftz.f32(sinval, yi) op.fma.rn.ftz.f32(yo, sinval, wgt, yo) - @ptx_func def spherical(self, xo, yo, xi, yi, wgt): reg.f32('r2') op.fma.rn.ftz.f32(r2, xi, xi, '1e-30') diff --git a/main.py b/main.py index 0040e84..6b3aa0d 100644 --- a/main.py +++ b/main.py @@ -15,11 +15,11 @@ from pprint import pprint from ctypes import * import numpy as np - np.set_printoptions(precision=5, edgeitems=20) +from pyptx import ptx, run + from cuburn.device_code import * -from cuburn.cuda import LaunchContext from fr0stlib.pyflam3 import * from fr0stlib.pyflam3._flam3 import * from cuburn.render import * @@ -32,10 +32,33 @@ def dump_3d(nda): f.write(' | '.join([' '.join( ['%4.1g\t' % x for x in pt]) for pt in row]) + '\n') +def disass(mod): + import subprocess + sys.stdout.flush() + with open('/tmp/pyptx.ptx', 'w') as fp: + fp.write(mod.source) + subprocess.check_call('ptxas -arch sm_21 /tmp/pyptx.ptx ' + '-o /tmp/elf.o'.split()) + subprocess.check_call('/home/steven/code/decuda/elfToCubin.py --nouveau ' + '/tmp/elf.o'.split()) + def main(args): - verbose = 1 - if '-d' in args: - verbose = 3 + mwcent = ptx.Entry("mwc_test", 512) + mwctest = MWCRNGTest(mwcent) + + # Get the source for saving and disassembly before potentially crashing + mod = ptx.Module([mwcent]) + print '\n'.join(['%4d %s' % t for t in enumerate(mod.source.split('\n'))]) + disass(mod) + + mod = run.Module([mwcent]) + mod.print_func_info() + + ctx = mod.get_context('mwc_test', 14) + mwctest.run_test(ctx) + + return + with open(args[-1]) as fp: genomes = Genome.from_string(fp.read())