diff --git a/bench.py b/bench.py index 5b64a9e..ac655b4 100644 --- a/bench.py +++ b/bench.py @@ -7,7 +7,7 @@ Various micro-benchmarks and other experiments. import numpy as np import pycuda.autoinit import pycuda.driver as cuda -from cuburnlib.ptx import PTXFragment, PTXTest, ptx_func +from cuburnlib.ptx import PTXFragment, PTXTest, ptx_func, instmethod from cuburnlib.cuda import LaunchContext from cuburnlib.device_code import MWCRNG @@ -104,7 +104,7 @@ class L2WriteCombining(PTXTest): op.setp.ge.u32(p_done, x, 2) op.bra.uni(l2_restart, ifnotp=p_done) - + @instmethod def call(self, ctx): scratch = np.zeros(self.block_size*ctx.ctas/4, np.uint64) times_bytes = np.zeros((4, ctx.threads), np.uint64, 'F') @@ -137,7 +137,7 @@ def main(): ctx = LaunchContext([L2WriteCombining], block=(128,1,1), grid=(7*8,1), tests=True) ctx.compile(verbose=3) - ctx.ptx.instances[L2WriteCombining].call(ctx) + L2WriteCombining.call(ctx) if __name__ == "__main__": main() diff --git a/cuburnlib/device_code.py b/cuburnlib/device_code.py index dfb5826..0462d9c 100644 --- a/cuburnlib/device_code.py +++ b/cuburnlib/device_code.py @@ -130,6 +130,7 @@ class IterThread(PTXTest): std.store_per_thread(g_num_rounds, num_rounds) std.store_per_thread(g_num_writes, num_writes) + @instmethod def upload_cp_stream(self, ctx, cp_stream, num_cps): cp_array_dp, cp_array_l = ctx.mod.get_global('g_cp_array') assert len(cp_stream) <= cp_array_l, "Stream too big!" @@ -139,6 +140,7 @@ class IterThread(PTXTest): cuda.memset_d32(num_cps_dp, num_cps, 1) self.cps_uploaded = True + @instmethod def call(self, ctx): if not self.cps_uploaded: raise Error("Cannot call IterThread before uploading CPs") diff --git a/cuburnlib/ptx.py b/cuburnlib/ptx.py index 7d4f8e6..ad6c85e 100644 --- a/cuburnlib/ptx.py +++ b/cuburnlib/ptx.py @@ -500,6 +500,9 @@ class PTXFragment(object): An object containing PTX DSL functions. The object, and all its dependencies, will be instantiated by a PTX module. Each object will be bound to the name given by ``shortname`` in the DSL namespace. + + Because of the instantiation weirdness, use the instmethod decorator on + instance methods that will be called from regular Python code. """ # Name under which to make this code available in ptx_funcs @@ -575,6 +578,17 @@ class PTXFragment(object): """ pass +def instmethod(func): + """ + Wrapper to allow instances to be retrieved from an active context. Use it + on methods which depend on state created during a compilation phase, but + are intended to be called from normal Python code. + """ + def wrap(cls, ctx, *args, **kwargs): + inst = ctx.ptx.instances[cls] + func(inst, ctx, *args, **kwargs) + return classmethod(wrap) + class PTXEntryPoint(PTXFragment): # Human-readable entry point name name = "" @@ -591,6 +605,7 @@ class PTXEntryPoint(PTXFragment): """ raise NotImplementedError + @instmethod def call(self, ctx): """ Calls the entry point on the device. Haven't worked out the details @@ -819,7 +834,6 @@ class PTXModule(object): print '\n'.join(["%03d %s" % (i+1, l) for (i, l) in enumerate(self.source.split('\n'))]) - def _flatten(val): if isinstance(val, (list, tuple)): return ''.join(map(_flatten, val)) @@ -883,7 +897,7 @@ class DataStream(PTXFragment): >>> class ExampleDataStream(DataStream): >>> shortname = "ex" - Inside DSL functions, you can "retrieve" arbitrary Python expressions from + Inside DSL functions, you can retrieve arbitrary Python expressions from the data stream. >>> @ptx_func @@ -892,22 +906,17 @@ class DataStream(PTXFragment): >>> op.mov.u32(regA, some_device_allocation_base_address) >>> # From the structure at the base address in 'regA', load the value >>> # of 'ctx.nthreads' into reg1 - >>> ex.get(regA, reg1, 'ctx.nthreads') + >>> ex.get(regA, reg1, 'ctx.nthreads+padding') The expressions will be stored as strings and mapped to particular positions in the struct. Later, the expressions will be evaluated and coerced into a type matching the destination register: - >>> # Fish the instance holding the data stream from the compiled module - >>> ex_stream = launch_context.ptx.instances[ExampleDataStream] - >>> # Evaluate the expressions in the current namespace, augmented with the - >>> # supplied objects - >>> data = ex_stream.pack(ctx=launch_context) + >>> data = ExampleDataStream.pack(ctx, padding=4) Expressions will be aligned and may be reused in such a way as to minimize access times when taking device caching into account. This also implies - that the evaluated expressions should not modify any state, but that should - be obvious, no? + that the evaluated expressions should not modify any state. >>> @ptx_func >>> def example_func_2(): @@ -1034,7 +1043,8 @@ class DataStream(PTXFragment): for dv in self.size_delayvars: dv.val = self._size - def pack(self, _out_file_ = None, **kwargs): + @instmethod + def pack(self, ctx, _out_file_ = None, **kwargs): """ Evaluates all statements in the context of **kwargs. Take this code, presumably inside a PTX func:: @@ -1043,25 +1053,31 @@ class DataStream(PTXFragment): To pack this into a struct, call this method on an instance: - >>> ex_stream = launch_context.ptx.instances[ExampleDataStream] - >>> data = ex_stream.pack(frob=4, xyz=xyz) + >>> data = ExampleDataStream.pack(ctx, frob=4, xyz=xyz) This evaluates each Python expression from the stream with the provided arguments as locals, coerces it to the appropriate type, and returns the resulting structure as a string. + + The supplied LaunchContext is added to the namespace as ``ctx`` by + default. To supress, this, override ``ctx`` in the keyword arguments: + + >>> data = ExampleDataStream.pack(ctx, frob=5, xyz=xyz, ctx=None) """ out = StringIO() - self.pack_into(out, kwargs) + cls.pack_into(out, kwargs) return out.read() - def pack_into(self, outfile, **kwargs): + @instmethod + def pack_into(self, ctx, outfile, **kwargs): """ Like pack(), but write data to a file-like object at the file's current offset instead of returning it as a string. - >>> ex_stream.pack_into(strio_inst, frob=4, xyz=thing) - >>> ex_stream.pack_into(strio_inst, frob=6, xyz=another_thing) + >>> ex_stream.pack_into(ctx, strio_inst, frob=4, xyz=thing) + >>> ex_stream.pack_into(ctx, strio_inst, frob=6, xyz=another_thing) """ + kwargs.setdefault('ctx', ctx) for offset, size, texp in self.cells: if texp: type = texp.type @@ -1071,7 +1087,8 @@ class DataStream(PTXFragment): vals = [] outfile.write(struct.pack(type, *vals)) - def print_record(self): + @instmethod + def print_record(self, ctx): for cell in self.cells: if cell.texp is None: print '%3d %2d --' % (cell.offset, cell.size) diff --git a/cuburnlib/render.py b/cuburnlib/render.py index ea03df1..65e80f8 100644 --- a/cuburnlib/render.py +++ b/cuburnlib/render.py @@ -35,10 +35,9 @@ class Frame(pyflam3.Frame): "Distribution of a CP across multiple CTAs not yet done") # Interpolate each time step, calculate per-step variables, and pack # into the stream - cp_streamer = ctx.ptx.instances[CPDataStream] stream = StringIO() print "Data stream contents:" - cp_streamer.print_record() + CPDataStream.print_record(ctx) tcp = BaseGenome() for batch_idx in range(cp.nbatches): for time_idx in range(cp.ntemporal_samples): @@ -51,10 +50,8 @@ class Frame(pyflam3.Frame): cp.width * cp.height) / ( cp.nbatches * cp.ntemporal_samples) - cp_streamer.pack_into(stream, - frame=self, - cp=tcp, - cp_idx=idx) + CPDataStream.pack_into(ctx, stream, + frame=self, cp=tcp, cp_idx=idx) stream.seek(0) return (stream.read(), cp.nbatches * cp.ntemporal_samples) @@ -108,8 +105,8 @@ class Animation(object): # TODO: allow animation-long override of certain parameters (size, etc) cp_stream, num_cps = self.frame.pack_stream(self.ctx, time) iter_thread = self.ctx.ptx.instances[IterThread] - iter_thread.upload_cp_stream(self.ctx, cp_stream, num_cps) - iter_thread.call(self.ctx) + IterThread.upload_cp_stream(self.ctx, cp_stream, num_cps) + IterThread.call(self.ctx) class Features(object): """