instmethod decorator: another hack (to get around ctx.ptx.instances[])

2025-07-11 18:55:26 -04:00 · 2010-09-08 13:12:46 -04:00
parent 094890c324
commit 1f7b00b61e
4 changed files with 45 additions and 29 deletions
--- a/bench.py
+++ b/bench.py
@ -7,7 +7,7 @@ Various micro-benchmarks and other experiments.
 import numpy as np
 import pycuda.autoinit
 import pycuda.driver as cuda
-from cuburnlib.ptx import PTXFragment, PTXTest, ptx_func
+from cuburnlib.ptx import PTXFragment, PTXTest, ptx_func, instmethod
 from cuburnlib.cuda import LaunchContext
 from cuburnlib.device_code import MWCRNG

@ -104,7 +104,7 @@ class L2WriteCombining(PTXTest):
        op.setp.ge.u32(p_done, x, 2)
        op.bra.uni(l2_restart, ifnotp=p_done)

-
+    @instmethod
    def call(self, ctx):
        scratch = np.zeros(self.block_size*ctx.ctas/4, np.uint64)
        times_bytes = np.zeros((4, ctx.threads), np.uint64, 'F')
@ -137,7 +137,7 @@ def main():
    ctx = LaunchContext([L2WriteCombining], block=(128,1,1), grid=(7*8,1),
                        tests=True)
    ctx.compile(verbose=3)
-    ctx.ptx.instances[L2WriteCombining].call(ctx)
+    L2WriteCombining.call(ctx)

 if __name__ == "__main__":
    main()
--- a/cuburnlib/device_code.py
+++ b/cuburnlib/device_code.py
@ -130,6 +130,7 @@ class IterThread(PTXTest):
        std.store_per_thread(g_num_rounds, num_rounds)
        std.store_per_thread(g_num_writes, num_writes)

+    @instmethod
    def upload_cp_stream(self, ctx, cp_stream, num_cps):
        cp_array_dp, cp_array_l = ctx.mod.get_global('g_cp_array')
        assert len(cp_stream) <= cp_array_l, "Stream too big!"
@ -139,6 +140,7 @@ class IterThread(PTXTest):
        cuda.memset_d32(num_cps_dp, num_cps, 1)
        self.cps_uploaded = True

+    @instmethod
    def call(self, ctx):
        if not self.cps_uploaded:
            raise Error("Cannot call IterThread before uploading CPs")
--- a/cuburnlib/ptx.py
+++ b/cuburnlib/ptx.py
@ -500,6 +500,9 @@ class PTXFragment(object):
    An object containing PTX DSL functions. The object, and all its
    dependencies, will be instantiated by a PTX module. Each object will be
    bound to the name given by ``shortname`` in the DSL namespace.
+
+    Because of the instantiation weirdness, use the instmethod decorator on
+    instance methods that will be called from regular Python code.
    """

    # Name under which to make this code available in ptx_funcs
@ -575,6 +578,17 @@ class PTXFragment(object):
        """
        pass

+def instmethod(func):
+    """
+    Wrapper to allow instances to be retrieved from an active context. Use it
+    on methods which depend on state created during a compilation phase, but
+    are intended to be called from normal Python code.
+    """
+    def wrap(cls, ctx, *args, **kwargs):
+        inst = ctx.ptx.instances[cls]
+        func(inst, ctx, *args, **kwargs)
+    return classmethod(wrap)
+
 class PTXEntryPoint(PTXFragment):
    # Human-readable entry point name
    name = ""
@ -591,6 +605,7 @@ class PTXEntryPoint(PTXFragment):
        """
        raise NotImplementedError

+    @instmethod
    def call(self, ctx):
        """
        Calls the entry point on the device. Haven't worked out the details
@ -819,7 +834,6 @@ class PTXModule(object):
        print '\n'.join(["%03d %s" % (i+1, l) for (i, l) in
                        enumerate(self.source.split('\n'))])

-
 def _flatten(val):
    if isinstance(val, (list, tuple)):
        return ''.join(map(_flatten, val))
@ -883,7 +897,7 @@ class DataStream(PTXFragment):
    >>> class ExampleDataStream(DataStream):
    >>>     shortname = "ex"

-    Inside DSL functions, you can "retrieve" arbitrary Python expressions from
+    Inside DSL functions, you can retrieve arbitrary Python expressions from
    the data stream.

    >>> @ptx_func
@ -892,22 +906,17 @@ class DataStream(PTXFragment):
    >>>     op.mov.u32(regA, some_device_allocation_base_address)
    >>>     # From the structure at the base address in 'regA', load the value
    >>>     # of 'ctx.nthreads' into reg1
-    >>>     ex.get(regA, reg1, 'ctx.nthreads')
+    >>>     ex.get(regA, reg1, 'ctx.nthreads+padding')

    The expressions will be stored as strings and mapped to particular
    positions in the struct. Later, the expressions will be evaluated and
    coerced into a type matching the destination register:

-    >>> # Fish the instance holding the data stream from the compiled module
-    >>> ex_stream = launch_context.ptx.instances[ExampleDataStream]
-    >>> # Evaluate the expressions in the current namespace, augmented with the
-    >>> # supplied objects
-    >>> data = ex_stream.pack(ctx=launch_context)
+    >>> data = ExampleDataStream.pack(ctx, padding=4)

    Expressions will be aligned and may be reused in such a way as to minimize
    access times when taking device caching into account. This also implies
-    that the evaluated expressions should not modify any state, but that should
-    be obvious, no?
+    that the evaluated expressions should not modify any state.

    >>> @ptx_func
    >>> def example_func_2():
@ -1034,7 +1043,8 @@ class DataStream(PTXFragment):
        for dv in self.size_delayvars:
            dv.val = self._size

-    def pack(self, _out_file_ = None, **kwargs):
+    @instmethod
+    def pack(self, ctx, _out_file_ = None, **kwargs):
        """
        Evaluates all statements in the context of **kwargs. Take this code,
        presumably inside a PTX func::
@ -1043,25 +1053,31 @@ class DataStream(PTXFragment):

        To pack this into a struct, call this method on an instance:

-        >>> ex_stream = launch_context.ptx.instances[ExampleDataStream]
-        >>> data = ex_stream.pack(frob=4, xyz=xyz)
+        >>> data = ExampleDataStream.pack(ctx, frob=4, xyz=xyz)

        This evaluates each Python expression from the stream with the provided
        arguments as locals, coerces it to the appropriate type, and returns
        the resulting structure as a string.
+
+        The supplied LaunchContext is added to the namespace as ``ctx`` by
+        default. To supress, this, override ``ctx`` in the keyword arguments:
+
+        >>> data = ExampleDataStream.pack(ctx, frob=5, xyz=xyz, ctx=None)
        """
        out = StringIO()
-        self.pack_into(out, kwargs)
+        cls.pack_into(out, kwargs)
        return out.read()

-    def pack_into(self, outfile, **kwargs):
+    @instmethod
+    def pack_into(self, ctx, outfile, **kwargs):
        """
        Like pack(), but write data to a file-like object at the file's current
        offset instead of returning it as a string.

-        >>> ex_stream.pack_into(strio_inst, frob=4, xyz=thing)
-        >>> ex_stream.pack_into(strio_inst, frob=6, xyz=another_thing)
+        >>> ex_stream.pack_into(ctx, strio_inst, frob=4, xyz=thing)
+        >>> ex_stream.pack_into(ctx, strio_inst, frob=6, xyz=another_thing)
        """
+        kwargs.setdefault('ctx', ctx)
        for offset, size, texp in self.cells:
            if texp:
                type = texp.type
@ -1071,7 +1087,8 @@ class DataStream(PTXFragment):
                vals = []
            outfile.write(struct.pack(type, *vals))

-    def print_record(self):
+    @instmethod
+    def print_record(self, ctx):
        for cell in self.cells:
            if cell.texp is None:
                print '%3d %2d --' % (cell.offset, cell.size)
--- a/cuburnlib/render.py
+++ b/cuburnlib/render.py
@ -35,10 +35,9 @@ class Frame(pyflam3.Frame):
                "Distribution of a CP across multiple CTAs not yet done")
        # Interpolate each time step, calculate per-step variables, and pack
        # into the stream
-        cp_streamer = ctx.ptx.instances[CPDataStream]
        stream = StringIO()
        print "Data stream contents:"
-        cp_streamer.print_record()
+        CPDataStream.print_record(ctx)
        tcp = BaseGenome()
        for batch_idx in range(cp.nbatches):
            for time_idx in range(cp.ntemporal_samples):
@ -51,10 +50,8 @@ class Frame(pyflam3.Frame):
                                cp.width * cp.height) / (
                                cp.nbatches * cp.ntemporal_samples)

-                cp_streamer.pack_into(stream,
-                        frame=self,
-                        cp=tcp,
-                        cp_idx=idx)
+                CPDataStream.pack_into(ctx, stream,
+                        frame=self, cp=tcp, cp_idx=idx)
        stream.seek(0)
        return (stream.read(), cp.nbatches * cp.ntemporal_samples)

@ -108,8 +105,8 @@ class Animation(object):
        # TODO: allow animation-long override of certain parameters (size, etc)
        cp_stream, num_cps = self.frame.pack_stream(self.ctx, time)
        iter_thread = self.ctx.ptx.instances[IterThread]
-        iter_thread.upload_cp_stream(self.ctx, cp_stream, num_cps)
-        iter_thread.call(self.ctx)
+        IterThread.upload_cp_stream(self.ctx, cp_stream, num_cps)
+        IterThread.call(self.ctx)

 class Features(object):
    """