diff --git a/bench.py b/bench.py
index 5b64a9e..ac655b4 100644
--- a/bench.py
+++ b/bench.py
@@ -7,7 +7,7 @@ Various micro-benchmarks and other experiments.
 import numpy as np
 import pycuda.autoinit
 import pycuda.driver as cuda
-from cuburnlib.ptx import PTXFragment, PTXTest, ptx_func
+from cuburnlib.ptx import PTXFragment, PTXTest, ptx_func, instmethod
 from cuburnlib.cuda import LaunchContext
 from cuburnlib.device_code import MWCRNG
 
@@ -104,7 +104,7 @@ class L2WriteCombining(PTXTest):
         op.setp.ge.u32(p_done, x, 2)
         op.bra.uni(l2_restart, ifnotp=p_done)
 
-
+    @instmethod
     def call(self, ctx):
         scratch = np.zeros(self.block_size*ctx.ctas/4, np.uint64)
         times_bytes = np.zeros((4, ctx.threads), np.uint64, 'F')
@@ -137,7 +137,7 @@ def main():
     ctx = LaunchContext([L2WriteCombining], block=(128,1,1), grid=(7*8,1),
                         tests=True)
     ctx.compile(verbose=3)
-    ctx.ptx.instances[L2WriteCombining].call(ctx)
+    L2WriteCombining.call(ctx)
 
 if __name__ == "__main__":
     main()
diff --git a/cuburnlib/device_code.py b/cuburnlib/device_code.py
index dfb5826..0462d9c 100644
--- a/cuburnlib/device_code.py
+++ b/cuburnlib/device_code.py
@@ -130,6 +130,7 @@ class IterThread(PTXTest):
         std.store_per_thread(g_num_rounds, num_rounds)
         std.store_per_thread(g_num_writes, num_writes)
 
+    @instmethod
     def upload_cp_stream(self, ctx, cp_stream, num_cps):
         cp_array_dp, cp_array_l = ctx.mod.get_global('g_cp_array')
         assert len(cp_stream) <= cp_array_l, "Stream too big!"
@@ -139,6 +140,7 @@ class IterThread(PTXTest):
         cuda.memset_d32(num_cps_dp, num_cps, 1)
         self.cps_uploaded = True
 
+    @instmethod
     def call(self, ctx):
         if not self.cps_uploaded:
             raise Error("Cannot call IterThread before uploading CPs")
diff --git a/cuburnlib/ptx.py b/cuburnlib/ptx.py
index 7d4f8e6..ad6c85e 100644
--- a/cuburnlib/ptx.py
+++ b/cuburnlib/ptx.py
@@ -500,6 +500,9 @@ class PTXFragment(object):
     An object containing PTX DSL functions. The object, and all its
     dependencies, will be instantiated by a PTX module. Each object will be
     bound to the name given by ``shortname`` in the DSL namespace.
+
+    Because of the instantiation weirdness, use the instmethod decorator on
+    instance methods that will be called from regular Python code.
     """
 
     # Name under which to make this code available in ptx_funcs
@@ -575,6 +578,17 @@ class PTXFragment(object):
         """
         pass
 
+def instmethod(func):
+    """
+    Wrapper to allow instances to be retrieved from an active context. Use it
+    on methods which depend on state created during a compilation phase, but
+    are intended to be called from normal Python code.
+    """
+    def wrap(cls, ctx, *args, **kwargs):
+        inst = ctx.ptx.instances[cls]
+        func(inst, ctx, *args, **kwargs)
+    return classmethod(wrap)
+
 class PTXEntryPoint(PTXFragment):
     # Human-readable entry point name
     name = ""
@@ -591,6 +605,7 @@ class PTXEntryPoint(PTXFragment):
         """
         raise NotImplementedError
 
+    @instmethod
     def call(self, ctx):
         """
         Calls the entry point on the device. Haven't worked out the details
@@ -819,7 +834,6 @@ class PTXModule(object):
         print '\n'.join(["%03d %s" % (i+1, l) for (i, l) in
                         enumerate(self.source.split('\n'))])
 
-
 def _flatten(val):
     if isinstance(val, (list, tuple)):
         return ''.join(map(_flatten, val))
@@ -883,7 +897,7 @@ class DataStream(PTXFragment):
     >>> class ExampleDataStream(DataStream):
     >>>     shortname = "ex"
 
-    Inside DSL functions, you can "retrieve" arbitrary Python expressions from
+    Inside DSL functions, you can retrieve arbitrary Python expressions from
     the data stream.
 
     >>> @ptx_func
@@ -892,22 +906,17 @@ class DataStream(PTXFragment):
     >>>     op.mov.u32(regA, some_device_allocation_base_address)
     >>>     # From the structure at the base address in 'regA', load the value
     >>>     # of 'ctx.nthreads' into reg1
-    >>>     ex.get(regA, reg1, 'ctx.nthreads')
+    >>>     ex.get(regA, reg1, 'ctx.nthreads+padding')
 
     The expressions will be stored as strings and mapped to particular
     positions in the struct. Later, the expressions will be evaluated and
     coerced into a type matching the destination register:
 
-    >>> # Fish the instance holding the data stream from the compiled module
-    >>> ex_stream = launch_context.ptx.instances[ExampleDataStream]
-    >>> # Evaluate the expressions in the current namespace, augmented with the
-    >>> # supplied objects
-    >>> data = ex_stream.pack(ctx=launch_context)
+    >>> data = ExampleDataStream.pack(ctx, padding=4)
 
     Expressions will be aligned and may be reused in such a way as to minimize
     access times when taking device caching into account. This also implies
-    that the evaluated expressions should not modify any state, but that should
-    be obvious, no?
+    that the evaluated expressions should not modify any state.
 
     >>> @ptx_func
     >>> def example_func_2():
@@ -1034,7 +1043,8 @@ class DataStream(PTXFragment):
         for dv in self.size_delayvars:
             dv.val = self._size
 
-    def pack(self, _out_file_ = None, **kwargs):
+    @instmethod
+    def pack(self, ctx, _out_file_ = None, **kwargs):
         """
         Evaluates all statements in the context of **kwargs. Take this code,
         presumably inside a PTX func::
@@ -1043,25 +1053,31 @@ class DataStream(PTXFragment):
 
         To pack this into a struct, call this method on an instance:
 
-        >>> ex_stream = launch_context.ptx.instances[ExampleDataStream]
-        >>> data = ex_stream.pack(frob=4, xyz=xyz)
+        >>> data = ExampleDataStream.pack(ctx, frob=4, xyz=xyz)
 
         This evaluates each Python expression from the stream with the provided
         arguments as locals, coerces it to the appropriate type, and returns
         the resulting structure as a string.
+
+        The supplied LaunchContext is added to the namespace as ``ctx`` by
+        default. To supress, this, override ``ctx`` in the keyword arguments:
+
+        >>> data = ExampleDataStream.pack(ctx, frob=5, xyz=xyz, ctx=None)
         """
         out = StringIO()
-        self.pack_into(out, kwargs)
+        cls.pack_into(out, kwargs)
         return out.read()
 
-    def pack_into(self, outfile, **kwargs):
+    @instmethod
+    def pack_into(self, ctx, outfile, **kwargs):
         """
         Like pack(), but write data to a file-like object at the file's current
         offset instead of returning it as a string.
 
-        >>> ex_stream.pack_into(strio_inst, frob=4, xyz=thing)
-        >>> ex_stream.pack_into(strio_inst, frob=6, xyz=another_thing)
+        >>> ex_stream.pack_into(ctx, strio_inst, frob=4, xyz=thing)
+        >>> ex_stream.pack_into(ctx, strio_inst, frob=6, xyz=another_thing)
         """
+        kwargs.setdefault('ctx', ctx)
         for offset, size, texp in self.cells:
             if texp:
                 type = texp.type
@@ -1071,7 +1087,8 @@ class DataStream(PTXFragment):
                 vals = []
             outfile.write(struct.pack(type, *vals))
 
-    def print_record(self):
+    @instmethod
+    def print_record(self, ctx):
         for cell in self.cells:
             if cell.texp is None:
                 print '%3d %2d --' % (cell.offset, cell.size)
diff --git a/cuburnlib/render.py b/cuburnlib/render.py
index ea03df1..65e80f8 100644
--- a/cuburnlib/render.py
+++ b/cuburnlib/render.py
@@ -35,10 +35,9 @@ class Frame(pyflam3.Frame):
                 "Distribution of a CP across multiple CTAs not yet done")
         # Interpolate each time step, calculate per-step variables, and pack
         # into the stream
-        cp_streamer = ctx.ptx.instances[CPDataStream]
         stream = StringIO()
         print "Data stream contents:"
-        cp_streamer.print_record()
+        CPDataStream.print_record(ctx)
         tcp = BaseGenome()
         for batch_idx in range(cp.nbatches):
             for time_idx in range(cp.ntemporal_samples):
@@ -51,10 +50,8 @@ class Frame(pyflam3.Frame):
                                 cp.width * cp.height) / (
                                 cp.nbatches * cp.ntemporal_samples)
 
-                cp_streamer.pack_into(stream,
-                        frame=self,
-                        cp=tcp,
-                        cp_idx=idx)
+                CPDataStream.pack_into(ctx, stream,
+                        frame=self, cp=tcp, cp_idx=idx)
         stream.seek(0)
         return (stream.read(), cp.nbatches * cp.ntemporal_samples)
 
@@ -108,8 +105,8 @@ class Animation(object):
         # TODO: allow animation-long override of certain parameters (size, etc)
         cp_stream, num_cps = self.frame.pack_stream(self.ctx, time)
         iter_thread = self.ctx.ptx.instances[IterThread]
-        iter_thread.upload_cp_stream(self.ctx, cp_stream, num_cps)
-        iter_thread.call(self.ctx)
+        IterThread.upload_cp_stream(self.ctx, cp_stream, num_cps)
+        IterThread.call(self.ctx)
 
 class Features(object):
     """