cuburn/cuburn/cuda.py

# These imports are order-sensitive!
import pyglet
import pyglet.gl as gl
gl.get_current_context()

import pycuda.driver as cuda
from pycuda.compiler import SourceModule
import pycuda.tools
import pycuda.gl as cudagl
import pycuda.gl.autoinit

import numpy as np

from cuburn.ptx import PTXModule, PTXTest, PTXTestFailure

class LaunchContext(object):
    """
    Context collecting the information needed to create, run, and gather the
    results of a device computation. This may eventually also include an actual
    CUDA context, but for now it just uses the global one.

    To create the fastest device code across multiple device families, this
    context may decide to iteratively refine the final PTX by regenerating
    and recompiling it several times to optimize certain parameters of the
    launch, such as the distribution of threads throughout the device.
    The properties of this device which are tuned are listed below. Any PTX
    fragments which use this information must emit valid PTX for any state
    given below, but the PTX is only required to actually run with the final,
    fixed values of all tuned parameters below.

        `block`:    3-tuple of (x,y,z); dimensions of each CTA.
        `grid`:     2-tuple of (x,y); dimensions of the grid of CTAs.
        `nthreads`: Number of active threads on device as a whole.
        `mod`:      Final compiled module. Unavailable during assembly.

    """
    def __init__(self, entries, block=(1,1,1), grid=(1,1), tests=False):
        self.entry_types = entries
        self.block, self.grid, self.build_tests = block, grid, tests
        self.setup_done = False
        self.stream = cuda.Stream()

    @property
    def nthreads(self):
        return reduce(lambda a, b: a*b, self.block + self.grid)

    @property
    def nctas(self):
        return self.grid[0] * self.grid[1]

    @property
    def threads_per_cta(self):
        return self.block[0] * self.block[1] * self.block[2]

    @property
    def warps_per_cta(self):
        return self.threads_per_cta / 32

    def compile(self, verbose=False, **kwargs):
        kwargs['ctx'] = self
        self.ptx = PTXModule(self.entry_types, kwargs, self.build_tests)
        # TODO: make this optional and let user choose path
        with open('/tmp/cuburn.ptx', 'w') as f: f.write(self.ptx.source)
        try:
            # TODO: detect/customize arch, code; verbose setting;
            # keep directory enable/disable via debug
            self.mod = cuda.module_from_buffer(self.ptx.source,
                [(cuda.jit_option.OPTIMIZATION_LEVEL, 0),
                 (cuda.jit_option.TARGET_FROM_CUCONTEXT, 1)])
        except (cuda.CompileError, cuda.RuntimeError), e:
            # TODO: if output not written above, print different message
            print "Compile error. Source is at /tmp/cuburn.ptx"
            print e
            raise e
        if verbose:
            for entry in self.ptx.entries:
                func = self.mod.get_function(entry.entry_name)
                print "Compiled %s: used %d regs, %d sm, %d local" % (
                        entry.entry_name, func.num_regs,
                        func.shared_size_bytes, func.local_size_bytes)

    def call_setup(self, entry_inst):
        for inst in self.ptx.entry_deps[type(entry_inst)]:
            inst.call_setup(self)

    def call_teardown(self, entry_inst):
        okay = True
        for inst in reversed(self.ptx.entry_deps[type(entry_inst)]):
            if inst is entry_inst and isinstance(entry_inst, PTXTest):
                try:
                    inst.call_teardown(self)
                except PTXTestFailure, e:
                    print "\nTest %s FAILED!" % inst.entry_name
                    print "Reason:", e
                    print
                    okay = False
            else:
                inst.call_teardown(self)
        return okay

    def run_tests(self):
        if not self.ptx.tests:
            print "No tests to run."
            return True
        all_okay = True
        for test in self.ptx.tests:
            cuda.Context.synchronize()
            if test.call(self):
                print "Test %s passed.\n" % test.entry_name
            else:
                all_okay = False
        return all_okay

    def get_per_thread(self, name, dtype, shaped=False):
        """
        Convenience function to get the contents of the global memory variable
        ``name`` from the device as a numpy array of type ``dtype``, as might
        be stored by _PTXStdLib.store_per_thread. If ``shaped`` is True, the
        array will be 3D, as (cta_no, warp_no, lane_no).
        """
        if shaped:
            shape = (self.nctas, self.warps_per_cta, 32)
        else:
            shape = self.nthreads
        dp, l = self.mod.get_global(name)
        return cuda.from_device(dp, shape, dtype)