cuburn/cuburnlib/cuda.py

# These imports are order-sensitive!
import pyglet
import pyglet.gl as gl
gl.get_current_context()

import pycuda.driver as cuda
import pycuda.tools
import pycuda.gl as cudagl
import pycuda.gl.autoinit

import numpy as np

from cuburnlib.ptx import PTXModule, PTXTest, PTXTestFailure

class LaunchContext(object):
    """
    Context collecting the information needed to create, run, and gather the
    results of a device computation. This may eventually also include an actual
    CUDA context, but for now it just uses the global one.

    To create the fastest device code across multiple device families, this
    context may decide to iteratively refine the final PTX by regenerating
    and recompiling it several times to optimize certain parameters of the
    launch, such as the distribution of threads throughout the device.
    The properties of this device which are tuned are listed below. Any PTX
    fragments which use this information must emit valid PTX for any state
    given below, but the PTX is only required to actually run with the final,
    fixed values of all tuned parameters below.

        `block`:    3-tuple of (x,y,z); dimensions of each CTA.
        `grid`:     2-tuple of (x,y); dimensions of the grid of CTAs.
        `threads`:  Number of active threads on device as a whole.
        `mod`:      Final compiled module. Unavailable during assembly.

    """
    def __init__(self, entries, block=(1,1,1), grid=(1,1), tests=False):
        self.entry_types = entries
        self.block, self.grid, self.build_tests = block, grid, tests
        self.setup_done = False

    @property
    def threads(self):
        return reduce(lambda a, b: a*b, self.block + self.grid)

    @property
    def ctas(self):
        return self.grid[0] * self.grid[1]

    @property
    def threads_per_cta(self):
        return self.block[0] * self.block[1] * self.block[2]

    @property
    def warps_per_cta(self):
        return self.threads_per_cta / 32

    def compile(self, verbose=False, **kwargs):
        kwargs['ctx'] = self
        self.ptx = PTXModule(self.entry_types, kwargs, self.build_tests)
        try:
            self.mod = cuda.module_from_buffer(self.ptx.source)
        except (cuda.CompileError, cuda.RuntimeError), e:
            print "Aww, dang, compile error. Here's the source:"
            self.ptx.print_source()
            raise e
        if verbose:
            if verbose >= 3:
                self.ptx.print_source()
            for entry in self.ptx.entries:
                func = self.mod.get_function(entry.entry_name)
                print "Compiled %s: used %d regs, %d sm, %d local" % (
                        entry.entry_name, func.num_regs,
                        func.shared_size_bytes, func.local_size_bytes)

    def call_setup(self, entry_inst):
        for inst in self.ptx.entry_deps[type(entry_inst)]:
            inst.call_setup(self)

    def call_teardown(self, entry_inst):
        okay = True
        for inst in reversed(self.ptx.entry_deps[type(entry_inst)]):
            if inst is entry_inst and isinstance(entry_inst, PTXTest):
                try:
                    inst.call_teardown(self)
                except PTXTestFailure, e:
                    print "PTX Test %s failed!" % inst.entry_name, e
                    okay = False
            else:
                inst.call_teardown(self)
        return okay

    def run_tests(self):
        if not self.ptx.tests:
            print "No tests to run."
            return True
        all_okay = True
        for test in self.ptx.tests:
            cuda.Context.synchronize()
            if test.call(self):
                print "Test %s passed." % test.entry_name
            else:
                print "Test %s FAILED." % test.entry_name
            all_okay = False
        return all_okay
Splitting things up a bit 2010-08-28 16:56:05 -04:00			`# These imports are order-sensitive!`
			`import pyglet`
			`import pyglet.gl as gl`
			`gl.get_current_context()`

			`import pycuda.driver as cuda`
			`import pycuda.tools`
			`import pycuda.gl as cudagl`
			`import pycuda.gl.autoinit`

			`import numpy as np`

Refactor call() to be more elegant 2010-09-10 14:43:20 -04:00			`from cuburnlib.ptx import PTXModule, PTXTest, PTXTestFailure`
Splitting things up a bit 2010-08-28 16:56:05 -04:00
			`class LaunchContext(object):`
			`"""`
			`Context collecting the information needed to create, run, and gather the`
Finally runs again 2010-09-06 11:18:20 -04:00			`results of a device computation. This may eventually also include an actual`
			`CUDA context, but for now it just uses the global one.`
Splitting things up a bit 2010-08-28 16:56:05 -04:00
			`To create the fastest device code across multiple device families, this`
			`context may decide to iteratively refine the final PTX by regenerating`
			`and recompiling it several times to optimize certain parameters of the`
			`launch, such as the distribution of threads throughout the device.`
			`The properties of this device which are tuned are listed below. Any PTX`
			`fragments which use this information must emit valid PTX for any state`
			`given below, but the PTX is only required to actually run with the final,`
			`fixed values of all tuned parameters below.`

			`block`: 3-tuple of (x,y,z); dimensions of each CTA.
			`grid`: 2-tuple of (x,y); dimensions of the grid of CTAs.
			`threads`: Number of active threads on device as a whole.
			`mod`: Final compiled module. Unavailable during assembly.

			`"""`
Finally runs again 2010-09-06 11:18:20 -04:00			`def __init__(self, entries, block=(1,1,1), grid=(1,1), tests=False):`
Splitting things up a bit 2010-08-28 16:56:05 -04:00			`self.entry_types = entries`
			`self.block, self.grid, self.build_tests = block, grid, tests`
			`self.setup_done = False`

			`@property`
			`def threads(self):`
			`return reduce(lambda a, b: a*b, self.block + self.grid)`

Add the first of many microbenchmarks 2010-09-07 12:44:12 -04:00			`@property`
			`def ctas(self):`
			`return self.grid[0] * self.grid[1]`

Use shared memory for iter_count and have each CP processed by only one CTA. Slower, but the code is a bit simpler conceptually, and the difference will be more than accounted for by better scheduling towards the end of the process. 2010-09-07 14:54:50 -04:00			`@property`
			`def threads_per_cta(self):`
			`return self.block[0] * self.block[1] * self.block[2]`

Added property ctx.warps_per_cta 2010-09-10 12:53:40 -04:00			`@property`
			`def warps_per_cta(self):`
			`return self.threads_per_cta / 32`

Finally runs again 2010-09-06 11:18:20 -04:00			`def compile(self, verbose=False, **kwargs):`
			`kwargs['ctx'] = self`
			`self.ptx = PTXModule(self.entry_types, kwargs, self.build_tests)`
Splitting things up a bit 2010-08-28 16:56:05 -04:00			`try:`
			`self.mod = cuda.module_from_buffer(self.ptx.source)`
			`except (cuda.CompileError, cuda.RuntimeError), e:`
			`print "Aww, dang, compile error. Here's the source:"`
Finally runs again 2010-09-06 11:18:20 -04:00			`self.ptx.print_source()`
Splitting things up a bit 2010-08-28 16:56:05 -04:00			`raise e`
			`if verbose:`
Formatter improvements 2010-09-02 16:12:22 -04:00			`if verbose >= 3:`
Finally runs again 2010-09-06 11:18:20 -04:00			`self.ptx.print_source()`
PTX DSL working, at least well enough to pass MWCRNGTest 2010-09-01 21:09:40 -04:00			`for entry in self.ptx.entries:`
			`func = self.mod.get_function(entry.entry_name)`
			`print "Compiled %s: used %d regs, %d sm, %d local" % (`
			`entry.entry_name, func.num_regs,`
			`func.shared_size_bytes, func.local_size_bytes)`
Splitting things up a bit 2010-08-28 16:56:05 -04:00
Refactor call() to be more elegant 2010-09-10 14:43:20 -04:00			`def call_setup(self, entry_inst):`
			`for inst in self.ptx.entry_deps[type(entry_inst)]:`
			`inst.call_setup(self)`

			`def call_teardown(self, entry_inst):`
			`okay = True`
			`for inst in reversed(self.ptx.entry_deps[type(entry_inst)]):`
			`if inst is entry_inst and isinstance(entry_inst, PTXTest):`
			`try:`
			`inst.call_teardown(self)`
			`except PTXTestFailure, e:`
			`print "PTX Test %s failed!" % inst.entry_name, e`
			`okay = False`
Splitting things up a bit 2010-08-28 16:56:05 -04:00			`else:`
Refactor call() to be more elegant 2010-09-10 14:43:20 -04:00			`inst.call_teardown(self)`
			`return okay`
Splitting things up a bit 2010-08-28 16:56:05 -04:00
			`def run_tests(self):`
Refactor call() to be more elegant 2010-09-10 14:43:20 -04:00			`if not self.ptx.tests:`
			`print "No tests to run."`
			`return True`
			`all_okay = True`
			`for test in self.ptx.tests:`
			`cuda.Context.synchronize()`
			`if test.call(self):`
			`print "Test %s passed." % test.entry_name`
			`else:`
			`print "Test %s FAILED." % test.entry_name`
			`all_okay = False`
			`return all_okay`
Splitting things up a bit 2010-08-28 16:56:05 -04:00