cuburn/cuburnlib/cuda.py

106 lines
3.7 KiB
Python
Raw Normal View History

2010-08-28 16:56:05 -04:00
# These imports are order-sensitive!
import pyglet
import pyglet.gl as gl
gl.get_current_context()
import pycuda.driver as cuda
import pycuda.tools
import pycuda.gl as cudagl
import pycuda.gl.autoinit
import numpy as np
2010-09-10 14:43:20 -04:00
from cuburnlib.ptx import PTXModule, PTXTest, PTXTestFailure
2010-08-28 16:56:05 -04:00
class LaunchContext(object):
"""
Context collecting the information needed to create, run, and gather the
2010-09-06 11:18:20 -04:00
results of a device computation. This may eventually also include an actual
CUDA context, but for now it just uses the global one.
2010-08-28 16:56:05 -04:00
To create the fastest device code across multiple device families, this
context may decide to iteratively refine the final PTX by regenerating
and recompiling it several times to optimize certain parameters of the
launch, such as the distribution of threads throughout the device.
The properties of this device which are tuned are listed below. Any PTX
fragments which use this information must emit valid PTX for any state
given below, but the PTX is only required to actually run with the final,
fixed values of all tuned parameters below.
`block`: 3-tuple of (x,y,z); dimensions of each CTA.
`grid`: 2-tuple of (x,y); dimensions of the grid of CTAs.
`threads`: Number of active threads on device as a whole.
`mod`: Final compiled module. Unavailable during assembly.
"""
2010-09-06 11:18:20 -04:00
def __init__(self, entries, block=(1,1,1), grid=(1,1), tests=False):
2010-08-28 16:56:05 -04:00
self.entry_types = entries
self.block, self.grid, self.build_tests = block, grid, tests
self.setup_done = False
@property
def threads(self):
return reduce(lambda a, b: a*b, self.block + self.grid)
2010-09-07 12:44:12 -04:00
@property
def ctas(self):
return self.grid[0] * self.grid[1]
@property
def threads_per_cta(self):
return self.block[0] * self.block[1] * self.block[2]
2010-09-10 12:53:40 -04:00
@property
def warps_per_cta(self):
return self.threads_per_cta / 32
2010-09-06 11:18:20 -04:00
def compile(self, verbose=False, **kwargs):
kwargs['ctx'] = self
self.ptx = PTXModule(self.entry_types, kwargs, self.build_tests)
2010-08-28 16:56:05 -04:00
try:
self.mod = cuda.module_from_buffer(self.ptx.source)
except (cuda.CompileError, cuda.RuntimeError), e:
print "Aww, dang, compile error. Here's the source:"
2010-09-06 11:18:20 -04:00
self.ptx.print_source()
2010-08-28 16:56:05 -04:00
raise e
if verbose:
2010-09-02 16:12:22 -04:00
if verbose >= 3:
2010-09-06 11:18:20 -04:00
self.ptx.print_source()
for entry in self.ptx.entries:
func = self.mod.get_function(entry.entry_name)
print "Compiled %s: used %d regs, %d sm, %d local" % (
entry.entry_name, func.num_regs,
func.shared_size_bytes, func.local_size_bytes)
2010-08-28 16:56:05 -04:00
2010-09-10 14:43:20 -04:00
def call_setup(self, entry_inst):
for inst in self.ptx.entry_deps[type(entry_inst)]:
inst.call_setup(self)
def call_teardown(self, entry_inst):
okay = True
for inst in reversed(self.ptx.entry_deps[type(entry_inst)]):
if inst is entry_inst and isinstance(entry_inst, PTXTest):
try:
inst.call_teardown(self)
except PTXTestFailure, e:
print "PTX Test %s failed!" % inst.entry_name, e
okay = False
2010-08-28 16:56:05 -04:00
else:
2010-09-10 14:43:20 -04:00
inst.call_teardown(self)
return okay
2010-08-28 16:56:05 -04:00
def run_tests(self):
2010-09-10 14:43:20 -04:00
if not self.ptx.tests:
print "No tests to run."
return True
all_okay = True
for test in self.ptx.tests:
cuda.Context.synchronize()
if test.call(self):
print "Test %s passed." % test.entry_name
else:
print "Test %s FAILED." % test.entry_name
all_okay = False
return all_okay
2010-08-28 16:56:05 -04:00