Known broken checkin because I'm nervous.

This commit is contained in:
Steven Robertson 2010-10-01 01:20:20 -04:00
parent b938c320a8
commit c0e3c1d599
5 changed files with 786 additions and 1164 deletions

View File

@ -0,0 +1,32 @@
from collections import namedtuple
Flag = namedtuple('Flag', 'level desc')
class DebugSettings(object):
"""
Container for default debug settings.
"""
def __init__(self, items):
self.items = items
self.values = {}
self.level = 1
def __getattr__(self, name):
if name not in self.items:
raise KeyError("Unknown debug flag name!")
if name in self.values:
return self.values[name]
return (self.items[name].level <= self.level)
def format_help(self):
name_len = min(30, max(map(len, self.items.keys())))
fmt = '%-' + name_len + 's %d %s'
return '\n'.join([fmt % (k, v.level, v.desc)
for k, v in self.items.items()])
debug_flags = dict(
count_writes = Flag(3, "Count the number of points written per thread "
"when doing iterations."),
count_rounds = Flag(3, "Count the number of times the iteration loop "
"runs per thread when doing iterations.")
)

View File

@ -1,39 +1,48 @@
# These imports are order-sensitive! # These imports are order-sensitive!
import pyglet #import pyglet
import pyglet.gl as gl #import pyglet.gl as gl
gl.get_current_context() #gl.get_current_context()
import pycuda.driver as cuda import pycuda.driver as cuda
from pycuda.compiler import SourceModule from pycuda.compiler import SourceModule
import pycuda.tools import pycuda.tools
import pycuda.gl as cudagl #import pycuda.gl as cudagl
import pycuda.gl.autoinit #import pycuda.gl.autoinit
import pycuda.autoinit
import numpy as np import numpy as np
from cuburn.ptx import PTXModule, PTXTest, PTXTestFailure from cuburn.ptx import PTXFormatter
class Module(object):
def __init__(self, entries):
self.entries = entries
self.source = self.compile(entries)
self.mod = self.assemble(self.source)
@staticmethod
def compile(entries):
formatter = PTXFormatter()
for entry in entries:
entry.format_source(formatter)
return formatter.get_source()
def assemble(self, src):
# TODO: make this a debugging option
with open('/tmp/cuburn.ptx', 'w') as f: f.write(src)
try:
mod = cuda.module_from_buffer(src,
[(cuda.jit_option.OPTIMIZATION_LEVEL, 0),
(cuda.jit_option.TARGET_FROM_CUCONTEXT, 1)])
except (cuda.CompileError, cuda.RuntimeError), e:
# TODO: if output not written above, print different message
# TODO: read assembler output and recover Python source lines
print "Compile error. Source is at /tmp/cuburn.ptx"
print e
raise e
return mod
class LaunchContext(object): class LaunchContext(object):
"""
Context collecting the information needed to create, run, and gather the
results of a device computation. This may eventually also include an actual
CUDA context, but for now it just uses the global one.
To create the fastest device code across multiple device families, this
context may decide to iteratively refine the final PTX by regenerating
and recompiling it several times to optimize certain parameters of the
launch, such as the distribution of threads throughout the device.
The properties of this device which are tuned are listed below. Any PTX
fragments which use this information must emit valid PTX for any state
given below, but the PTX is only required to actually run with the final,
fixed values of all tuned parameters below.
`block`: 3-tuple of (x,y,z); dimensions of each CTA.
`grid`: 2-tuple of (x,y); dimensions of the grid of CTAs.
`nthreads`: Number of active threads on device as a whole.
`mod`: Final compiled module. Unavailable during assembly.
"""
def __init__(self, entries, block=(1,1,1), grid=(1,1), tests=False): def __init__(self, entries, block=(1,1,1), grid=(1,1), tests=False):
self.entry_types = entries self.entry_types = entries
self.block, self.grid, self.build_tests = block, grid, tests self.block, self.grid, self.build_tests = block, grid, tests
@ -60,18 +69,6 @@ class LaunchContext(object):
kwargs['ctx'] = self kwargs['ctx'] = self
self.ptx = PTXModule(self.entry_types, kwargs, self.build_tests) self.ptx = PTXModule(self.entry_types, kwargs, self.build_tests)
# TODO: make this optional and let user choose path # TODO: make this optional and let user choose path
with open('/tmp/cuburn.ptx', 'w') as f: f.write(self.ptx.source)
try:
# TODO: detect/customize arch, code; verbose setting;
# keep directory enable/disable via debug
self.mod = cuda.module_from_buffer(self.ptx.source,
[(cuda.jit_option.OPTIMIZATION_LEVEL, 0),
(cuda.jit_option.TARGET_FROM_CUCONTEXT, 1)])
except (cuda.CompileError, cuda.RuntimeError), e:
# TODO: if output not written above, print different message
print "Compile error. Source is at /tmp/cuburn.ptx"
print e
raise e
if verbose: if verbose:
for entry in self.ptx.entries: for entry in self.ptx.entries:
func = self.mod.get_function(entry.entry_name) func = self.mod.get_function(entry.entry_name)

View File

@ -523,175 +523,130 @@ class ShufflePoints(PTXFragment):
op.bar.sync(bar) op.bar.sync(bar)
op.ld.volatile.shared.b32(var, addr(shuf_read)) op.ld.volatile.shared.b32(var, addr(shuf_read))
class MWCRNG(PTXFragment): class MWCRNG(object):
shortname = "mwc" def __init__(self, entry, seed=None):
# TODO: install this in data directory or something
def __init__(self):
self.threads_ready = 0
if not os.path.isfile('primes.bin'): if not os.path.isfile('primes.bin'):
raise EnvironmentError('primes.bin not found') raise EnvironmentError('primes.bin not found')
self.threads_ready = 0
self.mults, self.state = None, None
@ptx_func self.entry = entry
def module_setup(self): entry.add_param('mwc_mults', entry.types.u32)
mem.global_.u32('mwc_rng_mults', ctx.nthreads) entry.add_param('mwc_states', entry.types.u32)
mem.global_.u64('mwc_rng_state', ctx.nthreads) r, o = entry.regs, entry.ops
with entry.head as e:
#mwc_mult_addr = gtid * 4 + e.params.mwc_mults
gtid = o.mad.lo(e.special.ctaid_x, ctx.threads_per_cta,
e.special.tid_x)
mwc_mult_addr = o.mad.lo.u32(gtid, 4, e.params.mwc_mults)
r.mwc_mult = o.load.u32(mwc_mult_addr)
mwc_state_addr = o.mad.lo.u32(gtid, 8, e.params.mwc_states)
r.mwc_state, r.mwc_carry = o.load.u64(mwc_state_addr)
with entry.tail as e:
#gtid = e.special.ctaid_x * ctx.threads_per_cta + e.special.tid_x
gtid = o.mad.lo(e.special.ctaid_x, ctx.threads_per_cta,
e.special.tid_x)
mwc_state_addr = o.mad.lo.u32(gtid, 8, e.params.mwc_states)
o.store.v2(mwc_state_addr, (r.mwc_state, r.mwc_carry))
@ptx_func def next_b32(self):
def entry_setup(self): e, r, o = self.entry, self.entry.regs, self.entry.ops
reg.u32('mwc_st mwc_mult mwc_car') mwc_out = o.cvt.u64(r.mwc_carry)
with block('Load MWC multipliers and states'): mwc_out = o.mad.wide.u32(r.mwc_mult, r.mwc_state, mwc_out)
reg.u32('mwc_off mwc_addr') r.mwc_state, r.mwc_carry = o.mov(mwc_out)
std.get_gtid(mwc_off) return r.mwc_state
op.mov.u32(mwc_addr, mwc_rng_mults)
op.mad.lo.u32(mwc_addr, mwc_off, 4, mwc_addr)
op.ld.global_.u32(mwc_mult, addr(mwc_addr))
op.mov.u32(mwc_addr, mwc_rng_state) def next_f32_01(self):
op.mad.lo.u32(mwc_addr, mwc_off, 8, mwc_addr) e, r, o = self.entry, self.entry.regs, self.entry.ops
op.ld.global_.v2.u32(vec(mwc_st, mwc_car), addr(mwc_addr)) mwc_float = o.cvt.rn.f32.u32(self.next_b32())
# TODO: check the precision on the uploaded types here
return o.mul.f32(mwc_float, 1./(1<<32))
@ptx_func def next_f32_11(self):
def entry_teardown(self): e, r, o = self.entry, self.entry.regs, self.entry.ops
with block('Save MWC states'): mwc_float = o.cvt.rn.f32.s32(self.next_b32())
reg.u32('mwc_off mwc_addr') return o.mul.f32(mwc_float, 1./(1<<31))
std.get_gtid(mwc_off)
op.mov.u32(mwc_addr, mwc_rng_state)
op.mad.lo.u32(mwc_addr, mwc_off, 8, mwc_addr)
op.st.global_.v2.u32(addr(mwc_addr), vec(mwc_st, mwc_car))
@ptx_func def call_setup(self, ctx, force=False):
def _next(self):
# Call from inside a block!
reg.u64('mwc_out')
op.cvt.u64.u32(mwc_out, mwc_car)
op.mad.wide.u32(mwc_out, mwc_st, mwc_mult, mwc_out)
op.mov.b64(vec(mwc_st, mwc_car), mwc_out)
@ptx_func
def next_b32(self, dst_reg):
with block('Load next random u32 into ' + dst_reg.name):
self._next()
op.mov.u32(dst_reg, mwc_st)
@ptx_func
def next_f32_01(self, dst_reg):
# TODO: verify that this is the fastest-performance method
# TODO: verify that this actually does what I think it does
with block('Load random float [0,1] into ' + dst_reg.name):
self._next()
op.cvt.rn.f32.u32(dst_reg, mwc_st)
op.mul.f32(dst_reg, dst_reg, '0f2F800000') # 1./(1<<32)
@ptx_func
def next_f32_11(self, dst_reg):
with block('Load random float [-1,1) into ' + dst_reg.name):
reg.u32('mwc_to_float')
self._next()
op.cvt.rn.f32.s32(dst_reg, mwc_st)
op.mul.f32(dst_reg, dst_reg, '0f30000000') # 1./(1<<31)
@instmethod
def seed(self, ctx, rand=np.random):
""" """
Seed the random number generators with values taken from a Seed the random number generators with values taken from a
``np.random`` instance. ``np.random`` instance.
""" """
if force or self.nthreads_ready < ctx.nthreads:
# Load raw big-endian u32 multipliers from primes.bin. # Load raw big-endian u32 multipliers from primes.bin.
with open('primes.bin') as primefp: with open('primes.bin') as primefp:
dt = np.dtype(np.uint32).newbyteorder('B') dt = np.dtype(np.uint32).newbyteorder('B')
mults = np.frombuffer(primefp.read(), dtype=dt) mults = np.frombuffer(primefp.read(), dtype=dt)
stream = cuda.Stream()
# Randomness in choosing multipliers is good, but larger multipliers # Randomness in choosing multipliers is good, but larger multipliers
# have longer periods, which is also good. This is a compromise. # have longer periods, which is also good. This is a compromise.
mults = np.array(mults[:ctx.nthreads*4]) mults = np.array(mults[:ctx.nthreads*4])
rand.shuffle(mults) rand.shuffle(mults)
# Copy multipliers and seeds to the device locked_mults = ctx.hostpool.allocate(ctx.nthreads, np.uint32)
multdp, multl = ctx.mod.get_global('mwc_rng_mults') locked_mults[:] = mults[ctx.nthreads]
cuda.memcpy_htod(multdp, mults.tostring()[:multl]) self.mults = ctx.pool.allocate(4*ctx.nthreads)
cuda.memcpy_htod_async(self.mults, locked_mults.base, ctx.stream)
# Intentionally excludes both 0 and (2^32-1), as they can lead to # Intentionally excludes both 0 and (2^32-1), as they can lead to
# degenerate sequences of period 0 # degenerate sequences of period 0
states = np.array(rand.randint(1, 0xffffffff, size=2*ctx.nthreads), states = np.array(rand.randint(1, 0xffffffff, size=2*ctx.nthreads),
dtype=np.uint32) dtype=np.uint32)
statedp, statel = ctx.mod.get_global('mwc_rng_state') locked_states = ctx.hostpool.allocate(2*ctx.nthreads, np.uint32)
cuda.memcpy_htod(statedp, states.tostring()) locked_states[:] = states
self.threads_ready = ctx.nthreads self.states = ctx.pool.allocate(8*ctx.nthreads)
cuda.memcpy_htod_async(self.states, locked_states, ctx.stream)
self.nthreads_ready = ctx.nthreads
ctx.set_param('mwc_mults', self.mults)
ctx.set_param('mwc_states', self.states)
def call_setup(self, ctx): class MWCRNGTest(PTXEntry):
if self.threads_ready < ctx.nthreads:
self.seed(ctx)
def tests(self):
return [MWCRNGTest, MWCRNGFloatsTest]
class MWCRNGTest(PTXTest):
name = "MWC RNG sum-of-threads"
rounds = 5000 rounds = 5000
entry_name = 'MWC_RNG_test'
entry_params = ''
def deps(self): def __init__(self, entry):
return [MWCRNG] self.entry = entry
self.mwc = MWCRNG(entry)
@ptx_func entry.add_param('mwc_test_sums', entry.types.u32)
def module_setup(self): with entry.body():
mem.global_.u64('mwc_rng_test_sums', ctx.nthreads) self.entry_body()
@ptx_func def entry_body(self):
def entry(self): e, r, o = self.entry, self.entry.regs, self.entry.ops
reg.u64('sum addl')
reg.u32('addend')
op.mov.u64(sum, 0)
with block('Sum next %d random numbers' % self.rounds):
reg.u32('loopct')
reg.pred('p')
op.mov.u32(loopct, self.rounds)
label('loopstart')
mwc.next_b32(addend)
op.cvt.u64.u32(addl, addend)
op.add.u64(sum, sum, addl)
op.sub.u32(loopct, loopct, 1)
op.setp.gt.u32(p, loopct, 0)
op.bra.uni(loopstart, ifp=p)
with block('Store sum and state'): r.sum = 0
reg.u32('adr offset') with e.std.loop(self.rounds) as mwc_rng_sum:
std.get_gtid(offset) addend = o.cvt.u64.u32(self.mwc.next_b32())
op.mov.u32(adr, mwc_rng_test_sums) r.sum = o.add.u64(r.sum, addend)
op.mad.lo.u32(adr, offset, 8, adr)
op.st.global_.u64(addr(adr), sum)
def call_setup(self, ctx): e.std.store_per_thread(e.params.mwc_test_sums, r.sum)
# Get current multipliers and seeds from the device
self.mults = ctx.get_per_thread('mwc_rng_mults', np.uint32)
self.fullstates = ctx.get_per_thread('mwc_rng_state', np.uint64)
self.sums = np.zeros(ctx.nthreads, np.uint64)
print "Running %d states forward %d rounds" % \ def call(self, ctx):
(len(self.mults), self.rounds) # Generate current state, upload it to GPU
self.mwc.call_setup(ctx, force=True)
mults, fullstates = self.mwc.mults, self.mwc.fullstates
sums = np.zeros_like(fullstates)
# Run two trials, to ensure device state is getting saved properly
for trial in range(2):
print "Trial %d, on CPU: " % trial,
ctime = time.time() ctime = time.time()
for i in range(self.rounds): for i in range(self.rounds):
states = self.fullstates & 0xffffffff states = fullstates & 0xffffffff
carries = self.fullstates >> 32 carries = fullstates >> 32
self.fullstates = self.mults * states + carries fullstates = self.mults * states + carries
self.sums += self.fullstates & 0xffffffff sums += fullstates & 0xffffffff
ctime = time.time() - ctime ctime = time.time() - ctime
print "Done on host, took %g seconds" % ctime print "Took %g seconds." % ctime
def call_teardown(self, ctx): print "Trial %d, on device: " % trial,
dfullstates = ctx.get_per_thread('mwc_rng_state', np.uint64) dsums = np.empty_like(sums)
if not (dfullstates == self.fullstates).all(): ctx.set_param('mwc_test_sums', cuda.Out(dsums))
print "State discrepancy" print "Took %g seconds." % ctx.call()
print dfullstates
print self.fullstates
raise PTXTestFailure("MWC RNG state discrepancy")
if not np.all(np.equal(sums, dsums)):
dsums = ctx.get_per_thread('mwc_rng_test_sums', np.uint64) print "Sum discrepancy!"
if not (dsums == self.sums).all(): print sums
print "Sum discrepancy"
print dsums print dsums
print self.sums raise TODOSomeKindOfException()
raise PTXTestFailure("MWC RNG sum discrepancy")
class MWCRNGFloatsTest(PTXTest): class MWCRNGFloatsTest(PTXTest):
""" """

File diff suppressed because it is too large Load Diff

View File

@ -16,6 +16,8 @@ from ctypes import *
import numpy as np import numpy as np
np.set_printoptions(precision=5, edgeitems=20)
from cuburn.device_code import * from cuburn.device_code import *
from cuburn.cuda import LaunchContext from cuburn.cuda import LaunchContext
from fr0stlib.pyflam3 import * from fr0stlib.pyflam3 import *