Known broken checkin because I'm nervous.

This commit is contained in:
Steven Robertson 2010-10-01 01:20:20 -04:00
parent b938c320a8
commit c0e3c1d599
5 changed files with 786 additions and 1164 deletions

View File

@ -0,0 +1,32 @@
from collections import namedtuple
Flag = namedtuple('Flag', 'level desc')
class DebugSettings(object):
"""
Container for default debug settings.
"""
def __init__(self, items):
self.items = items
self.values = {}
self.level = 1
def __getattr__(self, name):
if name not in self.items:
raise KeyError("Unknown debug flag name!")
if name in self.values:
return self.values[name]
return (self.items[name].level <= self.level)
def format_help(self):
name_len = min(30, max(map(len, self.items.keys())))
fmt = '%-' + name_len + 's %d %s'
return '\n'.join([fmt % (k, v.level, v.desc)
for k, v in self.items.items()])
debug_flags = dict(
count_writes = Flag(3, "Count the number of points written per thread "
"when doing iterations."),
count_rounds = Flag(3, "Count the number of times the iteration loop "
"runs per thread when doing iterations.")
)

View File

@ -1,39 +1,48 @@
# These imports are order-sensitive!
import pyglet
import pyglet.gl as gl
gl.get_current_context()
#import pyglet
#import pyglet.gl as gl
#gl.get_current_context()
import pycuda.driver as cuda
from pycuda.compiler import SourceModule
import pycuda.tools
import pycuda.gl as cudagl
import pycuda.gl.autoinit
#import pycuda.gl as cudagl
#import pycuda.gl.autoinit
import pycuda.autoinit
import numpy as np
from cuburn.ptx import PTXModule, PTXTest, PTXTestFailure
from cuburn.ptx import PTXFormatter
class Module(object):
def __init__(self, entries):
self.entries = entries
self.source = self.compile(entries)
self.mod = self.assemble(self.source)
@staticmethod
def compile(entries):
formatter = PTXFormatter()
for entry in entries:
entry.format_source(formatter)
return formatter.get_source()
def assemble(self, src):
# TODO: make this a debugging option
with open('/tmp/cuburn.ptx', 'w') as f: f.write(src)
try:
mod = cuda.module_from_buffer(src,
[(cuda.jit_option.OPTIMIZATION_LEVEL, 0),
(cuda.jit_option.TARGET_FROM_CUCONTEXT, 1)])
except (cuda.CompileError, cuda.RuntimeError), e:
# TODO: if output not written above, print different message
# TODO: read assembler output and recover Python source lines
print "Compile error. Source is at /tmp/cuburn.ptx"
print e
raise e
return mod
class LaunchContext(object):
"""
Context collecting the information needed to create, run, and gather the
results of a device computation. This may eventually also include an actual
CUDA context, but for now it just uses the global one.
To create the fastest device code across multiple device families, this
context may decide to iteratively refine the final PTX by regenerating
and recompiling it several times to optimize certain parameters of the
launch, such as the distribution of threads throughout the device.
The properties of this device which are tuned are listed below. Any PTX
fragments which use this information must emit valid PTX for any state
given below, but the PTX is only required to actually run with the final,
fixed values of all tuned parameters below.
`block`: 3-tuple of (x,y,z); dimensions of each CTA.
`grid`: 2-tuple of (x,y); dimensions of the grid of CTAs.
`nthreads`: Number of active threads on device as a whole.
`mod`: Final compiled module. Unavailable during assembly.
"""
def __init__(self, entries, block=(1,1,1), grid=(1,1), tests=False):
self.entry_types = entries
self.block, self.grid, self.build_tests = block, grid, tests
@ -60,18 +69,6 @@ class LaunchContext(object):
kwargs['ctx'] = self
self.ptx = PTXModule(self.entry_types, kwargs, self.build_tests)
# TODO: make this optional and let user choose path
with open('/tmp/cuburn.ptx', 'w') as f: f.write(self.ptx.source)
try:
# TODO: detect/customize arch, code; verbose setting;
# keep directory enable/disable via debug
self.mod = cuda.module_from_buffer(self.ptx.source,
[(cuda.jit_option.OPTIMIZATION_LEVEL, 0),
(cuda.jit_option.TARGET_FROM_CUCONTEXT, 1)])
except (cuda.CompileError, cuda.RuntimeError), e:
# TODO: if output not written above, print different message
print "Compile error. Source is at /tmp/cuburn.ptx"
print e
raise e
if verbose:
for entry in self.ptx.entries:
func = self.mod.get_function(entry.entry_name)

View File

@ -523,175 +523,130 @@ class ShufflePoints(PTXFragment):
op.bar.sync(bar)
op.ld.volatile.shared.b32(var, addr(shuf_read))
class MWCRNG(PTXFragment):
shortname = "mwc"
def __init__(self):
self.threads_ready = 0
class MWCRNG(object):
def __init__(self, entry, seed=None):
# TODO: install this in data directory or something
if not os.path.isfile('primes.bin'):
raise EnvironmentError('primes.bin not found')
self.threads_ready = 0
self.mults, self.state = None, None
@ptx_func
def module_setup(self):
mem.global_.u32('mwc_rng_mults', ctx.nthreads)
mem.global_.u64('mwc_rng_state', ctx.nthreads)
self.entry = entry
entry.add_param('mwc_mults', entry.types.u32)
entry.add_param('mwc_states', entry.types.u32)
r, o = entry.regs, entry.ops
with entry.head as e:
#mwc_mult_addr = gtid * 4 + e.params.mwc_mults
gtid = o.mad.lo(e.special.ctaid_x, ctx.threads_per_cta,
e.special.tid_x)
mwc_mult_addr = o.mad.lo.u32(gtid, 4, e.params.mwc_mults)
r.mwc_mult = o.load.u32(mwc_mult_addr)
mwc_state_addr = o.mad.lo.u32(gtid, 8, e.params.mwc_states)
r.mwc_state, r.mwc_carry = o.load.u64(mwc_state_addr)
with entry.tail as e:
#gtid = e.special.ctaid_x * ctx.threads_per_cta + e.special.tid_x
gtid = o.mad.lo(e.special.ctaid_x, ctx.threads_per_cta,
e.special.tid_x)
mwc_state_addr = o.mad.lo.u32(gtid, 8, e.params.mwc_states)
o.store.v2(mwc_state_addr, (r.mwc_state, r.mwc_carry))
@ptx_func
def entry_setup(self):
reg.u32('mwc_st mwc_mult mwc_car')
with block('Load MWC multipliers and states'):
reg.u32('mwc_off mwc_addr')
std.get_gtid(mwc_off)
op.mov.u32(mwc_addr, mwc_rng_mults)
op.mad.lo.u32(mwc_addr, mwc_off, 4, mwc_addr)
op.ld.global_.u32(mwc_mult, addr(mwc_addr))
def next_b32(self):
e, r, o = self.entry, self.entry.regs, self.entry.ops
mwc_out = o.cvt.u64(r.mwc_carry)
mwc_out = o.mad.wide.u32(r.mwc_mult, r.mwc_state, mwc_out)
r.mwc_state, r.mwc_carry = o.mov(mwc_out)
return r.mwc_state
op.mov.u32(mwc_addr, mwc_rng_state)
op.mad.lo.u32(mwc_addr, mwc_off, 8, mwc_addr)
op.ld.global_.v2.u32(vec(mwc_st, mwc_car), addr(mwc_addr))
def next_f32_01(self):
e, r, o = self.entry, self.entry.regs, self.entry.ops
mwc_float = o.cvt.rn.f32.u32(self.next_b32())
# TODO: check the precision on the uploaded types here
return o.mul.f32(mwc_float, 1./(1<<32))
@ptx_func
def entry_teardown(self):
with block('Save MWC states'):
reg.u32('mwc_off mwc_addr')
std.get_gtid(mwc_off)
op.mov.u32(mwc_addr, mwc_rng_state)
op.mad.lo.u32(mwc_addr, mwc_off, 8, mwc_addr)
op.st.global_.v2.u32(addr(mwc_addr), vec(mwc_st, mwc_car))
def next_f32_11(self):
e, r, o = self.entry, self.entry.regs, self.entry.ops
mwc_float = o.cvt.rn.f32.s32(self.next_b32())
return o.mul.f32(mwc_float, 1./(1<<31))
@ptx_func
def _next(self):
# Call from inside a block!
reg.u64('mwc_out')
op.cvt.u64.u32(mwc_out, mwc_car)
op.mad.wide.u32(mwc_out, mwc_st, mwc_mult, mwc_out)
op.mov.b64(vec(mwc_st, mwc_car), mwc_out)
@ptx_func
def next_b32(self, dst_reg):
with block('Load next random u32 into ' + dst_reg.name):
self._next()
op.mov.u32(dst_reg, mwc_st)
@ptx_func
def next_f32_01(self, dst_reg):
# TODO: verify that this is the fastest-performance method
# TODO: verify that this actually does what I think it does
with block('Load random float [0,1] into ' + dst_reg.name):
self._next()
op.cvt.rn.f32.u32(dst_reg, mwc_st)
op.mul.f32(dst_reg, dst_reg, '0f2F800000') # 1./(1<<32)
@ptx_func
def next_f32_11(self, dst_reg):
with block('Load random float [-1,1) into ' + dst_reg.name):
reg.u32('mwc_to_float')
self._next()
op.cvt.rn.f32.s32(dst_reg, mwc_st)
op.mul.f32(dst_reg, dst_reg, '0f30000000') # 1./(1<<31)
@instmethod
def seed(self, ctx, rand=np.random):
def call_setup(self, ctx, force=False):
"""
Seed the random number generators with values taken from a
``np.random`` instance.
"""
# Load raw big-endian u32 multipliers from primes.bin.
with open('primes.bin') as primefp:
dt = np.dtype(np.uint32).newbyteorder('B')
mults = np.frombuffer(primefp.read(), dtype=dt)
stream = cuda.Stream()
# Randomness in choosing multipliers is good, but larger multipliers
# have longer periods, which is also good. This is a compromise.
mults = np.array(mults[:ctx.nthreads*4])
rand.shuffle(mults)
# Copy multipliers and seeds to the device
multdp, multl = ctx.mod.get_global('mwc_rng_mults')
cuda.memcpy_htod(multdp, mults.tostring()[:multl])
# Intentionally excludes both 0 and (2^32-1), as they can lead to
# degenerate sequences of period 0
states = np.array(rand.randint(1, 0xffffffff, size=2*ctx.nthreads),
dtype=np.uint32)
statedp, statel = ctx.mod.get_global('mwc_rng_state')
cuda.memcpy_htod(statedp, states.tostring())
self.threads_ready = ctx.nthreads
if force or self.nthreads_ready < ctx.nthreads:
# Load raw big-endian u32 multipliers from primes.bin.
with open('primes.bin') as primefp:
dt = np.dtype(np.uint32).newbyteorder('B')
mults = np.frombuffer(primefp.read(), dtype=dt)
# Randomness in choosing multipliers is good, but larger multipliers
# have longer periods, which is also good. This is a compromise.
mults = np.array(mults[:ctx.nthreads*4])
rand.shuffle(mults)
locked_mults = ctx.hostpool.allocate(ctx.nthreads, np.uint32)
locked_mults[:] = mults[ctx.nthreads]
self.mults = ctx.pool.allocate(4*ctx.nthreads)
cuda.memcpy_htod_async(self.mults, locked_mults.base, ctx.stream)
# Intentionally excludes both 0 and (2^32-1), as they can lead to
# degenerate sequences of period 0
states = np.array(rand.randint(1, 0xffffffff, size=2*ctx.nthreads),
dtype=np.uint32)
locked_states = ctx.hostpool.allocate(2*ctx.nthreads, np.uint32)
locked_states[:] = states
self.states = ctx.pool.allocate(8*ctx.nthreads)
cuda.memcpy_htod_async(self.states, locked_states, ctx.stream)
self.nthreads_ready = ctx.nthreads
ctx.set_param('mwc_mults', self.mults)
ctx.set_param('mwc_states', self.states)
def call_setup(self, ctx):
if self.threads_ready < ctx.nthreads:
self.seed(ctx)
def tests(self):
return [MWCRNGTest, MWCRNGFloatsTest]
class MWCRNGTest(PTXTest):
name = "MWC RNG sum-of-threads"
class MWCRNGTest(PTXEntry):
rounds = 5000
entry_name = 'MWC_RNG_test'
entry_params = ''
def deps(self):
return [MWCRNG]
def __init__(self, entry):
self.entry = entry
self.mwc = MWCRNG(entry)
@ptx_func
def module_setup(self):
mem.global_.u64('mwc_rng_test_sums', ctx.nthreads)
entry.add_param('mwc_test_sums', entry.types.u32)
with entry.body():
self.entry_body()
@ptx_func
def entry(self):
reg.u64('sum addl')
reg.u32('addend')
op.mov.u64(sum, 0)
with block('Sum next %d random numbers' % self.rounds):
reg.u32('loopct')
reg.pred('p')
op.mov.u32(loopct, self.rounds)
label('loopstart')
mwc.next_b32(addend)
op.cvt.u64.u32(addl, addend)
op.add.u64(sum, sum, addl)
op.sub.u32(loopct, loopct, 1)
op.setp.gt.u32(p, loopct, 0)
op.bra.uni(loopstart, ifp=p)
def entry_body(self):
e, r, o = self.entry, self.entry.regs, self.entry.ops
with block('Store sum and state'):
reg.u32('adr offset')
std.get_gtid(offset)
op.mov.u32(adr, mwc_rng_test_sums)
op.mad.lo.u32(adr, offset, 8, adr)
op.st.global_.u64(addr(adr), sum)
r.sum = 0
with e.std.loop(self.rounds) as mwc_rng_sum:
addend = o.cvt.u64.u32(self.mwc.next_b32())
r.sum = o.add.u64(r.sum, addend)
def call_setup(self, ctx):
# Get current multipliers and seeds from the device
self.mults = ctx.get_per_thread('mwc_rng_mults', np.uint32)
self.fullstates = ctx.get_per_thread('mwc_rng_state', np.uint64)
self.sums = np.zeros(ctx.nthreads, np.uint64)
e.std.store_per_thread(e.params.mwc_test_sums, r.sum)
print "Running %d states forward %d rounds" % \
(len(self.mults), self.rounds)
ctime = time.time()
for i in range(self.rounds):
states = self.fullstates & 0xffffffff
carries = self.fullstates >> 32
self.fullstates = self.mults * states + carries
self.sums += self.fullstates & 0xffffffff
ctime = time.time() - ctime
print "Done on host, took %g seconds" % ctime
def call(self, ctx):
# Generate current state, upload it to GPU
self.mwc.call_setup(ctx, force=True)
mults, fullstates = self.mwc.mults, self.mwc.fullstates
sums = np.zeros_like(fullstates)
def call_teardown(self, ctx):
dfullstates = ctx.get_per_thread('mwc_rng_state', np.uint64)
if not (dfullstates == self.fullstates).all():
print "State discrepancy"
print dfullstates
print self.fullstates
raise PTXTestFailure("MWC RNG state discrepancy")
# Run two trials, to ensure device state is getting saved properly
for trial in range(2):
print "Trial %d, on CPU: " % trial,
ctime = time.time()
for i in range(self.rounds):
states = fullstates & 0xffffffff
carries = fullstates >> 32
fullstates = self.mults * states + carries
sums += fullstates & 0xffffffff
ctime = time.time() - ctime
print "Took %g seconds." % ctime
print "Trial %d, on device: " % trial,
dsums = np.empty_like(sums)
ctx.set_param('mwc_test_sums', cuda.Out(dsums))
print "Took %g seconds." % ctx.call()
dsums = ctx.get_per_thread('mwc_rng_test_sums', np.uint64)
if not (dsums == self.sums).all():
print "Sum discrepancy"
print dsums
print self.sums
raise PTXTestFailure("MWC RNG sum discrepancy")
if not np.all(np.equal(sums, dsums)):
print "Sum discrepancy!"
print sums
print dsums
raise TODOSomeKindOfException()
class MWCRNGFloatsTest(PTXTest):
"""

File diff suppressed because it is too large Load Diff

View File

@ -16,6 +16,8 @@ from ctypes import *
import numpy as np
np.set_printoptions(precision=5, edgeitems=20)
from cuburn.device_code import *
from cuburn.cuda import LaunchContext
from fr0stlib.pyflam3 import *