mirror of
https://github.com/stevenrobertson/cuburn.git
synced 2025-03-15 16:01:29 -04:00
Known broken checkin because I'm nervous.
This commit is contained in:
parent
b938c320a8
commit
c0e3c1d599
@ -0,0 +1,32 @@
|
|||||||
|
|
||||||
|
from collections import namedtuple
|
||||||
|
|
||||||
|
Flag = namedtuple('Flag', 'level desc')
|
||||||
|
|
||||||
|
class DebugSettings(object):
|
||||||
|
"""
|
||||||
|
Container for default debug settings.
|
||||||
|
"""
|
||||||
|
def __init__(self, items):
|
||||||
|
self.items = items
|
||||||
|
self.values = {}
|
||||||
|
self.level = 1
|
||||||
|
def __getattr__(self, name):
|
||||||
|
if name not in self.items:
|
||||||
|
raise KeyError("Unknown debug flag name!")
|
||||||
|
if name in self.values:
|
||||||
|
return self.values[name]
|
||||||
|
return (self.items[name].level <= self.level)
|
||||||
|
def format_help(self):
|
||||||
|
name_len = min(30, max(map(len, self.items.keys())))
|
||||||
|
fmt = '%-' + name_len + 's %d %s'
|
||||||
|
return '\n'.join([fmt % (k, v.level, v.desc)
|
||||||
|
for k, v in self.items.items()])
|
||||||
|
|
||||||
|
debug_flags = dict(
|
||||||
|
count_writes = Flag(3, "Count the number of points written per thread "
|
||||||
|
"when doing iterations."),
|
||||||
|
count_rounds = Flag(3, "Count the number of times the iteration loop "
|
||||||
|
"runs per thread when doing iterations.")
|
||||||
|
)
|
||||||
|
|
@ -1,39 +1,48 @@
|
|||||||
# These imports are order-sensitive!
|
# These imports are order-sensitive!
|
||||||
import pyglet
|
#import pyglet
|
||||||
import pyglet.gl as gl
|
#import pyglet.gl as gl
|
||||||
gl.get_current_context()
|
#gl.get_current_context()
|
||||||
|
|
||||||
import pycuda.driver as cuda
|
import pycuda.driver as cuda
|
||||||
from pycuda.compiler import SourceModule
|
from pycuda.compiler import SourceModule
|
||||||
import pycuda.tools
|
import pycuda.tools
|
||||||
import pycuda.gl as cudagl
|
#import pycuda.gl as cudagl
|
||||||
import pycuda.gl.autoinit
|
#import pycuda.gl.autoinit
|
||||||
|
import pycuda.autoinit
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from cuburn.ptx import PTXModule, PTXTest, PTXTestFailure
|
from cuburn.ptx import PTXFormatter
|
||||||
|
|
||||||
|
class Module(object):
|
||||||
|
def __init__(self, entries):
|
||||||
|
self.entries = entries
|
||||||
|
self.source = self.compile(entries)
|
||||||
|
self.mod = self.assemble(self.source)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def compile(entries):
|
||||||
|
formatter = PTXFormatter()
|
||||||
|
for entry in entries:
|
||||||
|
entry.format_source(formatter)
|
||||||
|
return formatter.get_source()
|
||||||
|
|
||||||
|
def assemble(self, src):
|
||||||
|
# TODO: make this a debugging option
|
||||||
|
with open('/tmp/cuburn.ptx', 'w') as f: f.write(src)
|
||||||
|
try:
|
||||||
|
mod = cuda.module_from_buffer(src,
|
||||||
|
[(cuda.jit_option.OPTIMIZATION_LEVEL, 0),
|
||||||
|
(cuda.jit_option.TARGET_FROM_CUCONTEXT, 1)])
|
||||||
|
except (cuda.CompileError, cuda.RuntimeError), e:
|
||||||
|
# TODO: if output not written above, print different message
|
||||||
|
# TODO: read assembler output and recover Python source lines
|
||||||
|
print "Compile error. Source is at /tmp/cuburn.ptx"
|
||||||
|
print e
|
||||||
|
raise e
|
||||||
|
return mod
|
||||||
|
|
||||||
class LaunchContext(object):
|
class LaunchContext(object):
|
||||||
"""
|
|
||||||
Context collecting the information needed to create, run, and gather the
|
|
||||||
results of a device computation. This may eventually also include an actual
|
|
||||||
CUDA context, but for now it just uses the global one.
|
|
||||||
|
|
||||||
To create the fastest device code across multiple device families, this
|
|
||||||
context may decide to iteratively refine the final PTX by regenerating
|
|
||||||
and recompiling it several times to optimize certain parameters of the
|
|
||||||
launch, such as the distribution of threads throughout the device.
|
|
||||||
The properties of this device which are tuned are listed below. Any PTX
|
|
||||||
fragments which use this information must emit valid PTX for any state
|
|
||||||
given below, but the PTX is only required to actually run with the final,
|
|
||||||
fixed values of all tuned parameters below.
|
|
||||||
|
|
||||||
`block`: 3-tuple of (x,y,z); dimensions of each CTA.
|
|
||||||
`grid`: 2-tuple of (x,y); dimensions of the grid of CTAs.
|
|
||||||
`nthreads`: Number of active threads on device as a whole.
|
|
||||||
`mod`: Final compiled module. Unavailable during assembly.
|
|
||||||
|
|
||||||
"""
|
|
||||||
def __init__(self, entries, block=(1,1,1), grid=(1,1), tests=False):
|
def __init__(self, entries, block=(1,1,1), grid=(1,1), tests=False):
|
||||||
self.entry_types = entries
|
self.entry_types = entries
|
||||||
self.block, self.grid, self.build_tests = block, grid, tests
|
self.block, self.grid, self.build_tests = block, grid, tests
|
||||||
@ -60,18 +69,6 @@ class LaunchContext(object):
|
|||||||
kwargs['ctx'] = self
|
kwargs['ctx'] = self
|
||||||
self.ptx = PTXModule(self.entry_types, kwargs, self.build_tests)
|
self.ptx = PTXModule(self.entry_types, kwargs, self.build_tests)
|
||||||
# TODO: make this optional and let user choose path
|
# TODO: make this optional and let user choose path
|
||||||
with open('/tmp/cuburn.ptx', 'w') as f: f.write(self.ptx.source)
|
|
||||||
try:
|
|
||||||
# TODO: detect/customize arch, code; verbose setting;
|
|
||||||
# keep directory enable/disable via debug
|
|
||||||
self.mod = cuda.module_from_buffer(self.ptx.source,
|
|
||||||
[(cuda.jit_option.OPTIMIZATION_LEVEL, 0),
|
|
||||||
(cuda.jit_option.TARGET_FROM_CUCONTEXT, 1)])
|
|
||||||
except (cuda.CompileError, cuda.RuntimeError), e:
|
|
||||||
# TODO: if output not written above, print different message
|
|
||||||
print "Compile error. Source is at /tmp/cuburn.ptx"
|
|
||||||
print e
|
|
||||||
raise e
|
|
||||||
if verbose:
|
if verbose:
|
||||||
for entry in self.ptx.entries:
|
for entry in self.ptx.entries:
|
||||||
func = self.mod.get_function(entry.entry_name)
|
func = self.mod.get_function(entry.entry_name)
|
||||||
|
@ -523,175 +523,130 @@ class ShufflePoints(PTXFragment):
|
|||||||
op.bar.sync(bar)
|
op.bar.sync(bar)
|
||||||
op.ld.volatile.shared.b32(var, addr(shuf_read))
|
op.ld.volatile.shared.b32(var, addr(shuf_read))
|
||||||
|
|
||||||
class MWCRNG(PTXFragment):
|
class MWCRNG(object):
|
||||||
shortname = "mwc"
|
def __init__(self, entry, seed=None):
|
||||||
|
# TODO: install this in data directory or something
|
||||||
def __init__(self):
|
|
||||||
self.threads_ready = 0
|
|
||||||
if not os.path.isfile('primes.bin'):
|
if not os.path.isfile('primes.bin'):
|
||||||
raise EnvironmentError('primes.bin not found')
|
raise EnvironmentError('primes.bin not found')
|
||||||
|
self.threads_ready = 0
|
||||||
|
self.mults, self.state = None, None
|
||||||
|
|
||||||
@ptx_func
|
self.entry = entry
|
||||||
def module_setup(self):
|
entry.add_param('mwc_mults', entry.types.u32)
|
||||||
mem.global_.u32('mwc_rng_mults', ctx.nthreads)
|
entry.add_param('mwc_states', entry.types.u32)
|
||||||
mem.global_.u64('mwc_rng_state', ctx.nthreads)
|
r, o = entry.regs, entry.ops
|
||||||
|
with entry.head as e:
|
||||||
|
#mwc_mult_addr = gtid * 4 + e.params.mwc_mults
|
||||||
|
gtid = o.mad.lo(e.special.ctaid_x, ctx.threads_per_cta,
|
||||||
|
e.special.tid_x)
|
||||||
|
mwc_mult_addr = o.mad.lo.u32(gtid, 4, e.params.mwc_mults)
|
||||||
|
r.mwc_mult = o.load.u32(mwc_mult_addr)
|
||||||
|
mwc_state_addr = o.mad.lo.u32(gtid, 8, e.params.mwc_states)
|
||||||
|
r.mwc_state, r.mwc_carry = o.load.u64(mwc_state_addr)
|
||||||
|
with entry.tail as e:
|
||||||
|
#gtid = e.special.ctaid_x * ctx.threads_per_cta + e.special.tid_x
|
||||||
|
gtid = o.mad.lo(e.special.ctaid_x, ctx.threads_per_cta,
|
||||||
|
e.special.tid_x)
|
||||||
|
mwc_state_addr = o.mad.lo.u32(gtid, 8, e.params.mwc_states)
|
||||||
|
o.store.v2(mwc_state_addr, (r.mwc_state, r.mwc_carry))
|
||||||
|
|
||||||
@ptx_func
|
def next_b32(self):
|
||||||
def entry_setup(self):
|
e, r, o = self.entry, self.entry.regs, self.entry.ops
|
||||||
reg.u32('mwc_st mwc_mult mwc_car')
|
mwc_out = o.cvt.u64(r.mwc_carry)
|
||||||
with block('Load MWC multipliers and states'):
|
mwc_out = o.mad.wide.u32(r.mwc_mult, r.mwc_state, mwc_out)
|
||||||
reg.u32('mwc_off mwc_addr')
|
r.mwc_state, r.mwc_carry = o.mov(mwc_out)
|
||||||
std.get_gtid(mwc_off)
|
return r.mwc_state
|
||||||
op.mov.u32(mwc_addr, mwc_rng_mults)
|
|
||||||
op.mad.lo.u32(mwc_addr, mwc_off, 4, mwc_addr)
|
|
||||||
op.ld.global_.u32(mwc_mult, addr(mwc_addr))
|
|
||||||
|
|
||||||
op.mov.u32(mwc_addr, mwc_rng_state)
|
def next_f32_01(self):
|
||||||
op.mad.lo.u32(mwc_addr, mwc_off, 8, mwc_addr)
|
e, r, o = self.entry, self.entry.regs, self.entry.ops
|
||||||
op.ld.global_.v2.u32(vec(mwc_st, mwc_car), addr(mwc_addr))
|
mwc_float = o.cvt.rn.f32.u32(self.next_b32())
|
||||||
|
# TODO: check the precision on the uploaded types here
|
||||||
|
return o.mul.f32(mwc_float, 1./(1<<32))
|
||||||
|
|
||||||
@ptx_func
|
def next_f32_11(self):
|
||||||
def entry_teardown(self):
|
e, r, o = self.entry, self.entry.regs, self.entry.ops
|
||||||
with block('Save MWC states'):
|
mwc_float = o.cvt.rn.f32.s32(self.next_b32())
|
||||||
reg.u32('mwc_off mwc_addr')
|
return o.mul.f32(mwc_float, 1./(1<<31))
|
||||||
std.get_gtid(mwc_off)
|
|
||||||
op.mov.u32(mwc_addr, mwc_rng_state)
|
|
||||||
op.mad.lo.u32(mwc_addr, mwc_off, 8, mwc_addr)
|
|
||||||
op.st.global_.v2.u32(addr(mwc_addr), vec(mwc_st, mwc_car))
|
|
||||||
|
|
||||||
@ptx_func
|
def call_setup(self, ctx, force=False):
|
||||||
def _next(self):
|
|
||||||
# Call from inside a block!
|
|
||||||
reg.u64('mwc_out')
|
|
||||||
op.cvt.u64.u32(mwc_out, mwc_car)
|
|
||||||
op.mad.wide.u32(mwc_out, mwc_st, mwc_mult, mwc_out)
|
|
||||||
op.mov.b64(vec(mwc_st, mwc_car), mwc_out)
|
|
||||||
|
|
||||||
@ptx_func
|
|
||||||
def next_b32(self, dst_reg):
|
|
||||||
with block('Load next random u32 into ' + dst_reg.name):
|
|
||||||
self._next()
|
|
||||||
op.mov.u32(dst_reg, mwc_st)
|
|
||||||
|
|
||||||
@ptx_func
|
|
||||||
def next_f32_01(self, dst_reg):
|
|
||||||
# TODO: verify that this is the fastest-performance method
|
|
||||||
# TODO: verify that this actually does what I think it does
|
|
||||||
with block('Load random float [0,1] into ' + dst_reg.name):
|
|
||||||
self._next()
|
|
||||||
op.cvt.rn.f32.u32(dst_reg, mwc_st)
|
|
||||||
op.mul.f32(dst_reg, dst_reg, '0f2F800000') # 1./(1<<32)
|
|
||||||
|
|
||||||
@ptx_func
|
|
||||||
def next_f32_11(self, dst_reg):
|
|
||||||
with block('Load random float [-1,1) into ' + dst_reg.name):
|
|
||||||
reg.u32('mwc_to_float')
|
|
||||||
self._next()
|
|
||||||
op.cvt.rn.f32.s32(dst_reg, mwc_st)
|
|
||||||
op.mul.f32(dst_reg, dst_reg, '0f30000000') # 1./(1<<31)
|
|
||||||
|
|
||||||
@instmethod
|
|
||||||
def seed(self, ctx, rand=np.random):
|
|
||||||
"""
|
"""
|
||||||
Seed the random number generators with values taken from a
|
Seed the random number generators with values taken from a
|
||||||
``np.random`` instance.
|
``np.random`` instance.
|
||||||
"""
|
"""
|
||||||
# Load raw big-endian u32 multipliers from primes.bin.
|
if force or self.nthreads_ready < ctx.nthreads:
|
||||||
with open('primes.bin') as primefp:
|
# Load raw big-endian u32 multipliers from primes.bin.
|
||||||
dt = np.dtype(np.uint32).newbyteorder('B')
|
with open('primes.bin') as primefp:
|
||||||
mults = np.frombuffer(primefp.read(), dtype=dt)
|
dt = np.dtype(np.uint32).newbyteorder('B')
|
||||||
stream = cuda.Stream()
|
mults = np.frombuffer(primefp.read(), dtype=dt)
|
||||||
# Randomness in choosing multipliers is good, but larger multipliers
|
# Randomness in choosing multipliers is good, but larger multipliers
|
||||||
# have longer periods, which is also good. This is a compromise.
|
# have longer periods, which is also good. This is a compromise.
|
||||||
mults = np.array(mults[:ctx.nthreads*4])
|
mults = np.array(mults[:ctx.nthreads*4])
|
||||||
rand.shuffle(mults)
|
rand.shuffle(mults)
|
||||||
# Copy multipliers and seeds to the device
|
locked_mults = ctx.hostpool.allocate(ctx.nthreads, np.uint32)
|
||||||
multdp, multl = ctx.mod.get_global('mwc_rng_mults')
|
locked_mults[:] = mults[ctx.nthreads]
|
||||||
cuda.memcpy_htod(multdp, mults.tostring()[:multl])
|
self.mults = ctx.pool.allocate(4*ctx.nthreads)
|
||||||
# Intentionally excludes both 0 and (2^32-1), as they can lead to
|
cuda.memcpy_htod_async(self.mults, locked_mults.base, ctx.stream)
|
||||||
# degenerate sequences of period 0
|
# Intentionally excludes both 0 and (2^32-1), as they can lead to
|
||||||
states = np.array(rand.randint(1, 0xffffffff, size=2*ctx.nthreads),
|
# degenerate sequences of period 0
|
||||||
dtype=np.uint32)
|
states = np.array(rand.randint(1, 0xffffffff, size=2*ctx.nthreads),
|
||||||
statedp, statel = ctx.mod.get_global('mwc_rng_state')
|
dtype=np.uint32)
|
||||||
cuda.memcpy_htod(statedp, states.tostring())
|
locked_states = ctx.hostpool.allocate(2*ctx.nthreads, np.uint32)
|
||||||
self.threads_ready = ctx.nthreads
|
locked_states[:] = states
|
||||||
|
self.states = ctx.pool.allocate(8*ctx.nthreads)
|
||||||
|
cuda.memcpy_htod_async(self.states, locked_states, ctx.stream)
|
||||||
|
self.nthreads_ready = ctx.nthreads
|
||||||
|
ctx.set_param('mwc_mults', self.mults)
|
||||||
|
ctx.set_param('mwc_states', self.states)
|
||||||
|
|
||||||
def call_setup(self, ctx):
|
class MWCRNGTest(PTXEntry):
|
||||||
if self.threads_ready < ctx.nthreads:
|
|
||||||
self.seed(ctx)
|
|
||||||
|
|
||||||
def tests(self):
|
|
||||||
return [MWCRNGTest, MWCRNGFloatsTest]
|
|
||||||
|
|
||||||
class MWCRNGTest(PTXTest):
|
|
||||||
name = "MWC RNG sum-of-threads"
|
|
||||||
rounds = 5000
|
rounds = 5000
|
||||||
entry_name = 'MWC_RNG_test'
|
|
||||||
entry_params = ''
|
|
||||||
|
|
||||||
def deps(self):
|
def __init__(self, entry):
|
||||||
return [MWCRNG]
|
self.entry = entry
|
||||||
|
self.mwc = MWCRNG(entry)
|
||||||
|
|
||||||
@ptx_func
|
entry.add_param('mwc_test_sums', entry.types.u32)
|
||||||
def module_setup(self):
|
with entry.body():
|
||||||
mem.global_.u64('mwc_rng_test_sums', ctx.nthreads)
|
self.entry_body()
|
||||||
|
|
||||||
@ptx_func
|
def entry_body(self):
|
||||||
def entry(self):
|
e, r, o = self.entry, self.entry.regs, self.entry.ops
|
||||||
reg.u64('sum addl')
|
|
||||||
reg.u32('addend')
|
|
||||||
op.mov.u64(sum, 0)
|
|
||||||
with block('Sum next %d random numbers' % self.rounds):
|
|
||||||
reg.u32('loopct')
|
|
||||||
reg.pred('p')
|
|
||||||
op.mov.u32(loopct, self.rounds)
|
|
||||||
label('loopstart')
|
|
||||||
mwc.next_b32(addend)
|
|
||||||
op.cvt.u64.u32(addl, addend)
|
|
||||||
op.add.u64(sum, sum, addl)
|
|
||||||
op.sub.u32(loopct, loopct, 1)
|
|
||||||
op.setp.gt.u32(p, loopct, 0)
|
|
||||||
op.bra.uni(loopstart, ifp=p)
|
|
||||||
|
|
||||||
with block('Store sum and state'):
|
r.sum = 0
|
||||||
reg.u32('adr offset')
|
with e.std.loop(self.rounds) as mwc_rng_sum:
|
||||||
std.get_gtid(offset)
|
addend = o.cvt.u64.u32(self.mwc.next_b32())
|
||||||
op.mov.u32(adr, mwc_rng_test_sums)
|
r.sum = o.add.u64(r.sum, addend)
|
||||||
op.mad.lo.u32(adr, offset, 8, adr)
|
|
||||||
op.st.global_.u64(addr(adr), sum)
|
|
||||||
|
|
||||||
def call_setup(self, ctx):
|
e.std.store_per_thread(e.params.mwc_test_sums, r.sum)
|
||||||
# Get current multipliers and seeds from the device
|
|
||||||
self.mults = ctx.get_per_thread('mwc_rng_mults', np.uint32)
|
|
||||||
self.fullstates = ctx.get_per_thread('mwc_rng_state', np.uint64)
|
|
||||||
self.sums = np.zeros(ctx.nthreads, np.uint64)
|
|
||||||
|
|
||||||
print "Running %d states forward %d rounds" % \
|
def call(self, ctx):
|
||||||
(len(self.mults), self.rounds)
|
# Generate current state, upload it to GPU
|
||||||
ctime = time.time()
|
self.mwc.call_setup(ctx, force=True)
|
||||||
for i in range(self.rounds):
|
mults, fullstates = self.mwc.mults, self.mwc.fullstates
|
||||||
states = self.fullstates & 0xffffffff
|
sums = np.zeros_like(fullstates)
|
||||||
carries = self.fullstates >> 32
|
|
||||||
self.fullstates = self.mults * states + carries
|
|
||||||
self.sums += self.fullstates & 0xffffffff
|
|
||||||
ctime = time.time() - ctime
|
|
||||||
print "Done on host, took %g seconds" % ctime
|
|
||||||
|
|
||||||
def call_teardown(self, ctx):
|
# Run two trials, to ensure device state is getting saved properly
|
||||||
dfullstates = ctx.get_per_thread('mwc_rng_state', np.uint64)
|
for trial in range(2):
|
||||||
if not (dfullstates == self.fullstates).all():
|
print "Trial %d, on CPU: " % trial,
|
||||||
print "State discrepancy"
|
ctime = time.time()
|
||||||
print dfullstates
|
for i in range(self.rounds):
|
||||||
print self.fullstates
|
states = fullstates & 0xffffffff
|
||||||
raise PTXTestFailure("MWC RNG state discrepancy")
|
carries = fullstates >> 32
|
||||||
|
fullstates = self.mults * states + carries
|
||||||
|
sums += fullstates & 0xffffffff
|
||||||
|
ctime = time.time() - ctime
|
||||||
|
print "Took %g seconds." % ctime
|
||||||
|
|
||||||
|
print "Trial %d, on device: " % trial,
|
||||||
|
dsums = np.empty_like(sums)
|
||||||
|
ctx.set_param('mwc_test_sums', cuda.Out(dsums))
|
||||||
|
print "Took %g seconds." % ctx.call()
|
||||||
|
|
||||||
dsums = ctx.get_per_thread('mwc_rng_test_sums', np.uint64)
|
if not np.all(np.equal(sums, dsums)):
|
||||||
if not (dsums == self.sums).all():
|
print "Sum discrepancy!"
|
||||||
print "Sum discrepancy"
|
print sums
|
||||||
print dsums
|
print dsums
|
||||||
print self.sums
|
raise TODOSomeKindOfException()
|
||||||
raise PTXTestFailure("MWC RNG sum discrepancy")
|
|
||||||
|
|
||||||
class MWCRNGFloatsTest(PTXTest):
|
class MWCRNGFloatsTest(PTXTest):
|
||||||
"""
|
"""
|
||||||
|
1594
cuburn/ptx.py
1594
cuburn/ptx.py
File diff suppressed because it is too large
Load Diff
2
main.py
2
main.py
@ -16,6 +16,8 @@ from ctypes import *
|
|||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
|
np.set_printoptions(precision=5, edgeitems=20)
|
||||||
|
|
||||||
from cuburn.device_code import *
|
from cuburn.device_code import *
|
||||||
from cuburn.cuda import LaunchContext
|
from cuburn.cuda import LaunchContext
|
||||||
from fr0stlib.pyflam3 import *
|
from fr0stlib.pyflam3 import *
|
||||||
|
Loading…
Reference in New Issue
Block a user