mirror of
https://github.com/stevenrobertson/cuburn.git
synced 2025-02-05 11:40:04 -05:00
186 lines
6.6 KiB
Python
186 lines
6.6 KiB
Python
import os
|
|
import time
|
|
|
|
import pycuda.driver as cuda
|
|
import numpy as np
|
|
|
|
from cuburnlib.ptx import PTXFragment, PTXEntryPoint, PTXTest
|
|
|
|
class MWCRNG(PTXFragment):
|
|
def __init__(self):
|
|
self.threads_ready = 0
|
|
if not os.path.isfile('primes.bin'):
|
|
raise EnvironmentError('primes.bin not found')
|
|
|
|
prelude = (".global .u32 mwc_rng_mults[{{ctx.threads}}];\n"
|
|
".global .u64 mwc_rng_state[{{ctx.threads}}];")
|
|
|
|
def _next_b32(self, dreg):
|
|
# TODO: make sure PTX optimizes away superfluous move instrs
|
|
return """
|
|
{
|
|
// MWC next b32
|
|
.reg .u64 mwc_out;
|
|
cvt.u64.u32 mwc_out, mwc_car;
|
|
mad.wide.u32 mwc_out, mwc_st, mwc_mult, mwc_out;
|
|
mov.b64 {mwc_st, mwc_car}, mwc_out;
|
|
mov.u32 %s, mwc_st;
|
|
}
|
|
""" % dreg
|
|
|
|
def subs(self, ctx):
|
|
return {'mwc_next_b32': self._next_b32}
|
|
|
|
entry_start = """
|
|
.reg .u32 mwc_st, mwc_mult, mwc_car;
|
|
{
|
|
// MWC load multipliers and RNG states
|
|
.reg .u32 mwc_off, mwc_addr;
|
|
{{ get_gtid('mwc_off') }}
|
|
mov.u32 mwc_addr, mwc_rng_mults;
|
|
mad.lo.u32 mwc_addr, mwc_off, 4, mwc_addr;
|
|
ld.global.u32 mwc_mult, [mwc_addr];
|
|
mov.u32 mwc_addr, mwc_rng_state;
|
|
mad.lo.u32 mwc_addr, mwc_off, 8, mwc_addr;
|
|
ld.global.v2.u32 {mwc_st, mwc_car}, [mwc_addr];
|
|
}
|
|
"""
|
|
|
|
entry_end = """
|
|
{
|
|
// MWC save states
|
|
.reg .u32 mwc_addr, mwc_off;
|
|
{{ get_gtid('mwc_off') }}
|
|
mov.u32 mwc_addr, mwc_rng_state;
|
|
mad.lo.u32 mwc_addr, mwc_off, 8, mwc_addr;
|
|
st.global.v2.u32 [mwc_addr], {mwc_st, mwc_car};
|
|
}
|
|
"""
|
|
|
|
def set_up(self, ctx):
|
|
if self.threads_ready >= ctx.threads:
|
|
return
|
|
# Load raw big-endian u32 multipliers from primes.bin.
|
|
with open('primes.bin') as primefp:
|
|
dt = np.dtype(np.uint32).newbyteorder('B')
|
|
mults = np.frombuffer(primefp.read(), dtype=dt)
|
|
stream = cuda.Stream()
|
|
# Randomness in choosing multipliers is good, but larger multipliers
|
|
# have longer periods, which is also good. This is a compromise.
|
|
mults = np.array(mults[:ctx.threads*4])
|
|
ctx.rand.shuffle(mults)
|
|
# Copy multipliers and seeds to the device
|
|
multdp, multl = ctx.mod.get_global('mwc_rng_mults')
|
|
cuda.memcpy_htod_async(multdp, mults.tostring()[:multl])
|
|
# Intentionally excludes both 0 and (2^32-1), as they can lead to
|
|
# degenerate sequences of period 0
|
|
states = np.array(ctx.rand.randint(1, 0xffffffff, size=2*ctx.threads),
|
|
dtype=np.uint32)
|
|
statedp, statel = ctx.mod.get_global('mwc_rng_state')
|
|
cuda.memcpy_htod_async(statedp, states.tostring())
|
|
self.threads_ready = ctx.threads
|
|
|
|
def tests(self, ctx):
|
|
return [MWCRNGTest]
|
|
|
|
class MWCRNGTest(PTXTest):
|
|
name = "MWC RNG sum-of-threads"
|
|
deps = [MWCRNG]
|
|
rounds = 10000
|
|
|
|
prelude = ".global .u64 mwc_rng_test_sums[{{ctx.threads}}];"
|
|
|
|
def entry(self, ctx):
|
|
return ('MWC_RNG_test', '', """
|
|
.reg .u64 sum, addl;
|
|
.reg .u32 addend;
|
|
mov.u64 sum, 0;
|
|
{
|
|
.reg .u32 loopct;
|
|
.reg .pred p;
|
|
mov.u32 loopct, %s;
|
|
loopstart:
|
|
{{ mwc_next_b32('addend') }}
|
|
cvt.u64.u32 addl, addend;
|
|
add.u64 sum, sum, addl;
|
|
sub.u32 loopct, loopct, 1;
|
|
setp.gt.u32 p, loopct, 0;
|
|
@p bra.uni loopstart;
|
|
}
|
|
{
|
|
.reg .u32 addr, offset;
|
|
{{ get_gtid('offset') }}
|
|
mov.u32 addr, mwc_rng_test_sums;
|
|
mad.lo.u32 addr, offset, 8, addr;
|
|
st.global.u64 [addr], sum;
|
|
}
|
|
""" % self.rounds)
|
|
|
|
def call(self, ctx):
|
|
# Get current multipliers and seeds from the device
|
|
multdp, multl = ctx.mod.get_global('mwc_rng_mults')
|
|
mults = cuda.from_device(multdp, ctx.threads, np.uint32)
|
|
statedp, statel = ctx.mod.get_global('mwc_rng_state')
|
|
fullstates = cuda.from_device(statedp, ctx.threads, np.uint64)
|
|
sums = np.zeros(ctx.threads, np.uint64)
|
|
|
|
print "Running states forward %d rounds" % self.rounds
|
|
ctime = time.time()
|
|
for i in range(self.rounds):
|
|
states = fullstates & 0xffffffff
|
|
carries = fullstates >> 32
|
|
fullstates = mults * states + carries
|
|
sums = sums + (fullstates & 0xffffffff)
|
|
ctime = time.time() - ctime
|
|
print "Done on host, took %g seconds" % ctime
|
|
|
|
func = ctx.mod.get_function('MWC_RNG_test')
|
|
dtime = func(block=ctx.block, grid=ctx.grid, time_kernel=True)
|
|
print "Done on device, took %g seconds (%gx)" % (dtime, ctime/dtime)
|
|
dfullstates = cuda.from_device(statedp, ctx.threads, np.uint64)
|
|
print dfullstates, fullstates
|
|
if not (dfullstates == fullstates).all():
|
|
print "State discrepancy"
|
|
print dfullstates
|
|
print fullstates
|
|
return False
|
|
|
|
sumdp, suml = ctx.mod.get_global('mwc_rng_test_sums')
|
|
dsums = cuda.from_device(sumdp, ctx.threads, np.uint64)
|
|
print dsums, sums
|
|
if not (dsums == sums).all():
|
|
print "Sum discrepancy"
|
|
print dsums
|
|
print sums
|
|
return False
|
|
return True
|
|
|
|
class CameraCoordTransform(PTXFragment):
|
|
# This is here until I get the device stream packer going, or decide on
|
|
# how to handle C struct addressing if we go for unpacked structures
|
|
prelude = ".global .u32 camera_coords[8];"
|
|
|
|
def _cam_coord_xf(self, x, y, dreg):
|
|
"""
|
|
Given `.f32 x, y`, a coordinate in IFS space, writes the integer
|
|
offset from the start of the sampling lattice into `.u32 dreg`.
|
|
"""
|
|
|
|
return """{
|
|
.pred is_badval;
|
|
// TODO: This will change when data streaming is done
|
|
.reg .u32 camera_coord_address;
|
|
mov.u32 camera_coord_address, camera_coords;
|
|
// TODO: see if preloading everything hurts register count
|
|
.reg .f32 width_scale, width_upper_bound, height_scale, height_upper_bound;
|
|
ldu.v4.f32 {width_scale, width_upper_bound,
|
|
height_scale, height_upper_bound},
|
|
[camera_coord_address+0];
|
|
.reg .f32 x_xf, y_xf;
|
|
mad.rz.f32 x_xf, x, width_scale"""
|
|
# TODO unfinished
|
|
|
|
|
|
|
|
|