cuburn/cuburnlib/device_code.py
2010-08-30 14:45:44 -04:00

186 lines
6.6 KiB
Python

import os
import time
import pycuda.driver as cuda
import numpy as np
from cuburnlib.ptx import PTXFragment, PTXEntryPoint, PTXTest
class MWCRNG(PTXFragment):
def __init__(self):
self.threads_ready = 0
if not os.path.isfile('primes.bin'):
raise EnvironmentError('primes.bin not found')
prelude = (".global .u32 mwc_rng_mults[{{ctx.threads}}];\n"
".global .u64 mwc_rng_state[{{ctx.threads}}];")
def _next_b32(self, dreg):
# TODO: make sure PTX optimizes away superfluous move instrs
return """
{
// MWC next b32
.reg .u64 mwc_out;
cvt.u64.u32 mwc_out, mwc_car;
mad.wide.u32 mwc_out, mwc_st, mwc_mult, mwc_out;
mov.b64 {mwc_st, mwc_car}, mwc_out;
mov.u32 %s, mwc_st;
}
""" % dreg
def subs(self, ctx):
return {'mwc_next_b32': self._next_b32}
entry_start = """
.reg .u32 mwc_st, mwc_mult, mwc_car;
{
// MWC load multipliers and RNG states
.reg .u32 mwc_off, mwc_addr;
{{ get_gtid('mwc_off') }}
mov.u32 mwc_addr, mwc_rng_mults;
mad.lo.u32 mwc_addr, mwc_off, 4, mwc_addr;
ld.global.u32 mwc_mult, [mwc_addr];
mov.u32 mwc_addr, mwc_rng_state;
mad.lo.u32 mwc_addr, mwc_off, 8, mwc_addr;
ld.global.v2.u32 {mwc_st, mwc_car}, [mwc_addr];
}
"""
entry_end = """
{
// MWC save states
.reg .u32 mwc_addr, mwc_off;
{{ get_gtid('mwc_off') }}
mov.u32 mwc_addr, mwc_rng_state;
mad.lo.u32 mwc_addr, mwc_off, 8, mwc_addr;
st.global.v2.u32 [mwc_addr], {mwc_st, mwc_car};
}
"""
def set_up(self, ctx):
if self.threads_ready >= ctx.threads:
return
# Load raw big-endian u32 multipliers from primes.bin.
with open('primes.bin') as primefp:
dt = np.dtype(np.uint32).newbyteorder('B')
mults = np.frombuffer(primefp.read(), dtype=dt)
stream = cuda.Stream()
# Randomness in choosing multipliers is good, but larger multipliers
# have longer periods, which is also good. This is a compromise.
mults = np.array(mults[:ctx.threads*4])
ctx.rand.shuffle(mults)
# Copy multipliers and seeds to the device
multdp, multl = ctx.mod.get_global('mwc_rng_mults')
cuda.memcpy_htod_async(multdp, mults.tostring()[:multl])
# Intentionally excludes both 0 and (2^32-1), as they can lead to
# degenerate sequences of period 0
states = np.array(ctx.rand.randint(1, 0xffffffff, size=2*ctx.threads),
dtype=np.uint32)
statedp, statel = ctx.mod.get_global('mwc_rng_state')
cuda.memcpy_htod_async(statedp, states.tostring())
self.threads_ready = ctx.threads
def tests(self, ctx):
return [MWCRNGTest]
class MWCRNGTest(PTXTest):
name = "MWC RNG sum-of-threads"
deps = [MWCRNG]
rounds = 10000
prelude = ".global .u64 mwc_rng_test_sums[{{ctx.threads}}];"
def entry(self, ctx):
return ('MWC_RNG_test', '', """
.reg .u64 sum, addl;
.reg .u32 addend;
mov.u64 sum, 0;
{
.reg .u32 loopct;
.reg .pred p;
mov.u32 loopct, %s;
loopstart:
{{ mwc_next_b32('addend') }}
cvt.u64.u32 addl, addend;
add.u64 sum, sum, addl;
sub.u32 loopct, loopct, 1;
setp.gt.u32 p, loopct, 0;
@p bra.uni loopstart;
}
{
.reg .u32 addr, offset;
{{ get_gtid('offset') }}
mov.u32 addr, mwc_rng_test_sums;
mad.lo.u32 addr, offset, 8, addr;
st.global.u64 [addr], sum;
}
""" % self.rounds)
def call(self, ctx):
# Get current multipliers and seeds from the device
multdp, multl = ctx.mod.get_global('mwc_rng_mults')
mults = cuda.from_device(multdp, ctx.threads, np.uint32)
statedp, statel = ctx.mod.get_global('mwc_rng_state')
fullstates = cuda.from_device(statedp, ctx.threads, np.uint64)
sums = np.zeros(ctx.threads, np.uint64)
print "Running states forward %d rounds" % self.rounds
ctime = time.time()
for i in range(self.rounds):
states = fullstates & 0xffffffff
carries = fullstates >> 32
fullstates = mults * states + carries
sums = sums + (fullstates & 0xffffffff)
ctime = time.time() - ctime
print "Done on host, took %g seconds" % ctime
func = ctx.mod.get_function('MWC_RNG_test')
dtime = func(block=ctx.block, grid=ctx.grid, time_kernel=True)
print "Done on device, took %g seconds (%gx)" % (dtime, ctime/dtime)
dfullstates = cuda.from_device(statedp, ctx.threads, np.uint64)
print dfullstates, fullstates
if not (dfullstates == fullstates).all():
print "State discrepancy"
print dfullstates
print fullstates
return False
sumdp, suml = ctx.mod.get_global('mwc_rng_test_sums')
dsums = cuda.from_device(sumdp, ctx.threads, np.uint64)
print dsums, sums
if not (dsums == sums).all():
print "Sum discrepancy"
print dsums
print sums
return False
return True
class CameraCoordTransform(PTXFragment):
# This is here until I get the device stream packer going, or decide on
# how to handle C struct addressing if we go for unpacked structures
prelude = ".global .u32 camera_coords[8];"
def _cam_coord_xf(self, x, y, dreg):
"""
Given `.f32 x, y`, a coordinate in IFS space, writes the integer
offset from the start of the sampling lattice into `.u32 dreg`.
"""
return """{
.pred is_badval;
// TODO: This will change when data streaming is done
.reg .u32 camera_coord_address;
mov.u32 camera_coord_address, camera_coords;
// TODO: see if preloading everything hurts register count
.reg .f32 width_scale, width_upper_bound, height_scale, height_upper_bound;
ldu.v4.f32 {width_scale, width_upper_bound,
height_scale, height_upper_bound},
[camera_coord_address+0];
.reg .f32 x_xf, y_xf;
mad.rz.f32 x_xf, x, width_scale"""
# TODO unfinished