import os import time import pycuda.driver as cuda import numpy as np from cuburnlib.ptx import PTXFragment, PTXEntryPoint, PTXTest class MWCRNG(PTXFragment): def __init__(self): self.threads_ready = 0 if not os.path.isfile('primes.bin'): raise EnvironmentError('primes.bin not found') prelude = (".global .u32 mwc_rng_mults[{{ctx.threads}}];\n" ".global .u64 mwc_rng_state[{{ctx.threads}}];") def _next_b32(self, dreg): # TODO: make sure PTX optimizes away superfluous move instrs return """ { // MWC next b32 .reg .u64 mwc_out; cvt.u64.u32 mwc_out, mwc_car; mad.wide.u32 mwc_out, mwc_st, mwc_mult, mwc_out; mov.b64 {mwc_st, mwc_car}, mwc_out; mov.u32 %s, mwc_st; } """ % dreg def subs(self, ctx): return {'mwc_next_b32': self._next_b32} entry_start = """ .reg .u32 mwc_st, mwc_mult, mwc_car; { // MWC load multipliers and RNG states .reg .u32 mwc_off, mwc_addr; {{ get_gtid('mwc_off') }} mov.u32 mwc_addr, mwc_rng_mults; mad.lo.u32 mwc_addr, mwc_off, 4, mwc_addr; ld.global.u32 mwc_mult, [mwc_addr]; mov.u32 mwc_addr, mwc_rng_state; mad.lo.u32 mwc_addr, mwc_off, 8, mwc_addr; ld.global.v2.u32 {mwc_st, mwc_car}, [mwc_addr]; } """ entry_end = """ { // MWC save states .reg .u32 mwc_addr, mwc_off; {{ get_gtid('mwc_off') }} mov.u32 mwc_addr, mwc_rng_state; mad.lo.u32 mwc_addr, mwc_off, 8, mwc_addr; st.global.v2.u32 [mwc_addr], {mwc_st, mwc_car}; } """ def set_up(self, ctx): if self.threads_ready >= ctx.threads: return # Load raw big-endian u32 multipliers from primes.bin. with open('primes.bin') as primefp: dt = np.dtype(np.uint32).newbyteorder('B') mults = np.frombuffer(primefp.read(), dtype=dt) stream = cuda.Stream() # Randomness in choosing multipliers is good, but larger multipliers # have longer periods, which is also good. This is a compromise. mults = np.array(mults[:ctx.threads*4]) ctx.rand.shuffle(mults) # Copy multipliers and seeds to the device multdp, multl = ctx.mod.get_global('mwc_rng_mults') cuda.memcpy_htod_async(multdp, mults.tostring()[:multl]) # Intentionally excludes both 0 and (2^32-1), as they can lead to # degenerate sequences of period 0 states = np.array(ctx.rand.randint(1, 0xffffffff, size=2*ctx.threads), dtype=np.uint32) statedp, statel = ctx.mod.get_global('mwc_rng_state') cuda.memcpy_htod_async(statedp, states.tostring()) self.threads_ready = ctx.threads def tests(self, ctx): return [MWCRNGTest] class MWCRNGTest(PTXTest): name = "MWC RNG sum-of-threads" deps = [MWCRNG] rounds = 10000 prelude = ".global .u64 mwc_rng_test_sums[{{ctx.threads}}];" def entry(self, ctx): return ('MWC_RNG_test', '', """ .reg .u64 sum, addl; .reg .u32 addend; mov.u64 sum, 0; { .reg .u32 loopct; .reg .pred p; mov.u32 loopct, %s; loopstart: {{ mwc_next_b32('addend') }} cvt.u64.u32 addl, addend; add.u64 sum, sum, addl; sub.u32 loopct, loopct, 1; setp.gt.u32 p, loopct, 0; @p bra.uni loopstart; } { .reg .u32 addr, offset; {{ get_gtid('offset') }} mov.u32 addr, mwc_rng_test_sums; mad.lo.u32 addr, offset, 8, addr; st.global.u64 [addr], sum; } """ % self.rounds) def call(self, ctx): # Get current multipliers and seeds from the device multdp, multl = ctx.mod.get_global('mwc_rng_mults') mults = cuda.from_device(multdp, ctx.threads, np.uint32) statedp, statel = ctx.mod.get_global('mwc_rng_state') fullstates = cuda.from_device(statedp, ctx.threads, np.uint64) sums = np.zeros(ctx.threads, np.uint64) print "Running states forward %d rounds" % self.rounds ctime = time.time() for i in range(self.rounds): states = fullstates & 0xffffffff carries = fullstates >> 32 fullstates = mults * states + carries sums = sums + (fullstates & 0xffffffff) ctime = time.time() - ctime print "Done on host, took %g seconds" % ctime func = ctx.mod.get_function('MWC_RNG_test') dtime = func(block=ctx.block, grid=ctx.grid, time_kernel=True) print "Done on device, took %g seconds (%gx)" % (dtime, ctime/dtime) dfullstates = cuda.from_device(statedp, ctx.threads, np.uint64) print dfullstates, fullstates if not (dfullstates == fullstates).all(): print "State discrepancy" print dfullstates print fullstates return False sumdp, suml = ctx.mod.get_global('mwc_rng_test_sums') dsums = cuda.from_device(sumdp, ctx.threads, np.uint64) print dsums, sums if not (dsums == sums).all(): print "Sum discrepancy" print dsums print sums return False return True class CameraCoordTransform(PTXFragment): # This is here until I get the device stream packer going, or decide on # how to handle C struct addressing if we go for unpacked structures prelude = ".global .u32 camera_coords[8];" def _cam_coord_xf(self, x, y, dreg): """ Given `.f32 x, y`, a coordinate in IFS space, writes the integer offset from the start of the sampling lattice into `.u32 dreg`. """ return """{ .pred is_badval; // TODO: This will change when data streaming is done .reg .u32 camera_coord_address; mov.u32 camera_coord_address, camera_coords; // TODO: see if preloading everything hurts register count .reg .f32 width_scale, width_upper_bound, height_scale, height_upper_bound; ldu.v4.f32 {width_scale, width_upper_bound, height_scale, height_upper_bound}, [camera_coord_address+0]; .reg .f32 x_xf, y_xf; mad.rz.f32 x_xf, x, width_scale""" # TODO unfinished