cuburn/cuburnlib/device_code.py

"""
Contains the PTX fragments which will drive the device.
"""

import os
import time
import struct

import pycuda.driver as cuda
import numpy as np

from cuburnlib.ptx import *

class IterThread(PTXTest):
    entry_name = 'iter_thread'
    entry_params = []

    def __init__(self):
        self.cps_uploaded = False

    def deps(self):
        return [MWCRNG, CPDataStream]

    @ptx_func
    def module_setup(self):
        mem.global_.u32('g_cp_array',
                        cp.stream_size*features.max_ntemporal_samples)
        mem.global_.u32('g_num_cps')
        # TODO move into debug statement
        mem.global_.u32('g_num_rounds', ctx.threads)
        mem.global_.u32('g_num_writes', ctx.threads)

    @ptx_func
    def entry(self):
        reg.f32('x_coord y_coord color_coord alpha_coord')

        # TODO: temporary, for testing
        reg.u32('num_rounds num_writes')
        op.mov.u32(num_rounds, 0)
        op.mov.u32(num_writes, 0)

        # TODO: MWC float output types
        mwc.next_f32_01(x_coord)
        mwc.next_f32_01(y_coord)
        mwc.next_f32_01(color_coord)
        mwc.next_f32_01(alpha_coord)

        # Registers are hard to come by. To avoid having to track both the count
        # of samples processed and the number of samples to generate,
        # 'num_samples' counts *down* from the CP's desired sample count.
        # When it hits 0, we move on to the next CP.
        #
        # FUSE complicates things. To track it, we store the *negative* number
        # of points we have left to fuse before we start to store the results.
        # When it hits -1, we're done fusing, and can move on to the real
        # thread. The execution flow between 'cp_loop', 'fuse_start', and
        # 'iter_loop_start' is therefore tricky, and bears close inspection.
        #
        # In summary:
        #   num_samples == 0: Load next CP, set num_samples from that
        #   num_samples >  0: Iterate, store the result, decrement num_samples
        #   num_samples < -1: Iterate, don't store, increment num_samples
        #   num_samples == -1: Done fusing, enter normal flow
        # TODO: move this to qlocal storage
        reg.s32('num_samples')
        op.mov.s32(num_samples, -(features.num_fuse_samples+1))

        # TODO: Move cp_num to qlocal storage (or spill it, rarely accessed)
        reg.u32('cp_idx cpA')
        op.mov.u32(cp_idx, 0)

        label('cp_loop_start')
        op.bar.sync(0)

        with block('Check to see if this is the last CP'):
            reg.u32('num_cps')
            reg.pred('p_last_cp')
            op.ldu.u32(num_cps, addr(g_num_cps))
            op.setp.ge.u32(p_last_cp, cp_idx, num_cps)
            op.bra.uni('all_cps_done', ifp=p_last_cp)

        with block('Load CP address'):
            op.mov.u32(cpA, g_cp_array)
            op.mad.lo.u32(cpA, cp_idx, cp.stream_size, cpA)

        with block('Increment CP index, load num_samples (unless in fuse)'):
            reg.pred('p_not_in_fuse')
            op.setp.ge.s32(p_not_in_fuse, num_samples, 0)
            op.add.u32(cp_idx, cp_idx, 1, ifp=p_not_in_fuse)
            cp.get(cpA, num_samples, 'samples_per_thread',
                          ifp=p_not_in_fuse)

        label('fuse_loop_start')
        with block('FUSE-specific stuff'):
            reg.pred('p_fuse')
            comment('If num_samples == -1, set it to 0 and jump back up')
            comment('This will start the normal CP loading machinery')
            op.setp.eq.s32(p_fuse, num_samples, -1)
            op.mov.s32(num_samples, 0, ifp=p_fuse)
            op.bra.uni(cp_loop_start, ifp=p_fuse)

            comment('If num_samples < -1, still fusing, so increment')
            op.setp.lt.s32(p_fuse, num_samples, -1)
            op.add.s32(num_samples, num_samples, 1, ifp=p_fuse)

        label('iter_loop_start')

        comment('Do... well, most of everything')

        op.add.u32(num_rounds, num_rounds, 1)

        with block("Test if we're still in FUSE"):
            reg.pred('p_in_fuse')
            op.setp.lt.s32(p_in_fuse, num_samples, 0)
            op.bra.uni(fuse_loop_start, ifp=p_in_fuse)

        with block("Ordinarily, we'd write the result here"):
            op.add.u32(num_writes, num_writes, 1)

        with block("Check to see if we're done with this CP"):
            reg.pred('p_cp_done')
            op.add.s32(num_samples, num_samples, -1)
            op.setp.eq.s32(p_cp_done, num_samples, 0)
            op.bra.uni(cp_loop_start, ifp=p_cp_done)

        op.bra.uni(iter_loop_start)

        label('all_cps_done')
        # TODO this is for testing, move it to a debug statement
        std.store_per_thread(g_num_rounds, num_rounds)
        std.store_per_thread(g_num_writes, num_writes)

    def upload_cp_stream(self, ctx, cp_stream, num_cps):
        cp_array_dp, cp_array_l = ctx.mod.get_global('g_cp_array')
        assert len(cp_stream) <= cp_array_l, "Stream too big!"
        cuda.memcpy_htod_async(cp_array_dp, cp_stream)
        num_cps_dp, num_cps_l = ctx.mod.get_global('g_num_cps')
        cuda.memcpy_htod_async(num_cps_dp, struct.pack('i', num_cps))
        self.cps_uploaded = True

    def call(self, ctx):
        if not self.cps_uploaded:
            raise Error("Cannot call IterThread before uploading CPs")
        func = ctx.mod.get_function('iter_thread')
        dtime = func(block=ctx.block, grid=ctx.grid, time_kernel=True)

        num_rounds_dp, num_rounds_l = ctx.mod.get_global('g_num_rounds')
        num_writes_dp, num_writes_l = ctx.mod.get_global('g_num_writes')
        rounds = cuda.from_device(num_rounds_dp, ctx.threads, np.uint32)
        writes = cuda.from_device(num_writes_dp, ctx.threads, np.uint32)
        print "Rounds:", rounds
        print "Writes:", writes

class MWCRNG(PTXFragment):
    shortname = "mwc"

    def __init__(self):
        self.rand = np.random
        self.threads_ready = 0
        if not os.path.isfile('primes.bin'):
            raise EnvironmentError('primes.bin not found')

    def set_seed(self, seed):
        self.rand = np.random.mtrand.RandomState(seed)

    @ptx_func
    def module_setup(self):
        mem.global_.u32('mwc_rng_mults', ctx.threads)
        mem.global_.u64('mwc_rng_state', ctx.threads)

    @ptx_func
    def entry_setup(self):
        reg.u32('mwc_st mwc_mult mwc_car')
        with block('Load MWC multipliers and states'):
            reg.u32('mwc_off mwc_addr')
            std.get_gtid(mwc_off)
            op.mov.u32(mwc_addr, mwc_rng_mults)
            op.mad.lo.u32(mwc_addr, mwc_off, 4, mwc_addr)
            op.ld.global_.u32(mwc_mult, addr(mwc_addr))

            op.mov.u32(mwc_addr, mwc_rng_state)
            op.mad.lo.u32(mwc_addr, mwc_off, 8, mwc_addr)
            op.ld.global_.v2.u32(vec(mwc_st, mwc_car), addr(mwc_addr))

    @ptx_func
    def entry_teardown(self):
        with block('Save MWC states'):
            reg.u32('mwc_off mwc_addr')
            std.get_gtid(mwc_off)
            op.mov.u32(mwc_addr, mwc_rng_state)
            op.mad.lo.u32(mwc_addr, mwc_off, 8, mwc_addr)
            op.st.global_.v2.u32(addr(mwc_addr), vec(mwc_st, mwc_car))

    @ptx_func
    def _next(self):
        # Call from inside a block!
        reg.u64('mwc_out')
        op.cvt.u64.u32(mwc_out, mwc_car)
        op.mad.wide.u32(mwc_out, mwc_st, mwc_mult, mwc_out)
        op.mov.b64(vec(mwc_st, mwc_car), mwc_out)

    @ptx_func
    def next_b32(self, dst_reg):
        with block('Load next random u32 into ' + dst_reg.name):
            self._next()
            op.mov.u32(dst_reg, mwc_st)

    @ptx_func
    def next_f32_01(self, dst_reg):
        # TODO: verify that this is the fastest-performance method
        # TODO: verify that this actually does what I think it does
        with block('Load random float [0,1] into ' + dst_reg.name):
            self._next()
            op.cvt.rn.f32.u32(dst_reg, mwc_st)
            op.mul.f32(dst_reg, dst_reg, '0f0000802F') # 1./(1<<32)

    @ptx_func
    def next_f32_11(self, dst_reg):
        with block('Load random float [-1,1) into ' + dst_reg.name):
            self._next()
            op.cvt.rn.f32.s32(dst_reg, mwc_st)
            op.mul.lo.f32(dst_reg, dst_reg, '0f00000030') # 1./(1<<31)

    def device_init(self, ctx):
        if self.threads_ready >= ctx.threads:
            # Already set up enough random states, don't push again
            return

        # Load raw big-endian u32 multipliers from primes.bin.
        with open('primes.bin') as primefp:
            dt = np.dtype(np.uint32).newbyteorder('B')
            mults = np.frombuffer(primefp.read(), dtype=dt)
        stream = cuda.Stream()
        # Randomness in choosing multipliers is good, but larger multipliers
        # have longer periods, which is also good. This is a compromise.
        mults = np.array(mults[:ctx.threads*4])
        self.rand.shuffle(mults)
        # Copy multipliers and seeds to the device
        multdp, multl = ctx.mod.get_global('mwc_rng_mults')
        cuda.memcpy_htod_async(multdp, mults.tostring()[:multl])
        # Intentionally excludes both 0 and (2^32-1), as they can lead to
        # degenerate sequences of period 0
        states = np.array(self.rand.randint(1, 0xffffffff, size=2*ctx.threads),
                          dtype=np.uint32)
        statedp, statel = ctx.mod.get_global('mwc_rng_state')
        cuda.memcpy_htod_async(statedp, states.tostring())
        self.threads_ready = ctx.threads

    def tests(self):
        return [MWCRNGTest]

class MWCRNGTest(PTXTest):
    name = "MWC RNG sum-of-threads"
    rounds = 5000
    entry_name = 'MWC_RNG_test'
    entry_params = ''

    def deps(self):
        return [MWCRNG]

    @ptx_func
    def module_setup(self):
        mem.global_.u64('mwc_rng_test_sums', ctx.threads)

    @ptx_func
    def entry(self):
        reg.u64('sum addl')
        reg.u32('addend')
        op.mov.u64(sum, 0)
        with block('Sum next %d random numbers' % self.rounds):
            reg.u32('loopct')
            reg.pred('p')
            op.mov.u32(loopct, self.rounds)
            label('loopstart')
            mwc.next_b32(addend)
            op.cvt.u64.u32(addl, addend)
            op.add.u64(sum, sum, addl)
            op.sub.u32(loopct, loopct, 1)
            op.setp.gt.u32(p, loopct, 0)
            op.bra.uni(loopstart, ifp=p)

        with block('Store sum and state'):
            reg.u32('adr offset')
            std.get_gtid(offset)
            op.mov.u32(adr, mwc_rng_test_sums)
            op.mad.lo.u32(adr, offset, 8, adr)
            op.st.global_.u64(addr(adr), sum)

    def call(self, ctx):
        # Get current multipliers and seeds from the device
        multdp, multl = ctx.mod.get_global('mwc_rng_mults')
        mults = cuda.from_device(multdp, ctx.threads, np.uint32)
        statedp, statel = ctx.mod.get_global('mwc_rng_state')
        fullstates = cuda.from_device(statedp, ctx.threads, np.uint64)
        sums = np.zeros(ctx.threads, np.uint64)

        print "Running %d states forward %d rounds" % (len(mults), self.rounds)
        ctime = time.time()
        for i in range(self.rounds):
            states = fullstates & 0xffffffff
            carries = fullstates >> 32
            fullstates = mults * states + carries
            sums = sums + (fullstates & 0xffffffff)
        ctime = time.time() - ctime
        print "Done on host, took %g seconds" % ctime

        func = ctx.mod.get_function('MWC_RNG_test')
        dtime = func(block=ctx.block, grid=ctx.grid, time_kernel=True)
        print "Done on device, took %g seconds (%gx)" % (dtime, ctime/dtime)
        dfullstates = cuda.from_device(statedp, ctx.threads, np.uint64)
        if not (dfullstates == fullstates).all():
            print "State discrepancy"
            print dfullstates
            print fullstates
            return False

        sumdp, suml = ctx.mod.get_global('mwc_rng_test_sums')
        dsums = cuda.from_device(sumdp, ctx.threads, np.uint64)
        if not (dsums == sums).all():
            print "Sum discrepancy"
            print dsums
            print sums
            return False
        return True

class CameraCoordTransform(PTXFragment):
    pass

class CPDataStream(DataStream):
    """DataStream which stores the control points."""
    shortname = 'cp'
Known broken checkin to show algorias 2010-09-01 13:02:12 -04:00			`"""`
			`Contains the PTX fragments which will drive the device.`
			`"""`

Splitting things up a bit 2010-08-28 16:56:05 -04:00			`import os`
			`import time`
Finally runs again 2010-09-06 11:18:20 -04:00			`import struct`
Splitting things up a bit 2010-08-28 16:56:05 -04:00
			`import pycuda.driver as cuda`
			`import numpy as np`

PTX DSL working, at least well enough to pass MWCRNGTest 2010-09-01 21:09:40 -04:00			`from cuburnlib.ptx import *`
Splitting things up a bit 2010-08-28 16:56:05 -04:00
Time to go have nightmares about this code again (no really) 2010-09-03 00:52:27 -04:00			`class IterThread(PTXTest):`
			`entry_name = 'iter_thread'`
			`entry_params = []`
Finally runs again 2010-09-06 11:18:20 -04:00
			`def __init__(self):`
			`self.cps_uploaded = False`

Time to go have nightmares about this code again (no really) 2010-09-03 00:52:27 -04:00			`def deps(self):`
			`return [MWCRNG, CPDataStream]`

			`@ptx_func`
			`def module_setup(self):`
			`mem.global_.u32('g_cp_array',`
Switch from to_inject() to object insertion. One less kludge to deal with. 2010-09-06 16:09:37 -04:00			`cp.stream_size*features.max_ntemporal_samples)`
Time to go have nightmares about this code again (no really) 2010-09-03 00:52:27 -04:00			`mem.global_.u32('g_num_cps')`
			`# TODO move into debug statement`
			`mem.global_.u32('g_num_rounds', ctx.threads)`
			`mem.global_.u32('g_num_writes', ctx.threads)`

			`@ptx_func`
Finally runs again 2010-09-06 11:18:20 -04:00			`def entry(self):`
Time to go have nightmares about this code again (no really) 2010-09-03 00:52:27 -04:00			`reg.f32('x_coord y_coord color_coord alpha_coord')`

			`# TODO: temporary, for testing`
			`reg.u32('num_rounds num_writes')`
			`op.mov.u32(num_rounds, 0)`
			`op.mov.u32(num_writes, 0)`

			`# TODO: MWC float output types`
Switch from to_inject() to object insertion. One less kludge to deal with. 2010-09-06 16:09:37 -04:00			`mwc.next_f32_01(x_coord)`
			`mwc.next_f32_01(y_coord)`
			`mwc.next_f32_01(color_coord)`
			`mwc.next_f32_01(alpha_coord)`
Time to go have nightmares about this code again (no really) 2010-09-03 00:52:27 -04:00
			`# Registers are hard to come by. To avoid having to track both the count`
			`# of samples processed and the number of samples to generate,`
			`# 'num_samples' counts down from the CP's desired sample count.`
			`# When it hits 0, we move on to the next CP.`
			`#`
			`# FUSE complicates things. To track it, we store the negative number`
			`# of points we have left to fuse before we start to store the results.`
			`# When it hits -1, we're done fusing, and can move on to the real`
			`# thread. The execution flow between 'cp_loop', 'fuse_start', and`
			`# 'iter_loop_start' is therefore tricky, and bears close inspection.`
			`#`
			`# In summary:`
			`# num_samples == 0: Load next CP, set num_samples from that`
			`# num_samples > 0: Iterate, store the result, decrement num_samples`
			`# num_samples < -1: Iterate, don't store, increment num_samples`
			`# num_samples == -1: Done fusing, enter normal flow`
			`# TODO: move this to qlocal storage`
			`reg.s32('num_samples')`
			`op.mov.s32(num_samples, -(features.num_fuse_samples+1))`

			`# TODO: Move cp_num to qlocal storage (or spill it, rarely accessed)`
Finally runs again 2010-09-06 11:18:20 -04:00			`reg.u32('cp_idx cpA')`
			`op.mov.u32(cp_idx, 0)`
Time to go have nightmares about this code again (no really) 2010-09-03 00:52:27 -04:00
			`label('cp_loop_start')`
			`op.bar.sync(0)`

			`with block('Check to see if this is the last CP'):`
			`reg.u32('num_cps')`
			`reg.pred('p_last_cp')`
			`op.ldu.u32(num_cps, addr(g_num_cps))`
Finally runs again 2010-09-06 11:18:20 -04:00			`op.setp.ge.u32(p_last_cp, cp_idx, num_cps)`
Time to go have nightmares about this code again (no really) 2010-09-03 00:52:27 -04:00			`op.bra.uni('all_cps_done', ifp=p_last_cp)`

			`with block('Load CP address'):`
			`op.mov.u32(cpA, g_cp_array)`
Switch from to_inject() to object insertion. One less kludge to deal with. 2010-09-06 16:09:37 -04:00			`op.mad.lo.u32(cpA, cp_idx, cp.stream_size, cpA)`
Time to go have nightmares about this code again (no really) 2010-09-03 00:52:27 -04:00
Finally runs again 2010-09-06 11:18:20 -04:00			`with block('Increment CP index, load num_samples (unless in fuse)'):`
			`reg.pred('p_not_in_fuse')`
			`op.setp.ge.s32(p_not_in_fuse, num_samples, 0)`
			`op.add.u32(cp_idx, cp_idx, 1, ifp=p_not_in_fuse)`
Switch from to_inject() to object insertion. One less kludge to deal with. 2010-09-06 16:09:37 -04:00			`cp.get(cpA, num_samples, 'samples_per_thread',`
Finally runs again 2010-09-06 11:18:20 -04:00			`ifp=p_not_in_fuse)`
Time to go have nightmares about this code again (no really) 2010-09-03 00:52:27 -04:00
			`label('fuse_loop_start')`
			`with block('FUSE-specific stuff'):`
			`reg.pred('p_fuse')`
			`comment('If num_samples == -1, set it to 0 and jump back up')`
			`comment('This will start the normal CP loading machinery')`
			`op.setp.eq.s32(p_fuse, num_samples, -1)`
Finally runs again 2010-09-06 11:18:20 -04:00			`op.mov.s32(num_samples, 0, ifp=p_fuse)`
Time to go have nightmares about this code again (no really) 2010-09-03 00:52:27 -04:00			`op.bra.uni(cp_loop_start, ifp=p_fuse)`

			`comment('If num_samples < -1, still fusing, so increment')`
			`op.setp.lt.s32(p_fuse, num_samples, -1)`
			`op.add.s32(num_samples, num_samples, 1, ifp=p_fuse)`

			`label('iter_loop_start')`

			`comment('Do... well, most of everything')`

			`op.add.u32(num_rounds, num_rounds, 1)`

			`with block("Test if we're still in FUSE"):`
			`reg.pred('p_in_fuse')`
			`op.setp.lt.s32(p_in_fuse, num_samples, 0)`
Finally runs again 2010-09-06 11:18:20 -04:00			`op.bra.uni(fuse_loop_start, ifp=p_in_fuse)`
Time to go have nightmares about this code again (no really) 2010-09-03 00:52:27 -04:00
			`with block("Ordinarily, we'd write the result here"):`
			`op.add.u32(num_writes, num_writes, 1)`

			`with block("Check to see if we're done with this CP"):`
			`reg.pred('p_cp_done')`
Finally runs again 2010-09-06 11:18:20 -04:00			`op.add.s32(num_samples, num_samples, -1)`
Time to go have nightmares about this code again (no really) 2010-09-03 00:52:27 -04:00			`op.setp.eq.s32(p_cp_done, num_samples, 0)`
			`op.bra.uni(cp_loop_start, ifp=p_cp_done)`

			`op.bra.uni(iter_loop_start)`

Finally runs again 2010-09-06 11:18:20 -04:00			`label('all_cps_done')`
Time to go have nightmares about this code again (no really) 2010-09-03 00:52:27 -04:00			`# TODO this is for testing, move it to a debug statement`
Switch from to_inject() to object insertion. One less kludge to deal with. 2010-09-06 16:09:37 -04:00			`std.store_per_thread(g_num_rounds, num_rounds)`
			`std.store_per_thread(g_num_writes, num_writes)`
Time to go have nightmares about this code again (no really) 2010-09-03 00:52:27 -04:00
Finally runs again 2010-09-06 11:18:20 -04:00			`def upload_cp_stream(self, ctx, cp_stream, num_cps):`
			`cp_array_dp, cp_array_l = ctx.mod.get_global('g_cp_array')`
			`assert len(cp_stream) <= cp_array_l, "Stream too big!"`
			`cuda.memcpy_htod_async(cp_array_dp, cp_stream)`
			`num_cps_dp, num_cps_l = ctx.mod.get_global('g_num_cps')`
			`cuda.memcpy_htod_async(num_cps_dp, struct.pack('i', num_cps))`
			`self.cps_uploaded = True`

Time to go have nightmares about this code again (no really) 2010-09-03 00:52:27 -04:00			`def call(self, ctx):`
Finally runs again 2010-09-06 11:18:20 -04:00			`if not self.cps_uploaded:`
			`raise Error("Cannot call IterThread before uploading CPs")`
			`func = ctx.mod.get_function('iter_thread')`
			`dtime = func(block=ctx.block, grid=ctx.grid, time_kernel=True)`
Time to go have nightmares about this code again (no really) 2010-09-03 00:52:27 -04:00
Finally runs again 2010-09-06 11:18:20 -04:00			`num_rounds_dp, num_rounds_l = ctx.mod.get_global('g_num_rounds')`
			`num_writes_dp, num_writes_l = ctx.mod.get_global('g_num_writes')`
			`rounds = cuda.from_device(num_rounds_dp, ctx.threads, np.uint32)`
			`writes = cuda.from_device(num_writes_dp, ctx.threads, np.uint32)`
			`print "Rounds:", rounds`
			`print "Writes:", writes`
Time to go have nightmares about this code again (no really) 2010-09-03 00:52:27 -04:00
Splitting things up a bit 2010-08-28 16:56:05 -04:00			`class MWCRNG(PTXFragment):`
Switch from to_inject() to object insertion. One less kludge to deal with. 2010-09-06 16:09:37 -04:00			`shortname = "mwc"`

Splitting things up a bit 2010-08-28 16:56:05 -04:00			`def __init__(self):`
Finally runs again 2010-09-06 11:18:20 -04:00			`self.rand = np.random`
Before I rip out tempita and start a DSL 2010-08-30 14:45:44 -04:00			`self.threads_ready = 0`
Splitting things up a bit 2010-08-28 16:56:05 -04:00			`if not os.path.isfile('primes.bin'):`
			`raise EnvironmentError('primes.bin not found')`

Finally runs again 2010-09-06 11:18:20 -04:00			`def set_seed(self, seed):`
			`self.rand = np.random.mtrand.RandomState(seed)`

PTX DSL working, at least well enough to pass MWCRNGTest 2010-09-01 21:09:40 -04:00			`@ptx_func`
Known broken checkin to show algorias 2010-09-01 13:02:12 -04:00			`def module_setup(self):`
			`mem.global_.u32('mwc_rng_mults', ctx.threads)`
PTX DSL working, at least well enough to pass MWCRNGTest 2010-09-01 21:09:40 -04:00			`mem.global_.u64('mwc_rng_state', ctx.threads)`
Known broken checkin to show algorias 2010-09-01 13:02:12 -04:00
PTX DSL working, at least well enough to pass MWCRNGTest 2010-09-01 21:09:40 -04:00			`@ptx_func`
Known broken checkin to show algorias 2010-09-01 13:02:12 -04:00			`def entry_setup(self):`
			`reg.u32('mwc_st mwc_mult mwc_car')`
			`with block('Load MWC multipliers and states'):`
			`reg.u32('mwc_off mwc_addr')`
Switch from to_inject() to object insertion. One less kludge to deal with. 2010-09-06 16:09:37 -04:00			`std.get_gtid(mwc_off)`
Known broken checkin to show algorias 2010-09-01 13:02:12 -04:00			`op.mov.u32(mwc_addr, mwc_rng_mults)`
			`op.mad.lo.u32(mwc_addr, mwc_off, 4, mwc_addr)`
			`op.ld.global_.u32(mwc_mult, addr(mwc_addr))`

			`op.mov.u32(mwc_addr, mwc_rng_state)`
			`op.mad.lo.u32(mwc_addr, mwc_off, 8, mwc_addr)`
			`op.ld.global_.v2.u32(vec(mwc_st, mwc_car), addr(mwc_addr))`

PTX DSL working, at least well enough to pass MWCRNGTest 2010-09-01 21:09:40 -04:00			`@ptx_func`
Known broken checkin to show algorias 2010-09-01 13:02:12 -04:00			`def entry_teardown(self):`
			`with block('Save MWC states'):`
			`reg.u32('mwc_off mwc_addr')`
Switch from to_inject() to object insertion. One less kludge to deal with. 2010-09-06 16:09:37 -04:00			`std.get_gtid(mwc_off)`
Known broken checkin to show algorias 2010-09-01 13:02:12 -04:00			`op.mov.u32(mwc_addr, mwc_rng_state)`
			`op.mad.lo.u32(mwc_addr, mwc_off, 8, mwc_addr)`
			`op.st.global_.v2.u32(addr(mwc_addr), vec(mwc_st, mwc_car))`

Random floats (I think) 2010-09-06 14:19:06 -04:00			`@ptx_func`
			`def _next(self):`
			`# Call from inside a block!`
			`reg.u64('mwc_out')`
			`op.cvt.u64.u32(mwc_out, mwc_car)`
			`op.mad.wide.u32(mwc_out, mwc_st, mwc_mult, mwc_out)`
			`op.mov.b64(vec(mwc_st, mwc_car), mwc_out)`

PTX DSL working, at least well enough to pass MWCRNGTest 2010-09-01 21:09:40 -04:00			`@ptx_func`
Known broken checkin to show algorias 2010-09-01 13:02:12 -04:00			`def next_b32(self, dst_reg):`
Random floats (I think) 2010-09-06 14:19:06 -04:00			`with block('Load next random u32 into ' + dst_reg.name):`
			`self._next()`
PTX DSL working, at least well enough to pass MWCRNGTest 2010-09-01 21:09:40 -04:00			`op.mov.u32(dst_reg, mwc_st)`
Splitting things up a bit 2010-08-28 16:56:05 -04:00
Random floats (I think) 2010-09-06 14:19:06 -04:00			`@ptx_func`
			`def next_f32_01(self, dst_reg):`
			`# TODO: verify that this is the fastest-performance method`
			`# TODO: verify that this actually does what I think it does`
			`with block('Load random float [0,1] into ' + dst_reg.name):`
			`self._next()`
			`op.cvt.rn.f32.u32(dst_reg, mwc_st)`
			`op.mul.f32(dst_reg, dst_reg, '0f0000802F') # 1./(1<<32)`

			`@ptx_func`
			`def next_f32_11(self, dst_reg):`
			`with block('Load random float [-1,1) into ' + dst_reg.name):`
			`self._next()`
			`op.cvt.rn.f32.s32(dst_reg, mwc_st)`
			`op.mul.lo.f32(dst_reg, dst_reg, '0f00000030') # 1./(1<<31)`

PTX DSL working, at least well enough to pass MWCRNGTest 2010-09-01 21:09:40 -04:00			`def device_init(self, ctx):`
Before I rip out tempita and start a DSL 2010-08-30 14:45:44 -04:00			`if self.threads_ready >= ctx.threads:`
Known broken checkin to show algorias 2010-09-01 13:02:12 -04:00			`# Already set up enough random states, don't push again`
Before I rip out tempita and start a DSL 2010-08-30 14:45:44 -04:00			`return`
Known broken checkin to show algorias 2010-09-01 13:02:12 -04:00
Splitting things up a bit 2010-08-28 16:56:05 -04:00			`# Load raw big-endian u32 multipliers from primes.bin.`
			`with open('primes.bin') as primefp:`
			`dt = np.dtype(np.uint32).newbyteorder('B')`
			`mults = np.frombuffer(primefp.read(), dtype=dt)`
Before I rip out tempita and start a DSL 2010-08-30 14:45:44 -04:00			`stream = cuda.Stream()`
Splitting things up a bit 2010-08-28 16:56:05 -04:00			`# Randomness in choosing multipliers is good, but larger multipliers`
			`# have longer periods, which is also good. This is a compromise.`
Before I rip out tempita and start a DSL 2010-08-30 14:45:44 -04:00			`mults = np.array(mults[:ctx.threads*4])`
Finally runs again 2010-09-06 11:18:20 -04:00			`self.rand.shuffle(mults)`
Splitting things up a bit 2010-08-28 16:56:05 -04:00			`# Copy multipliers and seeds to the device`
			`multdp, multl = ctx.mod.get_global('mwc_rng_mults')`
Before I rip out tempita and start a DSL 2010-08-30 14:45:44 -04:00			`cuda.memcpy_htod_async(multdp, mults.tostring()[:multl])`
			`# Intentionally excludes both 0 and (2^32-1), as they can lead to`
			`# degenerate sequences of period 0`
Finally runs again 2010-09-06 11:18:20 -04:00			`states = np.array(self.rand.randint(1, 0xffffffff, size=2*ctx.threads),`
Before I rip out tempita and start a DSL 2010-08-30 14:45:44 -04:00			`dtype=np.uint32)`
Splitting things up a bit 2010-08-28 16:56:05 -04:00			`statedp, statel = ctx.mod.get_global('mwc_rng_state')`
Before I rip out tempita and start a DSL 2010-08-30 14:45:44 -04:00			`cuda.memcpy_htod_async(statedp, states.tostring())`
			`self.threads_ready = ctx.threads`
Splitting things up a bit 2010-08-28 16:56:05 -04:00
PTX DSL working, at least well enough to pass MWCRNGTest 2010-09-01 21:09:40 -04:00			`def tests(self):`
Splitting things up a bit 2010-08-28 16:56:05 -04:00			`return [MWCRNGTest]`

			`class MWCRNGTest(PTXTest):`
			`name = "MWC RNG sum-of-threads"`
Remove some dead code 2010-09-01 22:46:55 -04:00			`rounds = 5000`
Known broken checkin to show algorias 2010-09-01 13:02:12 -04:00			`entry_name = 'MWC_RNG_test'`
			`entry_params = ''`

PTX DSL working, at least well enough to pass MWCRNGTest 2010-09-01 21:09:40 -04:00			`def deps(self):`
			`return [MWCRNG]`

			`@ptx_func`
Known broken checkin to show algorias 2010-09-01 13:02:12 -04:00			`def module_setup(self):`
PTX DSL working, at least well enough to pass MWCRNGTest 2010-09-01 21:09:40 -04:00			`mem.global_.u64('mwc_rng_test_sums', ctx.threads)`
Known broken checkin to show algorias 2010-09-01 13:02:12 -04:00
			`@ptx_func`
			`def entry(self):`
			`reg.u64('sum addl')`
			`reg.u32('addend')`
			`op.mov.u64(sum, 0)`
			`with block('Sum next %d random numbers' % self.rounds):`
			`reg.u32('loopct')`
PTX DSL working, at least well enough to pass MWCRNGTest 2010-09-01 21:09:40 -04:00			`reg.pred('p')`
Known broken checkin to show algorias 2010-09-01 13:02:12 -04:00			`op.mov.u32(loopct, self.rounds)`
			`label('loopstart')`
Switch from to_inject() to object insertion. One less kludge to deal with. 2010-09-06 16:09:37 -04:00			`mwc.next_b32(addend)`
Known broken checkin to show algorias 2010-09-01 13:02:12 -04:00			`op.cvt.u64.u32(addl, addend)`
			`op.add.u64(sum, sum, addl)`
			`op.sub.u32(loopct, loopct, 1)`
			`op.setp.gt.u32(p, loopct, 0)`
			`op.bra.uni(loopstart, ifp=p)`
Splitting things up a bit 2010-08-28 16:56:05 -04:00
Known broken checkin to show algorias 2010-09-01 13:02:12 -04:00			`with block('Store sum and state'):`
			`reg.u32('adr offset')`
Switch from to_inject() to object insertion. One less kludge to deal with. 2010-09-06 16:09:37 -04:00			`std.get_gtid(offset)`
Known broken checkin to show algorias 2010-09-01 13:02:12 -04:00			`op.mov.u32(adr, mwc_rng_test_sums)`
			`op.mad.lo.u32(adr, offset, 8, adr)`
PTX DSL working, at least well enough to pass MWCRNGTest 2010-09-01 21:09:40 -04:00			`op.st.global_.u64(addr(adr), sum)`
Splitting things up a bit 2010-08-28 16:56:05 -04:00
			`def call(self, ctx):`
			`# Get current multipliers and seeds from the device`
			`multdp, multl = ctx.mod.get_global('mwc_rng_mults')`
			`mults = cuda.from_device(multdp, ctx.threads, np.uint32)`
			`statedp, statel = ctx.mod.get_global('mwc_rng_state')`
			`fullstates = cuda.from_device(statedp, ctx.threads, np.uint64)`
			`sums = np.zeros(ctx.threads, np.uint64)`

Remove some dead code 2010-09-01 22:46:55 -04:00			`print "Running %d states forward %d rounds" % (len(mults), self.rounds)`
Splitting things up a bit 2010-08-28 16:56:05 -04:00			`ctime = time.time()`
			`for i in range(self.rounds):`
			`states = fullstates & 0xffffffff`
			`carries = fullstates >> 32`
			`fullstates = mults * states + carries`
			`sums = sums + (fullstates & 0xffffffff)`
			`ctime = time.time() - ctime`
			`print "Done on host, took %g seconds" % ctime`

			`func = ctx.mod.get_function('MWC_RNG_test')`
			`dtime = func(block=ctx.block, grid=ctx.grid, time_kernel=True)`
			`print "Done on device, took %g seconds (%gx)" % (dtime, ctime/dtime)`
			`dfullstates = cuda.from_device(statedp, ctx.threads, np.uint64)`
			`if not (dfullstates == fullstates).all():`
			`print "State discrepancy"`
			`print dfullstates`
			`print fullstates`
			`return False`

			`sumdp, suml = ctx.mod.get_global('mwc_rng_test_sums')`
			`dsums = cuda.from_device(sumdp, ctx.threads, np.uint64)`
			`if not (dsums == sums).all():`
			`print "Sum discrepancy"`
			`print dsums`
			`print sums`
			`return False`
			`return True`

Before I rip out tempita and start a DSL 2010-08-30 14:45:44 -04:00			`class CameraCoordTransform(PTXFragment):`
Remove some dead code 2010-09-01 22:46:55 -04:00			`pass`
Before I rip out tempita and start a DSL 2010-08-30 14:45:44 -04:00
Finally runs again 2010-09-06 11:18:20 -04:00			`class CPDataStream(DataStream):`
Time to go have nightmares about this code again (no really) 2010-09-03 00:52:27 -04:00			`"""DataStream which stores the control points."""`
Switch from to_inject() to object insertion. One less kludge to deal with. 2010-09-06 16:09:37 -04:00			`shortname = 'cp'`
Splitting things up a bit 2010-08-28 16:56:05 -04:00