import os
import time

import pycuda.driver as cuda
import numpy as np

from cuburnlib.ptx import PTXFragment, PTXEntryPoint, PTXTest

class MWCRNG(PTXFragment):
    def __init__(self):
        self.threads_ready = 0
        if not os.path.isfile('primes.bin'):
            raise EnvironmentError('primes.bin not found')

    prelude = (".global .u32 mwc_rng_mults[{{ctx.threads}}];\n"
               ".global .u64 mwc_rng_state[{{ctx.threads}}];")

    def _next_b32(self, dreg):
        # TODO: make sure PTX optimizes away superfluous move instrs
        return """
        {
        // MWC next b32
        .reg .u64       mwc_out;
        cvt.u64.u32     mwc_out,    mwc_car;
        mad.wide.u32    mwc_out,    mwc_st,     mwc_mult,   mwc_out;
        mov.b64         {mwc_st,    mwc_car},   mwc_out;
        mov.u32         %s,         mwc_st;
        }
        """ % dreg

    def subs(self, ctx):
        return {'mwc_next_b32': self._next_b32}

    entry_start = """
    .reg .u32 mwc_st, mwc_mult, mwc_car;
    {
        // MWC load multipliers and RNG states
        .reg .u32       mwc_off, mwc_addr;
        {{ get_gtid('mwc_off') }}
        mov.u32         mwc_addr,   mwc_rng_mults;
        mad.lo.u32      mwc_addr,   mwc_off,    4,  mwc_addr;
        ld.global.u32   mwc_mult,   [mwc_addr];
        mov.u32         mwc_addr,   mwc_rng_state;
        mad.lo.u32      mwc_addr,   mwc_off,    8,  mwc_addr;
        ld.global.v2.u32 {mwc_st, mwc_car}, [mwc_addr];
    }
    """

    entry_end = """
    {
        // MWC save states
        .reg .u32       mwc_addr, mwc_off;
        {{ get_gtid('mwc_off') }}
        mov.u32         mwc_addr,   mwc_rng_state;
        mad.lo.u32      mwc_addr,   mwc_off,    8,      mwc_addr;
        st.global.v2.u32    [mwc_addr],     {mwc_st, mwc_car};
    }
    """

    def set_up(self, ctx):
        if self.threads_ready >= ctx.threads:
            return
        # Load raw big-endian u32 multipliers from primes.bin.
        with open('primes.bin') as primefp:
            dt = np.dtype(np.uint32).newbyteorder('B')
            mults = np.frombuffer(primefp.read(), dtype=dt)
        stream = cuda.Stream()
        # Randomness in choosing multipliers is good, but larger multipliers
        # have longer periods, which is also good. This is a compromise.
        mults = np.array(mults[:ctx.threads*4])
        ctx.rand.shuffle(mults)
        # Copy multipliers and seeds to the device
        multdp, multl = ctx.mod.get_global('mwc_rng_mults')
        cuda.memcpy_htod_async(multdp, mults.tostring()[:multl])
        # Intentionally excludes both 0 and (2^32-1), as they can lead to
        # degenerate sequences of period 0
        states = np.array(ctx.rand.randint(1, 0xffffffff, size=2*ctx.threads),
                          dtype=np.uint32)
        statedp, statel = ctx.mod.get_global('mwc_rng_state')
        cuda.memcpy_htod_async(statedp, states.tostring())
        self.threads_ready = ctx.threads

    def tests(self, ctx):
        return [MWCRNGTest]

class MWCRNGTest(PTXTest):
    name = "MWC RNG sum-of-threads"
    deps = [MWCRNG]
    rounds = 10000

    prelude = ".global .u64 mwc_rng_test_sums[{{ctx.threads}}];"

    def entry(self, ctx):
        return ('MWC_RNG_test', '', """
            .reg .u64   sum, addl;
            .reg .u32   addend;
            mov.u64     sum,    0;
            {
                .reg .u32   loopct;
                .reg .pred  p;
                mov.u32     loopct, %s;
loopstart:
                {{ mwc_next_b32('addend') }}
                cvt.u64.u32 addl,   addend;
                add.u64     sum,    sum,    addl;
                sub.u32     loopct, loopct, 1;
                setp.gt.u32 p,      loopct, 0;
            @p  bra.uni     loopstart;
            }
            {
                .reg .u32       addr, offset;
                {{ get_gtid('offset') }}
                mov.u32         addr,   mwc_rng_test_sums;
                mad.lo.u32      addr,   offset,     8,      addr;
                st.global.u64   [addr], sum;
            }
            """ % self.rounds)

    def call(self, ctx):
        # Get current multipliers and seeds from the device
        multdp, multl = ctx.mod.get_global('mwc_rng_mults')
        mults = cuda.from_device(multdp, ctx.threads, np.uint32)
        statedp, statel = ctx.mod.get_global('mwc_rng_state')
        fullstates = cuda.from_device(statedp, ctx.threads, np.uint64)
        sums = np.zeros(ctx.threads, np.uint64)

        print "Running states forward %d rounds" % self.rounds
        ctime = time.time()
        for i in range(self.rounds):
            states = fullstates & 0xffffffff
            carries = fullstates >> 32
            fullstates = mults * states + carries
            sums = sums + (fullstates & 0xffffffff)
        ctime = time.time() - ctime
        print "Done on host, took %g seconds" % ctime

        func = ctx.mod.get_function('MWC_RNG_test')
        dtime = func(block=ctx.block, grid=ctx.grid, time_kernel=True)
        print "Done on device, took %g seconds (%gx)" % (dtime, ctime/dtime)
        dfullstates = cuda.from_device(statedp, ctx.threads, np.uint64)
        print dfullstates, fullstates
        if not (dfullstates == fullstates).all():
            print "State discrepancy"
            print dfullstates
            print fullstates
            return False

        sumdp, suml = ctx.mod.get_global('mwc_rng_test_sums')
        dsums = cuda.from_device(sumdp, ctx.threads, np.uint64)
        print dsums, sums
        if not (dsums == sums).all():
            print "Sum discrepancy"
            print dsums
            print sums
            return False
        return True

class CameraCoordTransform(PTXFragment):
    # This is here until I get the device stream packer going, or decide on
    # how to handle C struct addressing if we go for unpacked structures
    prelude = ".global .u32 camera_coords[8];"

    def _cam_coord_xf(self, x, y, dreg):
        """
        Given `.f32 x, y`, a coordinate in IFS space, writes the integer
        offset from the start of the sampling lattice into `.u32 dreg`.
        """

        return """{
        .pred is_badval;
        // TODO: This will change when data streaming is done
        .reg .u32 camera_coord_address;
        mov.u32 camera_coord_address, camera_coords;
        // TODO: see if preloading everything hurts register count
        .reg .f32 width_scale, width_upper_bound, height_scale, height_upper_bound;
        ldu.v4.f32 {width_scale, width_upper_bound,
                    height_scale, height_upper_bound},
                   [camera_coord_address+0];
        .reg .f32 x_xf, y_xf;
        mad.rz.f32  x_xf,   x,  width_scale"""
        # TODO unfinished