#!/usr/bin/python
#
# flam3cuda, one of a surprisingly large number of ports of the fractal flame
# algorithm to NVIDIA GPUs.
#
# This one is copyright 2010 Steven Robertson <steven@strobe.cc>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 or later
# as published by the Free Software Foundation.

import os
import sys
import ctypes
import struct

import tempita

# These imports are order-sensitive!
import pyglet
import pyglet.gl as gl
gl.get_current_context()

import pycuda.driver as cuda
import pycuda.gl as cudagl
import pycuda.gl.autoinit
from pycuda.compiler import SourceModule

from multiprocessing import Process, Queue

import numpy as np

from fr0stlib import pyflam3

# PTX header and functions used for debugging.
prelude = """
.version 2.0
.target sm_20

.func (.reg .u32 $ret) get_gtid ()
{
    .reg .u16 tmp;
    .reg .u32 cta, ncta, tid, gtid;

    mov.u16         tmp,    %ctaid.x;
    cvt.u32.u16     cta,    tmp;
    mov.u16         tmp,    %ntid.x;
    cvt.u32.u16     ncta,   tmp;
    mul24.lo.u32    gtid,   cta,    ncta;

    mov.u16         tmp,    %tid.x;
    cvt.u32.u16     tid,    tmp;
    add.u32         gtid,   gtid,   tid;
    mov.b32         $ret,   gtid;
    ret;
}

.entry write_to_buffer ( .param .u32 bufbase )
{
    .reg .u32 base, gtid, off;

    ld.param.u32    base,       [bufbase];
    call.uni        (off),      get_gtid,   ();
    mad24.lo.u32    base,       off,        4,          base;
    st.volatile.global.b32      [base],     off;
}
"""

class CUGenome(pyflam3.Genome):
    def _render(self, frame, trans):
        obuf = (ctypes.c_ubyte * ((3+trans)*self.width*self.height))()
        stats = pyflam3.RenderStats()
        pyflam3.flam3_render(ctypes.byref(frame), obuf, pyflam3.flam3_field_both,
                     trans+3, trans, ctypes.byref(stats))
        return obuf, stats, frame

class LaunchContext(self):
    def __init__(self, seed=None):
        self.block, self.grid, self.threads = None, None, None
        self.stream = cuda.Stream()
        self.rand = mtrand.RandomState(seed)

    def set_size(self, block, grid):
        self.block, self.grid = block, grid
        self.threads = reduce(lambda a, b: a*b, self.block + self.grid)

class PTXFragment(object):
    """
    Wrapper for sections of template PTX.

    In order to provide the best optimization, and avoid a web of hard-coded
    parameters, the PTX module may be regenerated and recompiled several times
    with different or incomplete launch context parameters. To this end, avoid
    accessing the GPU in such functions, and do not depend on context values
    which are marked as "tuned" in the LaunchContext docstring being available.

    The final compilation pass is guaranteed to have all "tuned" values fixed
    in their final values for the stream.

    Template code will be processed recursively until all "{{" instances have
    been replaced, using the same namespace each time.
    """

    def deps(self, ctx):
        """
        Returns a list of PTXFragment objects on which this object depends
        for successful compilation. Circular dependencies are forbidden.
        """
        return []

    def subs(self, ctx):
        """
        Returns a dict of items to add to the template substitution namespace.
        The entire dict will be assembled, including all dependencies, before
        any templates are evaluated.
        """
        return {}

    def prelude(self, ctx):
        """
        Returns a template string containing any code (variable declarations,
        probably) that should be inserted at module scope. The prelude of
        all deps will be inserted above this prelude.
        """
        return ""

    def entryPrelude(self, ctx):
        """
        Returns a template string that should be inserted at the top of any
        entry point which depends on this method. The entry prelude of all
        deps will be inserted above this entry prelude.
        """
        return ""

    def setUp(self, ctx):
        """
        Do start-of-stream initialization, such as copying data to the device.
        """
        pass

    def test(self, ctx):
        """
        Perform device tests. Returns True on success, False on failure,
        or raises an exception.
        """
        return True

class PTXEntryPoint(PTXFragment):
    def entry(self, ctx):
        """
        Returns a template string corresponding to a PTX entry point.
        """
        pass

    def call(self, ctx):
        """
        Calls the entry point on the device. Haven't worked out the details
        of this one yet.
        """
        pass


class DeviceHelpers(PTXFragment):
    """This one's included by default, no need to depend on it"""
    def subs(self, ctx):
        return {
            'PTRT': ctypes.sizeof(ctypes.c_void_p) == 8 and '.u64' or '.u32',
            }

class MWCRandGen(PTXFragment):

    _prelude = """
    .const {{PTRT}} mwc_rng_mults_p;
    .const {{PTRT}} mwc_rng_seeds_p;
    """

    def __init__(self):
        if not os.path.isfile(os.path.join(os.path.dirname(__FILE__),
                                           'primes.bin')):
            raise EnvironmentError('primes.bin not found')

    def prelude(self):
        return self._prelude

    def setUp(self, ctx):
        # Load raw big-endian u32 multipliers from primes.bin.
        with open('primes.bin') as primefp:
            dt = np.dtype(np.uint32).newbyteorder('B')
            mults = np.frombuffer(primefp.read(), dtype=dt)
        # Randomness in choosing multipliers is good, but larger multipliers
        # have longer periods, which is also good. This is a compromise.
        ctx.rand.shuffle(mults[:ctx.threads*4])
        # Copy multipliers and seeds to the device
        devmp, devml = ctx.mod.get_global('mwc_rng_mults')
        cuda.memcpy_htod_async(devmp, mults.tostring()[:devml], ctx.stream)
        devsp, devsl = ctx.mod.get_global('mwc_rng_seeds')
        cuda.memcpy_htod_async(devsp, ctx.rand.bytes(devsl), ctx.stream)

    def _next_b32(self, dreg):
        return """
    mul.wide.u32    mwc_rng_
        mul.wide.u32


    def templates(self, ctx):
        return {'mwc_next_b32', self._next_b32}


    def test(self, ctx):


    def launch(self, ctx):
        if self.mults


def main(genome_path):


    #with open(genome_path) as fp:
        #genome = CUGenome.from_string(fp.read())[0]
    #genome.width, genome.height = 512, 512
    #genome.sample_density = 1000
    #obuf, stats, frame = genome.render(estimator=3)
    #gc.collect()

        ##q.put(str(obuf))
    ##p = Process(target=render, args=(q, genome_path))
    ##p.start()

    #window = pyglet.window.Window()
    #image = pyglet.image.ImageData(genome.width, genome.height, 'RGB', obuf)
    #tex = image.texture

    #@window.event
    #def on_draw():
        #window.clear()
        #tex.blit(0, 0)

    #pyglet.app.run()

if __name__ == "__main__":
    if len(sys.argv) < 2 or not os.path.isfile(sys.argv[1]):
        print "First argument must be a path to a genome file"
        sys.exit(1)
    main(sys.argv[1])