From a23ebdcf5f783095b949a9e05fa5b016da9133d7 Mon Sep 17 00:00:00 2001
From: Steven Robertson <steven@strobe.cc>
Date: Fri, 27 Aug 2010 12:28:02 -0400
Subject: [PATCH] Initial commit.

---
 helpers/genprimes.c |  57 ++++++++++
 main.py             | 257 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 314 insertions(+)
 create mode 100644 helpers/genprimes.c
 create mode 100644 main.py

diff --git a/helpers/genprimes.c b/helpers/genprimes.c
new file mode 100644
index 0000000..71962d5
--- /dev/null
+++ b/helpers/genprimes.c
@@ -0,0 +1,57 @@
+/* Public domain, FWIW
+ * gcc -o genprimes -lgmp genprimes.c; ./genprimes > primes.bin
+ * see http://www.ast.cam.ac.uk/~stg20/cuda/random/index.html
+ */
+#include <stdio.h>
+#include <stdarg.h>
+#include <stdlib.h>
+#include <string.h>
+#include <gmp.h>
+
+/* To verify the primes against the linked URL, compile this instead:
+int main() {
+    FILE *fp = fopen("primes.bin", "r");
+    char stuff[5];
+    while (fread(stuff, 4, 1, fp) == 1) {
+        uint64_t i = *((uint32_t*)stuff);
+        i = (i>>24) + (1<<8)*((i>>16)&0xff) + (1<<16)*((i>>8)&0xff) + (1<<24)*(i&0xff);
+        uint64_t j = i * 4294967296L - 1;
+        uint64_t k = (j-1)/2;
+        printf("%lu %lu %lu\n", i, j, k);
+    }
+}
+*/
+
+int main(int argc, char* argv[]) {
+    fprintf(stderr, "Generating list of multipliers for mod(2^32) MWC RNG\n");
+    mpz_t candidate, twotothethirtytwo;
+    mpz_init(candidate);
+    mpz_init_set_d(twotothethirtytwo, (double) (4294967296L));
+
+    char bytes[5];
+    bytes[4] = 0;
+    unsigned int i, found=0;
+    for (i = 4294967295L; i > 2147483648; i--) {
+        mpz_set_ui(candidate, i);
+        mpz_mul(candidate, candidate, twotothethirtytwo);
+        mpz_sub_ui(candidate, candidate, 1);
+        if(mpz_probab_prime_p(candidate, 200)) {
+            mpz_sub_ui(candidate, candidate, 1);
+            mpz_tdiv_q_ui(candidate, candidate, 2);
+            if(mpz_probab_prime_p(candidate, 200)) {
+                bytes[0] = (i>>24)&0xff;
+                bytes[1] = (i>>16)&0xff;
+                bytes[2] = (i>>8)&0xff;
+                bytes[3] = i&0xff;
+                fwrite(bytes, 4, 1, stdout);
+                found++;
+                if (!(found&0xff)) fprintf(stderr, ".");
+            }
+        }
+    }
+    fprintf(stderr, "\nFound %d multipliers.\n", found);
+    mpz_clear(candidate);
+    mpz_clear(twotothethirtytwo);
+}
+
+
diff --git a/main.py b/main.py
new file mode 100644
index 0000000..c2cdcef
--- /dev/null
+++ b/main.py
@@ -0,0 +1,257 @@
+#!/usr/bin/python
+#
+# flam3cuda, one of a surprisingly large number of ports of the fractal flame
+# algorithm to NVIDIA GPUs.
+#
+# This one is copyright 2010 Steven Robertson <steven@strobe.cc>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 or later
+# as published by the Free Software Foundation.
+
+import os
+import sys
+import ctypes
+import struct
+
+import tempita
+
+# These imports are order-sensitive!
+import pyglet
+import pyglet.gl as gl
+gl.get_current_context()
+
+import pycuda.driver as cuda
+import pycuda.gl as cudagl
+import pycuda.gl.autoinit
+from pycuda.compiler import SourceModule
+
+from multiprocessing import Process, Queue
+
+import numpy as np
+
+from fr0stlib import pyflam3
+
+# PTX header and functions used for debugging.
+prelude = """
+.version 2.0
+.target sm_20
+
+.func (.reg .u32 $ret) get_gtid ()
+{
+    .reg .u16 tmp;
+    .reg .u32 cta, ncta, tid, gtid;
+
+    mov.u16         tmp,    %ctaid.x;
+    cvt.u32.u16     cta,    tmp;
+    mov.u16         tmp,    %ntid.x;
+    cvt.u32.u16     ncta,   tmp;
+    mul24.lo.u32    gtid,   cta,    ncta;
+
+    mov.u16         tmp,    %tid.x;
+    cvt.u32.u16     tid,    tmp;
+    add.u32         gtid,   gtid,   tid;
+    mov.b32         $ret,   gtid;
+    ret;
+}
+
+.entry write_to_buffer ( .param .u32 bufbase )
+{
+    .reg .u32 base, gtid, off;
+
+    ld.param.u32    base,       [bufbase];
+    call.uni        (off),      get_gtid,   ();
+    mad24.lo.u32    base,       off,        4,          base;
+    st.volatile.global.b32      [base],     off;
+}
+"""
+
+class CUGenome(pyflam3.Genome):
+    def _render(self, frame, trans):
+        obuf = (ctypes.c_ubyte * ((3+trans)*self.width*self.height))()
+        stats = pyflam3.RenderStats()
+        pyflam3.flam3_render(ctypes.byref(frame), obuf, pyflam3.flam3_field_both,
+                     trans+3, trans, ctypes.byref(stats))
+        return obuf, stats, frame
+
+class LaunchContext(self):
+    def __init__(self, seed=None):
+        self.block, self.grid, self.threads = None, None, None
+        self.stream = cuda.Stream()
+        self.rand = mtrand.RandomState(seed)
+
+    def set_size(self, block, grid):
+        self.block, self.grid = block, grid
+        self.threads = reduce(lambda a, b: a*b, self.block + self.grid)
+
+class PTXFragment(object):
+    """
+    Wrapper for sections of template PTX.
+
+    In order to provide the best optimization, and avoid a web of hard-coded
+    parameters, the PTX module may be regenerated and recompiled several times
+    with different or incomplete launch context parameters. To this end, avoid
+    accessing the GPU in such functions, and do not depend on context values
+    which are marked as "tuned" in the LaunchContext docstring being available.
+
+    The final compilation pass is guaranteed to have all "tuned" values fixed
+    in their final values for the stream.
+
+    Template code will be processed recursively until all "{{" instances have
+    been replaced, using the same namespace each time.
+    """
+
+    def deps(self, ctx):
+        """
+        Returns a list of PTXFragment objects on which this object depends
+        for successful compilation. Circular dependencies are forbidden.
+        """
+        return []
+
+    def subs(self, ctx):
+        """
+        Returns a dict of items to add to the template substitution namespace.
+        The entire dict will be assembled, including all dependencies, before
+        any templates are evaluated.
+        """
+        return {}
+
+    def prelude(self, ctx):
+        """
+        Returns a template string containing any code (variable declarations,
+        probably) that should be inserted at module scope. The prelude of
+        all deps will be inserted above this prelude.
+        """
+        return ""
+
+    def entryPrelude(self, ctx):
+        """
+        Returns a template string that should be inserted at the top of any
+        entry point which depends on this method. The entry prelude of all
+        deps will be inserted above this entry prelude.
+        """
+        return ""
+
+    def setUp(self, ctx):
+        """
+        Do start-of-stream initialization, such as copying data to the device.
+        """
+        pass
+
+    def test(self, ctx):
+        """
+        Perform device tests. Returns True on success, False on failure,
+        or raises an exception.
+        """
+        return True
+
+class PTXEntryPoint(PTXFragment):
+    def entry(self, ctx):
+        """
+        Returns a template string corresponding to a PTX entry point.
+        """
+        pass
+
+    def call(self, ctx):
+        """
+        Calls the entry point on the device. Haven't worked out the details
+        of this one yet.
+        """
+        pass
+
+
+
+class DeviceHelpers(PTXFragment):
+    """This one's included by default, no need to depend on it"""
+    def subs(self, ctx):
+        return {
+            'PTRT': ctypes.sizeof(ctypes.c_void_p) == 8 and '.u64' or '.u32',
+            }
+
+class MWCRandGen(PTXFragment):
+
+    _prelude = """
+    .const {{PTRT}} mwc_rng_mults_p;
+    .const {{PTRT}} mwc_rng_seeds_p;
+    """
+
+    def __init__(self):
+        if not os.path.isfile(os.path.join(os.path.dirname(__FILE__),
+                                           'primes.bin')):
+            raise EnvironmentError('primes.bin not found')
+
+    def prelude(self):
+        return self._prelude
+
+    def setUp(self, ctx):
+        # Load raw big-endian u32 multipliers from primes.bin.
+        with open('primes.bin') as primefp:
+            dt = np.dtype(np.uint32).newbyteorder('B')
+            mults = np.frombuffer(primefp.read(), dtype=dt)
+        # Randomness in choosing multipliers is good, but larger multipliers
+        # have longer periods, which is also good. This is a compromise.
+        ctx.rand.shuffle(mults[:ctx.threads*4])
+        # Copy multipliers and seeds to the device
+        devmp, devml = ctx.mod.get_global('mwc_rng_mults')
+        cuda.memcpy_htod_async(devmp, mults.tostring()[:devml], ctx.stream)
+        devsp, devsl = ctx.mod.get_global('mwc_rng_seeds')
+        cuda.memcpy_htod_async(devsp, ctx.rand.bytes(devsl), ctx.stream)
+
+    def _next_b32(self, dreg):
+        return """
+    mul.wide.u32    mwc_rng_
+        mul.wide.u32
+
+
+
+    def templates(self, ctx):
+        return {'mwc_next_b32', self._next_b32}
+
+
+    def test(self, ctx):
+
+
+
+
+
+
+
+
+    def launch(self, ctx):
+        if self.mults
+
+
+
+def main(genome_path):
+
+
+
+
+    #with open(genome_path) as fp:
+        #genome = CUGenome.from_string(fp.read())[0]
+    #genome.width, genome.height = 512, 512
+    #genome.sample_density = 1000
+    #obuf, stats, frame = genome.render(estimator=3)
+    #gc.collect()
+
+        ##q.put(str(obuf))
+    ##p = Process(target=render, args=(q, genome_path))
+    ##p.start()
+
+    #window = pyglet.window.Window()
+    #image = pyglet.image.ImageData(genome.width, genome.height, 'RGB', obuf)
+    #tex = image.texture
+
+    #@window.event
+    #def on_draw():
+        #window.clear()
+        #tex.blit(0, 0)
+
+    #pyglet.app.run()
+
+if __name__ == "__main__":
+    if len(sys.argv) < 2 or not os.path.isfile(sys.argv[1]):
+        print "First argument must be a path to a genome file"
+        sys.exit(1)
+    main(sys.argv[1])
+