Initial commit.

2026-06-29 16:43:01 -04:00 · 2010-08-27 12:28:02 -04:00
commit a23ebdcf5f
2 changed files with 314 additions and 0 deletions
@@ -0,0 +1,257 @@
+#!/usr/bin/python
+#
+# flam3cuda, one of a surprisingly large number of ports of the fractal flame
+# algorithm to NVIDIA GPUs.
+#
+# This one is copyright 2010 Steven Robertson <steven@strobe.cc>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 or later
+# as published by the Free Software Foundation.
+
+import os
+import sys
+import ctypes
+import struct
+
+import tempita
+
+# These imports are order-sensitive!
+import pyglet
+import pyglet.gl as gl
+gl.get_current_context()
+
+import pycuda.driver as cuda
+import pycuda.gl as cudagl
+import pycuda.gl.autoinit
+from pycuda.compiler import SourceModule
+
+from multiprocessing import Process, Queue
+
+import numpy as np
+
+from fr0stlib import pyflam3
+
+# PTX header and functions used for debugging.
+prelude = """
+.version 2.0
+.target sm_20
+
+.func (.reg .u32 $ret) get_gtid ()
+{
+    .reg .u16 tmp;
+    .reg .u32 cta, ncta, tid, gtid;
+
+    mov.u16         tmp,    %ctaid.x;
+    cvt.u32.u16     cta,    tmp;
+    mov.u16         tmp,    %ntid.x;
+    cvt.u32.u16     ncta,   tmp;
+    mul24.lo.u32    gtid,   cta,    ncta;
+
+    mov.u16         tmp,    %tid.x;
+    cvt.u32.u16     tid,    tmp;
+    add.u32         gtid,   gtid,   tid;
+    mov.b32         $ret,   gtid;
+    ret;
+}
+
+.entry write_to_buffer ( .param .u32 bufbase )
+{
+    .reg .u32 base, gtid, off;
+
+    ld.param.u32    base,       [bufbase];
+    call.uni        (off),      get_gtid,   ();
+    mad24.lo.u32    base,       off,        4,          base;
+    st.volatile.global.b32      [base],     off;
+}
+"""
+
+class CUGenome(pyflam3.Genome):
+    def _render(self, frame, trans):
+        obuf = (ctypes.c_ubyte * ((3+trans)*self.width*self.height))()
+        stats = pyflam3.RenderStats()
+        pyflam3.flam3_render(ctypes.byref(frame), obuf, pyflam3.flam3_field_both,
+                     trans+3, trans, ctypes.byref(stats))
+        return obuf, stats, frame
+
+class LaunchContext(self):
+    def __init__(self, seed=None):
+        self.block, self.grid, self.threads = None, None, None
+        self.stream = cuda.Stream()
+        self.rand = mtrand.RandomState(seed)
+
+    def set_size(self, block, grid):
+        self.block, self.grid = block, grid
+        self.threads = reduce(lambda a, b: a*b, self.block + self.grid)
+
+class PTXFragment(object):
+    """
+    Wrapper for sections of template PTX.
+
+    In order to provide the best optimization, and avoid a web of hard-coded
+    parameters, the PTX module may be regenerated and recompiled several times
+    with different or incomplete launch context parameters. To this end, avoid
+    accessing the GPU in such functions, and do not depend on context values
+    which are marked as "tuned" in the LaunchContext docstring being available.
+
+    The final compilation pass is guaranteed to have all "tuned" values fixed
+    in their final values for the stream.
+
+    Template code will be processed recursively until all "{{" instances have
+    been replaced, using the same namespace each time.
+    """
+
+    def deps(self, ctx):
+        """
+        Returns a list of PTXFragment objects on which this object depends
+        for successful compilation. Circular dependencies are forbidden.
+        """
+        return []
+
+    def subs(self, ctx):
+        """
+        Returns a dict of items to add to the template substitution namespace.
+        The entire dict will be assembled, including all dependencies, before
+        any templates are evaluated.
+        """
+        return {}
+
+    def prelude(self, ctx):
+        """
+        Returns a template string containing any code (variable declarations,
+        probably) that should be inserted at module scope. The prelude of
+        all deps will be inserted above this prelude.
+        """
+        return ""
+
+    def entryPrelude(self, ctx):
+        """
+        Returns a template string that should be inserted at the top of any
+        entry point which depends on this method. The entry prelude of all
+        deps will be inserted above this entry prelude.
+        """
+        return ""
+
+    def setUp(self, ctx):
+        """
+        Do start-of-stream initialization, such as copying data to the device.
+        """
+        pass
+
+    def test(self, ctx):
+        """
+        Perform device tests. Returns True on success, False on failure,
+        or raises an exception.
+        """
+        return True
+
+class PTXEntryPoint(PTXFragment):
+    def entry(self, ctx):
+        """
+        Returns a template string corresponding to a PTX entry point.
+        """
+        pass
+
+    def call(self, ctx):
+        """
+        Calls the entry point on the device. Haven't worked out the details
+        of this one yet.
+        """
+        pass
+
+
+
+class DeviceHelpers(PTXFragment):
+    """This one's included by default, no need to depend on it"""
+    def subs(self, ctx):
+        return {
+            'PTRT': ctypes.sizeof(ctypes.c_void_p) == 8 and '.u64' or '.u32',
+            }
+
+class MWCRandGen(PTXFragment):
+
+    _prelude = """
+    .const {{PTRT}} mwc_rng_mults_p;
+    .const {{PTRT}} mwc_rng_seeds_p;
+    """
+
+    def __init__(self):
+        if not os.path.isfile(os.path.join(os.path.dirname(__FILE__),
+                                           'primes.bin')):
+            raise EnvironmentError('primes.bin not found')
+
+    def prelude(self):
+        return self._prelude
+
+    def setUp(self, ctx):
+        # Load raw big-endian u32 multipliers from primes.bin.
+        with open('primes.bin') as primefp:
+            dt = np.dtype(np.uint32).newbyteorder('B')
+            mults = np.frombuffer(primefp.read(), dtype=dt)
+        # Randomness in choosing multipliers is good, but larger multipliers
+        # have longer periods, which is also good. This is a compromise.
+        ctx.rand.shuffle(mults[:ctx.threads*4])
+        # Copy multipliers and seeds to the device
+        devmp, devml = ctx.mod.get_global('mwc_rng_mults')
+        cuda.memcpy_htod_async(devmp, mults.tostring()[:devml], ctx.stream)
+        devsp, devsl = ctx.mod.get_global('mwc_rng_seeds')
+        cuda.memcpy_htod_async(devsp, ctx.rand.bytes(devsl), ctx.stream)
+
+    def _next_b32(self, dreg):
+        return """
+    mul.wide.u32    mwc_rng_
+        mul.wide.u32
+
+
+
+    def templates(self, ctx):
+        return {'mwc_next_b32', self._next_b32}
+
+
+    def test(self, ctx):
+
+
+
+
+
+
+
+
+    def launch(self, ctx):
+        if self.mults
+
+
+
+def main(genome_path):
+
+
+
+
+    #with open(genome_path) as fp:
+        #genome = CUGenome.from_string(fp.read())[0]
+    #genome.width, genome.height = 512, 512
+    #genome.sample_density = 1000
+    #obuf, stats, frame = genome.render(estimator=3)
+    #gc.collect()
+
+        ##q.put(str(obuf))
+    ##p = Process(target=render, args=(q, genome_path))
+    ##p.start()
+
+    #window = pyglet.window.Window()
+    #image = pyglet.image.ImageData(genome.width, genome.height, 'RGB', obuf)
+    #tex = image.texture
+
+    #@window.event
+    #def on_draw():
+        #window.clear()
+        #tex.blit(0, 0)
+
+    #pyglet.app.run()
+
+if __name__ == "__main__":
+    if len(sys.argv) < 2 or not os.path.isfile(sys.argv[1]):
+        print "First argument must be a path to a genome file"
+        sys.exit(1)
+    main(sys.argv[1])
+