mirror of
https://github.com/stevenrobertson/cuburn.git
synced 2025-02-05 11:40:04 -05:00
258 lines
7.0 KiB
Python
258 lines
7.0 KiB
Python
![]() |
#!/usr/bin/python
|
||
|
#
|
||
|
# flam3cuda, one of a surprisingly large number of ports of the fractal flame
|
||
|
# algorithm to NVIDIA GPUs.
|
||
|
#
|
||
|
# This one is copyright 2010 Steven Robertson <steven@strobe.cc>
|
||
|
#
|
||
|
# This program is free software; you can redistribute it and/or modify
|
||
|
# it under the terms of the GNU General Public License version 2 or later
|
||
|
# as published by the Free Software Foundation.
|
||
|
|
||
|
import os
|
||
|
import sys
|
||
|
import ctypes
|
||
|
import struct
|
||
|
|
||
|
import tempita
|
||
|
|
||
|
# These imports are order-sensitive!
|
||
|
import pyglet
|
||
|
import pyglet.gl as gl
|
||
|
gl.get_current_context()
|
||
|
|
||
|
import pycuda.driver as cuda
|
||
|
import pycuda.gl as cudagl
|
||
|
import pycuda.gl.autoinit
|
||
|
from pycuda.compiler import SourceModule
|
||
|
|
||
|
from multiprocessing import Process, Queue
|
||
|
|
||
|
import numpy as np
|
||
|
|
||
|
from fr0stlib import pyflam3
|
||
|
|
||
|
# PTX header and functions used for debugging.
|
||
|
prelude = """
|
||
|
.version 2.0
|
||
|
.target sm_20
|
||
|
|
||
|
.func (.reg .u32 $ret) get_gtid ()
|
||
|
{
|
||
|
.reg .u16 tmp;
|
||
|
.reg .u32 cta, ncta, tid, gtid;
|
||
|
|
||
|
mov.u16 tmp, %ctaid.x;
|
||
|
cvt.u32.u16 cta, tmp;
|
||
|
mov.u16 tmp, %ntid.x;
|
||
|
cvt.u32.u16 ncta, tmp;
|
||
|
mul24.lo.u32 gtid, cta, ncta;
|
||
|
|
||
|
mov.u16 tmp, %tid.x;
|
||
|
cvt.u32.u16 tid, tmp;
|
||
|
add.u32 gtid, gtid, tid;
|
||
|
mov.b32 $ret, gtid;
|
||
|
ret;
|
||
|
}
|
||
|
|
||
|
.entry write_to_buffer ( .param .u32 bufbase )
|
||
|
{
|
||
|
.reg .u32 base, gtid, off;
|
||
|
|
||
|
ld.param.u32 base, [bufbase];
|
||
|
call.uni (off), get_gtid, ();
|
||
|
mad24.lo.u32 base, off, 4, base;
|
||
|
st.volatile.global.b32 [base], off;
|
||
|
}
|
||
|
"""
|
||
|
|
||
|
class CUGenome(pyflam3.Genome):
|
||
|
def _render(self, frame, trans):
|
||
|
obuf = (ctypes.c_ubyte * ((3+trans)*self.width*self.height))()
|
||
|
stats = pyflam3.RenderStats()
|
||
|
pyflam3.flam3_render(ctypes.byref(frame), obuf, pyflam3.flam3_field_both,
|
||
|
trans+3, trans, ctypes.byref(stats))
|
||
|
return obuf, stats, frame
|
||
|
|
||
|
class LaunchContext(self):
|
||
|
def __init__(self, seed=None):
|
||
|
self.block, self.grid, self.threads = None, None, None
|
||
|
self.stream = cuda.Stream()
|
||
|
self.rand = mtrand.RandomState(seed)
|
||
|
|
||
|
def set_size(self, block, grid):
|
||
|
self.block, self.grid = block, grid
|
||
|
self.threads = reduce(lambda a, b: a*b, self.block + self.grid)
|
||
|
|
||
|
class PTXFragment(object):
|
||
|
"""
|
||
|
Wrapper for sections of template PTX.
|
||
|
|
||
|
In order to provide the best optimization, and avoid a web of hard-coded
|
||
|
parameters, the PTX module may be regenerated and recompiled several times
|
||
|
with different or incomplete launch context parameters. To this end, avoid
|
||
|
accessing the GPU in such functions, and do not depend on context values
|
||
|
which are marked as "tuned" in the LaunchContext docstring being available.
|
||
|
|
||
|
The final compilation pass is guaranteed to have all "tuned" values fixed
|
||
|
in their final values for the stream.
|
||
|
|
||
|
Template code will be processed recursively until all "{{" instances have
|
||
|
been replaced, using the same namespace each time.
|
||
|
"""
|
||
|
|
||
|
def deps(self, ctx):
|
||
|
"""
|
||
|
Returns a list of PTXFragment objects on which this object depends
|
||
|
for successful compilation. Circular dependencies are forbidden.
|
||
|
"""
|
||
|
return []
|
||
|
|
||
|
def subs(self, ctx):
|
||
|
"""
|
||
|
Returns a dict of items to add to the template substitution namespace.
|
||
|
The entire dict will be assembled, including all dependencies, before
|
||
|
any templates are evaluated.
|
||
|
"""
|
||
|
return {}
|
||
|
|
||
|
def prelude(self, ctx):
|
||
|
"""
|
||
|
Returns a template string containing any code (variable declarations,
|
||
|
probably) that should be inserted at module scope. The prelude of
|
||
|
all deps will be inserted above this prelude.
|
||
|
"""
|
||
|
return ""
|
||
|
|
||
|
def entryPrelude(self, ctx):
|
||
|
"""
|
||
|
Returns a template string that should be inserted at the top of any
|
||
|
entry point which depends on this method. The entry prelude of all
|
||
|
deps will be inserted above this entry prelude.
|
||
|
"""
|
||
|
return ""
|
||
|
|
||
|
def setUp(self, ctx):
|
||
|
"""
|
||
|
Do start-of-stream initialization, such as copying data to the device.
|
||
|
"""
|
||
|
pass
|
||
|
|
||
|
def test(self, ctx):
|
||
|
"""
|
||
|
Perform device tests. Returns True on success, False on failure,
|
||
|
or raises an exception.
|
||
|
"""
|
||
|
return True
|
||
|
|
||
|
class PTXEntryPoint(PTXFragment):
|
||
|
def entry(self, ctx):
|
||
|
"""
|
||
|
Returns a template string corresponding to a PTX entry point.
|
||
|
"""
|
||
|
pass
|
||
|
|
||
|
def call(self, ctx):
|
||
|
"""
|
||
|
Calls the entry point on the device. Haven't worked out the details
|
||
|
of this one yet.
|
||
|
"""
|
||
|
pass
|
||
|
|
||
|
|
||
|
|
||
|
class DeviceHelpers(PTXFragment):
|
||
|
"""This one's included by default, no need to depend on it"""
|
||
|
def subs(self, ctx):
|
||
|
return {
|
||
|
'PTRT': ctypes.sizeof(ctypes.c_void_p) == 8 and '.u64' or '.u32',
|
||
|
}
|
||
|
|
||
|
class MWCRandGen(PTXFragment):
|
||
|
|
||
|
_prelude = """
|
||
|
.const {{PTRT}} mwc_rng_mults_p;
|
||
|
.const {{PTRT}} mwc_rng_seeds_p;
|
||
|
"""
|
||
|
|
||
|
def __init__(self):
|
||
|
if not os.path.isfile(os.path.join(os.path.dirname(__FILE__),
|
||
|
'primes.bin')):
|
||
|
raise EnvironmentError('primes.bin not found')
|
||
|
|
||
|
def prelude(self):
|
||
|
return self._prelude
|
||
|
|
||
|
def setUp(self, ctx):
|
||
|
# Load raw big-endian u32 multipliers from primes.bin.
|
||
|
with open('primes.bin') as primefp:
|
||
|
dt = np.dtype(np.uint32).newbyteorder('B')
|
||
|
mults = np.frombuffer(primefp.read(), dtype=dt)
|
||
|
# Randomness in choosing multipliers is good, but larger multipliers
|
||
|
# have longer periods, which is also good. This is a compromise.
|
||
|
ctx.rand.shuffle(mults[:ctx.threads*4])
|
||
|
# Copy multipliers and seeds to the device
|
||
|
devmp, devml = ctx.mod.get_global('mwc_rng_mults')
|
||
|
cuda.memcpy_htod_async(devmp, mults.tostring()[:devml], ctx.stream)
|
||
|
devsp, devsl = ctx.mod.get_global('mwc_rng_seeds')
|
||
|
cuda.memcpy_htod_async(devsp, ctx.rand.bytes(devsl), ctx.stream)
|
||
|
|
||
|
def _next_b32(self, dreg):
|
||
|
return """
|
||
|
mul.wide.u32 mwc_rng_
|
||
|
mul.wide.u32
|
||
|
|
||
|
|
||
|
|
||
|
def templates(self, ctx):
|
||
|
return {'mwc_next_b32', self._next_b32}
|
||
|
|
||
|
|
||
|
def test(self, ctx):
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
def launch(self, ctx):
|
||
|
if self.mults
|
||
|
|
||
|
|
||
|
|
||
|
def main(genome_path):
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
#with open(genome_path) as fp:
|
||
|
#genome = CUGenome.from_string(fp.read())[0]
|
||
|
#genome.width, genome.height = 512, 512
|
||
|
#genome.sample_density = 1000
|
||
|
#obuf, stats, frame = genome.render(estimator=3)
|
||
|
#gc.collect()
|
||
|
|
||
|
##q.put(str(obuf))
|
||
|
##p = Process(target=render, args=(q, genome_path))
|
||
|
##p.start()
|
||
|
|
||
|
#window = pyglet.window.Window()
|
||
|
#image = pyglet.image.ImageData(genome.width, genome.height, 'RGB', obuf)
|
||
|
#tex = image.texture
|
||
|
|
||
|
#@window.event
|
||
|
#def on_draw():
|
||
|
#window.clear()
|
||
|
#tex.blit(0, 0)
|
||
|
|
||
|
#pyglet.app.run()
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
if len(sys.argv) < 2 or not os.path.isfile(sys.argv[1]):
|
||
|
print "First argument must be a path to a genome file"
|
||
|
sys.exit(1)
|
||
|
main(sys.argv[1])
|
||
|
|