cuburn/main.py

258 lines
7.0 KiB
Python
Raw Normal View History

2010-08-27 12:28:02 -04:00
#!/usr/bin/python
#
# flam3cuda, one of a surprisingly large number of ports of the fractal flame
# algorithm to NVIDIA GPUs.
#
# This one is copyright 2010 Steven Robertson <steven@strobe.cc>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 or later
# as published by the Free Software Foundation.
import os
import sys
import ctypes
import struct
import tempita
# These imports are order-sensitive!
import pyglet
import pyglet.gl as gl
gl.get_current_context()
import pycuda.driver as cuda
import pycuda.gl as cudagl
import pycuda.gl.autoinit
from pycuda.compiler import SourceModule
from multiprocessing import Process, Queue
import numpy as np
from fr0stlib import pyflam3
# PTX header and functions used for debugging.
prelude = """
.version 2.0
.target sm_20
.func (.reg .u32 $ret) get_gtid ()
{
.reg .u16 tmp;
.reg .u32 cta, ncta, tid, gtid;
mov.u16 tmp, %ctaid.x;
cvt.u32.u16 cta, tmp;
mov.u16 tmp, %ntid.x;
cvt.u32.u16 ncta, tmp;
mul24.lo.u32 gtid, cta, ncta;
mov.u16 tmp, %tid.x;
cvt.u32.u16 tid, tmp;
add.u32 gtid, gtid, tid;
mov.b32 $ret, gtid;
ret;
}
.entry write_to_buffer ( .param .u32 bufbase )
{
.reg .u32 base, gtid, off;
ld.param.u32 base, [bufbase];
call.uni (off), get_gtid, ();
mad24.lo.u32 base, off, 4, base;
st.volatile.global.b32 [base], off;
}
"""
class CUGenome(pyflam3.Genome):
def _render(self, frame, trans):
obuf = (ctypes.c_ubyte * ((3+trans)*self.width*self.height))()
stats = pyflam3.RenderStats()
pyflam3.flam3_render(ctypes.byref(frame), obuf, pyflam3.flam3_field_both,
trans+3, trans, ctypes.byref(stats))
return obuf, stats, frame
class LaunchContext(self):
def __init__(self, seed=None):
self.block, self.grid, self.threads = None, None, None
self.stream = cuda.Stream()
self.rand = mtrand.RandomState(seed)
def set_size(self, block, grid):
self.block, self.grid = block, grid
self.threads = reduce(lambda a, b: a*b, self.block + self.grid)
class PTXFragment(object):
"""
Wrapper for sections of template PTX.
In order to provide the best optimization, and avoid a web of hard-coded
parameters, the PTX module may be regenerated and recompiled several times
with different or incomplete launch context parameters. To this end, avoid
accessing the GPU in such functions, and do not depend on context values
which are marked as "tuned" in the LaunchContext docstring being available.
The final compilation pass is guaranteed to have all "tuned" values fixed
in their final values for the stream.
Template code will be processed recursively until all "{{" instances have
been replaced, using the same namespace each time.
"""
def deps(self, ctx):
"""
Returns a list of PTXFragment objects on which this object depends
for successful compilation. Circular dependencies are forbidden.
"""
return []
def subs(self, ctx):
"""
Returns a dict of items to add to the template substitution namespace.
The entire dict will be assembled, including all dependencies, before
any templates are evaluated.
"""
return {}
def prelude(self, ctx):
"""
Returns a template string containing any code (variable declarations,
probably) that should be inserted at module scope. The prelude of
all deps will be inserted above this prelude.
"""
return ""
def entryPrelude(self, ctx):
"""
Returns a template string that should be inserted at the top of any
entry point which depends on this method. The entry prelude of all
deps will be inserted above this entry prelude.
"""
return ""
def setUp(self, ctx):
"""
Do start-of-stream initialization, such as copying data to the device.
"""
pass
def test(self, ctx):
"""
Perform device tests. Returns True on success, False on failure,
or raises an exception.
"""
return True
class PTXEntryPoint(PTXFragment):
def entry(self, ctx):
"""
Returns a template string corresponding to a PTX entry point.
"""
pass
def call(self, ctx):
"""
Calls the entry point on the device. Haven't worked out the details
of this one yet.
"""
pass
class DeviceHelpers(PTXFragment):
"""This one's included by default, no need to depend on it"""
def subs(self, ctx):
return {
'PTRT': ctypes.sizeof(ctypes.c_void_p) == 8 and '.u64' or '.u32',
}
class MWCRandGen(PTXFragment):
_prelude = """
.const {{PTRT}} mwc_rng_mults_p;
.const {{PTRT}} mwc_rng_seeds_p;
"""
def __init__(self):
if not os.path.isfile(os.path.join(os.path.dirname(__FILE__),
'primes.bin')):
raise EnvironmentError('primes.bin not found')
def prelude(self):
return self._prelude
def setUp(self, ctx):
# Load raw big-endian u32 multipliers from primes.bin.
with open('primes.bin') as primefp:
dt = np.dtype(np.uint32).newbyteorder('B')
mults = np.frombuffer(primefp.read(), dtype=dt)
# Randomness in choosing multipliers is good, but larger multipliers
# have longer periods, which is also good. This is a compromise.
ctx.rand.shuffle(mults[:ctx.threads*4])
# Copy multipliers and seeds to the device
devmp, devml = ctx.mod.get_global('mwc_rng_mults')
cuda.memcpy_htod_async(devmp, mults.tostring()[:devml], ctx.stream)
devsp, devsl = ctx.mod.get_global('mwc_rng_seeds')
cuda.memcpy_htod_async(devsp, ctx.rand.bytes(devsl), ctx.stream)
def _next_b32(self, dreg):
return """
mul.wide.u32 mwc_rng_
mul.wide.u32
def templates(self, ctx):
return {'mwc_next_b32', self._next_b32}
def test(self, ctx):
def launch(self, ctx):
if self.mults
def main(genome_path):
#with open(genome_path) as fp:
#genome = CUGenome.from_string(fp.read())[0]
#genome.width, genome.height = 512, 512
#genome.sample_density = 1000
#obuf, stats, frame = genome.render(estimator=3)
#gc.collect()
##q.put(str(obuf))
##p = Process(target=render, args=(q, genome_path))
##p.start()
#window = pyglet.window.Window()
#image = pyglet.image.ImageData(genome.width, genome.height, 'RGB', obuf)
#tex = image.texture
#@window.event
#def on_draw():
#window.clear()
#tex.blit(0, 0)
#pyglet.app.run()
if __name__ == "__main__":
if len(sys.argv) < 2 or not os.path.isfile(sys.argv[1]):
print "First argument must be a path to a genome file"
sys.exit(1)
main(sys.argv[1])