mirror of
https://github.com/stevenrobertson/cuburn.git
synced 2025-07-05 15:55:14 -04:00
Initial commit.
This commit is contained in:
257
main.py
Normal file
257
main.py
Normal file
@ -0,0 +1,257 @@
|
||||
#!/usr/bin/python
|
||||
#
|
||||
# flam3cuda, one of a surprisingly large number of ports of the fractal flame
|
||||
# algorithm to NVIDIA GPUs.
|
||||
#
|
||||
# This one is copyright 2010 Steven Robertson <steven@strobe.cc>
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License version 2 or later
|
||||
# as published by the Free Software Foundation.
|
||||
|
||||
import os
|
||||
import sys
|
||||
import ctypes
|
||||
import struct
|
||||
|
||||
import tempita
|
||||
|
||||
# These imports are order-sensitive!
|
||||
import pyglet
|
||||
import pyglet.gl as gl
|
||||
gl.get_current_context()
|
||||
|
||||
import pycuda.driver as cuda
|
||||
import pycuda.gl as cudagl
|
||||
import pycuda.gl.autoinit
|
||||
from pycuda.compiler import SourceModule
|
||||
|
||||
from multiprocessing import Process, Queue
|
||||
|
||||
import numpy as np
|
||||
|
||||
from fr0stlib import pyflam3
|
||||
|
||||
# PTX header and functions used for debugging.
|
||||
prelude = """
|
||||
.version 2.0
|
||||
.target sm_20
|
||||
|
||||
.func (.reg .u32 $ret) get_gtid ()
|
||||
{
|
||||
.reg .u16 tmp;
|
||||
.reg .u32 cta, ncta, tid, gtid;
|
||||
|
||||
mov.u16 tmp, %ctaid.x;
|
||||
cvt.u32.u16 cta, tmp;
|
||||
mov.u16 tmp, %ntid.x;
|
||||
cvt.u32.u16 ncta, tmp;
|
||||
mul24.lo.u32 gtid, cta, ncta;
|
||||
|
||||
mov.u16 tmp, %tid.x;
|
||||
cvt.u32.u16 tid, tmp;
|
||||
add.u32 gtid, gtid, tid;
|
||||
mov.b32 $ret, gtid;
|
||||
ret;
|
||||
}
|
||||
|
||||
.entry write_to_buffer ( .param .u32 bufbase )
|
||||
{
|
||||
.reg .u32 base, gtid, off;
|
||||
|
||||
ld.param.u32 base, [bufbase];
|
||||
call.uni (off), get_gtid, ();
|
||||
mad24.lo.u32 base, off, 4, base;
|
||||
st.volatile.global.b32 [base], off;
|
||||
}
|
||||
"""
|
||||
|
||||
class CUGenome(pyflam3.Genome):
|
||||
def _render(self, frame, trans):
|
||||
obuf = (ctypes.c_ubyte * ((3+trans)*self.width*self.height))()
|
||||
stats = pyflam3.RenderStats()
|
||||
pyflam3.flam3_render(ctypes.byref(frame), obuf, pyflam3.flam3_field_both,
|
||||
trans+3, trans, ctypes.byref(stats))
|
||||
return obuf, stats, frame
|
||||
|
||||
class LaunchContext(self):
|
||||
def __init__(self, seed=None):
|
||||
self.block, self.grid, self.threads = None, None, None
|
||||
self.stream = cuda.Stream()
|
||||
self.rand = mtrand.RandomState(seed)
|
||||
|
||||
def set_size(self, block, grid):
|
||||
self.block, self.grid = block, grid
|
||||
self.threads = reduce(lambda a, b: a*b, self.block + self.grid)
|
||||
|
||||
class PTXFragment(object):
|
||||
"""
|
||||
Wrapper for sections of template PTX.
|
||||
|
||||
In order to provide the best optimization, and avoid a web of hard-coded
|
||||
parameters, the PTX module may be regenerated and recompiled several times
|
||||
with different or incomplete launch context parameters. To this end, avoid
|
||||
accessing the GPU in such functions, and do not depend on context values
|
||||
which are marked as "tuned" in the LaunchContext docstring being available.
|
||||
|
||||
The final compilation pass is guaranteed to have all "tuned" values fixed
|
||||
in their final values for the stream.
|
||||
|
||||
Template code will be processed recursively until all "{{" instances have
|
||||
been replaced, using the same namespace each time.
|
||||
"""
|
||||
|
||||
def deps(self, ctx):
|
||||
"""
|
||||
Returns a list of PTXFragment objects on which this object depends
|
||||
for successful compilation. Circular dependencies are forbidden.
|
||||
"""
|
||||
return []
|
||||
|
||||
def subs(self, ctx):
|
||||
"""
|
||||
Returns a dict of items to add to the template substitution namespace.
|
||||
The entire dict will be assembled, including all dependencies, before
|
||||
any templates are evaluated.
|
||||
"""
|
||||
return {}
|
||||
|
||||
def prelude(self, ctx):
|
||||
"""
|
||||
Returns a template string containing any code (variable declarations,
|
||||
probably) that should be inserted at module scope. The prelude of
|
||||
all deps will be inserted above this prelude.
|
||||
"""
|
||||
return ""
|
||||
|
||||
def entryPrelude(self, ctx):
|
||||
"""
|
||||
Returns a template string that should be inserted at the top of any
|
||||
entry point which depends on this method. The entry prelude of all
|
||||
deps will be inserted above this entry prelude.
|
||||
"""
|
||||
return ""
|
||||
|
||||
def setUp(self, ctx):
|
||||
"""
|
||||
Do start-of-stream initialization, such as copying data to the device.
|
||||
"""
|
||||
pass
|
||||
|
||||
def test(self, ctx):
|
||||
"""
|
||||
Perform device tests. Returns True on success, False on failure,
|
||||
or raises an exception.
|
||||
"""
|
||||
return True
|
||||
|
||||
class PTXEntryPoint(PTXFragment):
|
||||
def entry(self, ctx):
|
||||
"""
|
||||
Returns a template string corresponding to a PTX entry point.
|
||||
"""
|
||||
pass
|
||||
|
||||
def call(self, ctx):
|
||||
"""
|
||||
Calls the entry point on the device. Haven't worked out the details
|
||||
of this one yet.
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
|
||||
class DeviceHelpers(PTXFragment):
|
||||
"""This one's included by default, no need to depend on it"""
|
||||
def subs(self, ctx):
|
||||
return {
|
||||
'PTRT': ctypes.sizeof(ctypes.c_void_p) == 8 and '.u64' or '.u32',
|
||||
}
|
||||
|
||||
class MWCRandGen(PTXFragment):
|
||||
|
||||
_prelude = """
|
||||
.const {{PTRT}} mwc_rng_mults_p;
|
||||
.const {{PTRT}} mwc_rng_seeds_p;
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
if not os.path.isfile(os.path.join(os.path.dirname(__FILE__),
|
||||
'primes.bin')):
|
||||
raise EnvironmentError('primes.bin not found')
|
||||
|
||||
def prelude(self):
|
||||
return self._prelude
|
||||
|
||||
def setUp(self, ctx):
|
||||
# Load raw big-endian u32 multipliers from primes.bin.
|
||||
with open('primes.bin') as primefp:
|
||||
dt = np.dtype(np.uint32).newbyteorder('B')
|
||||
mults = np.frombuffer(primefp.read(), dtype=dt)
|
||||
# Randomness in choosing multipliers is good, but larger multipliers
|
||||
# have longer periods, which is also good. This is a compromise.
|
||||
ctx.rand.shuffle(mults[:ctx.threads*4])
|
||||
# Copy multipliers and seeds to the device
|
||||
devmp, devml = ctx.mod.get_global('mwc_rng_mults')
|
||||
cuda.memcpy_htod_async(devmp, mults.tostring()[:devml], ctx.stream)
|
||||
devsp, devsl = ctx.mod.get_global('mwc_rng_seeds')
|
||||
cuda.memcpy_htod_async(devsp, ctx.rand.bytes(devsl), ctx.stream)
|
||||
|
||||
def _next_b32(self, dreg):
|
||||
return """
|
||||
mul.wide.u32 mwc_rng_
|
||||
mul.wide.u32
|
||||
|
||||
|
||||
|
||||
def templates(self, ctx):
|
||||
return {'mwc_next_b32', self._next_b32}
|
||||
|
||||
|
||||
def test(self, ctx):
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def launch(self, ctx):
|
||||
if self.mults
|
||||
|
||||
|
||||
|
||||
def main(genome_path):
|
||||
|
||||
|
||||
|
||||
|
||||
#with open(genome_path) as fp:
|
||||
#genome = CUGenome.from_string(fp.read())[0]
|
||||
#genome.width, genome.height = 512, 512
|
||||
#genome.sample_density = 1000
|
||||
#obuf, stats, frame = genome.render(estimator=3)
|
||||
#gc.collect()
|
||||
|
||||
##q.put(str(obuf))
|
||||
##p = Process(target=render, args=(q, genome_path))
|
||||
##p.start()
|
||||
|
||||
#window = pyglet.window.Window()
|
||||
#image = pyglet.image.ImageData(genome.width, genome.height, 'RGB', obuf)
|
||||
#tex = image.texture
|
||||
|
||||
#@window.event
|
||||
#def on_draw():
|
||||
#window.clear()
|
||||
#tex.blit(0, 0)
|
||||
|
||||
#pyglet.app.run()
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) < 2 or not os.path.isfile(sys.argv[1]):
|
||||
print "First argument must be a path to a genome file"
|
||||
sys.exit(1)
|
||||
main(sys.argv[1])
|
||||
|
Reference in New Issue
Block a user