Refactor API

--HG--
rename : cuburn/code/filter.py => cuburn/code/filtering.py
This commit is contained in:
Steven Robertson 2011-06-11 15:59:10 -04:00
parent 6f3c27007a
commit e79df46c66
6 changed files with 362 additions and 142 deletions

View File

@ -14,6 +14,8 @@ from ctypes import *
from fr0stlib.pyflam3 import constants from fr0stlib.pyflam3 import constants
from fr0stlib.pyflam3._flam3 import * from fr0stlib.pyflam3._flam3 import *
from cuburn import render
flam3_nvariations = constants.flam3_nvariations = 99 flam3_nvariations = constants.flam3_nvariations = 99
BaseXForm._fields_ = [('var', c_double * flam3_nvariations) BaseXForm._fields_ = [('var', c_double * flam3_nvariations)

View File

@ -223,24 +223,22 @@ void density_est(float4 *pixbuf, float4 *outbuf, float *denbuf,
""") """)
def invoke(self, mod, abufd, obufd, dbufd): def invoke(self, mod, abufd, obufd, dbufd, stream=None):
# TODO: add no-est version # TODO: add no-est version
# TODO: come up with a general way to average these parameters # TODO: come up with a general way to average these parameters
k1 = self.cp.brightness * 268 / 256 k1 = self.cp.brightness * 268 / 256
area = self.features.acc_width * self.features.acc_height / self.cp.ppu ** 2 area = self.features.acc_width * self.features.acc_height / self.cp.ppu ** 2
k2 = 1 / (area * self.cp.adj_density) k2 = 1 / (area * self.cp.adj_density)
print k1, k2, area
if self.cp.estimator == 0: if self.cp.estimator == 0:
fun = mod.get_function("logscale") fun = mod.get_function("logscale")
t = fun(abufd, obufd, np.float32(k1), np.float32(k2), t = fun(abufd, obufd, np.float32(k1), np.float32(k2),
block=(self.features.acc_width, 1, 1), block=(self.features.acc_width, 1, 1),
grid=(self.features.acc_height, 1), time_kernel=True) grid=(self.features.acc_height, 1), stream=stream)
else: else:
fun = mod.get_function("density_est") fun = mod.get_function("density_est")
t = fun(abufd, obufd, dbufd, np.float32(k1), np.float32(k2), fun(abufd, obufd, dbufd, np.float32(k1), np.float32(k2),
block=(32, 32, 1), grid=(self.features.acc_width/32, 1), block=(32, 32, 1), grid=(self.features.acc_width/32, 1),
time_kernel=True) stream=stream)
print "Density estimation: %g" % t

View File

@ -2,20 +2,13 @@
The main iteration loop. The main iteration loop.
""" """
from ctypes import byref, memset, sizeof from cuburn.code import mwc, variations
import pycuda.driver as cuda
from pycuda.driver import In, Out, InOut
from pycuda.compiler import SourceModule
import numpy as np
from scipy import ndimage
from fr0stlib.pyflam3 import flam3_interpolate
from cuburn.code import mwc, variations, filter
from cuburn.code.util import * from cuburn.code.util import *
from cuburn.render import Genome
class IterCode(HunkOCode): class IterCode(HunkOCode):
# The number of threads per block
NTHREADS = 512
def __init__(self, features): def __init__(self, features):
self.features = features self.features = features
self.packer = DataPacker('iter_info') self.packer = DataPacker('iter_info')
@ -69,14 +62,14 @@ void iter(mwc_st *msts, iter_info *infos, float4 *accbuf, float *denbuf) {
iter_info *info_glob = &(infos[blockIdx.x]); iter_info *info_glob = &(infos[blockIdx.x]);
// load info to shared memory cooperatively // load info to shared memory cooperatively
for (int i = threadIdx.y * 32 + threadIdx.x; for (int i = threadIdx.y * blockDim.x + threadIdx.x;
i * 4 < sizeof(iter_info); i += blockDim.x * blockDim.y) i * 4 < sizeof(iter_info); i += blockDim.x * blockDim.y)
reinterpret_cast<float*>(&info)[i] = reinterpret_cast<float*>(&info)[i] =
reinterpret_cast<float*>(info_glob)[i]; reinterpret_cast<float*>(info_glob)[i];
int consec_bad = -{{features.fuse}}; int consec_bad = -{{features.fuse}};
// TODO: make nsteps adjustable via genome // TODO: remove '512' constant
int nsamps = {{packer.get('cp.width * cp.height / 512000. * cp.adj_density')}}; int nsamps = {{packer.get('cp.width * cp.height / (cp.ntemporal_samples * 512.) * cp.adj_density')}};
float x, y, color; float x, y, color;
x = mwc_next_11(&rctx); x = mwc_next_11(&rctx);
@ -157,86 +150,3 @@ void iter(mwc_st *msts, iter_info *infos, float4 *accbuf, float *denbuf) {
packer = self.packer.view('info'), packer = self.packer.view('info'),
**globals()) **globals())
def render(features, cps):
# TODO: make this adjustable via genome
nsteps = 1000
abuf = np.zeros((features.acc_height, features.acc_stride, 4), dtype=np.float32)
dbuf = np.zeros((features.acc_height, features.acc_stride), dtype=np.float32)
seeds = mwc.MWC.make_seeds(512 * nsteps)
iter = IterCode(features)
de = filter.DensityEst(features, cps[0])
code = assemble_code(BaseCode, mwc.MWC, iter.packer, iter,
filter.ColorClip, de)
for lno, line in enumerate(code.split('\n')):
print '%3d %s' % (lno, line)
mod = SourceModule(code,
options=['-use_fast_math', '-maxrregcount', '32'])
cps_as_array = (Genome * len(cps))()
for i, cp in enumerate(cps):
cps_as_array[i] = cp
infos = []
pal = np.empty((16, 256, 4), dtype=np.uint8)
# TODO: move this into a common function
if len(cps) > 1:
cp = Genome()
memset(byref(cp), 0, sizeof(cp))
sampAt = [int(i/15.*(nsteps-1)) for i in range(16)]
for n in range(nsteps):
flam3_interpolate(cps_as_array, 2, float(n)/nsteps - 0.5,
0, byref(cp))
cp._init()
if n in sampAt:
pidx = sampAt.index(n)
for i, e in enumerate(cp.palette.entries):
pal[pidx][i] = np.uint8(np.array(e.color) * 255.0)
infos.append(iter.packer.pack(cp=cp, cp_step_frac=float(n)/nsteps))
else:
for i, e in enumerate(cps[0].palette.entries):
pal[0][i] = np.uint8(np.array(e.color) * 255.0)
pal[1:] = pal[0]
infos.append(iter.packer.pack(cp=cps[0], cp_step_frac=0))
infos *= nsteps
infos = np.concatenate(infos)
dpal = cuda.make_multichannel_2d_array(pal, 'C')
tref = mod.get_texref('palTex')
tref.set_array(dpal)
tref.set_format(cuda.array_format.UNSIGNED_INT8, 4)
tref.set_flags(cuda.TRSF_NORMALIZED_COORDINATES)
tref.set_filter_mode(cuda.filter_mode.LINEAR)
abufd = cuda.to_device(abuf)
dbufd = cuda.to_device(dbuf)
fun = mod.get_function("iter")
fun.set_cache_config(cuda.func_cache.PREFER_L1)
t = fun(InOut(seeds), InOut(infos), abufd, dbufd,
block=(32,16,1), grid=(nsteps,1), time_kernel=True)
print "Completed render in %g seconds" % t
f = np.float32
npix = features.acc_width * features.acc_height
# TODO: just allocate
obufd = cuda.to_device(abuf)
dbuf = cuda.from_device_like(dbufd, dbuf)
dbuf = ndimage.filters.gaussian_filter(dbuf, 0.6)
dbufd = cuda.to_device(dbuf)
de.invoke(mod, abufd, obufd, dbufd)
fun = mod.get_function("colorclip")
t = fun(obufd, f(1 / cp.gamma), f(cp.vibrancy), f(cp.highlight_power),
block=(256,1,1), grid=(npix/256,1), time_kernel=True)
print "Completed color filtering in %g seconds" % t
abuf = cuda.from_device_like(obufd, abuf)
return abuf, dbuf

View File

@ -66,8 +66,26 @@ int trunca(float f) {
asm("cvt.rni.s32.f32 %0, %1;" : "=r"(ret) : "f"(f)); asm("cvt.rni.s32.f32 %0, %1;" : "=r"(ret) : "f"(f));
return ret; return ret;
} }
__global__
void zero_dptr(float* dptr, int size) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < size) {
dptr[i] = 0.0f;
}
}
""" """
@staticmethod
def zero_dptr(mod, dptr, size, stream=None):
"""
A memory zeroer which can be embedded in a stream. Size is the
number of 4-byte words in the pointer.
"""
zero = mod.get_function("zero_dptr")
zero(dptr, np.int32(size), stream=stream,
block=(1024, 1, 1), grid=(size/1024+1, 1))
class DataPackerView(object): class DataPackerView(object):
""" """
View of a data packer. Intended to be initialized using DataPacker.view(). View of a data packer. Intended to be initialized using DataPacker.view().

View File

@ -1,44 +1,57 @@
import sys import sys
import math import math
import re import re
from itertools import cycle, repeat, chain, izip
from ctypes import * from ctypes import *
from cStringIO import StringIO from cStringIO import StringIO
import numpy as np import numpy as np
from scipy import ndimage
from fr0stlib import pyflam3 from fr0stlib import pyflam3
from fr0stlib.pyflam3._flam3 import * from fr0stlib.pyflam3._flam3 import *
from fr0stlib.pyflam3.constants import * from fr0stlib.pyflam3.constants import *
import pycuda.compiler
import pycuda.driver as cuda
from cuburn import affine from cuburn import affine
from cuburn.variations import Variations from cuburn.code import util, mwc, iter, filtering
class Genome(pyflam3.Genome): def _chunk(l, cs):
@classmethod """
def from_string(cls, *args, **kwargs): Yield the contents of list ``l`` in chunks of size no more than ``cs``.
gnms = super(Genome, cls).from_string(*args, **kwargs) """
for g in gnms: g._init() for i in range(0, len(l), cs):
return gnms yield l[i:i+cs]
def _init(self): class Genome(object):
"""
Normalizes and precalculates some properties of a Genome. Assumes that
Genome argument passed in will not change.
"""
# Fix the ctypes ugliness since switching to __getattribute__ in 2.7.
# There are more elegant ways to do this, but I can't be bothered.
def __getattr__(self, name):
return getattr(self.cp, name)
def __init__(self, ctypes_genome):
self.cp = ctypes_genome
self.xforms = [self.xform[i] for i in range(self.num_xforms)] self.xforms = [self.xform[i] for i in range(self.num_xforms)]
dens = np.array([x.density for i, x in enumerate(self.xforms) dens = np.array([x.density for i, x in enumerate(self.xforms)
if i != self.final_xform_index]) if i != self.final_xform_index])
dens /= np.sum(dens) dens /= np.sum(dens)
self.norm_density = [np.sum(dens[:i+1]) for i in range(len(dens))] self.norm_density = [np.sum(dens[:i+1]) for i in range(len(dens))]
self.camera_transform = self.calc_camera_transform()
scale = property(lambda cp: 2.0 ** cp.zoom) scale = property(lambda cp: 2.0 ** cp.zoom)
adj_density = property(lambda cp: cp.sample_density * (cp.scale ** 2)) adj_density = property(lambda cp: cp.sample_density * (cp.scale ** 2))
ppu = property(lambda cp: cp.pixels_per_unit * cp.scale) ppu = property(lambda cp: cp.pixels_per_unit * cp.scale)
@property def calc_camera_transform(cp):
def camera_transform(cp):
""" """
An affine matrix which will transform IFS coordinates to image width An affine matrix which will transform IFS coordinates to image width
and height. Assumes that width and height are constant. and height. Assumes that width and height are constant.
""" """
# TODO: when reading as a property during packing, this may be
# calculated 6 times instead of 1
# TODO: also requires knowing gutter width
g = Features.gutter g = Features.gutter
return ( affine.translate(0.5 * cp.width + g, 0.5 * cp.height + g) return ( affine.translate(0.5 * cp.width + g, 0.5 * cp.height + g)
* affine.scale(cp.ppu, cp.ppu) * affine.scale(cp.ppu, cp.ppu)
@ -65,13 +78,294 @@ class Animation(object):
In other words, it's best to use exactly one Animation for each In other words, it's best to use exactly one Animation for each
interpolated sequence between one or two genomes. interpolated sequence between one or two genomes.
""" """
def __init__(self, genomes, ngenomes = None): def __init__(self, ctypes_genome_array):
self.features = Features(genomes) self._g_arr = ctypes_genome_array
self.genomes = map(Genome, ctypes_genome_array)
self.features = Features(self.genomes)
self._iter = self._de = self.src = self.cubin = self.mod = None
def compile(self): def compile(self, keep=False,
pass cmp_options=('-use_fast_math', '-maxrregcount', '32')):
def render_frame(self, time=0): """
pass Compile a kernel capable of rendering every frame in this animation.
The resulting compiled kernel is stored in the ``cubin`` property;
the source is available as ``src``, and is also returned for
inspection and display.
This operation is idempotent, and has no side effects outside of
setting properties on this instance (unless there's a compiler error,
which is a bug); it should therefore be threadsafe as well.
It is, however, rather slow.
"""
self._iter = iter.IterCode(self.features)
self._de = filtering.DensityEst(self.features, self.genomes[0])
# TODO: make choice of filtering explicit
# TODO: autoload dependent modules?
self.src = util.assemble_code(util.BaseCode, mwc.MWC, self._iter.packer,
self._iter, filtering.ColorClip, self._de)
self.cubin = pycuda.compiler.compile(self.src, keep=False,
options=list(cmp_options))
return self.src
def copy(self):
"""
Return a copy of this animation without any references to the current
CUDA context. This can be used to load an animation in multiple CUDA
contexts without recompiling, so that rendering can proceed across
multiple devices - but managing that is up to you.
"""
import copy
new = copy.copy(self)
new.mod = None
return new
def load(self, jit_options=[]):
"""
Replace the currently loaded CUDA module in the active CUDA context
with the compiled code's module. A reference is kept to the module,
meaning that rendering should henceforth only be called from the
thread and context in which this function was called.
"""
if self.cubin is None:
self.compile()
self.mod = cuda.module_from_buffer(self.cubin, jit_options)
def render_frames(self, times=None):
"""
Render a flame for each genome in the iterable value 'genomes'.
Returns a Python generator object which will yield one NumPy array
for each rendered image.
This method produces a considerable amount of side effects, and should
not be used lightly. Things may go poorly for you if this method is not
allowed to run until completion (by exhausting all items in the
generator object).
A performance note: while any ready tasks will be scheduled on the GPU
before yielding a result, spending a lot of time before returning
control to this function can allow the GPU to become idle. It's best
to hand the resulting array to another thread after grabbing it from
the renderer for handling.
``times`` is a sequence of center times at which to render, or ``None``
to render one frame for each genome used to create the animation.
"""
# Don't see this changing, but empirical tests could prove me wrong
NRENDERERS = 2
# TODO: under a slightly modified sequencing, certain buffers can be
# shared (though this may be unimportant if a good AA technique which
# doesn't require full SS can be found)
rdrs = [_AnimRenderer(self) for i in range(NRENDERERS)]
# Zip up each genome with an alternating renderer, plus enough empty
# genomes at the end to flush all pending tasks
times = times or [cp.time for cp in self.genomes]
exttimes = chain(times, repeat(None, NRENDERERS))
for rdr, time in izip(cycle(rdrs), exttimes):
if rdr.wait():
yield rdr.get_result()
if time is not None:
rdr.render(time)
def _interp(self, time, cp):
flam3_interpolate(self._g_arr, len(self._g_arr), time, 0, byref(cp))
class _AnimRenderer(object):
# Large launches lock the display for a considerable period and may be
# killed due to a device timeout; small launches are harder to load-balance
# on the GPU and incur overhead. This empirical value is multiplied by the
# number of SMs on the device to determine how many blocks should be in
# each launch. Extremely high quality, high resolution renders may still
# encounter a device timeout, and no workaround is in place for that yet.
SM_FACTOR = 8
# Currently, palette interpolation is done independently of animation
# interpolation, so that the process is not biased and so we only need to
# mess about with one texture per renderer. This many steps will always be
# used, no matter the number of time steps.
PAL_HEIGHT = 16
def __init__(self, anim):
self.anim = anim
self.pending = False
self.stream = cuda.Stream()
self._nsms = cuda.Context.get_device().multiprocessor_count
self.cps_per_block = self._nsms * self.SM_FACTOR
self.ncps = anim.features.max_cps
self.nblocks = int(math.ceil(self.ncps / float(self.cps_per_block)))
# These are stored to avoid leaks, not to be stateful in method calls
# TODO: ensure proper cleanup is done
self._dst_cp = pyflam3.Genome()
memset(byref(self._dst_cp), 0, sizeof(self._dst_cp))
self._cen_cp = pyflam3.Genome()
memset(byref(self._cen_cp), 0, sizeof(self._cen_cp))
self.nbins = anim.features.acc_height * anim.features.acc_stride
self.d_den = cuda.mem_alloc(4 * self.nbins)
self.d_accum = cuda.mem_alloc(16 * self.nbins)
self.d_out = cuda.mem_alloc(16 * self.nbins)
self.d_infos = cuda.mem_alloc(anim._iter.packer.align * self.ncps)
# Defer allocation until first needed
self.d_seeds = [None] * self.nblocks
def render(self, cen_time):
assert not self.pending, "Tried to render with results pending!"
self.pending = True
a = self.anim
cen_cp = self._cen_cp
a._interp(cen_time, cen_cp)
palette = self._interp_colors(cen_time, cen_cp)
util.BaseCode.zero_dptr(a.mod, self.d_den, self.nbins,
self.stream)
util.BaseCode.zero_dptr(a.mod, self.d_accum, 4 * self.nbins,
self.stream)
# ------------------------------------------------------------
# TODO WARNING TODO WARNING TODO WARNING TODO WARNING TODO
# This will replace the palette while it's in use by the other
# rendering function. Need to pass palette texref in function
# invocation.
# ------------------------------------------------------------
dpal = cuda.make_multichannel_2d_array(palette, 'C')
tref = a.mod.get_texref('palTex')
tref.set_array(dpal)
tref.set_format(cuda.array_format.UNSIGNED_INT8, 4)
tref.set_flags(cuda.TRSF_NORMALIZED_COORDINATES)
tref.set_filter_mode(cuda.filter_mode.LINEAR)
cp = self._dst_cp
packer = a._iter.packer
iter_fun = a.mod.get_function("iter")
iter_fun.set_cache_config(cuda.func_cache.PREFER_L1)
# Must be accumulated over all CPs
gam, vib, hipow = 0, 0, 0
# This is gross, but there are a lot of fiddly corner cases with any
# index-based iteration scheme.
times = list(enumerate(self._mk_dts(cen_time, cen_cp, self.ncps)))
for b, block_times in enumerate(_chunk(times, self.cps_per_block)):
infos = []
if len(a.genomes) > 1:
for n, t in block_times:
a._interp(t, cp)
frac = float(n) / cen_cp.ntemporal_samples
info = packer.pack(cp=Genome(cp), cp_step_frac=frac)
infos.append(info)
gam += cp.gamma
vib += cp.vibrancy
hipow += cp.highlight_power
else:
# Can't interpolate normally; just pack copies
# TODO: this still packs the genome 20 times or so instead of
# once
packed = packer.pack(cp=a.genomes[0], cp_step_frac=0)
infos = [packed] * len(block_times)
gam += a.genomes[0].gamma * len(block_times)
vib += a.genomes[0].vibrancy * len(block_times)
hipow += a.genomes[0].highlight_power * len(block_times)
infos = np.concatenate(infos)
offset = b * packer.align * self.cps_per_block
# TODO: portable across 32/64-bit arches?
d_info_off = int(self.d_infos) + offset
cuda.memcpy_htod(d_info_off, infos)
if not self.d_seeds[b]:
seeds = mwc.MWC.make_seeds(iter.IterCode.NTHREADS *
self.cps_per_block)
self.d_seeds[b] = cuda.to_device(seeds)
# TODO: get block config from IterCode
# TODO: print timing information
iter_fun(self.d_seeds[b], np.uint64(d_info_off),
self.d_accum, self.d_den,
block=(32, 16, 1), grid=(len(block_times), 1),
stream=self.stream)
# MAJOR TODO: for now, we kill almost all parallelism by forcing the
# stream here. Later, once we've decided on a density-buffer prefilter,
# we will move it to the GPU, allowing it to be embedded in the stream
# and letting the remaining code be asynchronous.
self.stream.synchronize()
dbuf_dim = (a.features.acc_height, a.features.acc_stride)
dbuf = cuda.from_device(self.d_den, dbuf_dim, np.float32)
dbuf = ndimage.filters.gaussian_filter(dbuf, 0.6)
cuda.memcpy_htod(self.d_den, dbuf)
util.BaseCode.zero_dptr(a.mod, self.d_out, 4 * self.nbins,
self.stream)
self.stream.synchronize()
a._de.invoke(a.mod, self.d_accum, self.d_out, self.d_den,
self.stream)
self.stream.synchronize()
n = np.float32(self.ncps)
gam = np.float32(n / gam)
vib = np.float32(vib / n)
hipow = np.float32(hipow / n)
# TODO: get block size from colorclip class? It actually does not
# depend on that being the case
color_fun = a.mod.get_function("colorclip")
color_fun(self.d_out, gam, vib, hipow,
block=(256, 1, 1), grid=(self.nbins / 256, 1),
stream=self.stream)
def _interp_colors(self, cen_time, cen_cp):
# TODO: any visible difference between uint8 and richer formats?
pal = np.empty((self.PAL_HEIGHT, 256, 4), dtype=np.uint8)
a = self.anim
if len(a.genomes) > 1:
# The typical case; applying real motion blur
cp = self._dst_cp
times = self._mk_dts(cen_time, cen_cp, self.PAL_HEIGHT)
for n, t in enumerate(times):
a._interp(t, cp)
for i, e in enumerate(cp.palette.entries):
pal[n][i] = np.uint8(np.array(e.color) * 255.0)
else:
# Cannot call any interp functions on a single genome; rather than
# have alternate code-paths, just copy the same colors everywhere
for i, e in enumerate(a.genomes[0].palette.entries):
# TODO: This triggers a RuntimeWarning
pal[0][i] = np.uint8(np.array(e.color) * 255.0)
pal[1:] = pal[0]
return pal
def wait(self):
if self.pending:
self.stream.synchronize()
self.pending = False
return True
return False
def get_result(self):
a = self.anim
g = a.features.gutter
obuf_dim = (a.features.acc_height, a.features.acc_stride, 4)
out = cuda.from_device(self.d_out, obuf_dim, np.float32)
# TODO: performance?
out = np.delete(out, np.s_[:16], axis=0)
out = np.delete(out, np.s_[:16], axis=1)
out = np.delete(out, np.s_[-16:], axis=0)
out = np.delete(out, np.s_[-16:], axis=1)
return out
@staticmethod
def _mk_dts(cen_time, cen_cp, ncps):
w = cen_cp.temporal_filter_width
return [w * (t / (ncps - 1.0) - 0.5) for t in range(ncps)]
class Features(object): class Features(object):
""" """
@ -93,7 +387,8 @@ class Features(object):
palette_height = 16 palette_height = 16
# Maximum width of DE and other spatial filters, and thus in turn the # Maximum width of DE and other spatial filters, and thus in turn the
# amount of padding applied # amount of padding applied. Note that, for now, this must not be changed!
# The filtering code makes deep assumptions about this value.
gutter = 16 gutter = 16
def __init__(self, genomes): def __init__(self, genomes):
@ -116,11 +411,13 @@ class Features(object):
else: else:
self.final_xform_index = None self.final_xform_index = None
self.max_cps = max([cp.ntemporal_samples for cp in genomes])
self.width = genomes[0].width self.width = genomes[0].width
self.height = genomes[0].height self.height = genomes[0].height
self.acc_width = genomes[0].width + 2 * self.gutter self.acc_width = genomes[0].width + 2 * self.gutter
self.acc_height = genomes[0].height + 2 * self.gutter self.acc_height = genomes[0].height + 2 * self.gutter
self.acc_stride = genomes[0].width + 2 * self.gutter self.acc_stride = 32 * int(math.ceil(self.acc_width / 32.))
class XFormFeatures(object): class XFormFeatures(object):
def __init__(self, xforms, xform_id): def __init__(self, xforms, xform_id):

29
main.py
View File

@ -22,13 +22,10 @@ import scipy
import pyglet import pyglet
import pycuda.autoinit import pycuda.autoinit
from fr0stlib.pyflam3 import *
from fr0stlib.pyflam3._flam3 import *
import cuburn._pyflam3_hacks import cuburn._pyflam3_hacks
from fr0stlib import pyflam3
from cuburn.render import * from cuburn.render import *
from cuburn.code.mwc import MWCTest from cuburn.code.mwc import MWCTest
from cuburn.code.iter import render, membench
# Required on my system; CUDA doesn't yet work with GCC 4.5 # Required on my system; CUDA doesn't yet work with GCC 4.5
os.environ['PATH'] = ('/usr/x86_64-pc-linux-gnu/gcc-bin/4.4.5:' os.environ['PATH'] = ('/usr/x86_64-pc-linux-gnu/gcc-bin/4.4.5:'
@ -37,24 +34,22 @@ os.environ['PATH'] = ('/usr/x86_64-pc-linux-gnu/gcc-bin/4.4.5:'
def main(args): def main(args):
if '-t' in args: if '-t' in args:
MWCTest.test_mwc() MWCTest.test_mwc()
membench()
with open(args[1]) as fp: with open(args[1]) as fp:
genomes = Genome.from_string(fp.read()) genome_ptr, ngenomes = pyflam3.Genome.from_string(fp.read())
genomes = cast(genome_ptr, POINTER(pyflam3.Genome*ngenomes)).contents
anim = Animation(genomes) anim = Animation(genomes)
accum, den = render(anim.features, genomes) anim.compile()
accum = np.delete(accum, np.s_[:16], axis=0) anim.load()
accum = np.delete(accum, np.s_[:16], axis=1) for n, out in enumerate(anim.render_frames()):
accum = np.delete(accum, np.s_[-16:], axis=0) noalpha = np.delete(out, 3, axis=2)
accum = np.delete(accum, np.s_[-16:], axis=1) scipy.misc.imsave('rendered_%03d.png' % n, noalpha)
scipy.misc.imsave('rendered_%03d.jpg' % n, noalpha)
noalpha = np.delete(accum, 3, axis=2) return
scipy.misc.imsave('rendered.png', noalpha)
scipy.misc.imsave('rendered.jpg', noalpha)
if '-g' not in args: #if '-g' not in args:
return # return
window = pyglet.window.Window(anim.features.width, anim.features.height) window = pyglet.window.Window(anim.features.width, anim.features.height)
imgbuf = (np.minimum(accum * 255, 255)).astype(np.uint8) imgbuf = (np.minimum(accum * 255, 255)).astype(np.uint8)