mirror of
https://github.com/stevenrobertson/cuburn.git
synced 2025-04-21 00:51:31 -04:00
Refactor API
--HG-- rename : cuburn/code/filter.py => cuburn/code/filtering.py
This commit is contained in:
parent
6f3c27007a
commit
e79df46c66
@ -14,6 +14,8 @@ from ctypes import *
|
||||
from fr0stlib.pyflam3 import constants
|
||||
from fr0stlib.pyflam3._flam3 import *
|
||||
|
||||
from cuburn import render
|
||||
|
||||
flam3_nvariations = constants.flam3_nvariations = 99
|
||||
|
||||
BaseXForm._fields_ = [('var', c_double * flam3_nvariations)
|
||||
|
@ -223,24 +223,22 @@ void density_est(float4 *pixbuf, float4 *outbuf, float *denbuf,
|
||||
|
||||
""")
|
||||
|
||||
def invoke(self, mod, abufd, obufd, dbufd):
|
||||
def invoke(self, mod, abufd, obufd, dbufd, stream=None):
|
||||
# TODO: add no-est version
|
||||
# TODO: come up with a general way to average these parameters
|
||||
|
||||
k1 = self.cp.brightness * 268 / 256
|
||||
area = self.features.acc_width * self.features.acc_height / self.cp.ppu ** 2
|
||||
k2 = 1 / (area * self.cp.adj_density)
|
||||
print k1, k2, area
|
||||
|
||||
if self.cp.estimator == 0:
|
||||
fun = mod.get_function("logscale")
|
||||
t = fun(abufd, obufd, np.float32(k1), np.float32(k2),
|
||||
block=(self.features.acc_width, 1, 1),
|
||||
grid=(self.features.acc_height, 1), time_kernel=True)
|
||||
grid=(self.features.acc_height, 1), stream=stream)
|
||||
else:
|
||||
fun = mod.get_function("density_est")
|
||||
t = fun(abufd, obufd, dbufd, np.float32(k1), np.float32(k2),
|
||||
block=(32, 32, 1), grid=(self.features.acc_width/32, 1),
|
||||
time_kernel=True)
|
||||
print "Density estimation: %g" % t
|
||||
fun(abufd, obufd, dbufd, np.float32(k1), np.float32(k2),
|
||||
block=(32, 32, 1), grid=(self.features.acc_width/32, 1),
|
||||
stream=stream)
|
||||
|
@ -2,20 +2,13 @@
|
||||
The main iteration loop.
|
||||
"""
|
||||
|
||||
from ctypes import byref, memset, sizeof
|
||||
|
||||
import pycuda.driver as cuda
|
||||
from pycuda.driver import In, Out, InOut
|
||||
from pycuda.compiler import SourceModule
|
||||
import numpy as np
|
||||
from scipy import ndimage
|
||||
|
||||
from fr0stlib.pyflam3 import flam3_interpolate
|
||||
from cuburn.code import mwc, variations, filter
|
||||
from cuburn.code import mwc, variations
|
||||
from cuburn.code.util import *
|
||||
from cuburn.render import Genome
|
||||
|
||||
class IterCode(HunkOCode):
|
||||
# The number of threads per block
|
||||
NTHREADS = 512
|
||||
|
||||
def __init__(self, features):
|
||||
self.features = features
|
||||
self.packer = DataPacker('iter_info')
|
||||
@ -69,14 +62,14 @@ void iter(mwc_st *msts, iter_info *infos, float4 *accbuf, float *denbuf) {
|
||||
iter_info *info_glob = &(infos[blockIdx.x]);
|
||||
|
||||
// load info to shared memory cooperatively
|
||||
for (int i = threadIdx.y * 32 + threadIdx.x;
|
||||
for (int i = threadIdx.y * blockDim.x + threadIdx.x;
|
||||
i * 4 < sizeof(iter_info); i += blockDim.x * blockDim.y)
|
||||
reinterpret_cast<float*>(&info)[i] =
|
||||
reinterpret_cast<float*>(info_glob)[i];
|
||||
|
||||
int consec_bad = -{{features.fuse}};
|
||||
// TODO: make nsteps adjustable via genome
|
||||
int nsamps = {{packer.get('cp.width * cp.height / 512000. * cp.adj_density')}};
|
||||
// TODO: remove '512' constant
|
||||
int nsamps = {{packer.get('cp.width * cp.height / (cp.ntemporal_samples * 512.) * cp.adj_density')}};
|
||||
|
||||
float x, y, color;
|
||||
x = mwc_next_11(&rctx);
|
||||
@ -157,86 +150,3 @@ void iter(mwc_st *msts, iter_info *infos, float4 *accbuf, float *denbuf) {
|
||||
packer = self.packer.view('info'),
|
||||
**globals())
|
||||
|
||||
def render(features, cps):
|
||||
# TODO: make this adjustable via genome
|
||||
nsteps = 1000
|
||||
abuf = np.zeros((features.acc_height, features.acc_stride, 4), dtype=np.float32)
|
||||
dbuf = np.zeros((features.acc_height, features.acc_stride), dtype=np.float32)
|
||||
seeds = mwc.MWC.make_seeds(512 * nsteps)
|
||||
|
||||
iter = IterCode(features)
|
||||
de = filter.DensityEst(features, cps[0])
|
||||
code = assemble_code(BaseCode, mwc.MWC, iter.packer, iter,
|
||||
filter.ColorClip, de)
|
||||
|
||||
for lno, line in enumerate(code.split('\n')):
|
||||
print '%3d %s' % (lno, line)
|
||||
mod = SourceModule(code,
|
||||
options=['-use_fast_math', '-maxrregcount', '32'])
|
||||
|
||||
cps_as_array = (Genome * len(cps))()
|
||||
for i, cp in enumerate(cps):
|
||||
cps_as_array[i] = cp
|
||||
|
||||
infos = []
|
||||
pal = np.empty((16, 256, 4), dtype=np.uint8)
|
||||
|
||||
# TODO: move this into a common function
|
||||
if len(cps) > 1:
|
||||
cp = Genome()
|
||||
memset(byref(cp), 0, sizeof(cp))
|
||||
|
||||
sampAt = [int(i/15.*(nsteps-1)) for i in range(16)]
|
||||
for n in range(nsteps):
|
||||
flam3_interpolate(cps_as_array, 2, float(n)/nsteps - 0.5,
|
||||
0, byref(cp))
|
||||
cp._init()
|
||||
if n in sampAt:
|
||||
pidx = sampAt.index(n)
|
||||
for i, e in enumerate(cp.palette.entries):
|
||||
pal[pidx][i] = np.uint8(np.array(e.color) * 255.0)
|
||||
infos.append(iter.packer.pack(cp=cp, cp_step_frac=float(n)/nsteps))
|
||||
else:
|
||||
for i, e in enumerate(cps[0].palette.entries):
|
||||
pal[0][i] = np.uint8(np.array(e.color) * 255.0)
|
||||
pal[1:] = pal[0]
|
||||
infos.append(iter.packer.pack(cp=cps[0], cp_step_frac=0))
|
||||
infos *= nsteps
|
||||
|
||||
infos = np.concatenate(infos)
|
||||
|
||||
dpal = cuda.make_multichannel_2d_array(pal, 'C')
|
||||
tref = mod.get_texref('palTex')
|
||||
tref.set_array(dpal)
|
||||
tref.set_format(cuda.array_format.UNSIGNED_INT8, 4)
|
||||
tref.set_flags(cuda.TRSF_NORMALIZED_COORDINATES)
|
||||
tref.set_filter_mode(cuda.filter_mode.LINEAR)
|
||||
|
||||
abufd = cuda.to_device(abuf)
|
||||
dbufd = cuda.to_device(dbuf)
|
||||
|
||||
fun = mod.get_function("iter")
|
||||
fun.set_cache_config(cuda.func_cache.PREFER_L1)
|
||||
t = fun(InOut(seeds), InOut(infos), abufd, dbufd,
|
||||
block=(32,16,1), grid=(nsteps,1), time_kernel=True)
|
||||
print "Completed render in %g seconds" % t
|
||||
|
||||
f = np.float32
|
||||
|
||||
npix = features.acc_width * features.acc_height
|
||||
|
||||
# TODO: just allocate
|
||||
obufd = cuda.to_device(abuf)
|
||||
dbuf = cuda.from_device_like(dbufd, dbuf)
|
||||
dbuf = ndimage.filters.gaussian_filter(dbuf, 0.6)
|
||||
dbufd = cuda.to_device(dbuf)
|
||||
de.invoke(mod, abufd, obufd, dbufd)
|
||||
|
||||
fun = mod.get_function("colorclip")
|
||||
t = fun(obufd, f(1 / cp.gamma), f(cp.vibrancy), f(cp.highlight_power),
|
||||
block=(256,1,1), grid=(npix/256,1), time_kernel=True)
|
||||
print "Completed color filtering in %g seconds" % t
|
||||
|
||||
abuf = cuda.from_device_like(obufd, abuf)
|
||||
return abuf, dbuf
|
||||
|
||||
|
@ -66,8 +66,26 @@ int trunca(float f) {
|
||||
asm("cvt.rni.s32.f32 %0, %1;" : "=r"(ret) : "f"(f));
|
||||
return ret;
|
||||
}
|
||||
|
||||
__global__
|
||||
void zero_dptr(float* dptr, int size) {
|
||||
int i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (i < size) {
|
||||
dptr[i] = 0.0f;
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def zero_dptr(mod, dptr, size, stream=None):
|
||||
"""
|
||||
A memory zeroer which can be embedded in a stream. Size is the
|
||||
number of 4-byte words in the pointer.
|
||||
"""
|
||||
zero = mod.get_function("zero_dptr")
|
||||
zero(dptr, np.int32(size), stream=stream,
|
||||
block=(1024, 1, 1), grid=(size/1024+1, 1))
|
||||
|
||||
class DataPackerView(object):
|
||||
"""
|
||||
View of a data packer. Intended to be initialized using DataPacker.view().
|
||||
|
339
cuburn/render.py
339
cuburn/render.py
@ -1,44 +1,57 @@
|
||||
import sys
|
||||
import math
|
||||
import re
|
||||
from itertools import cycle, repeat, chain, izip
|
||||
from ctypes import *
|
||||
from cStringIO import StringIO
|
||||
import numpy as np
|
||||
from scipy import ndimage
|
||||
|
||||
from fr0stlib import pyflam3
|
||||
from fr0stlib.pyflam3._flam3 import *
|
||||
from fr0stlib.pyflam3.constants import *
|
||||
|
||||
import pycuda.compiler
|
||||
import pycuda.driver as cuda
|
||||
|
||||
from cuburn import affine
|
||||
from cuburn.variations import Variations
|
||||
from cuburn.code import util, mwc, iter, filtering
|
||||
|
||||
class Genome(pyflam3.Genome):
|
||||
@classmethod
|
||||
def from_string(cls, *args, **kwargs):
|
||||
gnms = super(Genome, cls).from_string(*args, **kwargs)
|
||||
for g in gnms: g._init()
|
||||
return gnms
|
||||
def _chunk(l, cs):
|
||||
"""
|
||||
Yield the contents of list ``l`` in chunks of size no more than ``cs``.
|
||||
"""
|
||||
for i in range(0, len(l), cs):
|
||||
yield l[i:i+cs]
|
||||
|
||||
def _init(self):
|
||||
class Genome(object):
|
||||
"""
|
||||
Normalizes and precalculates some properties of a Genome. Assumes that
|
||||
Genome argument passed in will not change.
|
||||
"""
|
||||
# Fix the ctypes ugliness since switching to __getattribute__ in 2.7.
|
||||
# There are more elegant ways to do this, but I can't be bothered.
|
||||
def __getattr__(self, name):
|
||||
return getattr(self.cp, name)
|
||||
|
||||
def __init__(self, ctypes_genome):
|
||||
self.cp = ctypes_genome
|
||||
self.xforms = [self.xform[i] for i in range(self.num_xforms)]
|
||||
dens = np.array([x.density for i, x in enumerate(self.xforms)
|
||||
if i != self.final_xform_index])
|
||||
dens /= np.sum(dens)
|
||||
self.norm_density = [np.sum(dens[:i+1]) for i in range(len(dens))]
|
||||
self.camera_transform = self.calc_camera_transform()
|
||||
|
||||
scale = property(lambda cp: 2.0 ** cp.zoom)
|
||||
adj_density = property(lambda cp: cp.sample_density * (cp.scale ** 2))
|
||||
ppu = property(lambda cp: cp.pixels_per_unit * cp.scale)
|
||||
|
||||
@property
|
||||
def camera_transform(cp):
|
||||
def calc_camera_transform(cp):
|
||||
"""
|
||||
An affine matrix which will transform IFS coordinates to image width
|
||||
and height. Assumes that width and height are constant.
|
||||
"""
|
||||
# TODO: when reading as a property during packing, this may be
|
||||
# calculated 6 times instead of 1
|
||||
# TODO: also requires knowing gutter width
|
||||
g = Features.gutter
|
||||
return ( affine.translate(0.5 * cp.width + g, 0.5 * cp.height + g)
|
||||
* affine.scale(cp.ppu, cp.ppu)
|
||||
@ -65,13 +78,294 @@ class Animation(object):
|
||||
In other words, it's best to use exactly one Animation for each
|
||||
interpolated sequence between one or two genomes.
|
||||
"""
|
||||
def __init__(self, genomes, ngenomes = None):
|
||||
self.features = Features(genomes)
|
||||
def __init__(self, ctypes_genome_array):
|
||||
self._g_arr = ctypes_genome_array
|
||||
self.genomes = map(Genome, ctypes_genome_array)
|
||||
self.features = Features(self.genomes)
|
||||
self._iter = self._de = self.src = self.cubin = self.mod = None
|
||||
|
||||
def compile(self):
|
||||
pass
|
||||
def render_frame(self, time=0):
|
||||
pass
|
||||
def compile(self, keep=False,
|
||||
cmp_options=('-use_fast_math', '-maxrregcount', '32')):
|
||||
"""
|
||||
Compile a kernel capable of rendering every frame in this animation.
|
||||
The resulting compiled kernel is stored in the ``cubin`` property;
|
||||
the source is available as ``src``, and is also returned for
|
||||
inspection and display.
|
||||
|
||||
This operation is idempotent, and has no side effects outside of
|
||||
setting properties on this instance (unless there's a compiler error,
|
||||
which is a bug); it should therefore be threadsafe as well.
|
||||
It is, however, rather slow.
|
||||
"""
|
||||
self._iter = iter.IterCode(self.features)
|
||||
self._de = filtering.DensityEst(self.features, self.genomes[0])
|
||||
# TODO: make choice of filtering explicit
|
||||
# TODO: autoload dependent modules?
|
||||
self.src = util.assemble_code(util.BaseCode, mwc.MWC, self._iter.packer,
|
||||
self._iter, filtering.ColorClip, self._de)
|
||||
self.cubin = pycuda.compiler.compile(self.src, keep=False,
|
||||
options=list(cmp_options))
|
||||
return self.src
|
||||
|
||||
def copy(self):
|
||||
"""
|
||||
Return a copy of this animation without any references to the current
|
||||
CUDA context. This can be used to load an animation in multiple CUDA
|
||||
contexts without recompiling, so that rendering can proceed across
|
||||
multiple devices - but managing that is up to you.
|
||||
"""
|
||||
import copy
|
||||
new = copy.copy(self)
|
||||
new.mod = None
|
||||
return new
|
||||
|
||||
def load(self, jit_options=[]):
|
||||
"""
|
||||
Replace the currently loaded CUDA module in the active CUDA context
|
||||
with the compiled code's module. A reference is kept to the module,
|
||||
meaning that rendering should henceforth only be called from the
|
||||
thread and context in which this function was called.
|
||||
"""
|
||||
if self.cubin is None:
|
||||
self.compile()
|
||||
self.mod = cuda.module_from_buffer(self.cubin, jit_options)
|
||||
|
||||
def render_frames(self, times=None):
|
||||
"""
|
||||
Render a flame for each genome in the iterable value 'genomes'.
|
||||
Returns a Python generator object which will yield one NumPy array
|
||||
for each rendered image.
|
||||
|
||||
This method produces a considerable amount of side effects, and should
|
||||
not be used lightly. Things may go poorly for you if this method is not
|
||||
allowed to run until completion (by exhausting all items in the
|
||||
generator object).
|
||||
|
||||
A performance note: while any ready tasks will be scheduled on the GPU
|
||||
before yielding a result, spending a lot of time before returning
|
||||
control to this function can allow the GPU to become idle. It's best
|
||||
to hand the resulting array to another thread after grabbing it from
|
||||
the renderer for handling.
|
||||
|
||||
``times`` is a sequence of center times at which to render, or ``None``
|
||||
to render one frame for each genome used to create the animation.
|
||||
"""
|
||||
# Don't see this changing, but empirical tests could prove me wrong
|
||||
NRENDERERS = 2
|
||||
# TODO: under a slightly modified sequencing, certain buffers can be
|
||||
# shared (though this may be unimportant if a good AA technique which
|
||||
# doesn't require full SS can be found)
|
||||
rdrs = [_AnimRenderer(self) for i in range(NRENDERERS)]
|
||||
|
||||
# Zip up each genome with an alternating renderer, plus enough empty
|
||||
# genomes at the end to flush all pending tasks
|
||||
times = times or [cp.time for cp in self.genomes]
|
||||
exttimes = chain(times, repeat(None, NRENDERERS))
|
||||
for rdr, time in izip(cycle(rdrs), exttimes):
|
||||
if rdr.wait():
|
||||
yield rdr.get_result()
|
||||
if time is not None:
|
||||
rdr.render(time)
|
||||
|
||||
def _interp(self, time, cp):
|
||||
flam3_interpolate(self._g_arr, len(self._g_arr), time, 0, byref(cp))
|
||||
|
||||
|
||||
|
||||
class _AnimRenderer(object):
|
||||
# Large launches lock the display for a considerable period and may be
|
||||
# killed due to a device timeout; small launches are harder to load-balance
|
||||
# on the GPU and incur overhead. This empirical value is multiplied by the
|
||||
# number of SMs on the device to determine how many blocks should be in
|
||||
# each launch. Extremely high quality, high resolution renders may still
|
||||
# encounter a device timeout, and no workaround is in place for that yet.
|
||||
SM_FACTOR = 8
|
||||
|
||||
# Currently, palette interpolation is done independently of animation
|
||||
# interpolation, so that the process is not biased and so we only need to
|
||||
# mess about with one texture per renderer. This many steps will always be
|
||||
# used, no matter the number of time steps.
|
||||
PAL_HEIGHT = 16
|
||||
|
||||
|
||||
def __init__(self, anim):
|
||||
self.anim = anim
|
||||
self.pending = False
|
||||
self.stream = cuda.Stream()
|
||||
|
||||
self._nsms = cuda.Context.get_device().multiprocessor_count
|
||||
self.cps_per_block = self._nsms * self.SM_FACTOR
|
||||
self.ncps = anim.features.max_cps
|
||||
self.nblocks = int(math.ceil(self.ncps / float(self.cps_per_block)))
|
||||
|
||||
# These are stored to avoid leaks, not to be stateful in method calls
|
||||
# TODO: ensure proper cleanup is done
|
||||
self._dst_cp = pyflam3.Genome()
|
||||
memset(byref(self._dst_cp), 0, sizeof(self._dst_cp))
|
||||
self._cen_cp = pyflam3.Genome()
|
||||
memset(byref(self._cen_cp), 0, sizeof(self._cen_cp))
|
||||
|
||||
self.nbins = anim.features.acc_height * anim.features.acc_stride
|
||||
self.d_den = cuda.mem_alloc(4 * self.nbins)
|
||||
self.d_accum = cuda.mem_alloc(16 * self.nbins)
|
||||
self.d_out = cuda.mem_alloc(16 * self.nbins)
|
||||
self.d_infos = cuda.mem_alloc(anim._iter.packer.align * self.ncps)
|
||||
# Defer allocation until first needed
|
||||
self.d_seeds = [None] * self.nblocks
|
||||
|
||||
def render(self, cen_time):
|
||||
assert not self.pending, "Tried to render with results pending!"
|
||||
self.pending = True
|
||||
a = self.anim
|
||||
|
||||
cen_cp = self._cen_cp
|
||||
a._interp(cen_time, cen_cp)
|
||||
palette = self._interp_colors(cen_time, cen_cp)
|
||||
|
||||
util.BaseCode.zero_dptr(a.mod, self.d_den, self.nbins,
|
||||
self.stream)
|
||||
util.BaseCode.zero_dptr(a.mod, self.d_accum, 4 * self.nbins,
|
||||
self.stream)
|
||||
|
||||
# ------------------------------------------------------------
|
||||
# TODO WARNING TODO WARNING TODO WARNING TODO WARNING TODO
|
||||
# This will replace the palette while it's in use by the other
|
||||
# rendering function. Need to pass palette texref in function
|
||||
# invocation.
|
||||
# ------------------------------------------------------------
|
||||
dpal = cuda.make_multichannel_2d_array(palette, 'C')
|
||||
tref = a.mod.get_texref('palTex')
|
||||
tref.set_array(dpal)
|
||||
tref.set_format(cuda.array_format.UNSIGNED_INT8, 4)
|
||||
tref.set_flags(cuda.TRSF_NORMALIZED_COORDINATES)
|
||||
tref.set_filter_mode(cuda.filter_mode.LINEAR)
|
||||
|
||||
cp = self._dst_cp
|
||||
packer = a._iter.packer
|
||||
|
||||
iter_fun = a.mod.get_function("iter")
|
||||
iter_fun.set_cache_config(cuda.func_cache.PREFER_L1)
|
||||
|
||||
# Must be accumulated over all CPs
|
||||
gam, vib, hipow = 0, 0, 0
|
||||
|
||||
# This is gross, but there are a lot of fiddly corner cases with any
|
||||
# index-based iteration scheme.
|
||||
times = list(enumerate(self._mk_dts(cen_time, cen_cp, self.ncps)))
|
||||
for b, block_times in enumerate(_chunk(times, self.cps_per_block)):
|
||||
infos = []
|
||||
if len(a.genomes) > 1:
|
||||
for n, t in block_times:
|
||||
a._interp(t, cp)
|
||||
frac = float(n) / cen_cp.ntemporal_samples
|
||||
info = packer.pack(cp=Genome(cp), cp_step_frac=frac)
|
||||
infos.append(info)
|
||||
gam += cp.gamma
|
||||
vib += cp.vibrancy
|
||||
hipow += cp.highlight_power
|
||||
else:
|
||||
# Can't interpolate normally; just pack copies
|
||||
# TODO: this still packs the genome 20 times or so instead of
|
||||
# once
|
||||
packed = packer.pack(cp=a.genomes[0], cp_step_frac=0)
|
||||
infos = [packed] * len(block_times)
|
||||
gam += a.genomes[0].gamma * len(block_times)
|
||||
vib += a.genomes[0].vibrancy * len(block_times)
|
||||
hipow += a.genomes[0].highlight_power * len(block_times)
|
||||
|
||||
infos = np.concatenate(infos)
|
||||
offset = b * packer.align * self.cps_per_block
|
||||
# TODO: portable across 32/64-bit arches?
|
||||
d_info_off = int(self.d_infos) + offset
|
||||
cuda.memcpy_htod(d_info_off, infos)
|
||||
|
||||
if not self.d_seeds[b]:
|
||||
seeds = mwc.MWC.make_seeds(iter.IterCode.NTHREADS *
|
||||
self.cps_per_block)
|
||||
self.d_seeds[b] = cuda.to_device(seeds)
|
||||
|
||||
# TODO: get block config from IterCode
|
||||
# TODO: print timing information
|
||||
iter_fun(self.d_seeds[b], np.uint64(d_info_off),
|
||||
self.d_accum, self.d_den,
|
||||
block=(32, 16, 1), grid=(len(block_times), 1),
|
||||
stream=self.stream)
|
||||
|
||||
# MAJOR TODO: for now, we kill almost all parallelism by forcing the
|
||||
# stream here. Later, once we've decided on a density-buffer prefilter,
|
||||
# we will move it to the GPU, allowing it to be embedded in the stream
|
||||
# and letting the remaining code be asynchronous.
|
||||
self.stream.synchronize()
|
||||
dbuf_dim = (a.features.acc_height, a.features.acc_stride)
|
||||
dbuf = cuda.from_device(self.d_den, dbuf_dim, np.float32)
|
||||
dbuf = ndimage.filters.gaussian_filter(dbuf, 0.6)
|
||||
cuda.memcpy_htod(self.d_den, dbuf)
|
||||
|
||||
util.BaseCode.zero_dptr(a.mod, self.d_out, 4 * self.nbins,
|
||||
self.stream)
|
||||
self.stream.synchronize()
|
||||
a._de.invoke(a.mod, self.d_accum, self.d_out, self.d_den,
|
||||
self.stream)
|
||||
self.stream.synchronize()
|
||||
|
||||
|
||||
n = np.float32(self.ncps)
|
||||
gam = np.float32(n / gam)
|
||||
vib = np.float32(vib / n)
|
||||
hipow = np.float32(hipow / n)
|
||||
|
||||
# TODO: get block size from colorclip class? It actually does not
|
||||
# depend on that being the case
|
||||
color_fun = a.mod.get_function("colorclip")
|
||||
color_fun(self.d_out, gam, vib, hipow,
|
||||
block=(256, 1, 1), grid=(self.nbins / 256, 1),
|
||||
stream=self.stream)
|
||||
|
||||
def _interp_colors(self, cen_time, cen_cp):
|
||||
# TODO: any visible difference between uint8 and richer formats?
|
||||
pal = np.empty((self.PAL_HEIGHT, 256, 4), dtype=np.uint8)
|
||||
a = self.anim
|
||||
|
||||
if len(a.genomes) > 1:
|
||||
# The typical case; applying real motion blur
|
||||
cp = self._dst_cp
|
||||
times = self._mk_dts(cen_time, cen_cp, self.PAL_HEIGHT)
|
||||
for n, t in enumerate(times):
|
||||
a._interp(t, cp)
|
||||
for i, e in enumerate(cp.palette.entries):
|
||||
pal[n][i] = np.uint8(np.array(e.color) * 255.0)
|
||||
else:
|
||||
# Cannot call any interp functions on a single genome; rather than
|
||||
# have alternate code-paths, just copy the same colors everywhere
|
||||
for i, e in enumerate(a.genomes[0].palette.entries):
|
||||
# TODO: This triggers a RuntimeWarning
|
||||
pal[0][i] = np.uint8(np.array(e.color) * 255.0)
|
||||
pal[1:] = pal[0]
|
||||
return pal
|
||||
|
||||
def wait(self):
|
||||
if self.pending:
|
||||
self.stream.synchronize()
|
||||
self.pending = False
|
||||
return True
|
||||
return False
|
||||
|
||||
def get_result(self):
|
||||
a = self.anim
|
||||
g = a.features.gutter
|
||||
obuf_dim = (a.features.acc_height, a.features.acc_stride, 4)
|
||||
out = cuda.from_device(self.d_out, obuf_dim, np.float32)
|
||||
# TODO: performance?
|
||||
out = np.delete(out, np.s_[:16], axis=0)
|
||||
out = np.delete(out, np.s_[:16], axis=1)
|
||||
out = np.delete(out, np.s_[-16:], axis=0)
|
||||
out = np.delete(out, np.s_[-16:], axis=1)
|
||||
return out
|
||||
|
||||
@staticmethod
|
||||
def _mk_dts(cen_time, cen_cp, ncps):
|
||||
w = cen_cp.temporal_filter_width
|
||||
return [w * (t / (ncps - 1.0) - 0.5) for t in range(ncps)]
|
||||
|
||||
class Features(object):
|
||||
"""
|
||||
@ -93,7 +387,8 @@ class Features(object):
|
||||
palette_height = 16
|
||||
|
||||
# Maximum width of DE and other spatial filters, and thus in turn the
|
||||
# amount of padding applied
|
||||
# amount of padding applied. Note that, for now, this must not be changed!
|
||||
# The filtering code makes deep assumptions about this value.
|
||||
gutter = 16
|
||||
|
||||
def __init__(self, genomes):
|
||||
@ -116,11 +411,13 @@ class Features(object):
|
||||
else:
|
||||
self.final_xform_index = None
|
||||
|
||||
self.max_cps = max([cp.ntemporal_samples for cp in genomes])
|
||||
|
||||
self.width = genomes[0].width
|
||||
self.height = genomes[0].height
|
||||
self.acc_width = genomes[0].width + 2 * self.gutter
|
||||
self.acc_height = genomes[0].height + 2 * self.gutter
|
||||
self.acc_stride = genomes[0].width + 2 * self.gutter
|
||||
self.acc_stride = 32 * int(math.ceil(self.acc_width / 32.))
|
||||
|
||||
class XFormFeatures(object):
|
||||
def __init__(self, xforms, xform_id):
|
||||
|
29
main.py
29
main.py
@ -22,13 +22,10 @@ import scipy
|
||||
import pyglet
|
||||
import pycuda.autoinit
|
||||
|
||||
from fr0stlib.pyflam3 import *
|
||||
from fr0stlib.pyflam3._flam3 import *
|
||||
|
||||
import cuburn._pyflam3_hacks
|
||||
from fr0stlib import pyflam3
|
||||
from cuburn.render import *
|
||||
from cuburn.code.mwc import MWCTest
|
||||
from cuburn.code.iter import render, membench
|
||||
|
||||
# Required on my system; CUDA doesn't yet work with GCC 4.5
|
||||
os.environ['PATH'] = ('/usr/x86_64-pc-linux-gnu/gcc-bin/4.4.5:'
|
||||
@ -37,24 +34,22 @@ os.environ['PATH'] = ('/usr/x86_64-pc-linux-gnu/gcc-bin/4.4.5:'
|
||||
def main(args):
|
||||
if '-t' in args:
|
||||
MWCTest.test_mwc()
|
||||
membench()
|
||||
|
||||
|
||||
with open(args[1]) as fp:
|
||||
genomes = Genome.from_string(fp.read())
|
||||
genome_ptr, ngenomes = pyflam3.Genome.from_string(fp.read())
|
||||
genomes = cast(genome_ptr, POINTER(pyflam3.Genome*ngenomes)).contents
|
||||
anim = Animation(genomes)
|
||||
accum, den = render(anim.features, genomes)
|
||||
accum = np.delete(accum, np.s_[:16], axis=0)
|
||||
accum = np.delete(accum, np.s_[:16], axis=1)
|
||||
accum = np.delete(accum, np.s_[-16:], axis=0)
|
||||
accum = np.delete(accum, np.s_[-16:], axis=1)
|
||||
anim.compile()
|
||||
anim.load()
|
||||
for n, out in enumerate(anim.render_frames()):
|
||||
noalpha = np.delete(out, 3, axis=2)
|
||||
scipy.misc.imsave('rendered_%03d.png' % n, noalpha)
|
||||
scipy.misc.imsave('rendered_%03d.jpg' % n, noalpha)
|
||||
|
||||
noalpha = np.delete(accum, 3, axis=2)
|
||||
scipy.misc.imsave('rendered.png', noalpha)
|
||||
scipy.misc.imsave('rendered.jpg', noalpha)
|
||||
return
|
||||
|
||||
if '-g' not in args:
|
||||
return
|
||||
#if '-g' not in args:
|
||||
# return
|
||||
|
||||
window = pyglet.window.Window(anim.features.width, anim.features.height)
|
||||
imgbuf = (np.minimum(accum * 255, 255)).astype(np.uint8)
|
||||
|
Loading…
Reference in New Issue
Block a user