mirror of
https://github.com/stevenrobertson/cuburn.git
synced 2025-04-21 00:51:31 -04:00
Refactor API
--HG-- rename : cuburn/code/filter.py => cuburn/code/filtering.py
This commit is contained in:
parent
6f3c27007a
commit
e79df46c66
@ -14,6 +14,8 @@ from ctypes import *
|
|||||||
from fr0stlib.pyflam3 import constants
|
from fr0stlib.pyflam3 import constants
|
||||||
from fr0stlib.pyflam3._flam3 import *
|
from fr0stlib.pyflam3._flam3 import *
|
||||||
|
|
||||||
|
from cuburn import render
|
||||||
|
|
||||||
flam3_nvariations = constants.flam3_nvariations = 99
|
flam3_nvariations = constants.flam3_nvariations = 99
|
||||||
|
|
||||||
BaseXForm._fields_ = [('var', c_double * flam3_nvariations)
|
BaseXForm._fields_ = [('var', c_double * flam3_nvariations)
|
||||||
|
@ -223,24 +223,22 @@ void density_est(float4 *pixbuf, float4 *outbuf, float *denbuf,
|
|||||||
|
|
||||||
""")
|
""")
|
||||||
|
|
||||||
def invoke(self, mod, abufd, obufd, dbufd):
|
def invoke(self, mod, abufd, obufd, dbufd, stream=None):
|
||||||
# TODO: add no-est version
|
# TODO: add no-est version
|
||||||
# TODO: come up with a general way to average these parameters
|
# TODO: come up with a general way to average these parameters
|
||||||
|
|
||||||
k1 = self.cp.brightness * 268 / 256
|
k1 = self.cp.brightness * 268 / 256
|
||||||
area = self.features.acc_width * self.features.acc_height / self.cp.ppu ** 2
|
area = self.features.acc_width * self.features.acc_height / self.cp.ppu ** 2
|
||||||
k2 = 1 / (area * self.cp.adj_density)
|
k2 = 1 / (area * self.cp.adj_density)
|
||||||
print k1, k2, area
|
|
||||||
|
|
||||||
if self.cp.estimator == 0:
|
if self.cp.estimator == 0:
|
||||||
fun = mod.get_function("logscale")
|
fun = mod.get_function("logscale")
|
||||||
t = fun(abufd, obufd, np.float32(k1), np.float32(k2),
|
t = fun(abufd, obufd, np.float32(k1), np.float32(k2),
|
||||||
block=(self.features.acc_width, 1, 1),
|
block=(self.features.acc_width, 1, 1),
|
||||||
grid=(self.features.acc_height, 1), time_kernel=True)
|
grid=(self.features.acc_height, 1), stream=stream)
|
||||||
else:
|
else:
|
||||||
fun = mod.get_function("density_est")
|
fun = mod.get_function("density_est")
|
||||||
t = fun(abufd, obufd, dbufd, np.float32(k1), np.float32(k2),
|
fun(abufd, obufd, dbufd, np.float32(k1), np.float32(k2),
|
||||||
block=(32, 32, 1), grid=(self.features.acc_width/32, 1),
|
block=(32, 32, 1), grid=(self.features.acc_width/32, 1),
|
||||||
time_kernel=True)
|
stream=stream)
|
||||||
print "Density estimation: %g" % t
|
|
||||||
|
|
@ -2,20 +2,13 @@
|
|||||||
The main iteration loop.
|
The main iteration loop.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from ctypes import byref, memset, sizeof
|
from cuburn.code import mwc, variations
|
||||||
|
|
||||||
import pycuda.driver as cuda
|
|
||||||
from pycuda.driver import In, Out, InOut
|
|
||||||
from pycuda.compiler import SourceModule
|
|
||||||
import numpy as np
|
|
||||||
from scipy import ndimage
|
|
||||||
|
|
||||||
from fr0stlib.pyflam3 import flam3_interpolate
|
|
||||||
from cuburn.code import mwc, variations, filter
|
|
||||||
from cuburn.code.util import *
|
from cuburn.code.util import *
|
||||||
from cuburn.render import Genome
|
|
||||||
|
|
||||||
class IterCode(HunkOCode):
|
class IterCode(HunkOCode):
|
||||||
|
# The number of threads per block
|
||||||
|
NTHREADS = 512
|
||||||
|
|
||||||
def __init__(self, features):
|
def __init__(self, features):
|
||||||
self.features = features
|
self.features = features
|
||||||
self.packer = DataPacker('iter_info')
|
self.packer = DataPacker('iter_info')
|
||||||
@ -69,14 +62,14 @@ void iter(mwc_st *msts, iter_info *infos, float4 *accbuf, float *denbuf) {
|
|||||||
iter_info *info_glob = &(infos[blockIdx.x]);
|
iter_info *info_glob = &(infos[blockIdx.x]);
|
||||||
|
|
||||||
// load info to shared memory cooperatively
|
// load info to shared memory cooperatively
|
||||||
for (int i = threadIdx.y * 32 + threadIdx.x;
|
for (int i = threadIdx.y * blockDim.x + threadIdx.x;
|
||||||
i * 4 < sizeof(iter_info); i += blockDim.x * blockDim.y)
|
i * 4 < sizeof(iter_info); i += blockDim.x * blockDim.y)
|
||||||
reinterpret_cast<float*>(&info)[i] =
|
reinterpret_cast<float*>(&info)[i] =
|
||||||
reinterpret_cast<float*>(info_glob)[i];
|
reinterpret_cast<float*>(info_glob)[i];
|
||||||
|
|
||||||
int consec_bad = -{{features.fuse}};
|
int consec_bad = -{{features.fuse}};
|
||||||
// TODO: make nsteps adjustable via genome
|
// TODO: remove '512' constant
|
||||||
int nsamps = {{packer.get('cp.width * cp.height / 512000. * cp.adj_density')}};
|
int nsamps = {{packer.get('cp.width * cp.height / (cp.ntemporal_samples * 512.) * cp.adj_density')}};
|
||||||
|
|
||||||
float x, y, color;
|
float x, y, color;
|
||||||
x = mwc_next_11(&rctx);
|
x = mwc_next_11(&rctx);
|
||||||
@ -157,86 +150,3 @@ void iter(mwc_st *msts, iter_info *infos, float4 *accbuf, float *denbuf) {
|
|||||||
packer = self.packer.view('info'),
|
packer = self.packer.view('info'),
|
||||||
**globals())
|
**globals())
|
||||||
|
|
||||||
def render(features, cps):
|
|
||||||
# TODO: make this adjustable via genome
|
|
||||||
nsteps = 1000
|
|
||||||
abuf = np.zeros((features.acc_height, features.acc_stride, 4), dtype=np.float32)
|
|
||||||
dbuf = np.zeros((features.acc_height, features.acc_stride), dtype=np.float32)
|
|
||||||
seeds = mwc.MWC.make_seeds(512 * nsteps)
|
|
||||||
|
|
||||||
iter = IterCode(features)
|
|
||||||
de = filter.DensityEst(features, cps[0])
|
|
||||||
code = assemble_code(BaseCode, mwc.MWC, iter.packer, iter,
|
|
||||||
filter.ColorClip, de)
|
|
||||||
|
|
||||||
for lno, line in enumerate(code.split('\n')):
|
|
||||||
print '%3d %s' % (lno, line)
|
|
||||||
mod = SourceModule(code,
|
|
||||||
options=['-use_fast_math', '-maxrregcount', '32'])
|
|
||||||
|
|
||||||
cps_as_array = (Genome * len(cps))()
|
|
||||||
for i, cp in enumerate(cps):
|
|
||||||
cps_as_array[i] = cp
|
|
||||||
|
|
||||||
infos = []
|
|
||||||
pal = np.empty((16, 256, 4), dtype=np.uint8)
|
|
||||||
|
|
||||||
# TODO: move this into a common function
|
|
||||||
if len(cps) > 1:
|
|
||||||
cp = Genome()
|
|
||||||
memset(byref(cp), 0, sizeof(cp))
|
|
||||||
|
|
||||||
sampAt = [int(i/15.*(nsteps-1)) for i in range(16)]
|
|
||||||
for n in range(nsteps):
|
|
||||||
flam3_interpolate(cps_as_array, 2, float(n)/nsteps - 0.5,
|
|
||||||
0, byref(cp))
|
|
||||||
cp._init()
|
|
||||||
if n in sampAt:
|
|
||||||
pidx = sampAt.index(n)
|
|
||||||
for i, e in enumerate(cp.palette.entries):
|
|
||||||
pal[pidx][i] = np.uint8(np.array(e.color) * 255.0)
|
|
||||||
infos.append(iter.packer.pack(cp=cp, cp_step_frac=float(n)/nsteps))
|
|
||||||
else:
|
|
||||||
for i, e in enumerate(cps[0].palette.entries):
|
|
||||||
pal[0][i] = np.uint8(np.array(e.color) * 255.0)
|
|
||||||
pal[1:] = pal[0]
|
|
||||||
infos.append(iter.packer.pack(cp=cps[0], cp_step_frac=0))
|
|
||||||
infos *= nsteps
|
|
||||||
|
|
||||||
infos = np.concatenate(infos)
|
|
||||||
|
|
||||||
dpal = cuda.make_multichannel_2d_array(pal, 'C')
|
|
||||||
tref = mod.get_texref('palTex')
|
|
||||||
tref.set_array(dpal)
|
|
||||||
tref.set_format(cuda.array_format.UNSIGNED_INT8, 4)
|
|
||||||
tref.set_flags(cuda.TRSF_NORMALIZED_COORDINATES)
|
|
||||||
tref.set_filter_mode(cuda.filter_mode.LINEAR)
|
|
||||||
|
|
||||||
abufd = cuda.to_device(abuf)
|
|
||||||
dbufd = cuda.to_device(dbuf)
|
|
||||||
|
|
||||||
fun = mod.get_function("iter")
|
|
||||||
fun.set_cache_config(cuda.func_cache.PREFER_L1)
|
|
||||||
t = fun(InOut(seeds), InOut(infos), abufd, dbufd,
|
|
||||||
block=(32,16,1), grid=(nsteps,1), time_kernel=True)
|
|
||||||
print "Completed render in %g seconds" % t
|
|
||||||
|
|
||||||
f = np.float32
|
|
||||||
|
|
||||||
npix = features.acc_width * features.acc_height
|
|
||||||
|
|
||||||
# TODO: just allocate
|
|
||||||
obufd = cuda.to_device(abuf)
|
|
||||||
dbuf = cuda.from_device_like(dbufd, dbuf)
|
|
||||||
dbuf = ndimage.filters.gaussian_filter(dbuf, 0.6)
|
|
||||||
dbufd = cuda.to_device(dbuf)
|
|
||||||
de.invoke(mod, abufd, obufd, dbufd)
|
|
||||||
|
|
||||||
fun = mod.get_function("colorclip")
|
|
||||||
t = fun(obufd, f(1 / cp.gamma), f(cp.vibrancy), f(cp.highlight_power),
|
|
||||||
block=(256,1,1), grid=(npix/256,1), time_kernel=True)
|
|
||||||
print "Completed color filtering in %g seconds" % t
|
|
||||||
|
|
||||||
abuf = cuda.from_device_like(obufd, abuf)
|
|
||||||
return abuf, dbuf
|
|
||||||
|
|
||||||
|
@ -66,8 +66,26 @@ int trunca(float f) {
|
|||||||
asm("cvt.rni.s32.f32 %0, %1;" : "=r"(ret) : "f"(f));
|
asm("cvt.rni.s32.f32 %0, %1;" : "=r"(ret) : "f"(f));
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
__global__
|
||||||
|
void zero_dptr(float* dptr, int size) {
|
||||||
|
int i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
if (i < size) {
|
||||||
|
dptr[i] = 0.0f;
|
||||||
|
}
|
||||||
|
}
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def zero_dptr(mod, dptr, size, stream=None):
|
||||||
|
"""
|
||||||
|
A memory zeroer which can be embedded in a stream. Size is the
|
||||||
|
number of 4-byte words in the pointer.
|
||||||
|
"""
|
||||||
|
zero = mod.get_function("zero_dptr")
|
||||||
|
zero(dptr, np.int32(size), stream=stream,
|
||||||
|
block=(1024, 1, 1), grid=(size/1024+1, 1))
|
||||||
|
|
||||||
class DataPackerView(object):
|
class DataPackerView(object):
|
||||||
"""
|
"""
|
||||||
View of a data packer. Intended to be initialized using DataPacker.view().
|
View of a data packer. Intended to be initialized using DataPacker.view().
|
||||||
|
339
cuburn/render.py
339
cuburn/render.py
@ -1,44 +1,57 @@
|
|||||||
import sys
|
import sys
|
||||||
import math
|
import math
|
||||||
import re
|
import re
|
||||||
|
from itertools import cycle, repeat, chain, izip
|
||||||
from ctypes import *
|
from ctypes import *
|
||||||
from cStringIO import StringIO
|
from cStringIO import StringIO
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
from scipy import ndimage
|
||||||
|
|
||||||
from fr0stlib import pyflam3
|
from fr0stlib import pyflam3
|
||||||
from fr0stlib.pyflam3._flam3 import *
|
from fr0stlib.pyflam3._flam3 import *
|
||||||
from fr0stlib.pyflam3.constants import *
|
from fr0stlib.pyflam3.constants import *
|
||||||
|
|
||||||
|
import pycuda.compiler
|
||||||
|
import pycuda.driver as cuda
|
||||||
|
|
||||||
from cuburn import affine
|
from cuburn import affine
|
||||||
from cuburn.variations import Variations
|
from cuburn.code import util, mwc, iter, filtering
|
||||||
|
|
||||||
class Genome(pyflam3.Genome):
|
def _chunk(l, cs):
|
||||||
@classmethod
|
"""
|
||||||
def from_string(cls, *args, **kwargs):
|
Yield the contents of list ``l`` in chunks of size no more than ``cs``.
|
||||||
gnms = super(Genome, cls).from_string(*args, **kwargs)
|
"""
|
||||||
for g in gnms: g._init()
|
for i in range(0, len(l), cs):
|
||||||
return gnms
|
yield l[i:i+cs]
|
||||||
|
|
||||||
def _init(self):
|
class Genome(object):
|
||||||
|
"""
|
||||||
|
Normalizes and precalculates some properties of a Genome. Assumes that
|
||||||
|
Genome argument passed in will not change.
|
||||||
|
"""
|
||||||
|
# Fix the ctypes ugliness since switching to __getattribute__ in 2.7.
|
||||||
|
# There are more elegant ways to do this, but I can't be bothered.
|
||||||
|
def __getattr__(self, name):
|
||||||
|
return getattr(self.cp, name)
|
||||||
|
|
||||||
|
def __init__(self, ctypes_genome):
|
||||||
|
self.cp = ctypes_genome
|
||||||
self.xforms = [self.xform[i] for i in range(self.num_xforms)]
|
self.xforms = [self.xform[i] for i in range(self.num_xforms)]
|
||||||
dens = np.array([x.density for i, x in enumerate(self.xforms)
|
dens = np.array([x.density for i, x in enumerate(self.xforms)
|
||||||
if i != self.final_xform_index])
|
if i != self.final_xform_index])
|
||||||
dens /= np.sum(dens)
|
dens /= np.sum(dens)
|
||||||
self.norm_density = [np.sum(dens[:i+1]) for i in range(len(dens))]
|
self.norm_density = [np.sum(dens[:i+1]) for i in range(len(dens))]
|
||||||
|
self.camera_transform = self.calc_camera_transform()
|
||||||
|
|
||||||
scale = property(lambda cp: 2.0 ** cp.zoom)
|
scale = property(lambda cp: 2.0 ** cp.zoom)
|
||||||
adj_density = property(lambda cp: cp.sample_density * (cp.scale ** 2))
|
adj_density = property(lambda cp: cp.sample_density * (cp.scale ** 2))
|
||||||
ppu = property(lambda cp: cp.pixels_per_unit * cp.scale)
|
ppu = property(lambda cp: cp.pixels_per_unit * cp.scale)
|
||||||
|
|
||||||
@property
|
def calc_camera_transform(cp):
|
||||||
def camera_transform(cp):
|
|
||||||
"""
|
"""
|
||||||
An affine matrix which will transform IFS coordinates to image width
|
An affine matrix which will transform IFS coordinates to image width
|
||||||
and height. Assumes that width and height are constant.
|
and height. Assumes that width and height are constant.
|
||||||
"""
|
"""
|
||||||
# TODO: when reading as a property during packing, this may be
|
|
||||||
# calculated 6 times instead of 1
|
|
||||||
# TODO: also requires knowing gutter width
|
|
||||||
g = Features.gutter
|
g = Features.gutter
|
||||||
return ( affine.translate(0.5 * cp.width + g, 0.5 * cp.height + g)
|
return ( affine.translate(0.5 * cp.width + g, 0.5 * cp.height + g)
|
||||||
* affine.scale(cp.ppu, cp.ppu)
|
* affine.scale(cp.ppu, cp.ppu)
|
||||||
@ -65,13 +78,294 @@ class Animation(object):
|
|||||||
In other words, it's best to use exactly one Animation for each
|
In other words, it's best to use exactly one Animation for each
|
||||||
interpolated sequence between one or two genomes.
|
interpolated sequence between one or two genomes.
|
||||||
"""
|
"""
|
||||||
def __init__(self, genomes, ngenomes = None):
|
def __init__(self, ctypes_genome_array):
|
||||||
self.features = Features(genomes)
|
self._g_arr = ctypes_genome_array
|
||||||
|
self.genomes = map(Genome, ctypes_genome_array)
|
||||||
|
self.features = Features(self.genomes)
|
||||||
|
self._iter = self._de = self.src = self.cubin = self.mod = None
|
||||||
|
|
||||||
def compile(self):
|
def compile(self, keep=False,
|
||||||
pass
|
cmp_options=('-use_fast_math', '-maxrregcount', '32')):
|
||||||
def render_frame(self, time=0):
|
"""
|
||||||
pass
|
Compile a kernel capable of rendering every frame in this animation.
|
||||||
|
The resulting compiled kernel is stored in the ``cubin`` property;
|
||||||
|
the source is available as ``src``, and is also returned for
|
||||||
|
inspection and display.
|
||||||
|
|
||||||
|
This operation is idempotent, and has no side effects outside of
|
||||||
|
setting properties on this instance (unless there's a compiler error,
|
||||||
|
which is a bug); it should therefore be threadsafe as well.
|
||||||
|
It is, however, rather slow.
|
||||||
|
"""
|
||||||
|
self._iter = iter.IterCode(self.features)
|
||||||
|
self._de = filtering.DensityEst(self.features, self.genomes[0])
|
||||||
|
# TODO: make choice of filtering explicit
|
||||||
|
# TODO: autoload dependent modules?
|
||||||
|
self.src = util.assemble_code(util.BaseCode, mwc.MWC, self._iter.packer,
|
||||||
|
self._iter, filtering.ColorClip, self._de)
|
||||||
|
self.cubin = pycuda.compiler.compile(self.src, keep=False,
|
||||||
|
options=list(cmp_options))
|
||||||
|
return self.src
|
||||||
|
|
||||||
|
def copy(self):
|
||||||
|
"""
|
||||||
|
Return a copy of this animation without any references to the current
|
||||||
|
CUDA context. This can be used to load an animation in multiple CUDA
|
||||||
|
contexts without recompiling, so that rendering can proceed across
|
||||||
|
multiple devices - but managing that is up to you.
|
||||||
|
"""
|
||||||
|
import copy
|
||||||
|
new = copy.copy(self)
|
||||||
|
new.mod = None
|
||||||
|
return new
|
||||||
|
|
||||||
|
def load(self, jit_options=[]):
|
||||||
|
"""
|
||||||
|
Replace the currently loaded CUDA module in the active CUDA context
|
||||||
|
with the compiled code's module. A reference is kept to the module,
|
||||||
|
meaning that rendering should henceforth only be called from the
|
||||||
|
thread and context in which this function was called.
|
||||||
|
"""
|
||||||
|
if self.cubin is None:
|
||||||
|
self.compile()
|
||||||
|
self.mod = cuda.module_from_buffer(self.cubin, jit_options)
|
||||||
|
|
||||||
|
def render_frames(self, times=None):
|
||||||
|
"""
|
||||||
|
Render a flame for each genome in the iterable value 'genomes'.
|
||||||
|
Returns a Python generator object which will yield one NumPy array
|
||||||
|
for each rendered image.
|
||||||
|
|
||||||
|
This method produces a considerable amount of side effects, and should
|
||||||
|
not be used lightly. Things may go poorly for you if this method is not
|
||||||
|
allowed to run until completion (by exhausting all items in the
|
||||||
|
generator object).
|
||||||
|
|
||||||
|
A performance note: while any ready tasks will be scheduled on the GPU
|
||||||
|
before yielding a result, spending a lot of time before returning
|
||||||
|
control to this function can allow the GPU to become idle. It's best
|
||||||
|
to hand the resulting array to another thread after grabbing it from
|
||||||
|
the renderer for handling.
|
||||||
|
|
||||||
|
``times`` is a sequence of center times at which to render, or ``None``
|
||||||
|
to render one frame for each genome used to create the animation.
|
||||||
|
"""
|
||||||
|
# Don't see this changing, but empirical tests could prove me wrong
|
||||||
|
NRENDERERS = 2
|
||||||
|
# TODO: under a slightly modified sequencing, certain buffers can be
|
||||||
|
# shared (though this may be unimportant if a good AA technique which
|
||||||
|
# doesn't require full SS can be found)
|
||||||
|
rdrs = [_AnimRenderer(self) for i in range(NRENDERERS)]
|
||||||
|
|
||||||
|
# Zip up each genome with an alternating renderer, plus enough empty
|
||||||
|
# genomes at the end to flush all pending tasks
|
||||||
|
times = times or [cp.time for cp in self.genomes]
|
||||||
|
exttimes = chain(times, repeat(None, NRENDERERS))
|
||||||
|
for rdr, time in izip(cycle(rdrs), exttimes):
|
||||||
|
if rdr.wait():
|
||||||
|
yield rdr.get_result()
|
||||||
|
if time is not None:
|
||||||
|
rdr.render(time)
|
||||||
|
|
||||||
|
def _interp(self, time, cp):
|
||||||
|
flam3_interpolate(self._g_arr, len(self._g_arr), time, 0, byref(cp))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class _AnimRenderer(object):
|
||||||
|
# Large launches lock the display for a considerable period and may be
|
||||||
|
# killed due to a device timeout; small launches are harder to load-balance
|
||||||
|
# on the GPU and incur overhead. This empirical value is multiplied by the
|
||||||
|
# number of SMs on the device to determine how many blocks should be in
|
||||||
|
# each launch. Extremely high quality, high resolution renders may still
|
||||||
|
# encounter a device timeout, and no workaround is in place for that yet.
|
||||||
|
SM_FACTOR = 8
|
||||||
|
|
||||||
|
# Currently, palette interpolation is done independently of animation
|
||||||
|
# interpolation, so that the process is not biased and so we only need to
|
||||||
|
# mess about with one texture per renderer. This many steps will always be
|
||||||
|
# used, no matter the number of time steps.
|
||||||
|
PAL_HEIGHT = 16
|
||||||
|
|
||||||
|
|
||||||
|
def __init__(self, anim):
|
||||||
|
self.anim = anim
|
||||||
|
self.pending = False
|
||||||
|
self.stream = cuda.Stream()
|
||||||
|
|
||||||
|
self._nsms = cuda.Context.get_device().multiprocessor_count
|
||||||
|
self.cps_per_block = self._nsms * self.SM_FACTOR
|
||||||
|
self.ncps = anim.features.max_cps
|
||||||
|
self.nblocks = int(math.ceil(self.ncps / float(self.cps_per_block)))
|
||||||
|
|
||||||
|
# These are stored to avoid leaks, not to be stateful in method calls
|
||||||
|
# TODO: ensure proper cleanup is done
|
||||||
|
self._dst_cp = pyflam3.Genome()
|
||||||
|
memset(byref(self._dst_cp), 0, sizeof(self._dst_cp))
|
||||||
|
self._cen_cp = pyflam3.Genome()
|
||||||
|
memset(byref(self._cen_cp), 0, sizeof(self._cen_cp))
|
||||||
|
|
||||||
|
self.nbins = anim.features.acc_height * anim.features.acc_stride
|
||||||
|
self.d_den = cuda.mem_alloc(4 * self.nbins)
|
||||||
|
self.d_accum = cuda.mem_alloc(16 * self.nbins)
|
||||||
|
self.d_out = cuda.mem_alloc(16 * self.nbins)
|
||||||
|
self.d_infos = cuda.mem_alloc(anim._iter.packer.align * self.ncps)
|
||||||
|
# Defer allocation until first needed
|
||||||
|
self.d_seeds = [None] * self.nblocks
|
||||||
|
|
||||||
|
def render(self, cen_time):
|
||||||
|
assert not self.pending, "Tried to render with results pending!"
|
||||||
|
self.pending = True
|
||||||
|
a = self.anim
|
||||||
|
|
||||||
|
cen_cp = self._cen_cp
|
||||||
|
a._interp(cen_time, cen_cp)
|
||||||
|
palette = self._interp_colors(cen_time, cen_cp)
|
||||||
|
|
||||||
|
util.BaseCode.zero_dptr(a.mod, self.d_den, self.nbins,
|
||||||
|
self.stream)
|
||||||
|
util.BaseCode.zero_dptr(a.mod, self.d_accum, 4 * self.nbins,
|
||||||
|
self.stream)
|
||||||
|
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
# TODO WARNING TODO WARNING TODO WARNING TODO WARNING TODO
|
||||||
|
# This will replace the palette while it's in use by the other
|
||||||
|
# rendering function. Need to pass palette texref in function
|
||||||
|
# invocation.
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
dpal = cuda.make_multichannel_2d_array(palette, 'C')
|
||||||
|
tref = a.mod.get_texref('palTex')
|
||||||
|
tref.set_array(dpal)
|
||||||
|
tref.set_format(cuda.array_format.UNSIGNED_INT8, 4)
|
||||||
|
tref.set_flags(cuda.TRSF_NORMALIZED_COORDINATES)
|
||||||
|
tref.set_filter_mode(cuda.filter_mode.LINEAR)
|
||||||
|
|
||||||
|
cp = self._dst_cp
|
||||||
|
packer = a._iter.packer
|
||||||
|
|
||||||
|
iter_fun = a.mod.get_function("iter")
|
||||||
|
iter_fun.set_cache_config(cuda.func_cache.PREFER_L1)
|
||||||
|
|
||||||
|
# Must be accumulated over all CPs
|
||||||
|
gam, vib, hipow = 0, 0, 0
|
||||||
|
|
||||||
|
# This is gross, but there are a lot of fiddly corner cases with any
|
||||||
|
# index-based iteration scheme.
|
||||||
|
times = list(enumerate(self._mk_dts(cen_time, cen_cp, self.ncps)))
|
||||||
|
for b, block_times in enumerate(_chunk(times, self.cps_per_block)):
|
||||||
|
infos = []
|
||||||
|
if len(a.genomes) > 1:
|
||||||
|
for n, t in block_times:
|
||||||
|
a._interp(t, cp)
|
||||||
|
frac = float(n) / cen_cp.ntemporal_samples
|
||||||
|
info = packer.pack(cp=Genome(cp), cp_step_frac=frac)
|
||||||
|
infos.append(info)
|
||||||
|
gam += cp.gamma
|
||||||
|
vib += cp.vibrancy
|
||||||
|
hipow += cp.highlight_power
|
||||||
|
else:
|
||||||
|
# Can't interpolate normally; just pack copies
|
||||||
|
# TODO: this still packs the genome 20 times or so instead of
|
||||||
|
# once
|
||||||
|
packed = packer.pack(cp=a.genomes[0], cp_step_frac=0)
|
||||||
|
infos = [packed] * len(block_times)
|
||||||
|
gam += a.genomes[0].gamma * len(block_times)
|
||||||
|
vib += a.genomes[0].vibrancy * len(block_times)
|
||||||
|
hipow += a.genomes[0].highlight_power * len(block_times)
|
||||||
|
|
||||||
|
infos = np.concatenate(infos)
|
||||||
|
offset = b * packer.align * self.cps_per_block
|
||||||
|
# TODO: portable across 32/64-bit arches?
|
||||||
|
d_info_off = int(self.d_infos) + offset
|
||||||
|
cuda.memcpy_htod(d_info_off, infos)
|
||||||
|
|
||||||
|
if not self.d_seeds[b]:
|
||||||
|
seeds = mwc.MWC.make_seeds(iter.IterCode.NTHREADS *
|
||||||
|
self.cps_per_block)
|
||||||
|
self.d_seeds[b] = cuda.to_device(seeds)
|
||||||
|
|
||||||
|
# TODO: get block config from IterCode
|
||||||
|
# TODO: print timing information
|
||||||
|
iter_fun(self.d_seeds[b], np.uint64(d_info_off),
|
||||||
|
self.d_accum, self.d_den,
|
||||||
|
block=(32, 16, 1), grid=(len(block_times), 1),
|
||||||
|
stream=self.stream)
|
||||||
|
|
||||||
|
# MAJOR TODO: for now, we kill almost all parallelism by forcing the
|
||||||
|
# stream here. Later, once we've decided on a density-buffer prefilter,
|
||||||
|
# we will move it to the GPU, allowing it to be embedded in the stream
|
||||||
|
# and letting the remaining code be asynchronous.
|
||||||
|
self.stream.synchronize()
|
||||||
|
dbuf_dim = (a.features.acc_height, a.features.acc_stride)
|
||||||
|
dbuf = cuda.from_device(self.d_den, dbuf_dim, np.float32)
|
||||||
|
dbuf = ndimage.filters.gaussian_filter(dbuf, 0.6)
|
||||||
|
cuda.memcpy_htod(self.d_den, dbuf)
|
||||||
|
|
||||||
|
util.BaseCode.zero_dptr(a.mod, self.d_out, 4 * self.nbins,
|
||||||
|
self.stream)
|
||||||
|
self.stream.synchronize()
|
||||||
|
a._de.invoke(a.mod, self.d_accum, self.d_out, self.d_den,
|
||||||
|
self.stream)
|
||||||
|
self.stream.synchronize()
|
||||||
|
|
||||||
|
|
||||||
|
n = np.float32(self.ncps)
|
||||||
|
gam = np.float32(n / gam)
|
||||||
|
vib = np.float32(vib / n)
|
||||||
|
hipow = np.float32(hipow / n)
|
||||||
|
|
||||||
|
# TODO: get block size from colorclip class? It actually does not
|
||||||
|
# depend on that being the case
|
||||||
|
color_fun = a.mod.get_function("colorclip")
|
||||||
|
color_fun(self.d_out, gam, vib, hipow,
|
||||||
|
block=(256, 1, 1), grid=(self.nbins / 256, 1),
|
||||||
|
stream=self.stream)
|
||||||
|
|
||||||
|
def _interp_colors(self, cen_time, cen_cp):
|
||||||
|
# TODO: any visible difference between uint8 and richer formats?
|
||||||
|
pal = np.empty((self.PAL_HEIGHT, 256, 4), dtype=np.uint8)
|
||||||
|
a = self.anim
|
||||||
|
|
||||||
|
if len(a.genomes) > 1:
|
||||||
|
# The typical case; applying real motion blur
|
||||||
|
cp = self._dst_cp
|
||||||
|
times = self._mk_dts(cen_time, cen_cp, self.PAL_HEIGHT)
|
||||||
|
for n, t in enumerate(times):
|
||||||
|
a._interp(t, cp)
|
||||||
|
for i, e in enumerate(cp.palette.entries):
|
||||||
|
pal[n][i] = np.uint8(np.array(e.color) * 255.0)
|
||||||
|
else:
|
||||||
|
# Cannot call any interp functions on a single genome; rather than
|
||||||
|
# have alternate code-paths, just copy the same colors everywhere
|
||||||
|
for i, e in enumerate(a.genomes[0].palette.entries):
|
||||||
|
# TODO: This triggers a RuntimeWarning
|
||||||
|
pal[0][i] = np.uint8(np.array(e.color) * 255.0)
|
||||||
|
pal[1:] = pal[0]
|
||||||
|
return pal
|
||||||
|
|
||||||
|
def wait(self):
|
||||||
|
if self.pending:
|
||||||
|
self.stream.synchronize()
|
||||||
|
self.pending = False
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
def get_result(self):
|
||||||
|
a = self.anim
|
||||||
|
g = a.features.gutter
|
||||||
|
obuf_dim = (a.features.acc_height, a.features.acc_stride, 4)
|
||||||
|
out = cuda.from_device(self.d_out, obuf_dim, np.float32)
|
||||||
|
# TODO: performance?
|
||||||
|
out = np.delete(out, np.s_[:16], axis=0)
|
||||||
|
out = np.delete(out, np.s_[:16], axis=1)
|
||||||
|
out = np.delete(out, np.s_[-16:], axis=0)
|
||||||
|
out = np.delete(out, np.s_[-16:], axis=1)
|
||||||
|
return out
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _mk_dts(cen_time, cen_cp, ncps):
|
||||||
|
w = cen_cp.temporal_filter_width
|
||||||
|
return [w * (t / (ncps - 1.0) - 0.5) for t in range(ncps)]
|
||||||
|
|
||||||
class Features(object):
|
class Features(object):
|
||||||
"""
|
"""
|
||||||
@ -93,7 +387,8 @@ class Features(object):
|
|||||||
palette_height = 16
|
palette_height = 16
|
||||||
|
|
||||||
# Maximum width of DE and other spatial filters, and thus in turn the
|
# Maximum width of DE and other spatial filters, and thus in turn the
|
||||||
# amount of padding applied
|
# amount of padding applied. Note that, for now, this must not be changed!
|
||||||
|
# The filtering code makes deep assumptions about this value.
|
||||||
gutter = 16
|
gutter = 16
|
||||||
|
|
||||||
def __init__(self, genomes):
|
def __init__(self, genomes):
|
||||||
@ -116,11 +411,13 @@ class Features(object):
|
|||||||
else:
|
else:
|
||||||
self.final_xform_index = None
|
self.final_xform_index = None
|
||||||
|
|
||||||
|
self.max_cps = max([cp.ntemporal_samples for cp in genomes])
|
||||||
|
|
||||||
self.width = genomes[0].width
|
self.width = genomes[0].width
|
||||||
self.height = genomes[0].height
|
self.height = genomes[0].height
|
||||||
self.acc_width = genomes[0].width + 2 * self.gutter
|
self.acc_width = genomes[0].width + 2 * self.gutter
|
||||||
self.acc_height = genomes[0].height + 2 * self.gutter
|
self.acc_height = genomes[0].height + 2 * self.gutter
|
||||||
self.acc_stride = genomes[0].width + 2 * self.gutter
|
self.acc_stride = 32 * int(math.ceil(self.acc_width / 32.))
|
||||||
|
|
||||||
class XFormFeatures(object):
|
class XFormFeatures(object):
|
||||||
def __init__(self, xforms, xform_id):
|
def __init__(self, xforms, xform_id):
|
||||||
|
29
main.py
29
main.py
@ -22,13 +22,10 @@ import scipy
|
|||||||
import pyglet
|
import pyglet
|
||||||
import pycuda.autoinit
|
import pycuda.autoinit
|
||||||
|
|
||||||
from fr0stlib.pyflam3 import *
|
|
||||||
from fr0stlib.pyflam3._flam3 import *
|
|
||||||
|
|
||||||
import cuburn._pyflam3_hacks
|
import cuburn._pyflam3_hacks
|
||||||
|
from fr0stlib import pyflam3
|
||||||
from cuburn.render import *
|
from cuburn.render import *
|
||||||
from cuburn.code.mwc import MWCTest
|
from cuburn.code.mwc import MWCTest
|
||||||
from cuburn.code.iter import render, membench
|
|
||||||
|
|
||||||
# Required on my system; CUDA doesn't yet work with GCC 4.5
|
# Required on my system; CUDA doesn't yet work with GCC 4.5
|
||||||
os.environ['PATH'] = ('/usr/x86_64-pc-linux-gnu/gcc-bin/4.4.5:'
|
os.environ['PATH'] = ('/usr/x86_64-pc-linux-gnu/gcc-bin/4.4.5:'
|
||||||
@ -37,24 +34,22 @@ os.environ['PATH'] = ('/usr/x86_64-pc-linux-gnu/gcc-bin/4.4.5:'
|
|||||||
def main(args):
|
def main(args):
|
||||||
if '-t' in args:
|
if '-t' in args:
|
||||||
MWCTest.test_mwc()
|
MWCTest.test_mwc()
|
||||||
membench()
|
|
||||||
|
|
||||||
|
|
||||||
with open(args[1]) as fp:
|
with open(args[1]) as fp:
|
||||||
genomes = Genome.from_string(fp.read())
|
genome_ptr, ngenomes = pyflam3.Genome.from_string(fp.read())
|
||||||
|
genomes = cast(genome_ptr, POINTER(pyflam3.Genome*ngenomes)).contents
|
||||||
anim = Animation(genomes)
|
anim = Animation(genomes)
|
||||||
accum, den = render(anim.features, genomes)
|
anim.compile()
|
||||||
accum = np.delete(accum, np.s_[:16], axis=0)
|
anim.load()
|
||||||
accum = np.delete(accum, np.s_[:16], axis=1)
|
for n, out in enumerate(anim.render_frames()):
|
||||||
accum = np.delete(accum, np.s_[-16:], axis=0)
|
noalpha = np.delete(out, 3, axis=2)
|
||||||
accum = np.delete(accum, np.s_[-16:], axis=1)
|
scipy.misc.imsave('rendered_%03d.png' % n, noalpha)
|
||||||
|
scipy.misc.imsave('rendered_%03d.jpg' % n, noalpha)
|
||||||
|
|
||||||
noalpha = np.delete(accum, 3, axis=2)
|
return
|
||||||
scipy.misc.imsave('rendered.png', noalpha)
|
|
||||||
scipy.misc.imsave('rendered.jpg', noalpha)
|
|
||||||
|
|
||||||
if '-g' not in args:
|
#if '-g' not in args:
|
||||||
return
|
# return
|
||||||
|
|
||||||
window = pyglet.window.Window(anim.features.width, anim.features.height)
|
window = pyglet.window.Window(anim.features.width, anim.features.height)
|
||||||
imgbuf = (np.minimum(accum * 255, 255)).astype(np.uint8)
|
imgbuf = (np.minimum(accum * 255, 255)).astype(np.uint8)
|
||||||
|
Loading…
Reference in New Issue
Block a user