mirror of
https://github.com/stevenrobertson/cuburn.git
synced 2025-02-05 11:40:04 -05:00
New genome format to support flockutil
This commit is contained in:
parent
12655b8611
commit
b43481e374
@ -1,5 +1,7 @@
|
||||
|
||||
import numpy as np
|
||||
from numpy import float32 as f32, int32 as i32
|
||||
|
||||
import pycuda.compiler
|
||||
from pycuda.gpuarray import vec
|
||||
|
||||
@ -10,8 +12,8 @@ _CODE = '''
|
||||
|
||||
__global__
|
||||
void colorclip(float4 *pixbuf, float gamma, float vibrancy, float highpow,
|
||||
float linrange, float lingam, float3 bkgd, int fbsize,
|
||||
int alpha_output_channel) {
|
||||
float linrange, float lingam, float3 bkgd,
|
||||
int fbsize, int blend_background_color) {
|
||||
int i = threadIdx.x + blockDim.x * (blockIdx.x + gridDim.x * blockIdx.y);
|
||||
if (i >= fbsize) return;
|
||||
|
||||
@ -30,8 +32,17 @@ void colorclip(float4 *pixbuf, float gamma, float vibrancy, float highpow,
|
||||
alpha = (1.0f - frac) * pix.w * lingam + frac * alpha;
|
||||
}
|
||||
|
||||
float ls = vibrancy * alpha / pix.w;
|
||||
if (!blend_background_color) {
|
||||
float ls = alpha / pix.w;
|
||||
pix.x *= ls;
|
||||
pix.y *= ls;
|
||||
pix.z *= ls;
|
||||
pix.w = alpha;
|
||||
pixbuf[i] = pix;
|
||||
return;
|
||||
}
|
||||
|
||||
float ls = vibrancy * alpha / pix.w;
|
||||
alpha = fminf(1.0f, fmaxf(0.0f, alpha));
|
||||
|
||||
float maxc = fmaxf(pix.x, fmaxf(pix.y, pix.z));
|
||||
@ -64,22 +75,14 @@ void colorclip(float4 *pixbuf, float gamma, float vibrancy, float highpow,
|
||||
pix.y += (1.0f - vibrancy) * powf(opix.y, gamma);
|
||||
pix.z += (1.0f - vibrancy) * powf(opix.z, gamma);
|
||||
|
||||
if (alpha_output_channel) {
|
||||
float one_alpha = 1.0f / alpha;
|
||||
pix.x *= one_alpha;
|
||||
pix.y *= one_alpha;
|
||||
pix.z *= one_alpha;
|
||||
} else {
|
||||
pix.x += (1.0f - alpha) * bkgd.x;
|
||||
pix.y += (1.0f - alpha) * bkgd.y;
|
||||
pix.z += (1.0f - alpha) * bkgd.z;
|
||||
}
|
||||
pix.w = alpha;
|
||||
|
||||
// Clamp values. I think this is superfluous, but I'm not certain.
|
||||
pix.x = fminf(1.0f, pix.x);
|
||||
pix.y = fminf(1.0f, pix.y);
|
||||
pix.z = fminf(1.0f, pix.z);
|
||||
pix.w = alpha;
|
||||
|
||||
pixbuf[i] = pix;
|
||||
}
|
||||
@ -288,52 +291,45 @@ class Filtering(object):
|
||||
def __init__(self):
|
||||
self.init_mod()
|
||||
|
||||
def de(self, ddst, dsrc, info, start, stop, stream=None):
|
||||
# TODO: use integration to obtain parameter values
|
||||
t = (start + stop) / 2
|
||||
cp = info.genome
|
||||
|
||||
k1 = np.float32(cp.color.brightness(t) * 268 / 256)
|
||||
def de(self, ddst, dsrc, gnm, dim, tc, stream=None):
|
||||
k1 = f32(gnm.color.brightness(tc) * 268 / 256)
|
||||
# Old definition of area is (w*h/(s*s)). Since new scale 'ns' is now
|
||||
# s/w, new definition is (w*h/(s*s*w*w)) = (h/(s*s*w))
|
||||
area = info.height / (cp.camera.scale(t) ** 2 * info.width)
|
||||
k2 = np.float32(1 / (area * info.density))
|
||||
area = dim.h / (gnm.camera.scale(tc) ** 2 * dim.w)
|
||||
k2 = f32(1 / (area * gnm.spp(tc)))
|
||||
|
||||
if cp.de.radius == 0:
|
||||
nbins = info.acc_height * info.acc_stride
|
||||
if gnm.de.radius == 0:
|
||||
nbins = dim.ah * dim.astride
|
||||
fun = self.mod.get_function("logscale")
|
||||
t = fun(dsrc, ddst, k1, k2,
|
||||
block=(512, 1, 1), grid=(nbins/512, 1), stream=stream)
|
||||
else:
|
||||
scale_coeff = np.float32(-(1 + cp.de.radius(t)) ** -2.0)
|
||||
est_curve = np.float32(2 * cp.de.curve(t))
|
||||
scale_coeff = f32(-(1 + gnm.de.radius(tc)) ** -2.0)
|
||||
est_curve = f32(2 * gnm.de.curve(tc))
|
||||
# TODO: experiment with this
|
||||
edge_clamp = np.float32(2.0)
|
||||
edge_clamp = f32(1.2)
|
||||
fun = self.mod.get_function("density_est")
|
||||
fun(dsrc, ddst, scale_coeff, est_curve, edge_clamp, k1, k2,
|
||||
np.int32(info.acc_height), np.int32(info.acc_stride),
|
||||
block=(32, 32, 1), grid=(info.acc_width/32, 1), stream=stream)
|
||||
i32(dim.ah), i32(dim.astride), block=(32, 32, 1),
|
||||
grid=(dim.aw/32, 1), stream=stream)
|
||||
|
||||
def colorclip(self, dbuf, info, start, stop, stream=None):
|
||||
f32 = np.float32
|
||||
t = (start + stop) / 2
|
||||
cp = info.genome
|
||||
nbins = info.acc_height * info.acc_stride
|
||||
def colorclip(self, dbuf, gnm, dim, tc, blend, stream=None):
|
||||
nbins = dim.ah * dim.astride
|
||||
|
||||
# TODO: implement integration over cubic splines?
|
||||
gam = f32(1 / cp.color.gamma(t))
|
||||
vib = f32(cp.color.vibrancy(t))
|
||||
hipow = f32(cp.color.highlight_power(t))
|
||||
lin = f32(cp.color.gamma_threshold(t))
|
||||
gam = f32(1 / gnm.color.gamma(tc))
|
||||
vib = f32(gnm.color.vibrancy(tc))
|
||||
hipow = f32(gnm.color.highlight_power(tc))
|
||||
lin = f32(gnm.color.gamma_threshold(tc))
|
||||
lingam = f32(lin ** (gam-1.0) if lin > 0 else 0)
|
||||
bkgd = vec.make_float3(
|
||||
cp.color.background.r(t),
|
||||
cp.color.background.g(t),
|
||||
cp.color.background.b(t))
|
||||
gnm.color.background.r(tc),
|
||||
gnm.color.background.g(tc),
|
||||
gnm.color.background.b(tc))
|
||||
|
||||
color_fun = self.mod.get_function("colorclip")
|
||||
blocks = int(np.ceil(np.sqrt(nbins / 256)))
|
||||
color_fun(dbuf, gam, vib, hipow, lin, lingam, bkgd, np.int32(nbins),
|
||||
np.int32(0),
|
||||
block=(256, 1, 1), grid=(blocks, blocks), stream=stream)
|
||||
color_fun(dbuf, gam, vib, hipow, lin, lingam, bkgd, i32(nbins),
|
||||
i32(blend), block=(256, 1, 1), grid=(blocks, blocks),
|
||||
stream=stream)
|
||||
|
||||
|
@ -226,8 +226,11 @@ class GenomePacker(HunkOCode):
|
||||
_defs = Template(r"""
|
||||
|
||||
__global__
|
||||
void interp_{{tname}}({{tname}}* out, float *times, float *knots,
|
||||
float tstart, float tstep, mwc_st *rctxes, int maxid) {
|
||||
void interp_{{tname}}(
|
||||
{{tname}}* out, mwc_st *rctxes,
|
||||
const float *times, const float *knots,
|
||||
float tstart, float tstep, int maxid
|
||||
) {
|
||||
int id = gtid();
|
||||
if (id >= maxid) return;
|
||||
out = &out[id];
|
||||
|
@ -53,7 +53,7 @@ def precalc_chaos(pcp, std_xforms):
|
||||
|
||||
""").substitute(locals()))
|
||||
|
||||
def precalc_camera(info, pcam):
|
||||
def precalc_camera(pcam):
|
||||
pre_cam = pcam._precalc()
|
||||
|
||||
# Maxima code to check my logic:
|
||||
@ -68,7 +68,7 @@ def precalc_camera(info, pcam):
|
||||
float rot = {{pre_cam.rotation}} * M_PI / 180.0f;
|
||||
float rotsin = sin(rot), rotcos = cos(rot);
|
||||
float cenx = {{pre_cam.center.x}}, ceny = {{pre_cam.center.y}};
|
||||
float scale = {{pre_cam.scale}} * {{info.width}};
|
||||
float scale = {{pre_cam.scale}} * acc_size.width;
|
||||
|
||||
float ditherwidth = {{pre_cam.dither_width}} * 0.33f;
|
||||
float u0 = mwc_next_01(rctx);
|
||||
@ -81,12 +81,12 @@ def precalc_camera(info, pcam):
|
||||
{{pre_cam._set('xx')}} = scale * rotcos;
|
||||
{{pre_cam._set('xy')}} = scale * -rotsin;
|
||||
{{pre_cam._set('xo')}} = scale * (rotsin * ceny - rotcos * cenx)
|
||||
+ {{0.5 * (info.width + info.gutter + 1)}} + ditherx;
|
||||
+ 0.5f * acc_size.awidth + ditherx;
|
||||
|
||||
{{pre_cam._set('yx')}} = scale * rotsin;
|
||||
{{pre_cam._set('yy')}} = scale * rotcos;
|
||||
{{pre_cam._set('yo')}} = scale * -(rotsin * cenx + rotcos * ceny)
|
||||
+ {{0.5 * (info.height + info.gutter + 1)}} + dithery;
|
||||
+ 0.5f * acc_size.aheight + dithery;
|
||||
|
||||
""").substitute(locals()))
|
||||
|
||||
@ -113,13 +113,12 @@ class IterCode(HunkOCode):
|
||||
# The number of threads per block
|
||||
NTHREADS = 256
|
||||
|
||||
def __init__(self, info):
|
||||
self.info = info
|
||||
def __init__(self, info, genome):
|
||||
self.packer = interp.GenomePacker('iter_params')
|
||||
self.pcp = self.packer.view('params', self.info.genome, 'cp')
|
||||
self.pcp = self.packer.view('params', genome, 'cp')
|
||||
|
||||
iterbody = self._iterbody()
|
||||
bodies = [self._xfbody(i,x) for i,x in sorted(info.genome.xforms.items())]
|
||||
iterbody = self._iterbody(info, genome)
|
||||
bodies = [self._xfbody(i,x) for i,x in sorted(genome.xforms.items())]
|
||||
bodies.append(iterbody)
|
||||
self.defs = '\n'.join(bodies)
|
||||
|
||||
@ -132,7 +131,9 @@ __device__ int rb_head, rb_tail, rb_size;
|
||||
typedef struct {
|
||||
uint32_t width;
|
||||
uint32_t height;
|
||||
uint32_t stride;
|
||||
uint32_t awidth;
|
||||
uint32_t aheight;
|
||||
uint32_t astride;
|
||||
} acc_size_t;
|
||||
__constant__ acc_size_t acc_size;
|
||||
|
||||
@ -174,7 +175,7 @@ void apply_xf_{{xfid}}(float &ox, float &oy, float &color, mwc_st &rctx) {
|
||||
g.update(locals())
|
||||
return tmpl.substitute(g)
|
||||
|
||||
def _iterbody(self):
|
||||
def _iterbody(self, info, genome):
|
||||
tmpl = Template(r'''
|
||||
|
||||
__global__ void reset_rb(int size) {
|
||||
@ -331,7 +332,7 @@ void iter(
|
||||
|
||||
float cx, cy, cc;
|
||||
|
||||
{{precalc_camera(info, pcp.camera)}}
|
||||
{{precalc_camera(pcp.camera)}}
|
||||
|
||||
{{if 'final' in cp.xforms}}
|
||||
{{apply_affine('fx', 'fy', 'cx', 'cy', pcp.camera)}}
|
||||
@ -343,14 +344,14 @@ void iter(
|
||||
|
||||
uint32_t ix = trunca(cx), iy = trunca(cy);
|
||||
|
||||
if (ix >= acc_size.width || iy >= acc_size.height) {
|
||||
if (ix >= acc_size.awidth || iy >= acc_size.aheight) {
|
||||
{{if info.acc_mode == 'deferred'}}
|
||||
*log = 0xffffffff;
|
||||
{{endif}}
|
||||
continue;
|
||||
}
|
||||
|
||||
uint32_t i = iy * acc_size.stride + ix;
|
||||
uint32_t i = iy * acc_size.astride + ix;
|
||||
{{if info.acc_mode == 'atomic'}}
|
||||
asm volatile ({{crep("""
|
||||
{
|
||||
@ -626,12 +627,11 @@ oflow_write_end:
|
||||
{{endif}}
|
||||
''', 'iter_kern')
|
||||
return tmpl.substitute(
|
||||
info = self.info,
|
||||
cp = self.info.genome,
|
||||
info = info,
|
||||
cp = genome,
|
||||
pcp = self.pcp,
|
||||
NTHREADS = self.NTHREADS,
|
||||
NWARPS = self.NTHREADS / 32,
|
||||
std_xforms = [n for n in sorted(self.info.genome.xforms)
|
||||
if n != 'final'],
|
||||
std_xforms = [n for n in sorted(genome.xforms) if n != 'final'],
|
||||
**globals())
|
||||
|
||||
|
175
cuburn/genome.py
175
cuburn/genome.py
@ -79,115 +79,86 @@ class SplEval(object):
|
||||
return self.knots[1][0]
|
||||
return list(self.knots.T.flat)
|
||||
|
||||
@classmethod
|
||||
def wrap(cls, obj):
|
||||
"""
|
||||
Given a dict 'obj' representing, for instance, a Genome object, walk
|
||||
through the object recursively and in-place, turning any number or
|
||||
list of numbers into an SplEval.
|
||||
"""
|
||||
for k, v in obj.items():
|
||||
if (isinstance(v, (float, int)) or
|
||||
(isinstance(v, list) and isinstance(v[1], (float, int)))):
|
||||
obj[k] = cls(v)
|
||||
elif isinstance(v, dict):
|
||||
cls.wrap(v)
|
||||
|
||||
class RenderInfo(object):
|
||||
"""
|
||||
Determine features and constants required to render a particular set of
|
||||
genomes. The values of this class are fixed before compilation begins.
|
||||
"""
|
||||
# Number of iterations to iterate without write after generating a new
|
||||
# point. This number is currently fixed pretty deeply in the set of magic
|
||||
# constants which govern buffer sizes; changing the value here won't
|
||||
# actually change the code on the device to do something different.
|
||||
fuse = 256
|
||||
|
||||
# Height of the texture pallete which gets uploaded to the GPU (assuming
|
||||
# that palette-from-texture is enabled). For most genomes, this doesn't
|
||||
# need to be very large at all. However, since only an easily-cached
|
||||
# fraction of this will be accessed per SM, larger values shouldn't hurt
|
||||
# performance too much. When using deferred accumulation, increasing this
|
||||
# value increases the number of uniquely-dithered samples, which is nice.
|
||||
# Power-of-two, please.
|
||||
palette_height = 64
|
||||
|
||||
# Maximum width of DE and other spatial filters, and thus in turn the
|
||||
# amount of padding applied. Note that, for now, this must not be changed!
|
||||
# The filtering code makes deep assumptions about this value.
|
||||
gutter = 15
|
||||
|
||||
# TODO: for now, we always throw away the alpha channel before writing.
|
||||
# All code is in place to not do this, we just need to find a way to expose
|
||||
# this preference via the API (or push alpha blending entirely on the client,
|
||||
# which I'm not opposed to)
|
||||
alpha_output_channel = False
|
||||
|
||||
# There are three settings for this somewhat ersatz paramater. 'global'
|
||||
# uses unsynchronized global writes to accumulate sample points, 'atomic'
|
||||
# uses atomic global writes, and 'deferred' stores color and position in a
|
||||
# sample log, sorts the log by position, and uses shared memory to
|
||||
# perform the accumulation. Deferred has the accuracy of 'atomic' and
|
||||
# the speed of 'global' (it's actually faster!), but packs color and
|
||||
# position into a single 32-bit int for now, which limits resolution to
|
||||
# 1080p when xform opacity is respected, so the other two modes will hang
|
||||
# around until that can be extended to be memory-limited again.
|
||||
acc_mode = 'atomic'
|
||||
|
||||
# TODO: fix this
|
||||
chaos_used = False
|
||||
|
||||
def __init__(self, db, **kwargs):
|
||||
self.db = db
|
||||
# Copy all args into this object's namespace
|
||||
self.__dict__.update(kwargs)
|
||||
|
||||
self.acc_width = self.width + 2 * self.gutter
|
||||
self.acc_height = self.height + 2 * self.gutter
|
||||
self.acc_stride = 32 * int(np.ceil(self.acc_width / 32.))
|
||||
self.density = self.quality
|
||||
|
||||
# Deref genome
|
||||
self.genome = self.db.genomes[self.genome]
|
||||
|
||||
for k, v in self.db.palettes.items():
|
||||
pal = np.fromstring(base64.b64decode(v), np.uint8)
|
||||
pal = np.reshape(pal, (256, 3))
|
||||
pal_a = np.ones((256, 4), np.float32)
|
||||
pal_a[:,:3] = pal / 255.0
|
||||
self.db.palettes[k] = pal_a
|
||||
class Palette(object):
|
||||
"""Wafer-thin wrapper around palettes. For the future!"""
|
||||
def __init__(self, datastr, fmt='rgb8'):
|
||||
if fmt != 'rgb8':
|
||||
raise NotImplementedError
|
||||
if len(datastr) != 768:
|
||||
raise ValueError("Unsupported palette width")
|
||||
self.width = 256
|
||||
pal = np.reshape(np.fromstring(datastr, np.uint8), (256, 3))
|
||||
self.data = np.ones((256, 4), np.float32)
|
||||
self.data[:,:3] = pal / 255.0
|
||||
|
||||
class _AttrDict(dict):
|
||||
def __getattr__(self, name):
|
||||
return self[name]
|
||||
|
||||
def load_info(contents):
|
||||
result = json.loads(contents, object_hook=_AttrDict)
|
||||
SplEval.wrap(result.genomes)
|
||||
@classmethod
|
||||
def _wrap(cls, dct):
|
||||
for k, v in dct.items():
|
||||
if (isinstance(v, (float, int)) or
|
||||
(isinstance(v, list) and isinstance(v[1], (float, int)))):
|
||||
dct[k] = SplEval(v)
|
||||
elif isinstance(v, dict):
|
||||
dct[k] = cls._wrap(cls(v))
|
||||
return dct
|
||||
|
||||
# A Job object will have more details or something
|
||||
result = RenderInfo(result, **result.renders.values()[0])
|
||||
return result
|
||||
class Genome(_AttrDict):
|
||||
# For now, we base the Genome class on an _AttrDict, letting its structure
|
||||
# be defined implicitly by the way it is used in device code. More formal
|
||||
# unpacking will happen soon.
|
||||
def __init__(self, gnm, base_den):
|
||||
super(Genome, self).__init__(gnm)
|
||||
for k, v in self.items():
|
||||
v = _AttrDict(v)
|
||||
if k not in ('info', 'time'):
|
||||
_AttrDict._wrap(v)
|
||||
self[k] = v
|
||||
# TODO: this is a hack, figure out how to solve it more elegantly
|
||||
self.spp = SplEval(self.camera.density.knotlist)
|
||||
self.spp.knots[1] *= base_den
|
||||
# TODO: decide how to handle palettes. For now, it's the caller's
|
||||
# responsibility to replace this list with actual palettes.
|
||||
pal = self.color.palette
|
||||
if isinstance(pal, basestring):
|
||||
self.color.palette = [(0.0, pal), (1.0, pal)]
|
||||
elif isinstance(pal, list):
|
||||
self.color.palette = zip(pal[::2], pal[1::2])
|
||||
|
||||
class HacketyGenome(object):
|
||||
# TODO: caller also needs to call set_timing()
|
||||
self.adj_frame_width = None
|
||||
self.canonical_right = (not self.get('link') or not self.link == 'self'
|
||||
or not self.link.get('right'))
|
||||
|
||||
def set_timing(self, base_dur, fps, offset=0.0, err_spread=True):
|
||||
"""
|
||||
Holdover class to postpone a very deep refactoring as long as possible.
|
||||
Converts property accesses into interpolations over predetermined times.
|
||||
Set frame timing. Must be called at least once prior to rendering.
|
||||
"""
|
||||
def __init__(self, referent, times):
|
||||
# Times can be singular
|
||||
self.referent, self.times = referent, times
|
||||
def __getattr__(self, name):
|
||||
r = getattr(self.referent, str(name))
|
||||
if isinstance(r, _AttrDict):
|
||||
return HacketyGenome(r, self.times)
|
||||
elif isinstance(r, SplEval):
|
||||
return r(self.times)
|
||||
return r
|
||||
__getitem__ = __getattr__
|
||||
# TODO: test!
|
||||
dur = self.time.duration
|
||||
if isinstance(dur, basestring):
|
||||
clock = float(dur[:-1]) + offset
|
||||
else:
|
||||
clock = dur * base_dur + offset
|
||||
if self.canonical_right:
|
||||
nframes = int(np.floor(clock * fps))
|
||||
else:
|
||||
nframes = int(np.ceil(clock * fps))
|
||||
err = (clock - nframes / fps) / clock
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
import pprint
|
||||
pprint.pprint(read_genome(sys.stdin))
|
||||
fw = self.time.frame_width
|
||||
if not isinstance(fw, list):
|
||||
fw = [0, fw, 1, fw]
|
||||
fw = [float(f[:-1]) * fps if isinstance(f, basestring)
|
||||
else float(f) / (clock * fps) for f in fw]
|
||||
self.adj_frame_width = SplEval(fw)
|
||||
|
||||
times = np.linspace(offset, 1 - err, nframes + 1)
|
||||
# Move each time to a center time, and discard the last value
|
||||
times = times[:-1] + 0.5 * (times[1] - times[0])
|
||||
if err_spread:
|
||||
epts = np.linspace(-2*np.pi, 2*np.pi, nframes)
|
||||
times = times + 0.5 * err * (np.tanh(epts) + 1)
|
||||
return err, times
|
||||
|
264
cuburn/render.py
264
cuburn/render.py
@ -4,11 +4,11 @@ import re
|
||||
import time as timemod
|
||||
import tempfile
|
||||
from collections import namedtuple
|
||||
from itertools import cycle, repeat, chain, izip
|
||||
from itertools import cycle, repeat, chain, izip, imap, ifilter
|
||||
from ctypes import *
|
||||
from cStringIO import StringIO
|
||||
import numpy as np
|
||||
from numpy import int32 as i32, uint64 as u64
|
||||
from numpy import float32 as f32, int32 as i32, uint32 as u32, uint64 as u64
|
||||
from scipy import ndimage
|
||||
|
||||
from fr0stlib import pyflam3
|
||||
@ -24,41 +24,55 @@ from cuburn import affine
|
||||
from cuburn.code import util, mwc, iter, filtering, sort
|
||||
|
||||
RenderedImage = namedtuple('RenderedImage', 'buf idx gpu_time')
|
||||
Dimensions = namedtuple('Dimensions', 'w h aw ah astride')
|
||||
|
||||
def _sync_stream(dst, src):
|
||||
dst.wait_for_event(cuda.Event(cuda.event_flags.DISABLE_TIMING).record(src))
|
||||
|
||||
def argset(obj, **kwargs):
|
||||
for k, v in kwargs.items():
|
||||
setattr(obj, k, v)
|
||||
return obj
|
||||
|
||||
class Renderer(object):
|
||||
"""
|
||||
Control structure for rendering a series of frames.
|
||||
|
||||
Each animation will dynamically generate a kernel that includes only the
|
||||
code necessary to render the genomes provided. The process of generating
|
||||
and uploading the kernel takes a small but finite amount of time. In
|
||||
general, the kernel generated for all genomes resulting from interpolating
|
||||
between two control points will have identical performance, so it is
|
||||
wasteful to create more than one animation for any interpolated sequence.
|
||||
|
||||
However, genome sequences interpolated from three or more control points
|
||||
with different features enabled will have the code needed to render all
|
||||
genomes enabled for every frame. Doing this can hurt performance.
|
||||
|
||||
In other words, it's best to use exactly one Animation for each
|
||||
interpolated sequence between one or two genomes.
|
||||
"""
|
||||
|
||||
# Number of iterations to iterate without write after generating a new
|
||||
# point. This number is currently fixed pretty deeply in the set of magic
|
||||
# constants which govern buffer sizes; changing the value here won't
|
||||
# actually change the code on the device to do something different.
|
||||
fuse = 256
|
||||
|
||||
# The palette texture/surface covers the color coordinate from [0,1] with
|
||||
# (for now, a fixed 256) equidistant horizontal samples, and spans the
|
||||
# temporal range of the frame linearly with this many rows. Increasing
|
||||
# this value increases the number of uniquely-dithered samples when using
|
||||
# pre-dithered surfaces.
|
||||
palette_height = 64
|
||||
|
||||
# Maximum width of DE and other spatial filters, and thus in turn the
|
||||
# amount of padding applied. Note that, for now, this must not be changed!
|
||||
# The filtering code makes deep assumptions about this value.
|
||||
gutter = 15
|
||||
|
||||
# Accumulation mode. Leave it at 'atomic' for now.
|
||||
acc_mode = 'atomic'
|
||||
|
||||
# TODO
|
||||
chaos_used = False
|
||||
|
||||
cmp_options = ('-use_fast_math', '-maxrregcount', '42')
|
||||
keep = False
|
||||
|
||||
def __init__(self, info):
|
||||
self.info = info
|
||||
def __init__(self):
|
||||
self._iter = self.src = self.cubin = self.mod = None
|
||||
self.packed_genome = None
|
||||
|
||||
# Ensure class options don't get contaminated on an instance
|
||||
self.cmp_options = list(self.cmp_options)
|
||||
|
||||
def compile(self, keep=None, cmp_options=None, jit_options=[]):
|
||||
def compile(self, genome, keep=None, cmp_options=None):
|
||||
"""
|
||||
Compile a kernel capable of rendering every frame in this animation.
|
||||
The resulting compiled kernel is stored in the ``cubin`` property;
|
||||
@ -73,7 +87,7 @@ class Renderer(object):
|
||||
keep = self.keep if keep is None else keep
|
||||
cmp_options = self.cmp_options if cmp_options is None else cmp_options
|
||||
|
||||
self._iter = iter.IterCode(self.info)
|
||||
self._iter = iter.IterCode(self, genome)
|
||||
self._iter.packer.finalize()
|
||||
self.src = util.assemble_code(util.BaseCode, mwc.MWC, self._iter.packer,
|
||||
self._iter)
|
||||
@ -82,41 +96,81 @@ class Renderer(object):
|
||||
self.cubin = pycuda.compiler.compile(
|
||||
self.src, keep=keep, options=cmp_options,
|
||||
cache_dir=False if keep else None)
|
||||
|
||||
def load(self, genome, jit_options=[]):
|
||||
if not self.cubin:
|
||||
self.compile(genome)
|
||||
self.mod = cuda.module_from_buffer(self.cubin, jit_options)
|
||||
with open('/tmp/iter_kern.cubin', 'wb') as fp:
|
||||
fp.write(self.cubin)
|
||||
return self.src
|
||||
|
||||
def render(self, times):
|
||||
def render(self, genome, times, width, height, blend=True):
|
||||
"""
|
||||
Render a flame for each genome in the iterable value 'genomes'.
|
||||
Returns a RenderedImage object with the rendered buffer in the
|
||||
requested format (3D RGBA ndarray only for now).
|
||||
Render a frame for each timestamp in the iterable value ``times``. This
|
||||
function returns a generator that will yield a RenderedImage object
|
||||
containing a shared reference to the output buffer for each specified
|
||||
frame.
|
||||
|
||||
This method produces a considerable amount of side effects, and should
|
||||
not be used lightly. Things may go poorly for you if this method is not
|
||||
allowed to run until completion (by exhausting all items in the
|
||||
generator object).
|
||||
The returned buffer is page-locked host memory. Between the time a
|
||||
buffer is yielded and the time the next frame's results are requested,
|
||||
the buffer will not be modified. Thereafter, however, it will be
|
||||
overwritten by an asynchronous DMA operation coming from the CUDA
|
||||
device. If you hang on to it for longer than one frame, copy it.
|
||||
|
||||
``times`` is a sequence of (idx, start, stop) times, where index is
|
||||
the logical frame number (though it can be any value) and 'start' and
|
||||
'stop' together define the time range to be rendered for each frame.
|
||||
``genome`` is the genome to be rendered. Successive calls to the
|
||||
`render()` method on one ``Renderer`` object must use genomes which
|
||||
produce identical compiled code, and this will not be verified by the
|
||||
renderer. In practice, this means you can alter genome parameter
|
||||
values, but the full set of keys must remain identical between runs on
|
||||
the same renderer.
|
||||
|
||||
``times`` is a list of (idx, cen_time) tuples, where ``idx`` is passed
|
||||
unmodified in the RenderedImage return value and ``cen_time`` is the
|
||||
central time of the current frame in spline-time units. (Any
|
||||
clock-time or frame-time units in the genome should be preconverted.)
|
||||
|
||||
If ``blend`` is False, the output buffer will contain unclipped,
|
||||
premultiplied RGBA data, without vibrancy, highlight power, or the
|
||||
alpha elbow applied.
|
||||
"""
|
||||
if times == []:
|
||||
r = self.render_gen(genome, width, height, blend=blend)
|
||||
next(r)
|
||||
return ifilter(None, imap(r.send, chain(times, [None])))
|
||||
|
||||
def render_gen(self, genome, width, height, blend=True):
|
||||
"""
|
||||
Render frames. This method is wrapped by the ``render()`` method; see
|
||||
its docstring for warnings and details.
|
||||
|
||||
Instead of passing frame times as an iterable, they are passed
|
||||
individually via the ``generator.send()`` method. There is an
|
||||
internal pipeline latency of one frame, so the first call to the
|
||||
``send()`` method will return None, the second call will return the
|
||||
first frame's result, and so on. To retrieve the last frame in a
|
||||
sequence, send ``None``.
|
||||
|
||||
Direct use of this method is useful for implementing render servers.
|
||||
"""
|
||||
|
||||
last_idx = None
|
||||
next_frame = yield
|
||||
if next_frame is None:
|
||||
return
|
||||
|
||||
if not self.mod:
|
||||
self.load(genome)
|
||||
|
||||
filt = filtering.Filtering()
|
||||
|
||||
reset_rb_fun = self.mod.get_function("reset_rb")
|
||||
packer_fun = self.mod.get_function("interp_iter_params")
|
||||
iter_fun = self.mod.get_function("iter")
|
||||
|
||||
info = self.info
|
||||
|
||||
# The synchronization model is messy. See helpers/task_model.svg.
|
||||
iter_stream = cuda.Stream()
|
||||
filt_stream = cuda.Stream()
|
||||
if info.acc_mode == 'deferred':
|
||||
if self.acc_mode == 'deferred':
|
||||
write_stream = cuda.Stream()
|
||||
write_fun = self.mod.get_function("write_shmem")
|
||||
else:
|
||||
@ -128,19 +182,30 @@ class Renderer(object):
|
||||
event_a = cuda.Event().record(filt_stream)
|
||||
event_b = None
|
||||
|
||||
nbins = info.acc_height * info.acc_stride
|
||||
awidth = width + 2 * self.gutter
|
||||
aheight = height + 2 * self.gutter
|
||||
astride = 32 * int(np.ceil(awidth / 32.))
|
||||
dim = Dimensions(width, height, awidth, aheight, astride)
|
||||
d_acc_size = self.mod.get_global('acc_size')[0]
|
||||
cuda.memcpy_htod_async(d_acc_size, u32(list(dim)), write_stream)
|
||||
|
||||
nbins = awidth * aheight
|
||||
# Extra padding in accum helps with write_shmem overruns
|
||||
d_accum = cuda.mem_alloc(16 * nbins + (1<<16))
|
||||
d_out = cuda.mem_alloc(16 * nbins)
|
||||
if info.acc_mode == 'atomic':
|
||||
if self.acc_mode == 'atomic':
|
||||
d_atom = cuda.mem_alloc(8 * nbins)
|
||||
flush_fun = self.mod.get_function("flush_atom")
|
||||
|
||||
acc_size = np.array([info.acc_width, info.acc_height, info.acc_stride])
|
||||
d_acc_size = self.mod.get_global('acc_size')[0]
|
||||
cuda.memcpy_htod_async(d_acc_size, np.uint32(acc_size), write_stream)
|
||||
obuf_copy = argset(cuda.Memcpy2D(),
|
||||
src_y=self.gutter, src_x_in_bytes=16*self.gutter,
|
||||
src_pitch=16*astride, dst_pitch=16*width,
|
||||
width_in_bytes=16*width, height=height)
|
||||
obuf_copy.set_src_device(d_out)
|
||||
h_out_a = cuda.pagelocked_empty((height, width, 4), f32)
|
||||
h_out_b = cuda.pagelocked_empty((height, width, 4), f32)
|
||||
|
||||
if info.acc_mode == 'deferred':
|
||||
if self.acc_mode == 'deferred':
|
||||
# Having a fixed, power-of-two log size makes things much easier
|
||||
log_size = 64 << 20
|
||||
d_log = cuda.mem_alloc(log_size * 4)
|
||||
@ -153,9 +218,8 @@ class Renderer(object):
|
||||
# Calculate 'nslots', the number of simultaneous running threads that
|
||||
# can be active on the GPU during iteration (and thus the number of
|
||||
# slots for loading and storing RNG and point context that will be
|
||||
# prepared on the device), 'rb_size' (the number of blocks in
|
||||
# 'nslots'), and determine a number of temporal samples
|
||||
# likely to load-balance effectively
|
||||
# prepared on the device), and derive 'rb_size', the number of blocks in
|
||||
# 'nslots'.
|
||||
iter_threads_per_block = 256
|
||||
dev_data = pycuda.tools.DeviceData()
|
||||
occupancy = pycuda.tools.OccupancyRecord(
|
||||
@ -169,14 +233,16 @@ class Renderer(object):
|
||||
reset_rb_fun(np.int32(rb_size), block=(1,1,1))
|
||||
|
||||
d_points = cuda.mem_alloc(nslots * 16)
|
||||
# We may add extra seeds to simplify palette dithering.
|
||||
seeds = mwc.MWC.make_seeds(max(nslots, 256 * info.palette_height))
|
||||
# This statement may add extra seeds to simplify palette dithering.
|
||||
seeds = mwc.MWC.make_seeds(max(nslots, 256 * self.palette_height))
|
||||
d_seeds = cuda.to_device(seeds)
|
||||
|
||||
# We used to auto-calculate this to a multiple of the number of SMs on
|
||||
# the device, but since we now use shorter launches and, to a certain
|
||||
# extent, allow simultaneous occupancy, that's not as important. The
|
||||
# 1024 is a magic constant, though: FUSE
|
||||
# 1024 is a magic constant to ensure reasonable and power-of-two log
|
||||
# size for deferred: 256MB / (4B * FUSE * NTHREADS). Enhancements to
|
||||
# the sort engine are needed to make this more flexible.
|
||||
ntemporal_samples = 1024
|
||||
genome_times, genome_knots = self._iter.packer.pack()
|
||||
d_genome_times = cuda.to_device(genome_times)
|
||||
@ -184,37 +250,31 @@ class Renderer(object):
|
||||
info_size = 4 * len(self._iter.packer) * ntemporal_samples
|
||||
d_infos = cuda.mem_alloc(info_size)
|
||||
|
||||
pals = info.genome.color.palette
|
||||
pals = genome.color.palette
|
||||
if isinstance(pals, basestring):
|
||||
pals = [0.0, pals, 1.0, pals]
|
||||
palint_times = np.empty(len(genome_times[0]), np.float32)
|
||||
palint_times = np.empty(len(genome_times[0]), f32)
|
||||
palint_times.fill(100.0)
|
||||
palint_times[:len(pals)/2] = pals[::2]
|
||||
palint_times[:len(pals)] = [p[0] for p in pals]
|
||||
d_palint_times = cuda.to_device(palint_times)
|
||||
d_palint_vals = cuda.to_device(
|
||||
np.concatenate(map(info.db.palettes.get, pals[1::2])))
|
||||
np.concatenate([p[1].data for p in pals]))
|
||||
|
||||
if info.acc_mode in ('deferred', 'atomic'):
|
||||
if self.acc_mode in ('deferred', 'atomic'):
|
||||
palette_fun = self.mod.get_function("interp_palette_hsv_flat")
|
||||
dsc = cuda.ArrayDescriptor3D()
|
||||
dsc.height = info.palette_height
|
||||
dsc.width = 256
|
||||
dsc.depth = 0
|
||||
dsc.format = cuda.array_format.SIGNED_INT32
|
||||
dsc.num_channels = 2
|
||||
dsc.flags = cuda.array3d_flags.SURFACE_LDST
|
||||
dsc = argset(cuda.ArrayDescriptor3D(), height=self.palette_height,
|
||||
width=256, depth=0, format=cuda.array_format.SIGNED_INT32,
|
||||
num_channels=2, flags=cuda.array3d_flags.SURFACE_LDST)
|
||||
palarray = cuda.Array(dsc)
|
||||
|
||||
tref = self.mod.get_surfref('flatpal')
|
||||
tref.set_array(palarray, 0)
|
||||
else:
|
||||
palette_fun = self.mod.get_function("interp_palette_hsv")
|
||||
dsc = cuda.ArrayDescriptor()
|
||||
dsc.height = info.palette_height
|
||||
dsc.width = 256
|
||||
dsc.format = cuda.array_format.UNSIGNED_INT8
|
||||
dsc.num_channels = 4
|
||||
d_palmem = cuda.mem_alloc(256 * info.palette_height * 4)
|
||||
dsc = argset(cuda.ArrayDescriptor(), height=self.palette_height,
|
||||
width=256, format=cuda.array_format.UNSIGNED_INT8,
|
||||
num_channels=4)
|
||||
d_palmem = cuda.mem_alloc(256 * self.palette_height * 4)
|
||||
|
||||
tref = self.mod.get_texref('palTex')
|
||||
tref.set_address_2d(d_palmem, dsc, 1024)
|
||||
@ -222,49 +282,46 @@ class Renderer(object):
|
||||
tref.set_flags(cuda.TRSF_NORMALIZED_COORDINATES)
|
||||
tref.set_filter_mode(cuda.filter_mode.LINEAR)
|
||||
|
||||
h_out_a = cuda.pagelocked_empty((info.acc_height, info.acc_stride, 4),
|
||||
np.float32)
|
||||
h_out_b = cuda.pagelocked_empty((info.acc_height, info.acc_stride, 4),
|
||||
np.float32)
|
||||
last_idx = None
|
||||
while next_frame is not None:
|
||||
# tc, td, ts, te: central, delta, start, end times
|
||||
idx, tc = next_frame
|
||||
td = genome.adj_frame_width(tc)
|
||||
ts, te = tc - 0.5 * td, tc + 0.5 * td
|
||||
|
||||
for idx, start, stop in times:
|
||||
twidth = np.float32((stop-start) / info.palette_height)
|
||||
if info.acc_mode in ('deferred', 'atomic'):
|
||||
palette_fun(d_seeds, d_palint_times, d_palint_vals,
|
||||
np.float32(start), twidth,
|
||||
block=(256,1,1), grid=(info.palette_height,1),
|
||||
stream=write_stream)
|
||||
if self.acc_mode in ('deferred', 'atomic'):
|
||||
# In this mode, the palette writes to a surface reference, but
|
||||
# requires dithering, so we pass it the seeds instead
|
||||
arg0 = d_seeds
|
||||
else:
|
||||
palette_fun(d_palmem, d_palint_times, d_palint_vals,
|
||||
np.float32(start), twidth,
|
||||
block=(256,1,1), grid=(info.palette_height,1),
|
||||
arg0 = d_palmem
|
||||
palette_fun(arg0, d_palint_times, d_palint_vals,
|
||||
f32(ts), f32(td / self.palette_height),
|
||||
block=(256,1,1), grid=(self.palette_height,1),
|
||||
stream=write_stream)
|
||||
|
||||
width = np.float32((stop-start) / ntemporal_samples)
|
||||
packer_fun(d_infos, d_genome_times, d_genome_knots,
|
||||
np.float32(start), width, d_seeds,
|
||||
np.int32(ntemporal_samples), block=(256,1,1),
|
||||
packer_fun(d_infos, d_seeds, d_genome_times, d_genome_knots,
|
||||
f32(ts), f32(td / ntemporal_samples),
|
||||
i32(ntemporal_samples), block=(256,1,1),
|
||||
grid=(int(np.ceil(ntemporal_samples/256.)),1),
|
||||
stream=iter_stream)
|
||||
|
||||
# Reset points so that they will be FUSEd
|
||||
util.BaseCode.fill_dptr(self.mod, d_points, 4 * nslots,
|
||||
iter_stream, np.float32(np.nan))
|
||||
iter_stream, f32(np.nan))
|
||||
|
||||
# Get interpolated control points for debugging
|
||||
#iter_stream.synchronize()
|
||||
#d_temp = cuda.from_device(d_infos,
|
||||
#(ntemporal_samples, len(self._iter.packer)), np.float32)
|
||||
#(ntemporal_samples, len(self._iter.packer)), f32)
|
||||
#for i, n in zip(d_temp[5], self._iter.packer.packed):
|
||||
#print '%60s %g' % ('_'.join(n), i)
|
||||
|
||||
util.BaseCode.fill_dptr(self.mod, d_accum, 4 * nbins, write_stream)
|
||||
if info.acc_mode == 'atomic':
|
||||
if self.acc_mode == 'atomic':
|
||||
util.BaseCode.fill_dptr(self.mod, d_atom, 2 * nbins, write_stream)
|
||||
nrounds = ( (info.density * info.width * info.height)
|
||||
nrounds = int( (genome.spp(tc) * width * height)
|
||||
/ (ntemporal_samples * 256 * 256) ) + 1
|
||||
if info.acc_mode == 'deferred':
|
||||
if self.acc_mode == 'deferred':
|
||||
for i in range(nrounds):
|
||||
iter_fun(np.uint64(d_log), d_seeds, d_points, d_infos,
|
||||
block=(32, self._iter.NTHREADS/32, 1),
|
||||
@ -272,18 +329,17 @@ class Renderer(object):
|
||||
_sync_stream(write_stream, iter_stream)
|
||||
sorter.sort(d_log_sorted, d_log, log_size, 3, True,
|
||||
stream=write_stream)
|
||||
#print cuda.from_device(sorter.dglobal, (256,), np.uint32)
|
||||
_sync_stream(iter_stream, write_stream)
|
||||
write_fun(d_accum, d_log_sorted, sorter.dglobal, i32(nbins),
|
||||
block=(1024, 1, 1), grid=(nwriteblocks, 1),
|
||||
stream=write_stream)
|
||||
else:
|
||||
args = [u64(d_accum), d_seeds, d_points, d_infos]
|
||||
if info.acc_mode == 'atomic':
|
||||
if self.acc_mode == 'atomic':
|
||||
args.append(u64(d_atom))
|
||||
iter_fun(*args, block=(32, self._iter.NTHREADS/32, 1),
|
||||
grid=(ntemporal_samples, nrounds), stream=iter_stream)
|
||||
if info.acc_mode == 'atomic':
|
||||
if self.acc_mode == 'atomic':
|
||||
nblocks = int(np.ceil(np.sqrt(nbins/float(512))))
|
||||
flush_fun(u64(d_accum), u64(d_atom), i32(nbins),
|
||||
block=(512, 1, 1), grid=(nblocks, nblocks),
|
||||
@ -291,27 +347,29 @@ class Renderer(object):
|
||||
|
||||
util.BaseCode.fill_dptr(self.mod, d_out, 4 * nbins, filt_stream)
|
||||
_sync_stream(filt_stream, write_stream)
|
||||
filt.de(d_out, d_accum, info, start, stop, filt_stream)
|
||||
filt.de(d_out, d_accum, genome, dim, tc, stream=filt_stream)
|
||||
_sync_stream(write_stream, filt_stream)
|
||||
filt.colorclip(d_out, info, start, stop, filt_stream)
|
||||
cuda.memcpy_dtoh_async(h_out_a, d_out, filt_stream)
|
||||
filt.colorclip(d_out, genome, dim, tc, blend, stream=filt_stream)
|
||||
obuf_copy.set_dst_host(h_out_a)
|
||||
obuf_copy(filt_stream)
|
||||
|
||||
if event_b:
|
||||
while not event_a.query():
|
||||
timemod.sleep(0.01)
|
||||
gpu_time = event_a.time_since(event_b)
|
||||
yield RenderedImage(self._trim(h_out_b), last_idx, gpu_time)
|
||||
result = RenderedImage(h_out_b, last_idx, gpu_time)
|
||||
else:
|
||||
result = None
|
||||
last_idx = idx
|
||||
|
||||
event_a, event_b = cuda.Event().record(filt_stream), event_a
|
||||
h_out_a, h_out_b = h_out_b, h_out_a
|
||||
last_idx = idx
|
||||
|
||||
# TODO: add ability to flush a frame without breaking the pipe
|
||||
next_frame = yield result
|
||||
|
||||
while not event_a.query():
|
||||
timemod.sleep(0.001)
|
||||
gpu_time = event_a.time_since(event_b)
|
||||
yield RenderedImage(self._trim(h_out_b), last_idx, gpu_time)
|
||||
|
||||
def _trim(self, result):
|
||||
g = self.info.gutter
|
||||
return result[g:-g,g:g+self.info.width].copy()
|
||||
yield RenderedImage(h_out_b, last_idx, gpu_time)
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user