New genome format to support flockutil

This commit is contained in:
Steven Robertson 2011-12-15 11:11:05 -05:00
parent 12655b8611
commit b43481e374
5 changed files with 303 additions and 275 deletions

View File

@ -1,5 +1,7 @@
import numpy as np
from numpy import float32 as f32, int32 as i32
import pycuda.compiler
from pycuda.gpuarray import vec
@ -10,8 +12,8 @@ _CODE = '''
__global__
void colorclip(float4 *pixbuf, float gamma, float vibrancy, float highpow,
float linrange, float lingam, float3 bkgd, int fbsize,
int alpha_output_channel) {
float linrange, float lingam, float3 bkgd,
int fbsize, int blend_background_color) {
int i = threadIdx.x + blockDim.x * (blockIdx.x + gridDim.x * blockIdx.y);
if (i >= fbsize) return;
@ -30,8 +32,17 @@ void colorclip(float4 *pixbuf, float gamma, float vibrancy, float highpow,
alpha = (1.0f - frac) * pix.w * lingam + frac * alpha;
}
float ls = vibrancy * alpha / pix.w;
if (!blend_background_color) {
float ls = alpha / pix.w;
pix.x *= ls;
pix.y *= ls;
pix.z *= ls;
pix.w = alpha;
pixbuf[i] = pix;
return;
}
float ls = vibrancy * alpha / pix.w;
alpha = fminf(1.0f, fmaxf(0.0f, alpha));
float maxc = fmaxf(pix.x, fmaxf(pix.y, pix.z));
@ -64,22 +75,14 @@ void colorclip(float4 *pixbuf, float gamma, float vibrancy, float highpow,
pix.y += (1.0f - vibrancy) * powf(opix.y, gamma);
pix.z += (1.0f - vibrancy) * powf(opix.z, gamma);
if (alpha_output_channel) {
float one_alpha = 1.0f / alpha;
pix.x *= one_alpha;
pix.y *= one_alpha;
pix.z *= one_alpha;
} else {
pix.x += (1.0f - alpha) * bkgd.x;
pix.y += (1.0f - alpha) * bkgd.y;
pix.z += (1.0f - alpha) * bkgd.z;
}
pix.w = alpha;
// Clamp values. I think this is superfluous, but I'm not certain.
pix.x = fminf(1.0f, pix.x);
pix.y = fminf(1.0f, pix.y);
pix.z = fminf(1.0f, pix.z);
pix.w = alpha;
pixbuf[i] = pix;
}
@ -288,52 +291,45 @@ class Filtering(object):
def __init__(self):
self.init_mod()
def de(self, ddst, dsrc, info, start, stop, stream=None):
# TODO: use integration to obtain parameter values
t = (start + stop) / 2
cp = info.genome
k1 = np.float32(cp.color.brightness(t) * 268 / 256)
def de(self, ddst, dsrc, gnm, dim, tc, stream=None):
k1 = f32(gnm.color.brightness(tc) * 268 / 256)
# Old definition of area is (w*h/(s*s)). Since new scale 'ns' is now
# s/w, new definition is (w*h/(s*s*w*w)) = (h/(s*s*w))
area = info.height / (cp.camera.scale(t) ** 2 * info.width)
k2 = np.float32(1 / (area * info.density))
area = dim.h / (gnm.camera.scale(tc) ** 2 * dim.w)
k2 = f32(1 / (area * gnm.spp(tc)))
if cp.de.radius == 0:
nbins = info.acc_height * info.acc_stride
if gnm.de.radius == 0:
nbins = dim.ah * dim.astride
fun = self.mod.get_function("logscale")
t = fun(dsrc, ddst, k1, k2,
block=(512, 1, 1), grid=(nbins/512, 1), stream=stream)
else:
scale_coeff = np.float32(-(1 + cp.de.radius(t)) ** -2.0)
est_curve = np.float32(2 * cp.de.curve(t))
scale_coeff = f32(-(1 + gnm.de.radius(tc)) ** -2.0)
est_curve = f32(2 * gnm.de.curve(tc))
# TODO: experiment with this
edge_clamp = np.float32(2.0)
edge_clamp = f32(1.2)
fun = self.mod.get_function("density_est")
fun(dsrc, ddst, scale_coeff, est_curve, edge_clamp, k1, k2,
np.int32(info.acc_height), np.int32(info.acc_stride),
block=(32, 32, 1), grid=(info.acc_width/32, 1), stream=stream)
i32(dim.ah), i32(dim.astride), block=(32, 32, 1),
grid=(dim.aw/32, 1), stream=stream)
def colorclip(self, dbuf, info, start, stop, stream=None):
f32 = np.float32
t = (start + stop) / 2
cp = info.genome
nbins = info.acc_height * info.acc_stride
def colorclip(self, dbuf, gnm, dim, tc, blend, stream=None):
nbins = dim.ah * dim.astride
# TODO: implement integration over cubic splines?
gam = f32(1 / cp.color.gamma(t))
vib = f32(cp.color.vibrancy(t))
hipow = f32(cp.color.highlight_power(t))
lin = f32(cp.color.gamma_threshold(t))
gam = f32(1 / gnm.color.gamma(tc))
vib = f32(gnm.color.vibrancy(tc))
hipow = f32(gnm.color.highlight_power(tc))
lin = f32(gnm.color.gamma_threshold(tc))
lingam = f32(lin ** (gam-1.0) if lin > 0 else 0)
bkgd = vec.make_float3(
cp.color.background.r(t),
cp.color.background.g(t),
cp.color.background.b(t))
gnm.color.background.r(tc),
gnm.color.background.g(tc),
gnm.color.background.b(tc))
color_fun = self.mod.get_function("colorclip")
blocks = int(np.ceil(np.sqrt(nbins / 256)))
color_fun(dbuf, gam, vib, hipow, lin, lingam, bkgd, np.int32(nbins),
np.int32(0),
block=(256, 1, 1), grid=(blocks, blocks), stream=stream)
color_fun(dbuf, gam, vib, hipow, lin, lingam, bkgd, i32(nbins),
i32(blend), block=(256, 1, 1), grid=(blocks, blocks),
stream=stream)

View File

@ -226,8 +226,11 @@ class GenomePacker(HunkOCode):
_defs = Template(r"""
__global__
void interp_{{tname}}({{tname}}* out, float *times, float *knots,
float tstart, float tstep, mwc_st *rctxes, int maxid) {
void interp_{{tname}}(
{{tname}}* out, mwc_st *rctxes,
const float *times, const float *knots,
float tstart, float tstep, int maxid
) {
int id = gtid();
if (id >= maxid) return;
out = &out[id];

View File

@ -53,7 +53,7 @@ def precalc_chaos(pcp, std_xforms):
""").substitute(locals()))
def precalc_camera(info, pcam):
def precalc_camera(pcam):
pre_cam = pcam._precalc()
# Maxima code to check my logic:
@ -68,7 +68,7 @@ def precalc_camera(info, pcam):
float rot = {{pre_cam.rotation}} * M_PI / 180.0f;
float rotsin = sin(rot), rotcos = cos(rot);
float cenx = {{pre_cam.center.x}}, ceny = {{pre_cam.center.y}};
float scale = {{pre_cam.scale}} * {{info.width}};
float scale = {{pre_cam.scale}} * acc_size.width;
float ditherwidth = {{pre_cam.dither_width}} * 0.33f;
float u0 = mwc_next_01(rctx);
@ -81,12 +81,12 @@ def precalc_camera(info, pcam):
{{pre_cam._set('xx')}} = scale * rotcos;
{{pre_cam._set('xy')}} = scale * -rotsin;
{{pre_cam._set('xo')}} = scale * (rotsin * ceny - rotcos * cenx)
+ {{0.5 * (info.width + info.gutter + 1)}} + ditherx;
+ 0.5f * acc_size.awidth + ditherx;
{{pre_cam._set('yx')}} = scale * rotsin;
{{pre_cam._set('yy')}} = scale * rotcos;
{{pre_cam._set('yo')}} = scale * -(rotsin * cenx + rotcos * ceny)
+ {{0.5 * (info.height + info.gutter + 1)}} + dithery;
+ 0.5f * acc_size.aheight + dithery;
""").substitute(locals()))
@ -113,13 +113,12 @@ class IterCode(HunkOCode):
# The number of threads per block
NTHREADS = 256
def __init__(self, info):
self.info = info
def __init__(self, info, genome):
self.packer = interp.GenomePacker('iter_params')
self.pcp = self.packer.view('params', self.info.genome, 'cp')
self.pcp = self.packer.view('params', genome, 'cp')
iterbody = self._iterbody()
bodies = [self._xfbody(i,x) for i,x in sorted(info.genome.xforms.items())]
iterbody = self._iterbody(info, genome)
bodies = [self._xfbody(i,x) for i,x in sorted(genome.xforms.items())]
bodies.append(iterbody)
self.defs = '\n'.join(bodies)
@ -132,7 +131,9 @@ __device__ int rb_head, rb_tail, rb_size;
typedef struct {
uint32_t width;
uint32_t height;
uint32_t stride;
uint32_t awidth;
uint32_t aheight;
uint32_t astride;
} acc_size_t;
__constant__ acc_size_t acc_size;
@ -174,7 +175,7 @@ void apply_xf_{{xfid}}(float &ox, float &oy, float &color, mwc_st &rctx) {
g.update(locals())
return tmpl.substitute(g)
def _iterbody(self):
def _iterbody(self, info, genome):
tmpl = Template(r'''
__global__ void reset_rb(int size) {
@ -331,7 +332,7 @@ void iter(
float cx, cy, cc;
{{precalc_camera(info, pcp.camera)}}
{{precalc_camera(pcp.camera)}}
{{if 'final' in cp.xforms}}
{{apply_affine('fx', 'fy', 'cx', 'cy', pcp.camera)}}
@ -343,14 +344,14 @@ void iter(
uint32_t ix = trunca(cx), iy = trunca(cy);
if (ix >= acc_size.width || iy >= acc_size.height) {
if (ix >= acc_size.awidth || iy >= acc_size.aheight) {
{{if info.acc_mode == 'deferred'}}
*log = 0xffffffff;
{{endif}}
continue;
}
uint32_t i = iy * acc_size.stride + ix;
uint32_t i = iy * acc_size.astride + ix;
{{if info.acc_mode == 'atomic'}}
asm volatile ({{crep("""
{
@ -626,12 +627,11 @@ oflow_write_end:
{{endif}}
''', 'iter_kern')
return tmpl.substitute(
info = self.info,
cp = self.info.genome,
info = info,
cp = genome,
pcp = self.pcp,
NTHREADS = self.NTHREADS,
NWARPS = self.NTHREADS / 32,
std_xforms = [n for n in sorted(self.info.genome.xforms)
if n != 'final'],
std_xforms = [n for n in sorted(genome.xforms) if n != 'final'],
**globals())

View File

@ -79,115 +79,86 @@ class SplEval(object):
return self.knots[1][0]
return list(self.knots.T.flat)
@classmethod
def wrap(cls, obj):
"""
Given a dict 'obj' representing, for instance, a Genome object, walk
through the object recursively and in-place, turning any number or
list of numbers into an SplEval.
"""
for k, v in obj.items():
if (isinstance(v, (float, int)) or
(isinstance(v, list) and isinstance(v[1], (float, int)))):
obj[k] = cls(v)
elif isinstance(v, dict):
cls.wrap(v)
class RenderInfo(object):
"""
Determine features and constants required to render a particular set of
genomes. The values of this class are fixed before compilation begins.
"""
# Number of iterations to iterate without write after generating a new
# point. This number is currently fixed pretty deeply in the set of magic
# constants which govern buffer sizes; changing the value here won't
# actually change the code on the device to do something different.
fuse = 256
# Height of the texture pallete which gets uploaded to the GPU (assuming
# that palette-from-texture is enabled). For most genomes, this doesn't
# need to be very large at all. However, since only an easily-cached
# fraction of this will be accessed per SM, larger values shouldn't hurt
# performance too much. When using deferred accumulation, increasing this
# value increases the number of uniquely-dithered samples, which is nice.
# Power-of-two, please.
palette_height = 64
# Maximum width of DE and other spatial filters, and thus in turn the
# amount of padding applied. Note that, for now, this must not be changed!
# The filtering code makes deep assumptions about this value.
gutter = 15
# TODO: for now, we always throw away the alpha channel before writing.
# All code is in place to not do this, we just need to find a way to expose
# this preference via the API (or push alpha blending entirely on the client,
# which I'm not opposed to)
alpha_output_channel = False
# There are three settings for this somewhat ersatz paramater. 'global'
# uses unsynchronized global writes to accumulate sample points, 'atomic'
# uses atomic global writes, and 'deferred' stores color and position in a
# sample log, sorts the log by position, and uses shared memory to
# perform the accumulation. Deferred has the accuracy of 'atomic' and
# the speed of 'global' (it's actually faster!), but packs color and
# position into a single 32-bit int for now, which limits resolution to
# 1080p when xform opacity is respected, so the other two modes will hang
# around until that can be extended to be memory-limited again.
acc_mode = 'atomic'
# TODO: fix this
chaos_used = False
def __init__(self, db, **kwargs):
self.db = db
# Copy all args into this object's namespace
self.__dict__.update(kwargs)
self.acc_width = self.width + 2 * self.gutter
self.acc_height = self.height + 2 * self.gutter
self.acc_stride = 32 * int(np.ceil(self.acc_width / 32.))
self.density = self.quality
# Deref genome
self.genome = self.db.genomes[self.genome]
for k, v in self.db.palettes.items():
pal = np.fromstring(base64.b64decode(v), np.uint8)
pal = np.reshape(pal, (256, 3))
pal_a = np.ones((256, 4), np.float32)
pal_a[:,:3] = pal / 255.0
self.db.palettes[k] = pal_a
class Palette(object):
"""Wafer-thin wrapper around palettes. For the future!"""
def __init__(self, datastr, fmt='rgb8'):
if fmt != 'rgb8':
raise NotImplementedError
if len(datastr) != 768:
raise ValueError("Unsupported palette width")
self.width = 256
pal = np.reshape(np.fromstring(datastr, np.uint8), (256, 3))
self.data = np.ones((256, 4), np.float32)
self.data[:,:3] = pal / 255.0
class _AttrDict(dict):
def __getattr__(self, name):
return self[name]
def load_info(contents):
result = json.loads(contents, object_hook=_AttrDict)
SplEval.wrap(result.genomes)
@classmethod
def _wrap(cls, dct):
for k, v in dct.items():
if (isinstance(v, (float, int)) or
(isinstance(v, list) and isinstance(v[1], (float, int)))):
dct[k] = SplEval(v)
elif isinstance(v, dict):
dct[k] = cls._wrap(cls(v))
return dct
# A Job object will have more details or something
result = RenderInfo(result, **result.renders.values()[0])
return result
class Genome(_AttrDict):
# For now, we base the Genome class on an _AttrDict, letting its structure
# be defined implicitly by the way it is used in device code. More formal
# unpacking will happen soon.
def __init__(self, gnm, base_den):
super(Genome, self).__init__(gnm)
for k, v in self.items():
v = _AttrDict(v)
if k not in ('info', 'time'):
_AttrDict._wrap(v)
self[k] = v
# TODO: this is a hack, figure out how to solve it more elegantly
self.spp = SplEval(self.camera.density.knotlist)
self.spp.knots[1] *= base_den
# TODO: decide how to handle palettes. For now, it's the caller's
# responsibility to replace this list with actual palettes.
pal = self.color.palette
if isinstance(pal, basestring):
self.color.palette = [(0.0, pal), (1.0, pal)]
elif isinstance(pal, list):
self.color.palette = zip(pal[::2], pal[1::2])
class HacketyGenome(object):
# TODO: caller also needs to call set_timing()
self.adj_frame_width = None
self.canonical_right = (not self.get('link') or not self.link == 'self'
or not self.link.get('right'))
def set_timing(self, base_dur, fps, offset=0.0, err_spread=True):
"""
Holdover class to postpone a very deep refactoring as long as possible.
Converts property accesses into interpolations over predetermined times.
Set frame timing. Must be called at least once prior to rendering.
"""
def __init__(self, referent, times):
# Times can be singular
self.referent, self.times = referent, times
def __getattr__(self, name):
r = getattr(self.referent, str(name))
if isinstance(r, _AttrDict):
return HacketyGenome(r, self.times)
elif isinstance(r, SplEval):
return r(self.times)
return r
__getitem__ = __getattr__
# TODO: test!
dur = self.time.duration
if isinstance(dur, basestring):
clock = float(dur[:-1]) + offset
else:
clock = dur * base_dur + offset
if self.canonical_right:
nframes = int(np.floor(clock * fps))
else:
nframes = int(np.ceil(clock * fps))
err = (clock - nframes / fps) / clock
if __name__ == "__main__":
import sys
import pprint
pprint.pprint(read_genome(sys.stdin))
fw = self.time.frame_width
if not isinstance(fw, list):
fw = [0, fw, 1, fw]
fw = [float(f[:-1]) * fps if isinstance(f, basestring)
else float(f) / (clock * fps) for f in fw]
self.adj_frame_width = SplEval(fw)
times = np.linspace(offset, 1 - err, nframes + 1)
# Move each time to a center time, and discard the last value
times = times[:-1] + 0.5 * (times[1] - times[0])
if err_spread:
epts = np.linspace(-2*np.pi, 2*np.pi, nframes)
times = times + 0.5 * err * (np.tanh(epts) + 1)
return err, times

View File

@ -4,11 +4,11 @@ import re
import time as timemod
import tempfile
from collections import namedtuple
from itertools import cycle, repeat, chain, izip
from itertools import cycle, repeat, chain, izip, imap, ifilter
from ctypes import *
from cStringIO import StringIO
import numpy as np
from numpy import int32 as i32, uint64 as u64
from numpy import float32 as f32, int32 as i32, uint32 as u32, uint64 as u64
from scipy import ndimage
from fr0stlib import pyflam3
@ -24,41 +24,55 @@ from cuburn import affine
from cuburn.code import util, mwc, iter, filtering, sort
RenderedImage = namedtuple('RenderedImage', 'buf idx gpu_time')
Dimensions = namedtuple('Dimensions', 'w h aw ah astride')
def _sync_stream(dst, src):
dst.wait_for_event(cuda.Event(cuda.event_flags.DISABLE_TIMING).record(src))
def argset(obj, **kwargs):
for k, v in kwargs.items():
setattr(obj, k, v)
return obj
class Renderer(object):
"""
Control structure for rendering a series of frames.
Each animation will dynamically generate a kernel that includes only the
code necessary to render the genomes provided. The process of generating
and uploading the kernel takes a small but finite amount of time. In
general, the kernel generated for all genomes resulting from interpolating
between two control points will have identical performance, so it is
wasteful to create more than one animation for any interpolated sequence.
However, genome sequences interpolated from three or more control points
with different features enabled will have the code needed to render all
genomes enabled for every frame. Doing this can hurt performance.
In other words, it's best to use exactly one Animation for each
interpolated sequence between one or two genomes.
"""
# Number of iterations to iterate without write after generating a new
# point. This number is currently fixed pretty deeply in the set of magic
# constants which govern buffer sizes; changing the value here won't
# actually change the code on the device to do something different.
fuse = 256
# The palette texture/surface covers the color coordinate from [0,1] with
# (for now, a fixed 256) equidistant horizontal samples, and spans the
# temporal range of the frame linearly with this many rows. Increasing
# this value increases the number of uniquely-dithered samples when using
# pre-dithered surfaces.
palette_height = 64
# Maximum width of DE and other spatial filters, and thus in turn the
# amount of padding applied. Note that, for now, this must not be changed!
# The filtering code makes deep assumptions about this value.
gutter = 15
# Accumulation mode. Leave it at 'atomic' for now.
acc_mode = 'atomic'
# TODO
chaos_used = False
cmp_options = ('-use_fast_math', '-maxrregcount', '42')
keep = False
def __init__(self, info):
self.info = info
def __init__(self):
self._iter = self.src = self.cubin = self.mod = None
self.packed_genome = None
# Ensure class options don't get contaminated on an instance
self.cmp_options = list(self.cmp_options)
def compile(self, keep=None, cmp_options=None, jit_options=[]):
def compile(self, genome, keep=None, cmp_options=None):
"""
Compile a kernel capable of rendering every frame in this animation.
The resulting compiled kernel is stored in the ``cubin`` property;
@ -73,7 +87,7 @@ class Renderer(object):
keep = self.keep if keep is None else keep
cmp_options = self.cmp_options if cmp_options is None else cmp_options
self._iter = iter.IterCode(self.info)
self._iter = iter.IterCode(self, genome)
self._iter.packer.finalize()
self.src = util.assemble_code(util.BaseCode, mwc.MWC, self._iter.packer,
self._iter)
@ -82,41 +96,81 @@ class Renderer(object):
self.cubin = pycuda.compiler.compile(
self.src, keep=keep, options=cmp_options,
cache_dir=False if keep else None)
def load(self, genome, jit_options=[]):
if not self.cubin:
self.compile(genome)
self.mod = cuda.module_from_buffer(self.cubin, jit_options)
with open('/tmp/iter_kern.cubin', 'wb') as fp:
fp.write(self.cubin)
return self.src
def render(self, times):
def render(self, genome, times, width, height, blend=True):
"""
Render a flame for each genome in the iterable value 'genomes'.
Returns a RenderedImage object with the rendered buffer in the
requested format (3D RGBA ndarray only for now).
Render a frame for each timestamp in the iterable value ``times``. This
function returns a generator that will yield a RenderedImage object
containing a shared reference to the output buffer for each specified
frame.
This method produces a considerable amount of side effects, and should
not be used lightly. Things may go poorly for you if this method is not
allowed to run until completion (by exhausting all items in the
generator object).
The returned buffer is page-locked host memory. Between the time a
buffer is yielded and the time the next frame's results are requested,
the buffer will not be modified. Thereafter, however, it will be
overwritten by an asynchronous DMA operation coming from the CUDA
device. If you hang on to it for longer than one frame, copy it.
``times`` is a sequence of (idx, start, stop) times, where index is
the logical frame number (though it can be any value) and 'start' and
'stop' together define the time range to be rendered for each frame.
``genome`` is the genome to be rendered. Successive calls to the
`render()` method on one ``Renderer`` object must use genomes which
produce identical compiled code, and this will not be verified by the
renderer. In practice, this means you can alter genome parameter
values, but the full set of keys must remain identical between runs on
the same renderer.
``times`` is a list of (idx, cen_time) tuples, where ``idx`` is passed
unmodified in the RenderedImage return value and ``cen_time`` is the
central time of the current frame in spline-time units. (Any
clock-time or frame-time units in the genome should be preconverted.)
If ``blend`` is False, the output buffer will contain unclipped,
premultiplied RGBA data, without vibrancy, highlight power, or the
alpha elbow applied.
"""
if times == []:
r = self.render_gen(genome, width, height, blend=blend)
next(r)
return ifilter(None, imap(r.send, chain(times, [None])))
def render_gen(self, genome, width, height, blend=True):
"""
Render frames. This method is wrapped by the ``render()`` method; see
its docstring for warnings and details.
Instead of passing frame times as an iterable, they are passed
individually via the ``generator.send()`` method. There is an
internal pipeline latency of one frame, so the first call to the
``send()`` method will return None, the second call will return the
first frame's result, and so on. To retrieve the last frame in a
sequence, send ``None``.
Direct use of this method is useful for implementing render servers.
"""
last_idx = None
next_frame = yield
if next_frame is None:
return
if not self.mod:
self.load(genome)
filt = filtering.Filtering()
reset_rb_fun = self.mod.get_function("reset_rb")
packer_fun = self.mod.get_function("interp_iter_params")
iter_fun = self.mod.get_function("iter")
info = self.info
# The synchronization model is messy. See helpers/task_model.svg.
iter_stream = cuda.Stream()
filt_stream = cuda.Stream()
if info.acc_mode == 'deferred':
if self.acc_mode == 'deferred':
write_stream = cuda.Stream()
write_fun = self.mod.get_function("write_shmem")
else:
@ -128,19 +182,30 @@ class Renderer(object):
event_a = cuda.Event().record(filt_stream)
event_b = None
nbins = info.acc_height * info.acc_stride
awidth = width + 2 * self.gutter
aheight = height + 2 * self.gutter
astride = 32 * int(np.ceil(awidth / 32.))
dim = Dimensions(width, height, awidth, aheight, astride)
d_acc_size = self.mod.get_global('acc_size')[0]
cuda.memcpy_htod_async(d_acc_size, u32(list(dim)), write_stream)
nbins = awidth * aheight
# Extra padding in accum helps with write_shmem overruns
d_accum = cuda.mem_alloc(16 * nbins + (1<<16))
d_out = cuda.mem_alloc(16 * nbins)
if info.acc_mode == 'atomic':
if self.acc_mode == 'atomic':
d_atom = cuda.mem_alloc(8 * nbins)
flush_fun = self.mod.get_function("flush_atom")
acc_size = np.array([info.acc_width, info.acc_height, info.acc_stride])
d_acc_size = self.mod.get_global('acc_size')[0]
cuda.memcpy_htod_async(d_acc_size, np.uint32(acc_size), write_stream)
obuf_copy = argset(cuda.Memcpy2D(),
src_y=self.gutter, src_x_in_bytes=16*self.gutter,
src_pitch=16*astride, dst_pitch=16*width,
width_in_bytes=16*width, height=height)
obuf_copy.set_src_device(d_out)
h_out_a = cuda.pagelocked_empty((height, width, 4), f32)
h_out_b = cuda.pagelocked_empty((height, width, 4), f32)
if info.acc_mode == 'deferred':
if self.acc_mode == 'deferred':
# Having a fixed, power-of-two log size makes things much easier
log_size = 64 << 20
d_log = cuda.mem_alloc(log_size * 4)
@ -153,9 +218,8 @@ class Renderer(object):
# Calculate 'nslots', the number of simultaneous running threads that
# can be active on the GPU during iteration (and thus the number of
# slots for loading and storing RNG and point context that will be
# prepared on the device), 'rb_size' (the number of blocks in
# 'nslots'), and determine a number of temporal samples
# likely to load-balance effectively
# prepared on the device), and derive 'rb_size', the number of blocks in
# 'nslots'.
iter_threads_per_block = 256
dev_data = pycuda.tools.DeviceData()
occupancy = pycuda.tools.OccupancyRecord(
@ -169,14 +233,16 @@ class Renderer(object):
reset_rb_fun(np.int32(rb_size), block=(1,1,1))
d_points = cuda.mem_alloc(nslots * 16)
# We may add extra seeds to simplify palette dithering.
seeds = mwc.MWC.make_seeds(max(nslots, 256 * info.palette_height))
# This statement may add extra seeds to simplify palette dithering.
seeds = mwc.MWC.make_seeds(max(nslots, 256 * self.palette_height))
d_seeds = cuda.to_device(seeds)
# We used to auto-calculate this to a multiple of the number of SMs on
# the device, but since we now use shorter launches and, to a certain
# extent, allow simultaneous occupancy, that's not as important. The
# 1024 is a magic constant, though: FUSE
# 1024 is a magic constant to ensure reasonable and power-of-two log
# size for deferred: 256MB / (4B * FUSE * NTHREADS). Enhancements to
# the sort engine are needed to make this more flexible.
ntemporal_samples = 1024
genome_times, genome_knots = self._iter.packer.pack()
d_genome_times = cuda.to_device(genome_times)
@ -184,37 +250,31 @@ class Renderer(object):
info_size = 4 * len(self._iter.packer) * ntemporal_samples
d_infos = cuda.mem_alloc(info_size)
pals = info.genome.color.palette
pals = genome.color.palette
if isinstance(pals, basestring):
pals = [0.0, pals, 1.0, pals]
palint_times = np.empty(len(genome_times[0]), np.float32)
palint_times = np.empty(len(genome_times[0]), f32)
palint_times.fill(100.0)
palint_times[:len(pals)/2] = pals[::2]
palint_times[:len(pals)] = [p[0] for p in pals]
d_palint_times = cuda.to_device(palint_times)
d_palint_vals = cuda.to_device(
np.concatenate(map(info.db.palettes.get, pals[1::2])))
np.concatenate([p[1].data for p in pals]))
if info.acc_mode in ('deferred', 'atomic'):
if self.acc_mode in ('deferred', 'atomic'):
palette_fun = self.mod.get_function("interp_palette_hsv_flat")
dsc = cuda.ArrayDescriptor3D()
dsc.height = info.palette_height
dsc.width = 256
dsc.depth = 0
dsc.format = cuda.array_format.SIGNED_INT32
dsc.num_channels = 2
dsc.flags = cuda.array3d_flags.SURFACE_LDST
dsc = argset(cuda.ArrayDescriptor3D(), height=self.palette_height,
width=256, depth=0, format=cuda.array_format.SIGNED_INT32,
num_channels=2, flags=cuda.array3d_flags.SURFACE_LDST)
palarray = cuda.Array(dsc)
tref = self.mod.get_surfref('flatpal')
tref.set_array(palarray, 0)
else:
palette_fun = self.mod.get_function("interp_palette_hsv")
dsc = cuda.ArrayDescriptor()
dsc.height = info.palette_height
dsc.width = 256
dsc.format = cuda.array_format.UNSIGNED_INT8
dsc.num_channels = 4
d_palmem = cuda.mem_alloc(256 * info.palette_height * 4)
dsc = argset(cuda.ArrayDescriptor(), height=self.palette_height,
width=256, format=cuda.array_format.UNSIGNED_INT8,
num_channels=4)
d_palmem = cuda.mem_alloc(256 * self.palette_height * 4)
tref = self.mod.get_texref('palTex')
tref.set_address_2d(d_palmem, dsc, 1024)
@ -222,49 +282,46 @@ class Renderer(object):
tref.set_flags(cuda.TRSF_NORMALIZED_COORDINATES)
tref.set_filter_mode(cuda.filter_mode.LINEAR)
h_out_a = cuda.pagelocked_empty((info.acc_height, info.acc_stride, 4),
np.float32)
h_out_b = cuda.pagelocked_empty((info.acc_height, info.acc_stride, 4),
np.float32)
last_idx = None
while next_frame is not None:
# tc, td, ts, te: central, delta, start, end times
idx, tc = next_frame
td = genome.adj_frame_width(tc)
ts, te = tc - 0.5 * td, tc + 0.5 * td
for idx, start, stop in times:
twidth = np.float32((stop-start) / info.palette_height)
if info.acc_mode in ('deferred', 'atomic'):
palette_fun(d_seeds, d_palint_times, d_palint_vals,
np.float32(start), twidth,
block=(256,1,1), grid=(info.palette_height,1),
stream=write_stream)
if self.acc_mode in ('deferred', 'atomic'):
# In this mode, the palette writes to a surface reference, but
# requires dithering, so we pass it the seeds instead
arg0 = d_seeds
else:
palette_fun(d_palmem, d_palint_times, d_palint_vals,
np.float32(start), twidth,
block=(256,1,1), grid=(info.palette_height,1),
arg0 = d_palmem
palette_fun(arg0, d_palint_times, d_palint_vals,
f32(ts), f32(td / self.palette_height),
block=(256,1,1), grid=(self.palette_height,1),
stream=write_stream)
width = np.float32((stop-start) / ntemporal_samples)
packer_fun(d_infos, d_genome_times, d_genome_knots,
np.float32(start), width, d_seeds,
np.int32(ntemporal_samples), block=(256,1,1),
packer_fun(d_infos, d_seeds, d_genome_times, d_genome_knots,
f32(ts), f32(td / ntemporal_samples),
i32(ntemporal_samples), block=(256,1,1),
grid=(int(np.ceil(ntemporal_samples/256.)),1),
stream=iter_stream)
# Reset points so that they will be FUSEd
util.BaseCode.fill_dptr(self.mod, d_points, 4 * nslots,
iter_stream, np.float32(np.nan))
iter_stream, f32(np.nan))
# Get interpolated control points for debugging
#iter_stream.synchronize()
#d_temp = cuda.from_device(d_infos,
#(ntemporal_samples, len(self._iter.packer)), np.float32)
#(ntemporal_samples, len(self._iter.packer)), f32)
#for i, n in zip(d_temp[5], self._iter.packer.packed):
#print '%60s %g' % ('_'.join(n), i)
util.BaseCode.fill_dptr(self.mod, d_accum, 4 * nbins, write_stream)
if info.acc_mode == 'atomic':
if self.acc_mode == 'atomic':
util.BaseCode.fill_dptr(self.mod, d_atom, 2 * nbins, write_stream)
nrounds = ( (info.density * info.width * info.height)
nrounds = int( (genome.spp(tc) * width * height)
/ (ntemporal_samples * 256 * 256) ) + 1
if info.acc_mode == 'deferred':
if self.acc_mode == 'deferred':
for i in range(nrounds):
iter_fun(np.uint64(d_log), d_seeds, d_points, d_infos,
block=(32, self._iter.NTHREADS/32, 1),
@ -272,18 +329,17 @@ class Renderer(object):
_sync_stream(write_stream, iter_stream)
sorter.sort(d_log_sorted, d_log, log_size, 3, True,
stream=write_stream)
#print cuda.from_device(sorter.dglobal, (256,), np.uint32)
_sync_stream(iter_stream, write_stream)
write_fun(d_accum, d_log_sorted, sorter.dglobal, i32(nbins),
block=(1024, 1, 1), grid=(nwriteblocks, 1),
stream=write_stream)
else:
args = [u64(d_accum), d_seeds, d_points, d_infos]
if info.acc_mode == 'atomic':
if self.acc_mode == 'atomic':
args.append(u64(d_atom))
iter_fun(*args, block=(32, self._iter.NTHREADS/32, 1),
grid=(ntemporal_samples, nrounds), stream=iter_stream)
if info.acc_mode == 'atomic':
if self.acc_mode == 'atomic':
nblocks = int(np.ceil(np.sqrt(nbins/float(512))))
flush_fun(u64(d_accum), u64(d_atom), i32(nbins),
block=(512, 1, 1), grid=(nblocks, nblocks),
@ -291,27 +347,29 @@ class Renderer(object):
util.BaseCode.fill_dptr(self.mod, d_out, 4 * nbins, filt_stream)
_sync_stream(filt_stream, write_stream)
filt.de(d_out, d_accum, info, start, stop, filt_stream)
filt.de(d_out, d_accum, genome, dim, tc, stream=filt_stream)
_sync_stream(write_stream, filt_stream)
filt.colorclip(d_out, info, start, stop, filt_stream)
cuda.memcpy_dtoh_async(h_out_a, d_out, filt_stream)
filt.colorclip(d_out, genome, dim, tc, blend, stream=filt_stream)
obuf_copy.set_dst_host(h_out_a)
obuf_copy(filt_stream)
if event_b:
while not event_a.query():
timemod.sleep(0.01)
gpu_time = event_a.time_since(event_b)
yield RenderedImage(self._trim(h_out_b), last_idx, gpu_time)
result = RenderedImage(h_out_b, last_idx, gpu_time)
else:
result = None
last_idx = idx
event_a, event_b = cuda.Event().record(filt_stream), event_a
h_out_a, h_out_b = h_out_b, h_out_a
last_idx = idx
# TODO: add ability to flush a frame without breaking the pipe
next_frame = yield result
while not event_a.query():
timemod.sleep(0.001)
gpu_time = event_a.time_since(event_b)
yield RenderedImage(self._trim(h_out_b), last_idx, gpu_time)
def _trim(self, result):
g = self.info.gutter
return result[g:-g,g:g+self.info.width].copy()
yield RenderedImage(h_out_b, last_idx, gpu_time)