New genome format to support flockutil

This commit is contained in:
Steven Robertson
2011-12-15 11:11:05 -05:00
parent 12655b8611
commit b43481e374
5 changed files with 303 additions and 275 deletions

View File

@ -4,11 +4,11 @@ import re
import time as timemod
import tempfile
from collections import namedtuple
from itertools import cycle, repeat, chain, izip
from itertools import cycle, repeat, chain, izip, imap, ifilter
from ctypes import *
from cStringIO import StringIO
import numpy as np
from numpy import int32 as i32, uint64 as u64
from numpy import float32 as f32, int32 as i32, uint32 as u32, uint64 as u64
from scipy import ndimage
from fr0stlib import pyflam3
@ -24,41 +24,55 @@ from cuburn import affine
from cuburn.code import util, mwc, iter, filtering, sort
RenderedImage = namedtuple('RenderedImage', 'buf idx gpu_time')
Dimensions = namedtuple('Dimensions', 'w h aw ah astride')
def _sync_stream(dst, src):
dst.wait_for_event(cuda.Event(cuda.event_flags.DISABLE_TIMING).record(src))
def argset(obj, **kwargs):
for k, v in kwargs.items():
setattr(obj, k, v)
return obj
class Renderer(object):
"""
Control structure for rendering a series of frames.
Each animation will dynamically generate a kernel that includes only the
code necessary to render the genomes provided. The process of generating
and uploading the kernel takes a small but finite amount of time. In
general, the kernel generated for all genomes resulting from interpolating
between two control points will have identical performance, so it is
wasteful to create more than one animation for any interpolated sequence.
However, genome sequences interpolated from three or more control points
with different features enabled will have the code needed to render all
genomes enabled for every frame. Doing this can hurt performance.
In other words, it's best to use exactly one Animation for each
interpolated sequence between one or two genomes.
"""
# Number of iterations to iterate without write after generating a new
# point. This number is currently fixed pretty deeply in the set of magic
# constants which govern buffer sizes; changing the value here won't
# actually change the code on the device to do something different.
fuse = 256
# The palette texture/surface covers the color coordinate from [0,1] with
# (for now, a fixed 256) equidistant horizontal samples, and spans the
# temporal range of the frame linearly with this many rows. Increasing
# this value increases the number of uniquely-dithered samples when using
# pre-dithered surfaces.
palette_height = 64
# Maximum width of DE and other spatial filters, and thus in turn the
# amount of padding applied. Note that, for now, this must not be changed!
# The filtering code makes deep assumptions about this value.
gutter = 15
# Accumulation mode. Leave it at 'atomic' for now.
acc_mode = 'atomic'
# TODO
chaos_used = False
cmp_options = ('-use_fast_math', '-maxrregcount', '42')
keep = False
def __init__(self, info):
self.info = info
def __init__(self):
self._iter = self.src = self.cubin = self.mod = None
self.packed_genome = None
# Ensure class options don't get contaminated on an instance
self.cmp_options = list(self.cmp_options)
def compile(self, keep=None, cmp_options=None, jit_options=[]):
def compile(self, genome, keep=None, cmp_options=None):
"""
Compile a kernel capable of rendering every frame in this animation.
The resulting compiled kernel is stored in the ``cubin`` property;
@ -73,7 +87,7 @@ class Renderer(object):
keep = self.keep if keep is None else keep
cmp_options = self.cmp_options if cmp_options is None else cmp_options
self._iter = iter.IterCode(self.info)
self._iter = iter.IterCode(self, genome)
self._iter.packer.finalize()
self.src = util.assemble_code(util.BaseCode, mwc.MWC, self._iter.packer,
self._iter)
@ -82,41 +96,81 @@ class Renderer(object):
self.cubin = pycuda.compiler.compile(
self.src, keep=keep, options=cmp_options,
cache_dir=False if keep else None)
def load(self, genome, jit_options=[]):
if not self.cubin:
self.compile(genome)
self.mod = cuda.module_from_buffer(self.cubin, jit_options)
with open('/tmp/iter_kern.cubin', 'wb') as fp:
fp.write(self.cubin)
return self.src
def render(self, times):
def render(self, genome, times, width, height, blend=True):
"""
Render a flame for each genome in the iterable value 'genomes'.
Returns a RenderedImage object with the rendered buffer in the
requested format (3D RGBA ndarray only for now).
Render a frame for each timestamp in the iterable value ``times``. This
function returns a generator that will yield a RenderedImage object
containing a shared reference to the output buffer for each specified
frame.
This method produces a considerable amount of side effects, and should
not be used lightly. Things may go poorly for you if this method is not
allowed to run until completion (by exhausting all items in the
generator object).
The returned buffer is page-locked host memory. Between the time a
buffer is yielded and the time the next frame's results are requested,
the buffer will not be modified. Thereafter, however, it will be
overwritten by an asynchronous DMA operation coming from the CUDA
device. If you hang on to it for longer than one frame, copy it.
``times`` is a sequence of (idx, start, stop) times, where index is
the logical frame number (though it can be any value) and 'start' and
'stop' together define the time range to be rendered for each frame.
``genome`` is the genome to be rendered. Successive calls to the
`render()` method on one ``Renderer`` object must use genomes which
produce identical compiled code, and this will not be verified by the
renderer. In practice, this means you can alter genome parameter
values, but the full set of keys must remain identical between runs on
the same renderer.
``times`` is a list of (idx, cen_time) tuples, where ``idx`` is passed
unmodified in the RenderedImage return value and ``cen_time`` is the
central time of the current frame in spline-time units. (Any
clock-time or frame-time units in the genome should be preconverted.)
If ``blend`` is False, the output buffer will contain unclipped,
premultiplied RGBA data, without vibrancy, highlight power, or the
alpha elbow applied.
"""
if times == []:
r = self.render_gen(genome, width, height, blend=blend)
next(r)
return ifilter(None, imap(r.send, chain(times, [None])))
def render_gen(self, genome, width, height, blend=True):
"""
Render frames. This method is wrapped by the ``render()`` method; see
its docstring for warnings and details.
Instead of passing frame times as an iterable, they are passed
individually via the ``generator.send()`` method. There is an
internal pipeline latency of one frame, so the first call to the
``send()`` method will return None, the second call will return the
first frame's result, and so on. To retrieve the last frame in a
sequence, send ``None``.
Direct use of this method is useful for implementing render servers.
"""
last_idx = None
next_frame = yield
if next_frame is None:
return
if not self.mod:
self.load(genome)
filt = filtering.Filtering()
reset_rb_fun = self.mod.get_function("reset_rb")
packer_fun = self.mod.get_function("interp_iter_params")
iter_fun = self.mod.get_function("iter")
info = self.info
# The synchronization model is messy. See helpers/task_model.svg.
iter_stream = cuda.Stream()
filt_stream = cuda.Stream()
if info.acc_mode == 'deferred':
if self.acc_mode == 'deferred':
write_stream = cuda.Stream()
write_fun = self.mod.get_function("write_shmem")
else:
@ -128,19 +182,30 @@ class Renderer(object):
event_a = cuda.Event().record(filt_stream)
event_b = None
nbins = info.acc_height * info.acc_stride
awidth = width + 2 * self.gutter
aheight = height + 2 * self.gutter
astride = 32 * int(np.ceil(awidth / 32.))
dim = Dimensions(width, height, awidth, aheight, astride)
d_acc_size = self.mod.get_global('acc_size')[0]
cuda.memcpy_htod_async(d_acc_size, u32(list(dim)), write_stream)
nbins = awidth * aheight
# Extra padding in accum helps with write_shmem overruns
d_accum = cuda.mem_alloc(16 * nbins + (1<<16))
d_out = cuda.mem_alloc(16 * nbins)
if info.acc_mode == 'atomic':
if self.acc_mode == 'atomic':
d_atom = cuda.mem_alloc(8 * nbins)
flush_fun = self.mod.get_function("flush_atom")
acc_size = np.array([info.acc_width, info.acc_height, info.acc_stride])
d_acc_size = self.mod.get_global('acc_size')[0]
cuda.memcpy_htod_async(d_acc_size, np.uint32(acc_size), write_stream)
obuf_copy = argset(cuda.Memcpy2D(),
src_y=self.gutter, src_x_in_bytes=16*self.gutter,
src_pitch=16*astride, dst_pitch=16*width,
width_in_bytes=16*width, height=height)
obuf_copy.set_src_device(d_out)
h_out_a = cuda.pagelocked_empty((height, width, 4), f32)
h_out_b = cuda.pagelocked_empty((height, width, 4), f32)
if info.acc_mode == 'deferred':
if self.acc_mode == 'deferred':
# Having a fixed, power-of-two log size makes things much easier
log_size = 64 << 20
d_log = cuda.mem_alloc(log_size * 4)
@ -153,9 +218,8 @@ class Renderer(object):
# Calculate 'nslots', the number of simultaneous running threads that
# can be active on the GPU during iteration (and thus the number of
# slots for loading and storing RNG and point context that will be
# prepared on the device), 'rb_size' (the number of blocks in
# 'nslots'), and determine a number of temporal samples
# likely to load-balance effectively
# prepared on the device), and derive 'rb_size', the number of blocks in
# 'nslots'.
iter_threads_per_block = 256
dev_data = pycuda.tools.DeviceData()
occupancy = pycuda.tools.OccupancyRecord(
@ -169,14 +233,16 @@ class Renderer(object):
reset_rb_fun(np.int32(rb_size), block=(1,1,1))
d_points = cuda.mem_alloc(nslots * 16)
# We may add extra seeds to simplify palette dithering.
seeds = mwc.MWC.make_seeds(max(nslots, 256 * info.palette_height))
# This statement may add extra seeds to simplify palette dithering.
seeds = mwc.MWC.make_seeds(max(nslots, 256 * self.palette_height))
d_seeds = cuda.to_device(seeds)
# We used to auto-calculate this to a multiple of the number of SMs on
# the device, but since we now use shorter launches and, to a certain
# extent, allow simultaneous occupancy, that's not as important. The
# 1024 is a magic constant, though: FUSE
# 1024 is a magic constant to ensure reasonable and power-of-two log
# size for deferred: 256MB / (4B * FUSE * NTHREADS). Enhancements to
# the sort engine are needed to make this more flexible.
ntemporal_samples = 1024
genome_times, genome_knots = self._iter.packer.pack()
d_genome_times = cuda.to_device(genome_times)
@ -184,37 +250,31 @@ class Renderer(object):
info_size = 4 * len(self._iter.packer) * ntemporal_samples
d_infos = cuda.mem_alloc(info_size)
pals = info.genome.color.palette
pals = genome.color.palette
if isinstance(pals, basestring):
pals = [0.0, pals, 1.0, pals]
palint_times = np.empty(len(genome_times[0]), np.float32)
palint_times = np.empty(len(genome_times[0]), f32)
palint_times.fill(100.0)
palint_times[:len(pals)/2] = pals[::2]
palint_times[:len(pals)] = [p[0] for p in pals]
d_palint_times = cuda.to_device(palint_times)
d_palint_vals = cuda.to_device(
np.concatenate(map(info.db.palettes.get, pals[1::2])))
np.concatenate([p[1].data for p in pals]))
if info.acc_mode in ('deferred', 'atomic'):
if self.acc_mode in ('deferred', 'atomic'):
palette_fun = self.mod.get_function("interp_palette_hsv_flat")
dsc = cuda.ArrayDescriptor3D()
dsc.height = info.palette_height
dsc.width = 256
dsc.depth = 0
dsc.format = cuda.array_format.SIGNED_INT32
dsc.num_channels = 2
dsc.flags = cuda.array3d_flags.SURFACE_LDST
dsc = argset(cuda.ArrayDescriptor3D(), height=self.palette_height,
width=256, depth=0, format=cuda.array_format.SIGNED_INT32,
num_channels=2, flags=cuda.array3d_flags.SURFACE_LDST)
palarray = cuda.Array(dsc)
tref = self.mod.get_surfref('flatpal')
tref.set_array(palarray, 0)
else:
palette_fun = self.mod.get_function("interp_palette_hsv")
dsc = cuda.ArrayDescriptor()
dsc.height = info.palette_height
dsc.width = 256
dsc.format = cuda.array_format.UNSIGNED_INT8
dsc.num_channels = 4
d_palmem = cuda.mem_alloc(256 * info.palette_height * 4)
dsc = argset(cuda.ArrayDescriptor(), height=self.palette_height,
width=256, format=cuda.array_format.UNSIGNED_INT8,
num_channels=4)
d_palmem = cuda.mem_alloc(256 * self.palette_height * 4)
tref = self.mod.get_texref('palTex')
tref.set_address_2d(d_palmem, dsc, 1024)
@ -222,49 +282,46 @@ class Renderer(object):
tref.set_flags(cuda.TRSF_NORMALIZED_COORDINATES)
tref.set_filter_mode(cuda.filter_mode.LINEAR)
h_out_a = cuda.pagelocked_empty((info.acc_height, info.acc_stride, 4),
np.float32)
h_out_b = cuda.pagelocked_empty((info.acc_height, info.acc_stride, 4),
np.float32)
last_idx = None
while next_frame is not None:
# tc, td, ts, te: central, delta, start, end times
idx, tc = next_frame
td = genome.adj_frame_width(tc)
ts, te = tc - 0.5 * td, tc + 0.5 * td
for idx, start, stop in times:
twidth = np.float32((stop-start) / info.palette_height)
if info.acc_mode in ('deferred', 'atomic'):
palette_fun(d_seeds, d_palint_times, d_palint_vals,
np.float32(start), twidth,
block=(256,1,1), grid=(info.palette_height,1),
stream=write_stream)
if self.acc_mode in ('deferred', 'atomic'):
# In this mode, the palette writes to a surface reference, but
# requires dithering, so we pass it the seeds instead
arg0 = d_seeds
else:
palette_fun(d_palmem, d_palint_times, d_palint_vals,
np.float32(start), twidth,
block=(256,1,1), grid=(info.palette_height,1),
stream=write_stream)
arg0 = d_palmem
palette_fun(arg0, d_palint_times, d_palint_vals,
f32(ts), f32(td / self.palette_height),
block=(256,1,1), grid=(self.palette_height,1),
stream=write_stream)
width = np.float32((stop-start) / ntemporal_samples)
packer_fun(d_infos, d_genome_times, d_genome_knots,
np.float32(start), width, d_seeds,
np.int32(ntemporal_samples), block=(256,1,1),
packer_fun(d_infos, d_seeds, d_genome_times, d_genome_knots,
f32(ts), f32(td / ntemporal_samples),
i32(ntemporal_samples), block=(256,1,1),
grid=(int(np.ceil(ntemporal_samples/256.)),1),
stream=iter_stream)
# Reset points so that they will be FUSEd
util.BaseCode.fill_dptr(self.mod, d_points, 4 * nslots,
iter_stream, np.float32(np.nan))
iter_stream, f32(np.nan))
# Get interpolated control points for debugging
#iter_stream.synchronize()
#d_temp = cuda.from_device(d_infos,
#(ntemporal_samples, len(self._iter.packer)), np.float32)
#(ntemporal_samples, len(self._iter.packer)), f32)
#for i, n in zip(d_temp[5], self._iter.packer.packed):
#print '%60s %g' % ('_'.join(n), i)
util.BaseCode.fill_dptr(self.mod, d_accum, 4 * nbins, write_stream)
if info.acc_mode == 'atomic':
if self.acc_mode == 'atomic':
util.BaseCode.fill_dptr(self.mod, d_atom, 2 * nbins, write_stream)
nrounds = ( (info.density * info.width * info.height)
/ (ntemporal_samples * 256 * 256) ) + 1
if info.acc_mode == 'deferred':
nrounds = int( (genome.spp(tc) * width * height)
/ (ntemporal_samples * 256 * 256) ) + 1
if self.acc_mode == 'deferred':
for i in range(nrounds):
iter_fun(np.uint64(d_log), d_seeds, d_points, d_infos,
block=(32, self._iter.NTHREADS/32, 1),
@ -272,18 +329,17 @@ class Renderer(object):
_sync_stream(write_stream, iter_stream)
sorter.sort(d_log_sorted, d_log, log_size, 3, True,
stream=write_stream)
#print cuda.from_device(sorter.dglobal, (256,), np.uint32)
_sync_stream(iter_stream, write_stream)
write_fun(d_accum, d_log_sorted, sorter.dglobal, i32(nbins),
block=(1024, 1, 1), grid=(nwriteblocks, 1),
stream=write_stream)
else:
args = [u64(d_accum), d_seeds, d_points, d_infos]
if info.acc_mode == 'atomic':
if self.acc_mode == 'atomic':
args.append(u64(d_atom))
iter_fun(*args, block=(32, self._iter.NTHREADS/32, 1),
grid=(ntemporal_samples, nrounds), stream=iter_stream)
if info.acc_mode == 'atomic':
if self.acc_mode == 'atomic':
nblocks = int(np.ceil(np.sqrt(nbins/float(512))))
flush_fun(u64(d_accum), u64(d_atom), i32(nbins),
block=(512, 1, 1), grid=(nblocks, nblocks),
@ -291,27 +347,29 @@ class Renderer(object):
util.BaseCode.fill_dptr(self.mod, d_out, 4 * nbins, filt_stream)
_sync_stream(filt_stream, write_stream)
filt.de(d_out, d_accum, info, start, stop, filt_stream)
filt.de(d_out, d_accum, genome, dim, tc, stream=filt_stream)
_sync_stream(write_stream, filt_stream)
filt.colorclip(d_out, info, start, stop, filt_stream)
cuda.memcpy_dtoh_async(h_out_a, d_out, filt_stream)
filt.colorclip(d_out, genome, dim, tc, blend, stream=filt_stream)
obuf_copy.set_dst_host(h_out_a)
obuf_copy(filt_stream)
if event_b:
while not event_a.query():
timemod.sleep(0.01)
gpu_time = event_a.time_since(event_b)
yield RenderedImage(self._trim(h_out_b), last_idx, gpu_time)
result = RenderedImage(h_out_b, last_idx, gpu_time)
else:
result = None
last_idx = idx
event_a, event_b = cuda.Event().record(filt_stream), event_a
h_out_a, h_out_b = h_out_b, h_out_a
last_idx = idx
# TODO: add ability to flush a frame without breaking the pipe
next_frame = yield result
while not event_a.query():
timemod.sleep(0.001)
gpu_time = event_a.time_since(event_b)
yield RenderedImage(self._trim(h_out_b), last_idx, gpu_time)
def _trim(self, result):
g = self.info.gutter
return result[g:-g,g:g+self.info.width].copy()
yield RenderedImage(h_out_b, last_idx, gpu_time)