cuburn/cuburn/render.py
2011-12-15 11:11:05 -05:00

376 lines
16 KiB
Python

import os
import sys
import re
import time as timemod
import tempfile
from collections import namedtuple
from itertools import cycle, repeat, chain, izip, imap, ifilter
from ctypes import *
from cStringIO import StringIO
import numpy as np
from numpy import float32 as f32, int32 as i32, uint32 as u32, uint64 as u64
from scipy import ndimage
from fr0stlib import pyflam3
from fr0stlib.pyflam3._flam3 import *
from fr0stlib.pyflam3.constants import *
import pycuda.compiler
import pycuda.driver as cuda
import pycuda.tools
import cuburn.genome
from cuburn import affine
from cuburn.code import util, mwc, iter, filtering, sort
RenderedImage = namedtuple('RenderedImage', 'buf idx gpu_time')
Dimensions = namedtuple('Dimensions', 'w h aw ah astride')
def _sync_stream(dst, src):
dst.wait_for_event(cuda.Event(cuda.event_flags.DISABLE_TIMING).record(src))
def argset(obj, **kwargs):
for k, v in kwargs.items():
setattr(obj, k, v)
return obj
class Renderer(object):
"""
Control structure for rendering a series of frames.
"""
# Number of iterations to iterate without write after generating a new
# point. This number is currently fixed pretty deeply in the set of magic
# constants which govern buffer sizes; changing the value here won't
# actually change the code on the device to do something different.
fuse = 256
# The palette texture/surface covers the color coordinate from [0,1] with
# (for now, a fixed 256) equidistant horizontal samples, and spans the
# temporal range of the frame linearly with this many rows. Increasing
# this value increases the number of uniquely-dithered samples when using
# pre-dithered surfaces.
palette_height = 64
# Maximum width of DE and other spatial filters, and thus in turn the
# amount of padding applied. Note that, for now, this must not be changed!
# The filtering code makes deep assumptions about this value.
gutter = 15
# Accumulation mode. Leave it at 'atomic' for now.
acc_mode = 'atomic'
# TODO
chaos_used = False
cmp_options = ('-use_fast_math', '-maxrregcount', '42')
keep = False
def __init__(self):
self._iter = self.src = self.cubin = self.mod = None
# Ensure class options don't get contaminated on an instance
self.cmp_options = list(self.cmp_options)
def compile(self, genome, keep=None, cmp_options=None):
"""
Compile a kernel capable of rendering every frame in this animation.
The resulting compiled kernel is stored in the ``cubin`` property;
the source is available as ``src``, and is also returned for
inspection and display.
This operation is idempotent, and has no side effects outside of
setting properties on this instance (unless there's a compiler error,
which is a bug); it should therefore be threadsafe as well.
It is, however, rather slow.
"""
keep = self.keep if keep is None else keep
cmp_options = self.cmp_options if cmp_options is None else cmp_options
self._iter = iter.IterCode(self, genome)
self._iter.packer.finalize()
self.src = util.assemble_code(util.BaseCode, mwc.MWC, self._iter.packer,
self._iter)
with open(os.path.join(tempfile.gettempdir(), 'kernel.cu'), 'w') as fp:
fp.write(self.src)
self.cubin = pycuda.compiler.compile(
self.src, keep=keep, options=cmp_options,
cache_dir=False if keep else None)
def load(self, genome, jit_options=[]):
if not self.cubin:
self.compile(genome)
self.mod = cuda.module_from_buffer(self.cubin, jit_options)
with open('/tmp/iter_kern.cubin', 'wb') as fp:
fp.write(self.cubin)
return self.src
def render(self, genome, times, width, height, blend=True):
"""
Render a frame for each timestamp in the iterable value ``times``. This
function returns a generator that will yield a RenderedImage object
containing a shared reference to the output buffer for each specified
frame.
The returned buffer is page-locked host memory. Between the time a
buffer is yielded and the time the next frame's results are requested,
the buffer will not be modified. Thereafter, however, it will be
overwritten by an asynchronous DMA operation coming from the CUDA
device. If you hang on to it for longer than one frame, copy it.
``genome`` is the genome to be rendered. Successive calls to the
`render()` method on one ``Renderer`` object must use genomes which
produce identical compiled code, and this will not be verified by the
renderer. In practice, this means you can alter genome parameter
values, but the full set of keys must remain identical between runs on
the same renderer.
``times`` is a list of (idx, cen_time) tuples, where ``idx`` is passed
unmodified in the RenderedImage return value and ``cen_time`` is the
central time of the current frame in spline-time units. (Any
clock-time or frame-time units in the genome should be preconverted.)
If ``blend`` is False, the output buffer will contain unclipped,
premultiplied RGBA data, without vibrancy, highlight power, or the
alpha elbow applied.
"""
r = self.render_gen(genome, width, height, blend=blend)
next(r)
return ifilter(None, imap(r.send, chain(times, [None])))
def render_gen(self, genome, width, height, blend=True):
"""
Render frames. This method is wrapped by the ``render()`` method; see
its docstring for warnings and details.
Instead of passing frame times as an iterable, they are passed
individually via the ``generator.send()`` method. There is an
internal pipeline latency of one frame, so the first call to the
``send()`` method will return None, the second call will return the
first frame's result, and so on. To retrieve the last frame in a
sequence, send ``None``.
Direct use of this method is useful for implementing render servers.
"""
last_idx = None
next_frame = yield
if next_frame is None:
return
if not self.mod:
self.load(genome)
filt = filtering.Filtering()
reset_rb_fun = self.mod.get_function("reset_rb")
packer_fun = self.mod.get_function("interp_iter_params")
iter_fun = self.mod.get_function("iter")
# The synchronization model is messy. See helpers/task_model.svg.
iter_stream = cuda.Stream()
filt_stream = cuda.Stream()
if self.acc_mode == 'deferred':
write_stream = cuda.Stream()
write_fun = self.mod.get_function("write_shmem")
else:
write_stream = iter_stream
# These events fire when the corresponding buffer is available for
# reading on the host (i.e. the copy is done). On the first pass, 'a'
# will be ignored, and subsequently moved to 'b'.
event_a = cuda.Event().record(filt_stream)
event_b = None
awidth = width + 2 * self.gutter
aheight = height + 2 * self.gutter
astride = 32 * int(np.ceil(awidth / 32.))
dim = Dimensions(width, height, awidth, aheight, astride)
d_acc_size = self.mod.get_global('acc_size')[0]
cuda.memcpy_htod_async(d_acc_size, u32(list(dim)), write_stream)
nbins = awidth * aheight
# Extra padding in accum helps with write_shmem overruns
d_accum = cuda.mem_alloc(16 * nbins + (1<<16))
d_out = cuda.mem_alloc(16 * nbins)
if self.acc_mode == 'atomic':
d_atom = cuda.mem_alloc(8 * nbins)
flush_fun = self.mod.get_function("flush_atom")
obuf_copy = argset(cuda.Memcpy2D(),
src_y=self.gutter, src_x_in_bytes=16*self.gutter,
src_pitch=16*astride, dst_pitch=16*width,
width_in_bytes=16*width, height=height)
obuf_copy.set_src_device(d_out)
h_out_a = cuda.pagelocked_empty((height, width, 4), f32)
h_out_b = cuda.pagelocked_empty((height, width, 4), f32)
if self.acc_mode == 'deferred':
# Having a fixed, power-of-two log size makes things much easier
log_size = 64 << 20
d_log = cuda.mem_alloc(log_size * 4)
d_log_sorted = cuda.mem_alloc(log_size * 4)
sorter = sort.Sorter(log_size)
# We need to cover each unique tag - address bits 20-23 - with one
# write block per sort bin. Or somethinig like that.
nwriteblocks = int(np.ceil(nbins / float(1<<20))) * 256
# Calculate 'nslots', the number of simultaneous running threads that
# can be active on the GPU during iteration (and thus the number of
# slots for loading and storing RNG and point context that will be
# prepared on the device), and derive 'rb_size', the number of blocks in
# 'nslots'.
iter_threads_per_block = 256
dev_data = pycuda.tools.DeviceData()
occupancy = pycuda.tools.OccupancyRecord(
dev_data, iter_threads_per_block,
iter_fun.shared_size_bytes, iter_fun.num_regs)
nsms = cuda.Context.get_device().multiprocessor_count
rb_size = occupancy.warps_per_mp * nsms / (iter_threads_per_block / 32)
nslots = iter_threads_per_block * rb_size
# Reset the ringbuffer info for the slots
reset_rb_fun(np.int32(rb_size), block=(1,1,1))
d_points = cuda.mem_alloc(nslots * 16)
# This statement may add extra seeds to simplify palette dithering.
seeds = mwc.MWC.make_seeds(max(nslots, 256 * self.palette_height))
d_seeds = cuda.to_device(seeds)
# We used to auto-calculate this to a multiple of the number of SMs on
# the device, but since we now use shorter launches and, to a certain
# extent, allow simultaneous occupancy, that's not as important. The
# 1024 is a magic constant to ensure reasonable and power-of-two log
# size for deferred: 256MB / (4B * FUSE * NTHREADS). Enhancements to
# the sort engine are needed to make this more flexible.
ntemporal_samples = 1024
genome_times, genome_knots = self._iter.packer.pack()
d_genome_times = cuda.to_device(genome_times)
d_genome_knots = cuda.to_device(genome_knots)
info_size = 4 * len(self._iter.packer) * ntemporal_samples
d_infos = cuda.mem_alloc(info_size)
pals = genome.color.palette
if isinstance(pals, basestring):
pals = [0.0, pals, 1.0, pals]
palint_times = np.empty(len(genome_times[0]), f32)
palint_times.fill(100.0)
palint_times[:len(pals)] = [p[0] for p in pals]
d_palint_times = cuda.to_device(palint_times)
d_palint_vals = cuda.to_device(
np.concatenate([p[1].data for p in pals]))
if self.acc_mode in ('deferred', 'atomic'):
palette_fun = self.mod.get_function("interp_palette_hsv_flat")
dsc = argset(cuda.ArrayDescriptor3D(), height=self.palette_height,
width=256, depth=0, format=cuda.array_format.SIGNED_INT32,
num_channels=2, flags=cuda.array3d_flags.SURFACE_LDST)
palarray = cuda.Array(dsc)
tref = self.mod.get_surfref('flatpal')
tref.set_array(palarray, 0)
else:
palette_fun = self.mod.get_function("interp_palette_hsv")
dsc = argset(cuda.ArrayDescriptor(), height=self.palette_height,
width=256, format=cuda.array_format.UNSIGNED_INT8,
num_channels=4)
d_palmem = cuda.mem_alloc(256 * self.palette_height * 4)
tref = self.mod.get_texref('palTex')
tref.set_address_2d(d_palmem, dsc, 1024)
tref.set_format(cuda.array_format.UNSIGNED_INT8, 4)
tref.set_flags(cuda.TRSF_NORMALIZED_COORDINATES)
tref.set_filter_mode(cuda.filter_mode.LINEAR)
while next_frame is not None:
# tc, td, ts, te: central, delta, start, end times
idx, tc = next_frame
td = genome.adj_frame_width(tc)
ts, te = tc - 0.5 * td, tc + 0.5 * td
if self.acc_mode in ('deferred', 'atomic'):
# In this mode, the palette writes to a surface reference, but
# requires dithering, so we pass it the seeds instead
arg0 = d_seeds
else:
arg0 = d_palmem
palette_fun(arg0, d_palint_times, d_palint_vals,
f32(ts), f32(td / self.palette_height),
block=(256,1,1), grid=(self.palette_height,1),
stream=write_stream)
packer_fun(d_infos, d_seeds, d_genome_times, d_genome_knots,
f32(ts), f32(td / ntemporal_samples),
i32(ntemporal_samples), block=(256,1,1),
grid=(int(np.ceil(ntemporal_samples/256.)),1),
stream=iter_stream)
# Reset points so that they will be FUSEd
util.BaseCode.fill_dptr(self.mod, d_points, 4 * nslots,
iter_stream, f32(np.nan))
# Get interpolated control points for debugging
#iter_stream.synchronize()
#d_temp = cuda.from_device(d_infos,
#(ntemporal_samples, len(self._iter.packer)), f32)
#for i, n in zip(d_temp[5], self._iter.packer.packed):
#print '%60s %g' % ('_'.join(n), i)
util.BaseCode.fill_dptr(self.mod, d_accum, 4 * nbins, write_stream)
if self.acc_mode == 'atomic':
util.BaseCode.fill_dptr(self.mod, d_atom, 2 * nbins, write_stream)
nrounds = int( (genome.spp(tc) * width * height)
/ (ntemporal_samples * 256 * 256) ) + 1
if self.acc_mode == 'deferred':
for i in range(nrounds):
iter_fun(np.uint64(d_log), d_seeds, d_points, d_infos,
block=(32, self._iter.NTHREADS/32, 1),
grid=(ntemporal_samples, 1), stream=iter_stream)
_sync_stream(write_stream, iter_stream)
sorter.sort(d_log_sorted, d_log, log_size, 3, True,
stream=write_stream)
_sync_stream(iter_stream, write_stream)
write_fun(d_accum, d_log_sorted, sorter.dglobal, i32(nbins),
block=(1024, 1, 1), grid=(nwriteblocks, 1),
stream=write_stream)
else:
args = [u64(d_accum), d_seeds, d_points, d_infos]
if self.acc_mode == 'atomic':
args.append(u64(d_atom))
iter_fun(*args, block=(32, self._iter.NTHREADS/32, 1),
grid=(ntemporal_samples, nrounds), stream=iter_stream)
if self.acc_mode == 'atomic':
nblocks = int(np.ceil(np.sqrt(nbins/float(512))))
flush_fun(u64(d_accum), u64(d_atom), i32(nbins),
block=(512, 1, 1), grid=(nblocks, nblocks),
stream=iter_stream)
util.BaseCode.fill_dptr(self.mod, d_out, 4 * nbins, filt_stream)
_sync_stream(filt_stream, write_stream)
filt.de(d_out, d_accum, genome, dim, tc, stream=filt_stream)
_sync_stream(write_stream, filt_stream)
filt.colorclip(d_out, genome, dim, tc, blend, stream=filt_stream)
obuf_copy.set_dst_host(h_out_a)
obuf_copy(filt_stream)
if event_b:
while not event_a.query():
timemod.sleep(0.01)
gpu_time = event_a.time_since(event_b)
result = RenderedImage(h_out_b, last_idx, gpu_time)
else:
result = None
last_idx = idx
event_a, event_b = cuda.Event().record(filt_stream), event_a
h_out_a, h_out_b = h_out_b, h_out_a
# TODO: add ability to flush a frame without breaking the pipe
next_frame = yield result
while not event_a.query():
timemod.sleep(0.001)
gpu_time = event_a.time_since(event_b)
yield RenderedImage(h_out_b, last_idx, gpu_time)