cuburn/cuburn/render.py
Steven Robertson 08d33ea593 Allow for customized blur width.
Also moves host pool to framebuffer for use by filters.
2012-04-16 01:25:34 -07:00

394 lines
16 KiB
Python

"""
Resources and tools to perform rendering.
"""
import os
import sys
import re
import time
import tempfile
from collections import namedtuple
import numpy as np
from numpy import float32 as f32, int32 as i32, uint32 as u32, uint64 as u64
import pycuda.driver as cuda
import pycuda.tools
import filters
import output
from code import util, mwc, iter, interp, sort
from code.util import ClsMod, devlib, filldptrlib, assemble_code, launch
from cuburn.genome.util import palette_decode
RenderedImage = namedtuple('RenderedImage', 'buf idx gpu_time')
Dimensions = namedtuple('Dimensions', 'w h aw ah astride')
class Framebuffers(object):
"""
The largest memory allocations, and a stream to serialize their use.
``d_front`` and ``d_back`` are separate buffers, each large enough to hold
four float32 components per pixel (including any gutter pixels added for
alignment or padding). ``d_side`` is another buffer large enough to hold
two float32 components per pixel.
Every user of this set of buffers may use and overwrite the buffers in
any way, as long as the output for the next stage winds up in the front
buffer. The front and back buffers can be exchanged by the ``flip()``
method (which simply exchanges the pointers); while no similar method
exists for the side buffer, you're free to do the same by taking local
copies of the references and exchanging them yourself.
There's one spot in the stream interleaving where the behavior is
different: the ``Output.convert`` call must store its output to the back
buffer, which will remain untouched until the dtoh copy of the converted
buffer is finished. This happens while the ``iter`` kernel of the next
frame writes to the front and side buffers, which means in practice that
there's essentially no waiting on any buffers.
If an output module decides to get all krazy and actually replaces the
references to the buffers on this object - to, say, implement a temporally
aware tone-mapping or denoising pass - that's probably okay, but just make
sure it appears like it's expected to.
"""
# Minimum extension of accumulation buffer beyond output region. Used to
# alleviate edge effects during filtering. Actual gutter may be larger to
# accomodate alignment requirements; when it is, that extension will be
# applied to the lower-right corner of the buffer. This is asymmetrical,
# but simplifies trimming logic when it's time for that.
gutter = 10
@classmethod
def calc_dim(cls, width, height):
"""
Given a width and height, return a valid set of dimensions which
include at least enough gutter to exceed the minimum, and where
(acc_width % 32) == 0 and (acc_height % 8) == 0.
"""
awidth = width + 2 * cls.gutter
aheight = 8 * int(np.ceil((height + 2 * cls.gutter) / 8.))
astride = 32 * int(np.ceil(awidth / 32.))
return Dimensions(width, height, awidth, aheight, astride)
def __init__(self):
self.stream = cuda.Stream()
self.pool = pycuda.tools.PageLockedMemoryPool()
self._clear()
# These resources rely on the slots/ringbuffer mechanism for sharing,
# and so can be shared across any number of launches, genomes, and
# render kernels. Notably, seeds are self-synchronizing, so they're not
# attached to either stream object.
self.d_rb = cuda.to_device(np.array([0, 0], dtype=u32))
seeds = mwc.make_seeds(util.DEFAULT_RB_SIZE * 256)
self.d_seeds = cuda.to_device(seeds)
self._len_d_points = util.DEFAULT_RB_SIZE * 256 * 16
self.d_points = cuda.mem_alloc(self._len_d_points)
def _clear(self):
self.nbins = self.d_front = self.d_back = self.d_side = None
def free(self, stream=None):
if stream is not None:
stream.synchronize()
else:
cuda.Context.synchronize()
for p in (self.d_front, self.d_back, self.d_side):
if p is not None:
p.free()
self._clear()
def alloc(self, dim, stream=None):
"""
Ensure that this object's framebuffers are large enough to handle the
given dimensions, allocating new ones if not.
If ``stream`` is not None and a reallocation is necessary, the stream
will be synchronized before the old buffers are deallocated.
"""
nbins = dim.ah * dim.astride
if self.nbins >= nbins: return
if self.nbins is not None: self.free()
try:
self.d_front = cuda.mem_alloc(16 * nbins)
self.d_back = cuda.mem_alloc(16 * nbins)
self.d_side = cuda.mem_alloc(8 * nbins)
self.nbins = nbins
except cuda.MemoryError, e:
# If a frame that's too large sneaks by the task distributor, we
# don't want to kill the server, but we also don't want to leave
# it stuck without any free memory to complete the next alloc.
# TODO: measure free mem and only take tasks that fit (but that
# should be done elsewhere)
self.free(stream)
raise e
def set_dim(self, width, height, stream=None):
"""
Compute padded dimensions for given width and height, ensure that the
buffers are large enough (and reallocate if not), and return the
calculated dimensions.
Note that the returned dimensions are always the same for a given
width, height, and minimum gutter, even if the underlying buffers are
larger due to a previous allocation.
"""
dim = self.calc_dim(width, height)
self.alloc(dim, stream)
return dim
def flip(self):
"""Flip the front and back buffers."""
self.d_front, self.d_back = self.d_back, self.d_front
class DevSrc(object):
"""
The buffers which represent a genome on-device, in the formats needed to
serve as a source for interpolating temporal samples.
"""
# Maximum number of knots per parameter. This also covers the maximum
# number of palettes allowed.
max_knots = 1 << util.DEFAULT_SEARCH_ROUNDS
# Maximum number of parameters per genome. This number is exceedingly
# high, and should never even come close to being hit.
max_params = 1024
def __init__(self):
self.d_times = cuda.mem_alloc(4 * self.max_knots * self.max_params)
self.d_knots = cuda.mem_alloc(4 * self.max_knots * self.max_params)
self.d_ptimes = cuda.mem_alloc(4 * self.max_knots)
self.d_pals = cuda.mem_alloc(4 * 4 * 256 * self.max_knots)
class DevInfo(object):
"""
The buffers which hold temporal samples on-device, as used by iter.
"""
# The palette texture/surface covers the color coordinate from [0,1] with
# equidistant horizontal samples, and spans the temporal range of the
# frame linearly with this many rows. Increasing these values increases the
# number of uniquely-dithered samples when using pre-dithered surfaces, as
# is done in 'atomic' accumulation.
palette_width = 256 # TODO: make this setting be respected
palette_height = 64
# This used to be determined automagically, but simultaneous occupancy
# and a much smaller block size simplifies this.
ntemporal_samples = 1024
# Number of iterations to iterate without write after generating a new
# point. This number is currently fixed pretty deeply in the set of magic
# constants which govern buffer sizes; changing the value here won't
# actually change the code on the device to do something different.
# It's here just for documentation purposes.
fuse = 256
def __init__(self):
self.d_params = cuda.mem_alloc(
self.ntemporal_samples * DevSrc.max_params * 4)
self.palette_surf_dsc = util.argset(cuda.ArrayDescriptor3D(),
height=self.palette_height, width=self.palette_width, depth=0,
format=cuda.array_format.SIGNED_INT32,
num_channels=2, flags=cuda.array3d_flags.SURFACE_LDST)
self.d_pal_array = cuda.Array(self.palette_surf_dsc)
class Renderer(object):
# Unloading a module triggers a context sync. To keep the renderer
# asynchronous, and avoid expensive CPU polling, this hangs on to
# a number of (relatively small) CUDA modules and flushes them together.
MAX_MODREFS = 20
_modrefs = []
def __init__(self, gnm, gprof):
self.packer, self.lib = iter.mkiterlib(gnm)
cubin = util.compile('iter', assemble_code(self.lib))
self.mod = cuda.module_from_buffer(cubin)
if len(self._modrefs) > self.MAX_MODREFS:
del self._modrefs[:]
self._modrefs.append(self.mod)
self.filts = filters.create(gprof)
self.out = output.PILOutput()
class RenderManager(ClsMod):
lib = devlib(deps=[interp.palintlib, filldptrlib, iter.flushatomlib])
def __init__(self):
super(RenderManager, self).__init__()
self.fb = Framebuffers()
self.src_a, self.src_b = DevSrc(), DevSrc()
self.info_a, self.info_b = DevInfo(), DevInfo()
self.stream_a, self.stream_b = cuda.Stream(), cuda.Stream()
self.filt_evt = self.copy_evt = None
def _copy(self, rdr, gnm):
"""
Queue a copy of a host genome into a set of device interpolation source
buffers.
Note that for now, this is broken! It ignores ``gnm``, and only packs
the genome that was used when creating the renderer.
"""
times, knots = rdr.packer.pack(gnm, self.fb.pool)
cuda.memcpy_htod_async(self.src_a.d_times, times, self.stream_a)
cuda.memcpy_htod_async(self.src_a.d_knots, knots, self.stream_a)
palsrc = dict([(v[0], palette_decode(v[1:])) for v in gnm['palette']])
ptimes, pvals = zip(*sorted(palsrc.items()))
palettes = self.fb.pool.allocate((len(palsrc), 256, 4), f32)
palettes[:] = pvals
palette_times = self.fb.pool.allocate((self.src_a.max_knots,), f32)
palette_times.fill(1e9)
palette_times[:len(ptimes)] = ptimes
cuda.memcpy_htod_async(self.src_a.d_pals, palettes, self.stream_a)
cuda.memcpy_htod_async(self.src_a.d_ptimes, palette_times,
self.stream_a)
# TODO: use bilerp tex as src for palette interp
def _interp(self, rdr, gnm, dim, ts, td):
d_acc_size = rdr.mod.get_global('acc_size')[0]
p_dim = self.fb.pool.allocate((len(dim),), u32)
p_dim[:] = dim
cuda.memcpy_htod_async(d_acc_size, p_dim, self.stream_a)
tref = self.mod.get_surfref('flatpal')
tref.set_array(self.info_a.d_pal_array, 0)
launch('interp_palette_flat', self.mod, self.stream_a,
256, self.info_a.palette_height,
self.fb.d_rb, self.fb.d_seeds,
self.src_a.d_ptimes, self.src_a.d_pals,
f32(ts), f32(td / self.info_a.palette_height))
nts = self.info_a.ntemporal_samples
launch('interp_iter_params', rdr.mod, self.stream_a,
256, np.ceil(nts / 256.),
self.info_a.d_params, self.src_a.d_times, self.src_a.d_knots,
f32(ts), f32(td / nts), i32(nts))
#self._print_interp_knots(rdr)
def _print_interp_knots(self, rdr, tsidx=5):
infos = cuda.from_device(self.info_a.d_params,
(tsidx + 1, len(rdr.packer)), f32)
for i, n in zip(infos[-1], rdr.packer.packed):
print '%60s %g' % ('_'.join(n), i)
def _iter(self, rdr, gnm, gprof, dim, tc):
tref = rdr.mod.get_surfref('flatpal')
tref.set_array(self.info_a.d_pal_array, 0)
nbins = dim.ah * dim.astride
fill = lambda b, s, v=i32(0): util.fill_dptr(
self.mod, b, s, stream=self.stream_a, value=v)
fill(self.fb.d_front, 4 * nbins)
fill(self.fb.d_side, 2 * nbins)
fill(self.fb.d_points, self.fb._len_d_points / 4, f32(np.nan))
nts = self.info_a.ntemporal_samples
nsamps = (gprof.spp(tc) * dim.w * dim.h)
nrounds = int(nsamps / (nts * 256. * 256)) + 1
def launch_iter(n):
if n == 0: return
launch('iter', rdr.mod, self.stream_a, (32, 8, 1), (nts, n),
self.fb.d_front, self.fb.d_side,
self.fb.d_rb, self.fb.d_seeds, self.fb.d_points,
self.info_a.d_params)
# Split the launch into multiple rounds, possibly (slightly) reducing
# work overlap but avoiding stalls when working on a device with an
# active X session. TODO: characterize performance impact, autodetect
BLOCK_SIZE = 4
for i in range(BLOCK_SIZE-1, nrounds, BLOCK_SIZE):
launch_iter(BLOCK_SIZE)
launch_iter(nrounds%BLOCK_SIZE)
nblocks = int(np.ceil(np.sqrt(dim.ah*dim.astride/256.)))
launch('flush_atom', self.mod, self.stream_a,
256, (nblocks, nblocks),
u64(self.fb.d_front), u64(self.fb.d_side), i32(nbins))
def queue_frame(self, rdr, gnm, gprof, tc, copy=True):
"""
Queue one frame for rendering.
``rdr`` is a compiled Renderer module. Caller must ensure that the
module is compatible with the genome data provided.
``gnm`` is the genome to be rendered.
``tc`` is the center time at which to render.
``w``, ``h`` are the width and height of the desired output in px.
If ``copy`` is False, the genome data will not be recopied for each
new genome. This function must be called with ``copy=True`` the first
time a new genome is used, and may be called in that manner
subsequently without harm. I suspect the performance impact is low, so
leave ``copy`` to True every time for now.
The return value is a 2-tuple ``(evt, h_out)``, where ``evt`` is a
CUDA event and ``h_out`` is the return value of the output module's
``copy`` function. In the typical case, ``h_out`` will be a host
allocation containing data in an appropriate format for the output
module's file writer, and ``evt`` indicates when the asynchronous
DMA copy which will populate ``h_out`` is complete. This can vary
depending on the output module in use, though.
"""
# Note: we synchronize on the previous stream if buffers need to be
# reallocated, which implicitly also syncs the current stream.
dim = self.fb.set_dim(gprof.width, gprof.height, self.stream_b)
# TODO: calculate this externally somewhere?
td = gprof.frame_width(tc) / round(gprof.fps * gprof.duration)
ts, te = tc - 0.5 * td, tc + 0.5 * td
# The stream interleaving here is nontrivial.
# TODO: update diagram and link to it here
if copy:
self.src_a, self.src_b = self.src_b, self.src_a
self._copy(rdr, gnm)
self._interp(rdr, gnm, dim, ts, td)
if self.filt_evt:
self.stream_a.wait_for_event(self.filt_evt)
self._iter(rdr, gnm, gprof, dim, tc)
if self.copy_evt:
self.stream_a.wait_for_event(self.copy_evt)
for filt, name in zip(rdr.filts, gprof.filter_order):
params = getattr(gprof.filters, name)
filt.apply(self.fb, gprof, params, dim, tc, self.stream_a)
rdr.out.convert(self.fb, gprof, dim, self.stream_a)
self.filt_evt = cuda.Event().record(self.stream_a)
h_out = rdr.out.copy(self.fb, dim, self.fb.pool, self.stream_a)
self.copy_evt = cuda.Event().record(self.stream_a)
self.info_a, self.info_b = self.info_b, self.info_a
self.stream_a, self.stream_b = self.stream_b, self.stream_a
return self.copy_evt, h_out
def render(self, gnm, gprof, times):
"""
A port of the old rendering function, retained for backwards
compatibility. Some of this will be pulled into as-yet-undecided
methods for more DRY.
"""
rdr = Renderer(gnm, gprof)
last_evt = cuda.Event().record(self.stream_a)
last_idx = None
def wait(): # Times like these where you wish for a macro
while not last_evt.query():
time.sleep(0.01)
gpu_time = last_evt.time_since(two_evts_ago)
return RenderedImage(last_buf, last_idx, gpu_time)
for idx, tc in times:
evt, h_buf = self.queue_frame(rdr, gnm, gprof, tc, last_idx is None)
if last_idx:
yield wait()
two_evts_ago, last_evt = last_evt, evt
last_buf, last_idx = h_buf, idx
if last_idx:
yield wait()