cuburn/cuburn/render.py

"""
Resources and tools to perform rendering.
"""

import os
import sys
import re
import time
import tempfile
from collections import namedtuple
import numpy as np
from numpy import float32 as f32, int32 as i32, uint32 as u32, uint64 as u64

import pycuda.driver as cuda
import pycuda.tools

import filters
import output
from code import util, mwc, iter, interp, sort
from code.util import ClsMod, devlib, filldptrlib, assemble_code, launch
from cuburn.genome.util import palette_decode

RenderedImage = namedtuple('RenderedImage', 'buf idx gpu_time')
Dimensions = namedtuple('Dimensions', 'w h aw ah astride')

class Framebuffers(object):
    """
    The largest memory allocations, and a stream to serialize their use.

    ``d_front`` and ``d_back`` are separate buffers, each large enough to hold
    four float32 components per pixel (including any gutter pixels added for
    alignment or padding). ``d_side`` is another buffer large enough to hold
    two float32 components per pixel.

    Every user of this set of buffers may use and overwrite the buffers in
    any way, as long as the output for the next stage winds up in the front
    buffer. The front and back buffers can be exchanged by the ``flip()``
    method (which simply exchanges the pointers); while no similar method
    exists for the side buffer, you're free to do the same by taking local
    copies of the references and exchanging them yourself.

    There's one spot in the stream interleaving where the behavior is
    different: the ``Output.convert`` call must store its output to the back
    buffer, which will remain untouched until the dtoh copy of the converted
    buffer is finished. This happens while the ``iter`` kernel of the next
    frame writes to the front and side buffers, which means in practice that
    there's essentially no waiting on any buffers.

    If an output module decides to get all krazy and actually replaces the
    references to the buffers on this object - to, say, implement a temporally
    aware tone-mapping or denoising pass - that's probably okay, but just make
    sure it appears like it's expected to.
    """

    # Minimum extension of accumulation buffer beyond output region. Used to
    # alleviate edge effects during filtering. Actual gutter may be larger to
    # accomodate alignment requirements; when it is, that extension will be
    # applied to the lower-right corner of the buffer. This is asymmetrical,
    # but simplifies trimming logic when it's time for that.
    gutter = 10

    @classmethod
    def calc_dim(cls, width, height):
        """
        Given a width and height, return a valid set of dimensions which
        include at least enough gutter to exceed the minimum, and where
        (acc_width % 32) == 0 and (acc_height % 8) == 0.
        """
        awidth = width + 2 * cls.gutter
        aheight = 8 * int(np.ceil((height + 2 * cls.gutter) / 8.))
        astride = 32 * int(np.ceil(awidth / 32.))
        return Dimensions(width, height, awidth, aheight, astride)

    def __init__(self):
        self.stream = cuda.Stream()
        self.pool = pycuda.tools.PageLockedMemoryPool()
        self._clear()

        # These resources rely on the slots/ringbuffer mechanism for sharing,
        # and so can be shared across any number of launches, genomes, and
        # render kernels. Notably, seeds are self-synchronizing, so they're not
        # attached to either stream object.
        self.d_rb = cuda.to_device(np.array([0, 0], dtype=u32))
        seeds = mwc.make_seeds(util.DEFAULT_RB_SIZE * 256)
        self.d_seeds = cuda.to_device(seeds)
        self._len_d_points = util.DEFAULT_RB_SIZE * 256 * 16
        self.d_points = cuda.mem_alloc(self._len_d_points)

    def _clear(self):
        self.nbins = self.d_front = self.d_back = self.d_side = None

    def free(self, stream=None):
        if stream is not None:
            stream.synchronize()
        else:
            cuda.Context.synchronize()
        for p in (self.d_front, self.d_back, self.d_side):
            if p is not None:
                p.free()
        self._clear()

    def alloc(self, dim, stream=None):
        """
        Ensure that this object's framebuffers are large enough to handle the
        given dimensions, allocating new ones if not.

        If ``stream`` is not None and a reallocation is necessary, the stream
        will be synchronized before the old buffers are deallocated.
        """
        nbins = dim.ah * dim.astride
        if self.nbins >= nbins: return
        if self.nbins is not None: self.free()
        try:
            self.d_front = cuda.mem_alloc(16 * nbins)
            self.d_back  = cuda.mem_alloc(16 * nbins)
            self.d_side  = cuda.mem_alloc(8 * nbins)
            self.nbins = nbins
        except cuda.MemoryError, e:
            # If a frame that's too large sneaks by the task distributor, we
            # don't want to kill the server, but we also don't want to leave
            # it stuck without any free memory to complete the next alloc.
            # TODO: measure free mem and only take tasks that fit (but that
            # should be done elsewhere)
            self.free(stream)
            raise e

    def set_dim(self, width, height, stream=None):
        """
        Compute padded dimensions for given width and height, ensure that the
        buffers are large enough (and reallocate if not), and return the
        calculated dimensions.

        Note that the returned dimensions are always the same for a given
        width, height, and minimum gutter, even if the underlying buffers are
        larger due to a previous allocation.
        """
        dim = self.calc_dim(width, height)
        self.alloc(dim, stream)
        return dim

    def flip(self):
        """Flip the front and back buffers."""
        self.d_front, self.d_back = self.d_back, self.d_front

class DevSrc(object):
    """
    The buffers which represent a genome on-device, in the formats needed to
    serve as a source for interpolating temporal samples.
    """

    # Maximum number of knots per parameter. This also covers the maximum
    # number of palettes allowed.
    max_knots = 1 << util.DEFAULT_SEARCH_ROUNDS

    # Maximum number of parameters per genome. This number is exceedingly
    # high, and should never even come close to being hit.
    max_params = 1024

    def __init__(self):
        self.d_times = cuda.mem_alloc(4 * self.max_knots * self.max_params)
        self.d_knots = cuda.mem_alloc(4 * self.max_knots * self.max_params)
        self.d_ptimes = cuda.mem_alloc(4 * self.max_knots)
        self.d_pals = cuda.mem_alloc(4 * 4 * 256 * self.max_knots)

class DevInfo(object):
    """
    The buffers which hold temporal samples on-device, as used by iter.
    """

    # The palette texture/surface covers the color coordinate from [0,1] with
    # equidistant horizontal samples, and spans the temporal range of the
    # frame linearly with this many rows. Increasing these values increases the
    # number of uniquely-dithered samples when using pre-dithered surfaces, as
    # is done in 'atomic' accumulation.
    palette_width = 256 # TODO: make this setting be respected
    palette_height = 64

    # This used to be determined automagically, but simultaneous occupancy
    # and a much smaller block size simplifies this.
    ntemporal_samples = 1024

    # Number of iterations to iterate without write after generating a new
    # point. This number is currently fixed pretty deeply in the set of magic
    # constants which govern buffer sizes; changing the value here won't
    # actually change the code on the device to do something different.
    # It's here just for documentation purposes.
    fuse = 256

    def __init__(self):
        self.d_params = cuda.mem_alloc(
                self.ntemporal_samples * DevSrc.max_params * 4)
        self.palette_surf_dsc = util.argset(cuda.ArrayDescriptor3D(),
                height=self.palette_height, width=self.palette_width, depth=0,
                format=cuda.array_format.SIGNED_INT32,
                num_channels=2, flags=cuda.array3d_flags.SURFACE_LDST)
        self.d_pal_array = cuda.Array(self.palette_surf_dsc)

class Renderer(object):
    # Unloading a module triggers a context sync. To keep the renderer
    # asynchronous, and avoid expensive CPU polling, this hangs on to
    # a number of (relatively small) CUDA modules and flushes them together.
    MAX_MODREFS = 20
    _modrefs = []

    def __init__(self, gnm, gprof):
        self.packer, self.lib = iter.mkiterlib(gnm)
        cubin = util.compile('iter', assemble_code(self.lib))
        self.mod = cuda.module_from_buffer(cubin)

        if len(self._modrefs) > self.MAX_MODREFS:
            del self._modrefs[:]
        self._modrefs.append(self.mod)

        self.filts = filters.create(gprof)
        self.out = output.PILOutput()

class RenderManager(ClsMod):
    lib = devlib(deps=[interp.palintlib, filldptrlib, iter.flushatomlib])

    def __init__(self):
        super(RenderManager, self).__init__()
        self.fb = Framebuffers()
        self.src_a, self.src_b = DevSrc(), DevSrc()
        self.info_a, self.info_b = DevInfo(), DevInfo()
        self.stream_a, self.stream_b = cuda.Stream(), cuda.Stream()
        self.filt_evt = self.copy_evt = None

    def _copy(self, rdr, gnm):
        """
        Queue a copy of a host genome into a set of device interpolation source
        buffers.

        Note that for now, this is broken! It ignores ``gnm``, and only packs
        the genome that was used when creating the renderer.
        """
        times, knots = rdr.packer.pack(gnm, self.fb.pool)
        cuda.memcpy_htod_async(self.src_a.d_times, times, self.stream_a)
        cuda.memcpy_htod_async(self.src_a.d_knots, knots, self.stream_a)

        palsrc = dict([(v[0], palette_decode(v[1:])) for v in gnm['palette']])
        ptimes, pvals = zip(*sorted(palsrc.items()))
        palettes = self.fb.pool.allocate((len(palsrc), 256, 4), f32)
        palettes[:] = pvals
        palette_times = self.fb.pool.allocate((self.src_a.max_knots,), f32)
        palette_times.fill(1e9)
        palette_times[:len(ptimes)] = ptimes
        cuda.memcpy_htod_async(self.src_a.d_pals, palettes, self.stream_a)
        cuda.memcpy_htod_async(self.src_a.d_ptimes, palette_times,
                               self.stream_a)

        # TODO: use bilerp tex as src for palette interp

    def _interp(self, rdr, gnm, dim, ts, td):
        d_acc_size = rdr.mod.get_global('acc_size')[0]
        p_dim = self.fb.pool.allocate((len(dim),), u32)
        p_dim[:] = dim
        cuda.memcpy_htod_async(d_acc_size, p_dim, self.stream_a)

        tref = self.mod.get_surfref('flatpal')
        tref.set_array(self.info_a.d_pal_array, 0)
        launch('interp_palette_flat', self.mod, self.stream_a,
                256, self.info_a.palette_height,
                self.fb.d_rb, self.fb.d_seeds,
                self.src_a.d_ptimes, self.src_a.d_pals,
                f32(ts), f32(td / self.info_a.palette_height))

        nts = self.info_a.ntemporal_samples
        launch('interp_iter_params', rdr.mod, self.stream_a,
                256, np.ceil(nts / 256.),
                self.info_a.d_params, self.src_a.d_times, self.src_a.d_knots,
                f32(ts), f32(td / nts), i32(nts))
        #self._print_interp_knots(rdr)

    def _print_interp_knots(self, rdr, tsidx=5):
        infos = cuda.from_device(self.info_a.d_params,
                (tsidx + 1, len(rdr.packer)), f32)
        for i, n in zip(infos[-1], rdr.packer.packed):
            print '%60s %g' % ('_'.join(n), i)

    def _iter(self, rdr, gnm, gprof, dim, tc):
        tref = rdr.mod.get_surfref('flatpal')
        tref.set_array(self.info_a.d_pal_array, 0)

        nbins = dim.ah * dim.astride
        fill = lambda b, s, v=i32(0): util.fill_dptr(
                self.mod, b, s, stream=self.stream_a, value=v)
        fill(self.fb.d_front,  4 * nbins)
        fill(self.fb.d_side,   2 * nbins)
        fill(self.fb.d_points, self.fb._len_d_points / 4, f32(np.nan))

        nts = self.info_a.ntemporal_samples
        nsamps = (gprof.spp(tc) * dim.w * dim.h)
        nrounds = int(nsamps / (nts * 256. * 256)) + 1

        def launch_iter(n):
            if n == 0: return
            launch('iter', rdr.mod, self.stream_a, (32, 8, 1), (nts, n),
                    self.fb.d_front, self.fb.d_side,
                    self.fb.d_rb, self.fb.d_seeds, self.fb.d_points,
                    self.info_a.d_params)
        # Split the launch into multiple rounds, possibly (slightly) reducing
        # work overlap but avoiding stalls when working on a device with an
        # active X session. TODO: characterize performance impact, autodetect
        BLOCK_SIZE = 4
        for i in range(BLOCK_SIZE-1, nrounds, BLOCK_SIZE):
            launch_iter(BLOCK_SIZE)
        launch_iter(nrounds%BLOCK_SIZE)

        nblocks = int(np.ceil(np.sqrt(dim.ah*dim.astride/256.)))
        launch('flush_atom', self.mod, self.stream_a,
                256, (nblocks, nblocks),
                u64(self.fb.d_front), u64(self.fb.d_side), i32(nbins))

    def queue_frame(self, rdr, gnm, gprof, tc, copy=True):
        """
        Queue one frame for rendering.

        ``rdr`` is a compiled Renderer module. Caller must ensure that the
        module is compatible with the genome data provided.

        ``gnm`` is the genome to be rendered.

        ``tc`` is the center time at which to render.

        ``w``, ``h`` are the width and height of the desired output in px.

        If ``copy`` is False, the genome data will not be recopied for each
        new genome. This function must be called with ``copy=True`` the first
        time a new genome is used, and may be called in that manner
        subsequently without harm. I suspect the performance impact is low, so
        leave ``copy`` to True every time for now.

        The return value is a 2-tuple ``(evt, h_out)``, where ``evt`` is a
        CUDA event and ``h_out`` is the return value of the output module's
        ``copy`` function. In the typical case, ``h_out`` will be a host
        allocation containing data in an appropriate format for the output
        module's file writer, and ``evt`` indicates when the asynchronous
        DMA copy which will populate ``h_out`` is complete. This can vary
        depending on the output module in use, though.
        """
        # Note: we synchronize on the previous stream if buffers need to be
        # reallocated, which implicitly also syncs the current stream.
        dim = self.fb.set_dim(gprof.width, gprof.height, self.stream_b)

        # TODO: calculate this externally somewhere?
        td = gprof.frame_width(tc) / round(gprof.fps * gprof.duration)
        ts, te = tc - 0.5 * td, tc + 0.5 * td

        # The stream interleaving here is nontrivial.
        # TODO: update diagram and link to it here
        if copy:
            self.src_a, self.src_b = self.src_b, self.src_a
            self._copy(rdr, gnm)
        self._interp(rdr, gnm, dim, ts, td)
        if self.filt_evt:
            self.stream_a.wait_for_event(self.filt_evt)
        self._iter(rdr, gnm, gprof, dim, tc)
        if self.copy_evt:
            self.stream_a.wait_for_event(self.copy_evt)
        for filt, name in zip(rdr.filts, gprof.filter_order):
            params = getattr(gprof.filters, name)
            filt.apply(self.fb, gprof, params, dim, tc, self.stream_a)
        rdr.out.convert(self.fb, gprof, dim, self.stream_a)
        self.filt_evt = cuda.Event().record(self.stream_a)
        h_out = rdr.out.copy(self.fb, dim, self.fb.pool, self.stream_a)
        self.copy_evt = cuda.Event().record(self.stream_a)

        self.info_a, self.info_b = self.info_b, self.info_a
        self.stream_a, self.stream_b = self.stream_b, self.stream_a
        return self.copy_evt, h_out

    def render(self, gnm, gprof, times):
        """
        A port of the old rendering function, retained for backwards
        compatibility. Some of this will be pulled into as-yet-undecided
        methods for more DRY.
        """
        rdr = Renderer(gnm, gprof)
        last_evt = cuda.Event().record(self.stream_a)
        last_idx = None
        def wait(): # Times like these where you wish for a macro
            while not last_evt.query():
                time.sleep(0.01)
            gpu_time = last_evt.time_since(two_evts_ago)
            return RenderedImage(last_buf, last_idx, gpu_time)
        for idx, tc in times:
            evt, h_buf = self.queue_frame(rdr, gnm, gprof, tc, last_idx is None)
            if last_idx:
                yield wait()
            two_evts_ago, last_evt = last_evt, evt
            last_buf, last_idx = h_buf, idx
        if last_idx:
            yield wait()