cuburn/cuburn/render.py

import os
import sys
import re
import time as timemod
import tempfile
from collections import namedtuple
from itertools import cycle, repeat, chain, izip, imap, ifilter
from ctypes import *
from cStringIO import StringIO
import numpy as np
from numpy import float32 as f32, int32 as i32, uint32 as u32, uint64 as u64
from scipy import ndimage

from fr0stlib import pyflam3
from fr0stlib.pyflam3._flam3 import *
from fr0stlib.pyflam3.constants import *

import pycuda.compiler
import pycuda.driver as cuda
import pycuda.tools

import cuburn.genome
from cuburn import affine
from cuburn.code import util, mwc, iter, filtering, sort

RenderedImage = namedtuple('RenderedImage', 'buf idx gpu_time')
Dimensions = namedtuple('Dimensions', 'w h aw ah astride')

def _sync_stream(dst, src):
    dst.wait_for_event(cuda.Event(cuda.event_flags.DISABLE_TIMING).record(src))

def argset(obj, **kwargs):
    for k, v in kwargs.items():
        setattr(obj, k, v)
    return obj

class Renderer(object):
    """
    Control structure for rendering a series of frames.
    """

    # Number of iterations to iterate without write after generating a new
    # point. This number is currently fixed pretty deeply in the set of magic
    # constants which govern buffer sizes; changing the value here won't
    # actually change the code on the device to do something different.
    fuse = 256

    # The palette texture/surface covers the color coordinate from [0,1] with
    # (for now, a fixed 256) equidistant horizontal samples, and spans the
    # temporal range of the frame linearly with this many rows. Increasing
    # this value increases the number of uniquely-dithered samples when using
    # pre-dithered surfaces.
    palette_height = 64

    # Maximum width of DE and other spatial filters, and thus in turn the
    # amount of padding applied. Note that, for now, this must not be changed!
    # The filtering code makes deep assumptions about this value.
    gutter = 15

    # Accumulation mode. Leave it at 'atomic' for now.
    acc_mode = 'atomic'

    # TODO
    chaos_used = False

    cmp_options = ('-use_fast_math', '-maxrregcount', '42')
    keep = False

    def __init__(self):
        self._iter = self.src = self.cubin = self.mod = None

        # Ensure class options don't get contaminated on an instance
        self.cmp_options = list(self.cmp_options)

    def compile(self, genome, keep=None, cmp_options=None):
        """
        Compile a kernel capable of rendering every frame in this animation.
        The resulting compiled kernel is stored in the ``cubin`` property;
        the source is available as ``src``, and is also returned for
        inspection and display.

        This operation is idempotent, and has no side effects outside of
        setting properties on this instance (unless there's a compiler error,
        which is a bug); it should therefore be threadsafe as well.
        It is, however, rather slow.
        """
        keep = self.keep if keep is None else keep
        cmp_options = self.cmp_options if cmp_options is None else cmp_options

        self._iter = iter.IterCode(self, genome)
        self._iter.packer.finalize()
        self.src = util.assemble_code(util.BaseCode, mwc.MWC, self._iter.packer,
                                      self._iter)
        with open(os.path.join(tempfile.gettempdir(), 'kernel.cu'), 'w') as fp:
            fp.write(self.src)
        self.cubin = pycuda.compiler.compile(
                self.src, keep=keep, options=cmp_options,
                cache_dir=False if keep else None)

    def load(self, genome, jit_options=[]):
        if not self.cubin:
            self.compile(genome)
        self.mod = cuda.module_from_buffer(self.cubin, jit_options)
        with open('/tmp/iter_kern.cubin', 'wb') as fp:
            fp.write(self.cubin)
        return self.src

    def render(self, genome, times, width, height, blend=True):
        """
        Render a frame for each timestamp in the iterable value ``times``. This
        function returns a generator that will yield a RenderedImage object
        containing a shared reference to the output buffer for each specified
        frame.

        The returned buffer is page-locked host memory. Between the time a
        buffer is yielded and the time the next frame's results are requested,
        the buffer will not be modified. Thereafter, however, it will be
        overwritten by an asynchronous DMA operation coming from the CUDA
        device. If you hang on to it for longer than one frame, copy it.

        ``genome`` is the genome to be rendered. Successive calls to the
        `render()` method on one ``Renderer`` object must use genomes which
        produce identical compiled code, and this will not be verified by the
        renderer. In practice, this means you can alter genome parameter
        values, but the full set of keys must remain identical between runs on
        the same renderer.

        ``times`` is a list of (idx, cen_time) tuples, where ``idx`` is passed
        unmodified in the RenderedImage return value and ``cen_time`` is the
        central time of the current frame in spline-time units. (Any
        clock-time or frame-time units in the genome should be preconverted.)

        If ``blend`` is False, the output buffer will contain unclipped,
        premultiplied RGBA data, without vibrancy, highlight power, or the
        alpha elbow applied.
        """
        r = self.render_gen(genome, width, height, blend=blend)
        next(r)
        return ifilter(None, imap(r.send, chain(times, [None])))

    def render_gen(self, genome, width, height, blend=True):
        """
        Render frames. This method is wrapped by the ``render()`` method; see
        its docstring for warnings and details.

        Instead of passing frame times as an iterable, they are passed
        individually via the ``generator.send()`` method. There is an
        internal pipeline latency of one frame, so the first call to the
        ``send()`` method will return None, the second call will return the
        first frame's result, and so on. To retrieve the last frame in a
        sequence, send ``None``.

        Direct use of this method is useful for implementing render servers.
        """

        last_idx = None
        next_frame = yield
        if next_frame is None:
            return

        if not self.mod:
            self.load(genome)

        filt = filtering.Filtering()

        reset_rb_fun = self.mod.get_function("reset_rb")
        packer_fun = self.mod.get_function("interp_iter_params")
        iter_fun = self.mod.get_function("iter")

        # The synchronization model is messy. See helpers/task_model.svg.
        iter_stream = cuda.Stream()
        filt_stream = cuda.Stream()
        if self.acc_mode == 'deferred':
            write_stream = cuda.Stream()
            write_fun = self.mod.get_function("write_shmem")
        else:
            write_stream = iter_stream

        # These events fire when the corresponding buffer is available for
        # reading on the host (i.e. the copy is done). On the first pass, 'a'
        # will be ignored, and subsequently moved to 'b'.
        event_a = cuda.Event().record(filt_stream)
        event_b = None

        awidth = width + 2 * self.gutter
        aheight = height + 2 * self.gutter
        astride = 32 * int(np.ceil(awidth / 32.))
        dim = Dimensions(width, height, awidth, aheight, astride)
        d_acc_size = self.mod.get_global('acc_size')[0]
        cuda.memcpy_htod_async(d_acc_size, u32(list(dim)), write_stream)

        nbins = awidth * aheight
        # Extra padding in accum helps with write_shmem overruns
        d_accum = cuda.mem_alloc(16 * nbins + (1<<16))
        d_out = cuda.mem_alloc(16 * nbins)
        if self.acc_mode == 'atomic':
            d_atom = cuda.mem_alloc(8 * nbins)
            flush_fun = self.mod.get_function("flush_atom")

        obuf_copy = argset(cuda.Memcpy2D(),
            src_y=self.gutter, src_x_in_bytes=16*self.gutter,
            src_pitch=16*astride, dst_pitch=16*width,
            width_in_bytes=16*width, height=height)
        obuf_copy.set_src_device(d_out)
        h_out_a = cuda.pagelocked_empty((height, width, 4), f32)
        h_out_b = cuda.pagelocked_empty((height, width, 4), f32)

        if self.acc_mode == 'deferred':
            # Having a fixed, power-of-two log size makes things much easier
            log_size = 64 << 20
            d_log = cuda.mem_alloc(log_size * 4)
            d_log_sorted = cuda.mem_alloc(log_size * 4)
            sorter = sort.Sorter(log_size)
            # We need to cover each unique tag - address bits 20-23 - with one
            # write block per sort bin. Or somethinig like that.
            nwriteblocks = int(np.ceil(nbins / float(1<<20))) * 256

        # Calculate 'nslots', the number of simultaneous running threads that
        # can be active on the GPU during iteration (and thus the number of
        # slots for loading and storing RNG and point context that will be
        # prepared on the device), and derive 'rb_size', the number of blocks in
        # 'nslots'.
        iter_threads_per_block = 256
        dev_data = pycuda.tools.DeviceData()
        occupancy = pycuda.tools.OccupancyRecord(
                dev_data, iter_threads_per_block,
                iter_fun.shared_size_bytes, iter_fun.num_regs)
        nsms = cuda.Context.get_device().multiprocessor_count
        rb_size = occupancy.warps_per_mp * nsms / (iter_threads_per_block / 32)
        nslots = iter_threads_per_block * rb_size

        # Reset the ringbuffer info for the slots
        reset_rb_fun(np.int32(rb_size), block=(1,1,1))

        d_points = cuda.mem_alloc(nslots * 16)
        # This statement may add extra seeds to simplify palette dithering.
        seeds = mwc.MWC.make_seeds(max(nslots, 256 * self.palette_height))
        d_seeds = cuda.to_device(seeds)

        # We used to auto-calculate this to a multiple of the number of SMs on
        # the device, but since we now use shorter launches and, to a certain
        # extent, allow simultaneous occupancy, that's not as important. The
        # 1024 is a magic constant to ensure reasonable and power-of-two log
        # size for deferred: 256MB / (4B * FUSE * NTHREADS). Enhancements to
        # the sort engine are needed to make this more flexible.
        ntemporal_samples = 1024
        genome_times, genome_knots = self._iter.packer.pack()
        d_genome_times = cuda.to_device(genome_times)
        d_genome_knots = cuda.to_device(genome_knots)
        info_size = 4 * len(self._iter.packer) * ntemporal_samples
        d_infos = cuda.mem_alloc(info_size)

        pals = genome.color.palette
        if isinstance(pals, basestring):
            pals = [0.0, pals, 1.0, pals]
        palint_times = np.empty(len(genome_times[0]), f32)
        palint_times.fill(100.0)
        palint_times[:len(pals)] = [p[0] for p in pals]
        d_palint_times = cuda.to_device(palint_times)
        d_palint_vals = cuda.to_device(
                np.concatenate([p[1].data for p in pals]))

        if self.acc_mode in ('deferred', 'atomic'):
            palette_fun = self.mod.get_function("interp_palette_hsv_flat")
            dsc = argset(cuda.ArrayDescriptor3D(), height=self.palette_height,
                    width=256, depth=0, format=cuda.array_format.SIGNED_INT32,
                    num_channels=2, flags=cuda.array3d_flags.SURFACE_LDST)
            palarray = cuda.Array(dsc)

            tref = self.mod.get_surfref('flatpal')
            tref.set_array(palarray, 0)
        else:
            palette_fun = self.mod.get_function("interp_palette_hsv")
            dsc = argset(cuda.ArrayDescriptor(), height=self.palette_height,
                    width=256, format=cuda.array_format.UNSIGNED_INT8,
                    num_channels=4)
            d_palmem = cuda.mem_alloc(256 * self.palette_height * 4)

            tref = self.mod.get_texref('palTex')
            tref.set_address_2d(d_palmem, dsc, 1024)
            tref.set_format(cuda.array_format.UNSIGNED_INT8, 4)
            tref.set_flags(cuda.TRSF_NORMALIZED_COORDINATES)
            tref.set_filter_mode(cuda.filter_mode.LINEAR)

        while next_frame is not None:
            # tc, td, ts, te: central, delta, start, end times
            idx, tc = next_frame
            td = genome.adj_frame_width(tc)
            ts, te = tc - 0.5 * td, tc + 0.5 * td

            if self.acc_mode in ('deferred', 'atomic'):
                # In this mode, the palette writes to a surface reference, but
                # requires dithering, so we pass it the seeds instead
                arg0 = d_seeds
            else:
                arg0 = d_palmem
            palette_fun(arg0, d_palint_times, d_palint_vals,
                        f32(ts), f32(td / self.palette_height),
                        block=(256,1,1), grid=(self.palette_height,1),
                        stream=write_stream)

            packer_fun(d_infos, d_seeds, d_genome_times, d_genome_knots,
                       f32(ts), f32(td / ntemporal_samples),
                       i32(ntemporal_samples), block=(256,1,1),
                       grid=(int(np.ceil(ntemporal_samples/256.)),1),
                       stream=iter_stream)

            # Reset points so that they will be FUSEd
            util.BaseCode.fill_dptr(self.mod, d_points, 4 * nslots,
                                    iter_stream, f32(np.nan))

            # Get interpolated control points for debugging
            #iter_stream.synchronize()
            #d_temp = cuda.from_device(d_infos,
                    #(ntemporal_samples, len(self._iter.packer)), f32)
            #for i, n in zip(d_temp[5], self._iter.packer.packed):
                #print '%60s %g' % ('_'.join(n), i)

            util.BaseCode.fill_dptr(self.mod, d_accum, 4 * nbins, write_stream)
            if self.acc_mode == 'atomic':
                util.BaseCode.fill_dptr(self.mod, d_atom, 2 * nbins, write_stream)
            nrounds = int( (genome.spp(tc) * width * height)
                         / (ntemporal_samples * 256 * 256) ) + 1
            if self.acc_mode == 'deferred':
                for i in range(nrounds):
                    iter_fun(np.uint64(d_log), d_seeds, d_points, d_infos,
                             block=(32, self._iter.NTHREADS/32, 1),
                             grid=(ntemporal_samples, 1), stream=iter_stream)
                    _sync_stream(write_stream, iter_stream)
                    sorter.sort(d_log_sorted, d_log, log_size, 3, True,
                                stream=write_stream)
                    _sync_stream(iter_stream, write_stream)
                    write_fun(d_accum, d_log_sorted, sorter.dglobal, i32(nbins),
                              block=(1024, 1, 1), grid=(nwriteblocks, 1),
                              stream=write_stream)
            else:
                args = [u64(d_accum), d_seeds, d_points, d_infos]
                if self.acc_mode == 'atomic':
                    args.append(u64(d_atom))
                iter_fun(*args, block=(32, self._iter.NTHREADS/32, 1),
                         grid=(ntemporal_samples, nrounds), stream=iter_stream)
                if self.acc_mode == 'atomic':
                    nblocks = int(np.ceil(np.sqrt(nbins/float(512))))
                    flush_fun(u64(d_accum), u64(d_atom), i32(nbins),
                              block=(512, 1, 1), grid=(nblocks, nblocks),
                              stream=iter_stream)

            util.BaseCode.fill_dptr(self.mod, d_out, 4 * nbins, filt_stream)
            _sync_stream(filt_stream, write_stream)
            filt.de(d_out, d_accum, genome, dim, tc, stream=filt_stream)
            _sync_stream(write_stream, filt_stream)
            filt.colorclip(d_out, genome, dim, tc, blend, stream=filt_stream)
            obuf_copy.set_dst_host(h_out_a)
            obuf_copy(filt_stream)

            if event_b:
                while not event_a.query():
                    timemod.sleep(0.01)
                gpu_time = event_a.time_since(event_b)
                result = RenderedImage(h_out_b, last_idx, gpu_time)
            else:
                result = None
            last_idx = idx

            event_a, event_b = cuda.Event().record(filt_stream), event_a
            h_out_a, h_out_b = h_out_b, h_out_a

            # TODO: add ability to flush a frame without breaking the pipe
            next_frame = yield result

        while not event_a.query():
            timemod.sleep(0.001)
        gpu_time = event_a.time_since(event_b)
        yield RenderedImage(h_out_b, last_idx, gpu_time)