cuburn/cuburn/render.py

import sys
import math
import re
import time
from itertools import cycle, repeat, chain, izip
from ctypes import *
from cStringIO import StringIO
import numpy as np
from scipy import ndimage

from fr0stlib import pyflam3
from fr0stlib.pyflam3._flam3 import *
from fr0stlib.pyflam3.constants import *

import pycuda.compiler
import pycuda.driver as cuda
import pycuda.tools
from pycuda.gpuarray import vec

from cuburn import affine
from cuburn.code import util, mwc, iter, filtering

def _chunk(l, cs):
    """
    Yield the contents of list ``l`` in chunks of size no more than ``cs``.
    """
    for i in range(0, len(l), cs):
        yield l[i:i+cs]

class Genome(object):
    """
    Normalizes and precalculates some properties of a Genome. Assumes that
    Genome argument passed in will not change.
    """
    # Fix the ctypes ugliness since switching to __getattribute__ in 2.7.
    # There are more elegant ways to do this, but I can't be bothered.
    def __getattr__(self, name):
        return getattr(self.cp, name)

    def __init__(self, ctypes_genome):
        self.cp = ctypes_genome
        self.xforms = [self.xform[i] for i in range(self.num_xforms)]
        dens = np.array([x.density for i, x in enumerate(self.xforms)
                         if i != self.final_xform_index])


        num_std_xf = len(dens)
        self.chaos_densities = np.zeros( (num_std_xf,num_std_xf) )
        for r in range(num_std_xf):
            chaos_row = np.array([ctypes_genome.chaos[r][c]
                                  for c in range(num_std_xf)])
            chaos_row = chaos_row * dens
            chaos_row /= np.sum(chaos_row)
            chaos_row = np.cumsum(chaos_row)
            self.chaos_densities[r,:] = chaos_row

        dens /= np.sum(dens)
        self.norm_density = np.cumsum(dens)

        # For performance reasons, defer this calculation
        self._camera_transform = None

    scale = property(lambda cp: 2.0 ** cp.zoom)
    adj_density = property(lambda cp: cp.sample_density * (cp.scale ** 2))
    ppu = property(lambda cp: cp.pixels_per_unit * cp.scale)

    @property
    def camera_transform(self):
        """
        An affine matrix which will transform IFS coordinates to image width
        and height. Assumes that width and height are constant.
        """
        cp = self
        if self._camera_transform is not None:
            return self._camera_transform
        g = Features.gutter
        if cp.estimator:
            # The filter shifts by this amount as a side effect of it being
            # written in a confusing and sloppy manner
            # TODO: this will be weird in an animation where one endpoint has
            # a radius of 0, and the other does not
            g -= Features.gutter / 2 - 1
        self._camera_transform = (
                 affine.translate(0.5 * cp.width + g, 0.5 * cp.height + g)
               * affine.scale(cp.ppu, cp.ppu)
               * affine.translate(-cp._center[0], -cp._center[1])
               * affine.rotate(cp.rotate * 2 * np.pi / 360,
                               cp.rot_center[0],
                               cp.rot_center[1]) )
        return self._camera_transform

class Animation(object):
    """
    Control structure for rendering a series of frames.

    Each animation will dynamically generate a kernel that includes only the
    code necessary to render the genomes provided. The process of generating
    and uploading the kernel takes a small but finite amount of time. In
    general, the kernel generated for all genomes resulting from interpolating
    between two control points will have identical performance, so it is
    wasteful to create more than one animation for any interpolated sequence.

    However, genome sequences interpolated from three or more control points
    with different features enabled will have the code needed to render all
    genomes enabled for every frame. Doing this can hurt performance.

    In other words, it's best to use exactly one Animation for each
    interpolated sequence between one or two genomes.
    """
    cmp_options = ('-use_fast_math', '-maxrregcount', '32')
    keep = False

    def __init__(self, ctypes_genome_array):
        self._g_arr = type(ctypes_genome_array)()
        libflam3.flam3_align(self._g_arr, ctypes_genome_array,
                             len(ctypes_genome_array))
        self.genomes = map(Genome, self._g_arr)
        self.features = Features(self.genomes)
        self._iter = self._de = self.src = self.cubin = self.mod = None

        # Ensure class options don't get contaminated on an instance
        self.cmp_options = list(self.cmp_options)

    def compile(self, keep=None, cmp_options=None):
        """
        Compile a kernel capable of rendering every frame in this animation.
        The resulting compiled kernel is stored in the ``cubin`` property;
        the source is available as ``src``, and is also returned for
        inspection and display.

        This operation is idempotent, and has no side effects outside of
        setting properties on this instance (unless there's a compiler error,
        which is a bug); it should therefore be threadsafe as well.
        It is, however, rather slow.
        """
        keep = self.keep if keep is None else keep
        cmp_options = self.cmp_options if cmp_options is None else cmp_options

        self._iter = iter.IterCode(self.features)
        self._de = filtering.DensityEst(self.features, self.genomes[0])
        cclip = filtering.ColorClip(self.features)
        # TODO: make choice of filtering explicit
        self.src = util.assemble_code(util.BaseCode, mwc.MWC, self._iter.packer,
                                      self._iter, cclip, self._de)
        self.cubin = pycuda.compiler.compile(
                self.src, keep=keep, options=cmp_options,
                cache_dir=False if keep else None)
        return self.src

    def copy(self):
        """
        Return a copy of this animation without any references to the current
        CUDA context. This can be used to load an animation in multiple CUDA
        contexts without recompiling, so that rendering can proceed across
        multiple devices - but managing that is up to you.
        """
        import copy
        new = copy.copy(self)
        new.mod = None
        return new

    def load(self, jit_options=[]):
        """
        Replace the currently loaded CUDA module in the active CUDA context
        with the compiled code's module. A reference is kept to the module,
        meaning that rendering should henceforth only be called from the
        thread and context in which this function was called.
        """
        if self.cubin is None:
            self.compile()
        self.mod = cuda.module_from_buffer(self.cubin, jit_options)

    def render_frames(self, times=None):
        """
        Render a flame for each genome in the iterable value 'genomes'.
        Returns a Python generator object which will yield one NumPy array
        for each rendered image.

        This method produces a considerable amount of side effects, and should
        not be used lightly. Things may go poorly for you if this method is not
        allowed to run until completion (by exhausting all items in the
        generator object).

        A performance note: while any ready tasks will be scheduled on the GPU
        before yielding a result, spending a lot of time before returning
        control to this function can allow the GPU to become idle. It's best
        to hand the resulting array to another thread after grabbing it from
        the renderer for handling.

        ``times`` is a sequence of center times at which to render, or ``None``
        to render one frame for each genome used to create the animation.
        """
        # Don't see this changing, but empirical tests could prove me wrong
        NRENDERERS = 2
        # TODO: under a slightly modified sequencing, certain buffers can be
        # shared (though this may be unimportant if a good AA technique which
        # doesn't require full SS can be found)
        rdrs = [_AnimRenderer(self) for i in range(NRENDERERS)]

        # Zip up each genome with an alternating renderer, plus enough empty
        # genomes at the end to flush all pending tasks
        times = times if times is not None else [cp.time for cp in self.genomes]
        exttimes = chain(times, repeat(None, NRENDERERS))
        for rdr, time in izip(cycle(rdrs), exttimes):
            if rdr.wait():
                yield rdr.get_result()
            if time is not None:
                rdr.render(time)

    def _interp(self, time, cp):
        flam3_interpolate(self._g_arr, len(self._g_arr), time, 0, byref(cp))

class _AnimRenderer(object):
    # Large launches lock the display for a considerable period and may be
    # killed due to a device timeout; small launches are harder to load-balance
    # on the GPU and incur overhead. This empirical value is multiplied by the
    # number of SMs on the device to determine how many blocks should be in
    # each launch. Extremely high quality, high resolution renders may still
    # encounter a device timeout, and no workaround is in place for that yet.
    SM_FACTOR = 8

    # Currently, palette interpolation is done independently of animation
    # interpolation, so that the process is not biased and so we only need to
    # mess about with one texture per renderer. This many steps will always be
    # used, no matter the number of time steps.
    PAL_HEIGHT = 16


    def __init__(self, anim):
        self.anim = anim
        self.pending = False
        self.stream = cuda.Stream()

        self._nsms = cuda.Context.get_device().multiprocessor_count
        self.cps_per_block = self._nsms * self.SM_FACTOR
        self.ncps = anim.features.max_cps
        self.nblocks = int(math.ceil(self.ncps / float(self.cps_per_block)))

        # These are stored to avoid leaks, not to be stateful in method calls
        self._dst_cp = pyflam3.Genome()
        memset(byref(self._dst_cp), 0, sizeof(self._dst_cp))
        self._cen_cp = pyflam3.Genome()
        memset(byref(self._cen_cp), 0, sizeof(self._cen_cp))

        self.nbins = anim.features.acc_height * anim.features.acc_stride
        self.d_accum = cuda.mem_alloc(16 * self.nbins)
        self.d_out = cuda.mem_alloc(16 * self.nbins)

        info_size = anim._iter.packer.align * self.ncps
        self.d_infos = cuda.mem_alloc(info_size)
        # Defer generation of seeds until they're first needed
        self.d_seeds = None

        # During the main rendering loop, we alternate between two streams and
        # two sets of seeds, synchronizing them at the end of rendering.
        self.alt_stream = cuda.Stream()
        self.d_alt_seeds = None

        # It's less than ideal, but we lock some memory ahead of time
        self.h_infos_locked = cuda.pagelocked_empty((info_size/4,), np.float32)

    def render(self, cen_time):
        assert not self.pending, "Tried to render with results pending!"
        self.pending = True
        a = self.anim

        cen_cp = self._cen_cp
        a._interp(cen_time, cen_cp)
        palette = self._interp_colors(cen_time, cen_cp)

        util.BaseCode.zero_dptr(a.mod, self.d_accum, 4 * self.nbins,
                                self.stream)
        # Ensure all main stream tasks are done before starting alt stream
        self.alt_stream.wait_for_event(cuda.Event().record(self.stream))

        dpal = cuda.make_multichannel_2d_array(palette, 'C')
        tref = a.mod.get_texref('palTex')
        tref.set_array(dpal)
        tref.set_format(cuda.array_format.UNSIGNED_INT8, 4)
        tref.set_flags(cuda.TRSF_NORMALIZED_COORDINATES)
        tref.set_filter_mode(cuda.filter_mode.LINEAR)

        cp = self._dst_cp
        packer = a._iter.packer

        iter_fun = a.mod.get_function("iter")
        #iter_fun.set_cache_config(cuda.func_cache.PREFER_L1)

        # Must be accumulated over all CPs
        gam, vib = 0, 0
        bkgd = np.zeros(3)

        # This is gross, but there are a lot of fiddly corner cases with any
        # index-based iteration scheme.
        times = list(enumerate(self._mk_dts(cen_time, cen_cp, self.ncps)))
        for b, block_times in enumerate(_chunk(times, self.cps_per_block)):
            on_main = b % 2 == 0
            stream = self.stream if on_main else self.alt_stream
            d_seeds = self.d_seeds if on_main else self.d_alt_seeds

            if not d_seeds:
                seeds = mwc.MWC.make_seeds(iter.IterCode.NTHREADS *
                                           self.cps_per_block)
                h_seeds = cuda.pagelocked_empty(seeds.shape, seeds.dtype)
                h_seeds[:] = seeds
                size = seeds.dtype.itemsize * seeds.size
                d_seeds = cuda.mem_alloc(size)
                cuda.memcpy_htod_async(d_seeds, h_seeds, stream)
                if on_main:
                    self.d_seeds = d_seeds
                else:
                    self.d_alt_seeds = d_seeds

            infos = []
            if len(a.genomes) > 1:
                for n, t in block_times:
                    a._interp(t, cp)
                    frac = float(n) / cen_cp.ntemporal_samples
                    info = packer.pack(cp=Genome(cp), cp_step_frac=frac)
                    infos.append(info)
                    gam += cp.gamma
                    vib += cp.vibrancy
                    bkgd += np.array(cp.background)
            else:
                # Can't interpolate normally; just pack copies
                packed = packer.pack(cp=a.genomes[0], cp_step_frac=0)
                infos = [packed] * len(block_times)
                gam += a.genomes[0].gamma * len(block_times)
                vib += a.genomes[0].vibrancy * len(block_times)
                bkgd += np.array(a.genomes[0].background) * len(block_times)

            infos = np.concatenate(infos)
            offset = b * packer.align * self.cps_per_block
            h_infos = self.h_infos_locked[offset/4:offset/4+len(infos)]
            h_infos[:] = infos
            # TODO: portable across 32/64-bit arches?
            d_info_off = int(self.d_infos) + offset
            cuda.memcpy_htod_async(d_info_off, h_infos, stream)

            # TODO: get block config from IterCode
            iter_fun(d_seeds, np.uint64(d_info_off), self.d_accum,
                     block=(32, 16, 1), grid=(len(block_times), 1),
                     texrefs=[tref], stream=stream)

        # Now ensure all alt stream tasks are done before continuing main
        self.stream.wait_for_event(cuda.Event().record(self.alt_stream))

        util.BaseCode.zero_dptr(a.mod, self.d_out, 4 * self.nbins,
                                self.stream)
        a._de.invoke(a.mod, Genome(cen_cp), self.d_accum, self.d_out,
                     self.stream)

        f = np.float32
        n = f(self.ncps)
        gam = f(n / gam)
        vib = f(vib / n)
        hipow = f(cen_cp.highlight_power)
        lin = f(cen_cp.gam_lin_thresh)
        lingam = f(math.pow(cen_cp.gam_lin_thresh, gam-1.0) if lin > 0 else 0)
        bkgd = vec.make_float3(*(bkgd / n))

        # TODO: get block size from colorclip class? It actually does not
        # depend on that being the case
        color_fun = a.mod.get_function("colorclip")
        color_fun(self.d_out, gam, vib, hipow, lin, lingam, bkgd,
                  block=(256, 1, 1), grid=(self.nbins / 256, 1),
                  stream=self.stream)


        # TODO: The stream seems to sync right here, automatically, before
        # returning. I think PyCUDA is forcing a sync when something drops out
        # of scope. Investigate.

    def _pal_to_np(self, cp):
        # Converting palettes by iteration has an enormous performance
        # overhead. We cheat massively and dangerously here.
        pal = cast(pointer(cp.palette), POINTER(c_double * (256 * 5)))
        val = np.frombuffer(buffer(pal.contents), count=256*5)
        return np.uint8(np.reshape(val, (256, 5))[:,1:] * 255.0)

    def _interp_colors(self, cen_time, cen_cp):
        # TODO: any visible difference between uint8 and richer formats?
        pal = np.empty((self.PAL_HEIGHT, 256, 4), dtype=np.uint8)
        a = self.anim

        if len(a.genomes) > 1:
            # The typical case; applying real motion blur
            cp = self._dst_cp
            times = self._mk_dts(cen_time, cen_cp, self.PAL_HEIGHT)
            for n, t in enumerate(times):
                a._interp(t, cp)
                pal[n] = self._pal_to_np(cp)
        else:
            # Cannot call any interp functions on a single genome; rather than
            # have alternate code-paths, just copy the same colors everywhere
            pal[0] = self._pal_to_np(a.genomes[0])
            pal[1:] = pal[0]
        return pal

    def wait(self):
        if self.pending:
            self.stream.synchronize()
            self.pending = False
            return True
        return False

    def get_result(self):
        a = self.anim
        g = a.features.gutter
        obuf_dim = (a.features.acc_height, a.features.acc_stride, 4)
        out = cuda.from_device(self.d_out, obuf_dim, np.float32)
        out = np.delete(out, np.s_[:g], axis=0)
        out = np.delete(out, np.s_[:g], axis=1)
        out = np.delete(out, np.s_[-g:], axis=0)
        out = np.delete(out, np.s_[a.features.width:], axis=1)
        return out

    @staticmethod
    def _mk_dts(cen_time, cen_cp, ncps):
        w = cen_cp.temporal_filter_width
        return [cen_time + w * (t / (ncps - 1.0) - 0.5) for t in range(ncps)]

class Features(object):
    """
    Determine features and constants required to render a particular set of
    genomes. The values of this class are fixed before compilation begins.
    """
    # Constant parameters which control handling of out-of-frame samples:
    # Number of iterations to iterate without write after new point
    fuse = 20
    # Maximum consecutive out-of-bounds points before picking new point
    max_oob = 10
    max_nxforms = 12

    # Height of the texture pallete which gets uploaded to the GPU (assuming
    # that palette-from-texture is enabled). For most genomes, this doesn't
    # need to be very large at all. However, since only an easily-cached
    # fraction of this will be accessed per SM, larger values shouldn't hurt
    # performance too much. Power-of-two, please.
    palette_height = 16

    # Maximum width of DE and other spatial filters, and thus in turn the
    # amount of padding applied. Note that, for now, this must not be changed!
    # The filtering code makes deep assumptions about this value.
    gutter = 16

    # TODO: for now, we always throw away the alpha channel before writing.
    # All code is in place to not do this, we just need to find a way to expose
    # this preference via the API (or push alpha blending entirely on the client,
    # which I'm not opposed to)
    alpha_output_channel = False

    def __init__(self, genomes):
        any = lambda l: bool(filter(None, map(l, genomes)))
        self.max_ntemporal_samples = max(
                [cp.nbatches * cp.ntemporal_samples for cp in genomes])
        self.non_box_temporal_filter = genomes[0].temporal_filter_type
        self.palette_mode = genomes[0].palette_mode and "linear" or "nearest"

        assert len(set([len(cp.xforms) for cp in genomes])) == 1, ("genomes "
            "must have same number of xforms! (use flam3-genome first)")
        self.nxforms = len(genomes[0].xforms)
        self.xforms = [XFormFeatures([cp.xforms[i] for cp in genomes], i)
                       for i in range(self.nxforms)]
        if any(lambda cp: cp.final_xform_enable):
            if not all([cp.final_xform_index == genomes[0].final_xform_index
                        for cp in genomes]):
                raise ValueError("Differing final xform indexes")
            self.final_xform_index = genomes[0].final_xform_index
        else:
            self.final_xform_index = None

        alphas = np.array([c.color[3] for g in genomes
                           for c in g.palette.entries])
        self.pal_has_alpha = np.any(alphas != 1.0)

        self.max_cps = max([cp.ntemporal_samples for cp in genomes])

        self.width = genomes[0].width
        self.height = genomes[0].height
        self.acc_width = genomes[0].width + 2 * self.gutter
        self.acc_height = genomes[0].height + 2 * self.gutter
        self.acc_stride = 32 * int(math.ceil(self.acc_width / 32.))
        self.std_xforms = filter(lambda v: v != self.final_xform_index,
                                 range(self.nxforms))
        self.chaos_used = False
        for cp in genomes:
            for r in range(len(self.std_xforms)):
                for c in range(len(self.std_xforms)):
                    if cp.chaos[r][c] != 1.0:
                        self.chaos_used = True


class XFormFeatures(object):
    def __init__(self, xforms, xform_id):
        self.id = xform_id
        any = lambda l: bool(filter(None, map(l, xforms)))
        self.has_post = any(lambda xf: getattr(xf, 'post', None))
        self.vars = set()
        for x in xforms:
            self.vars = (
                self.vars.union(set([i for i, v in enumerate(x.var) if v])))