New genome representation, and device interp.

2025-07-12 03:05:14 -04:00 · 2011-10-25 15:44:39 -04:00
parent be31708c09
commit 8939a6343a
8 changed files with 1030 additions and 729 deletions
--- a/cuburn/render.py
+++ b/cuburn/render.py
@ -1,12 +1,15 @@
+import os
 import sys
 import math
 import re
-import time
+import time as timemod
+import tempfile
 from itertools import cycle, repeat, chain, izip
 from ctypes import *
 from cStringIO import StringIO
 import numpy as np
 from scipy import ndimage
+import base64

 from fr0stlib import pyflam3
 from fr0stlib.pyflam3._flam3 import *
@ -17,79 +20,11 @@ import pycuda.driver as cuda
 import pycuda.tools
 from pycuda.gpuarray import vec

+import cuburn.genome
 from cuburn import affine
 from cuburn.code import util, mwc, iter, filtering

-def _chunk(l, cs):
-    """
-    Yield the contents of list ``l`` in chunks of size no more than ``cs``.
-    """
-    for i in range(0, len(l), cs):
-        yield l[i:i+cs]
-
-class Genome(object):
-    """
-    Normalizes and precalculates some properties of a Genome. Assumes that
-    Genome argument passed in will not change.
-    """
-    # Fix the ctypes ugliness since switching to __getattribute__ in 2.7.
-    # There are more elegant ways to do this, but I can't be bothered.
-    def __getattr__(self, name):
-        return getattr(self.cp, name)
-
-    def __init__(self, ctypes_genome):
-        self.cp = ctypes_genome
-        self.xforms = [self.xform[i] for i in range(self.num_xforms)]
-        dens = np.array([x.density for i, x in enumerate(self.xforms)
-                         if i != self.final_xform_index])
-
-
-        num_std_xf = len(dens)
-        self.chaos_densities = np.zeros( (num_std_xf,num_std_xf) )
-        for r in range(num_std_xf):
-            chaos_row = np.array([ctypes_genome.chaos[r][c]
-                                  for c in range(num_std_xf)])
-            chaos_row = chaos_row * dens
-            chaos_row /= np.sum(chaos_row)
-            chaos_row = np.cumsum(chaos_row)
-            self.chaos_densities[r,:] = chaos_row
-
-        dens /= np.sum(dens)
-        self.norm_density = np.cumsum(dens)
-
-        # For performance reasons, defer this calculation
-        self._camera_transform = None
-
-    scale = property(lambda cp: 2.0 ** cp.zoom)
-    adj_density = property(lambda cp: cp.sample_density * (cp.scale ** 2))
-    ppu = property(lambda cp: cp.pixels_per_unit * cp.scale)
-
-    @property
-    def camera_transform(self):
-        """
-        An affine matrix which will transform IFS coordinates to image width
-        and height. Assumes that width and height are constant.
-        """
-        cp = self
-        if self._camera_transform is not None:
-            return self._camera_transform
-        g = Features.gutter
-        if cp.estimator:
-            # The filter shifts by this amount as a side effect of it being
-            # written in a confusing and sloppy manner
-            # TODO: this will be weird in an animation where one endpoint has
-            # a radius of 0, and the other does not
-            g -= Features.gutter / 2 - 1
-        self._camera_transform = (
-                 affine.translate(0.5 * cp.width + g, 0.5 * cp.height + g)
-               * affine.scale(cp.ppu, cp.ppu)
-               * affine.translate(-cp._center[0], -cp._center[1])
-               * affine.rotate(cp.rotate * 2 * np.pi / 360,
-                               cp.rot_center[0],
-                               cp.rot_center[1]) )
-        return self._camera_transform
-
-class Animation(object):
+class Renderer(object):
    """
    Control structure for rendering a series of frames.

@ -108,27 +43,13 @@ class Animation(object):
    interpolated sequence between one or two genomes.
    """

-    # Large launches lock the display for a considerable period and may be
-    # killed due to a device timeout; small launches are harder to load-balance
-    # on the GPU and incur overhead. This empirical value is multiplied by the
-    # number of SMs on the device to determine how many blocks should be in
-    # each launch. Extremely high quality, high resolution renders may still
-    # encounter a device timeout, requiring the user to increase the split
-    # amount. This factor is not used in async mode.
-    SM_FACTOR = 8
-
    cmp_options = ('-use_fast_math', '-maxrregcount', '42')
-
-
    keep = False

-    def __init__(self, ctypes_genome_array):
-        self._g_arr = type(ctypes_genome_array)()
-        libflam3.flam3_align(self._g_arr, ctypes_genome_array,
-                             len(ctypes_genome_array))
-        self.genomes = map(Genome, self._g_arr)
-        self.features = Features(self.genomes)
+    def __init__(self, info):
+        self.info = info
        self._iter = self._de = self.src = self.cubin = self.mod = None
+        self.packed_genome = None

        # Ensure class options don't get contaminated on an instance
        self.cmp_options = list(self.cmp_options)
@ -148,12 +69,14 @@ class Animation(object):
        keep = self.keep if keep is None else keep
        cmp_options = self.cmp_options if cmp_options is None else cmp_options

-        self._iter = iter.IterCode(self.features)
-        self._de = filtering.DensityEst(self.features, self.genomes[0])
-        cclip = filtering.ColorClip(self.features)
-        # TODO: make choice of filtering explicit
+        self._iter = iter.IterCode(self.info)
+        self._de = filtering.DensityEst(self.info)
+        cclip = filtering.ColorClip(self.info)
+        self._iter.packer.finalize()
        self.src = util.assemble_code(util.BaseCode, mwc.MWC, self._iter.packer,
                                      self._iter, cclip, self._de)
+        with open(os.path.join(tempfile.gettempdir(), 'kernel.cu'), 'w') as fp:
+            fp.write(self.src)
        self.cubin = pycuda.compiler.compile(
                self.src, keep=keep, options=cmp_options,
                cache_dir=False if keep else None)
@ -183,11 +106,11 @@ class Animation(object):
        self.mod = cuda.module_from_buffer(self.cubin, jit_options)


-    def render_frames(self, times=None, sync=False):
+    def render(self, times):
        """
        Render a flame for each genome in the iterable value 'genomes'.
        Returns a Python generator object which will yield a 2-tuple of
-        ``(time, buf)``, where ``time`` is the central time of the frame and
+        ``(time, buf)``, where ``time`` is the start time of the frame and
        ``buf`` is a 3D (width, height, channel) NumPy array containing
        [0,1]-valued RGBA components.

@ -196,73 +119,64 @@ class Animation(object):
        allowed to run until completion (by exhausting all items in the
        generator object).

-        A performance note: while any ready tasks will be scheduled on the GPU
-        before yielding a result, spending a lot of time before returning
-        control to this function can allow the GPU to become idle. It's best
-        to hand the resulting array to another thread after grabbing it from
-        the renderer for handling.
-
-        ``times`` is a sequence of center times at which to render, or ``None``
-        to render one frame for each genome used to create the animation.
-
-        If ``sync`` is True, the CPU will sync with the GPU after every block
-        of temporal samples and yield None until the frame is ready. This
-        allows a single-card system to avoid having to go thirty seconds
-        between window refreshes while rendering. Otherwise, tasks will be
-        piled asynchronously on the card so that it is always under load.
+        ``times`` is a sequence of (start, stop) times defining the temporal
+        range to be rendered for each frame. This will change to be more
+        frame-centric in the future, allowing for interpolated temporal width.
        """
        if times == []:
            return

-        f = self.features
-
-        times = times if times is not None else [cp.time for cp in self.genomes]
+        info = self.info
        iter_stream = cuda.Stream()
        filt_stream = cuda.Stream()
-        cen_cp = pyflam3.Genome()
-        dst_cp = pyflam3.Genome()

-        nbins = f.acc_height * f.acc_stride
+        nbins = info.acc_height * info.acc_stride
        d_accum = cuda.mem_alloc(16 * nbins)
        d_out = cuda.mem_alloc(16 * nbins)

        num_sm = cuda.Context.get_device().multiprocessor_count
-        if sync:
-            cps_per_block = num_sm * self.SM_FACTOR
-        else:
-            cps_per_block = f.max_cps
+        cps_per_block = 1024

-        info_size = self._iter.packer.align * cps_per_block
+        genome_times, genome_knots = self._iter.packer.pack()
+        d_genome_times = cuda.to_device(genome_times)
+        d_genome_knots = cuda.to_device(genome_knots)
+        info_size = 4 * len(self._iter.packer) * cps_per_block
        d_infos = cuda.mem_alloc(info_size)
-        d_palmem = cuda.mem_alloc(256 * f.palette_height * 4)
+        d_palmem = cuda.mem_alloc(256 * info.palette_height * 4)

        seeds = mwc.MWC.make_seeds(self._iter.NTHREADS * cps_per_block)
        d_seeds = cuda.to_device(seeds)

-        h_infos = cuda.pagelocked_empty((info_size / 4,), np.float32)
        h_palmem = cuda.pagelocked_empty(
-                (f.palette_height, 256, 4), np.uint8)
-        h_out = cuda.pagelocked_empty((f.acc_height, f.acc_stride, 4), np.float32)
+                (info.palette_height, 256, 4), np.uint8)
+        h_out = cuda.pagelocked_empty((info.acc_height, info.acc_stride, 4),
+                                      np.float32)

        filter_done_event = None

-        packer = self._iter.packer
+        packer_fun = self.mod.get_function("interp_iter_params")
        iter_fun = self.mod.get_function("iter")
        #iter_fun.set_cache_config(cuda.func_cache.PREFER_L1)

        util.BaseCode.zero_dptr(self.mod, d_accum, 4 * nbins, filt_stream)

-        last_time = times[0]
+        last_time = times[0][0]

-        for time in times:
-            self._interp(cen_cp, time)
+        # TODO: move palette stuff to separate class; do interp
+        pal = np.fromstring(base64.b64decode(info.db.palettes.values()[0]),
+                            np.uint8)
+        pal = np.reshape(pal, (256, 3))
+        h_palmem[0,:,:3] = pal
+        h_palmem[1:] = h_palmem[0]

-            h_palmem[:] = self._interp_colors(dst_cp, time,
-                                              cen_cp.temporal_filter_width)
+        for start, stop in times:
+            cen_cp = cuburn.genome.HacketyGenome(info.genome, (start+stop)/2)
+
+            # "Interp" already done above, but...
            cuda.memcpy_htod_async(d_palmem, h_palmem, iter_stream)
            tref = self.mod.get_texref('palTex')
            array_info = cuda.ArrayDescriptor()
-            array_info.height = f.palette_height
+            array_info.height = info.palette_height
            array_info.width = 256
            array_info.array_format = cuda.array_format.UNSIGNED_INT8
            array_info.num_channels = 4
@ -272,69 +186,52 @@ class Animation(object):
            tref.set_flags(cuda.TRSF_NORMALIZED_COORDINATES)
            tref.set_filter_mode(cuda.filter_mode.LINEAR)

-            # Must be accumulated over all CPs
-            gam, vib = 0, 0
-            bkgd = np.zeros(3)

-            mblur_times = enumerate( np.linspace(-0.5, 0.5, cen_cp.ntemporal_samples)
-                                     * cen_cp.temporal_filter_width + time )
+            if filter_done_event:
+                iter_stream.wait_for_event(filter_done_event)

-            for block_times in _chunk(list(mblur_times), cps_per_block):
-                infos = []
-                if len(self.genomes) > 1:
-                    for n, t in block_times:
-                        self._interp(dst_cp, t)
-                        frac = float(n) / cen_cp.ntemporal_samples
-                        info = packer.pack(cp=Genome(dst_cp), cp_step_frac=frac)
-                        infos.append(info)
-                        gam += dst_cp.gamma
-                        vib += dst_cp.vibrancy
-                        bkgd += np.array(dst_cp.background)
-                else:
-                    # Can't interpolate normally; just pack copies
-                    packed = packer.pack(cp=self.genomes[0], cp_step_frac=0)
-                    infos = [packed] * len(block_times)
-                    gam += self.genomes[0].gamma * len(block_times)
-                    vib += self.genomes[0].vibrancy * len(block_times)
-                    bkgd += np.array(self.genomes[0].background) * len(block_times)
+            width = np.float32((stop-start) / cps_per_block)
+            packer_fun(d_infos, d_genome_times, d_genome_knots,
+                       np.float32(start), width, d_seeds,
+                       block=(256,1,1), grid=(cps_per_block/256,1),
+                       stream=iter_stream)

-                infos = np.concatenate(infos)
-                h_infos[:len(infos)] = infos
-                cuda.memcpy_htod_async(d_infos, h_infos)
+            # Get interpolated control points for debugging
+            #iter_stream.synchronize()
+            #d_temp = cuda.from_device(d_infos,
+                    #(cps_per_block, len(self._iter.packer)), np.float32)
+            #for i, n in zip(d_temp[5], self._iter.packer.packed):
+                #print '%60s %g' % ('_'.join(n), i)

-                if filter_done_event:
-                    iter_stream.wait_for_event(filter_done_event)
+            nsamps = info.density * info.width * info.height / cps_per_block
+            iter_fun(np.uint64(d_accum), d_seeds, d_infos, np.int32(nsamps),
+                     block=(32, self._iter.NTHREADS/32, 1),
+                     grid=(cps_per_block, 1),
+                     texrefs=[tref], stream=iter_stream)

-                # TODO: replace with option to split long runs shorter ones
-                # for interactivity
-                for i in range(1):
-                    iter_fun(d_seeds, d_infos, np.uint64(d_accum),
-                             block=(32, self._iter.NTHREADS/32, 1),
-                             grid=(len(block_times), 1),
-                             texrefs=[tref], stream=iter_stream)
-
-                    if sync:
-                        iter_stream.synchronize()
-                        yield None
-
-            if filter_done_event and not sync:
+            if filter_done_event:
+                while not filt_stream.is_done():
+                    timemod.sleep(0.01)
                filt_stream.synchronize()
                yield last_time, self._trim(h_out)
-                last_time = time
+                last_time = start

            util.BaseCode.zero_dptr(self.mod, d_out, 4 * nbins, filt_stream)
-            self._de.invoke(self.mod, Genome(cen_cp), d_accum, d_out, filt_stream)
+            self._de.invoke(self.mod, cen_cp, d_accum, d_out, filt_stream)
            util.BaseCode.zero_dptr(self.mod, d_accum, 4 * nbins, filt_stream)
            filter_done_event = cuda.Event().record(filt_stream)

            f32 = np.float32
-            n = f32(cen_cp.ntemporal_samples)
-            gam = f32(n / gam)
-            vib = f32(vib / n)
-            hipow = f32(cen_cp.highlight_power)
-            lin = f32(cen_cp.gam_lin_thresh)
-            lingam = f32(math.pow(cen_cp.gam_lin_thresh, gam-1.0) if lin > 0 else 0)
-            bkgd = vec.make_float3(*(bkgd / n))
+            # TODO: implement integration over cubic splines?
+            gam = f32(1 / cen_cp.color.gamma)
+            vib = f32(cen_cp.color.vibrancy)
+            hipow = f32(cen_cp.color.highlight_power)
+            lin = f32(cen_cp.color.gamma_threshold)
+            lingam = f32(math.pow(lin, gam-1.0) if lin > 0 else 0)
+            bkgd = vec.make_float3(
+                    cen_cp.color.background.r,
+                    cen_cp.color.background.g,
+                    cen_cp.color.background.b)

            color_fun = self.mod.get_function("colorclip")
            blocks = int(np.ceil(np.sqrt(nbins / 256)))
@ -343,133 +240,10 @@ class Animation(object):
                      stream=filt_stream)
            cuda.memcpy_dtoh_async(h_out, d_out, filt_stream)

-            if sync:
-                filt_stream.synchronize()
-                yield time, self._trim(h_out)
-
-        if not sync:
-            filt_stream.synchronize()
-            yield time, self._trim(h_out)
-
-    def _interp(self, cp, time):
-        flam3_interpolate(self._g_arr, len(self._g_arr), time, 0, byref(cp))
-
-    @staticmethod
-    def _pal_to_np(cp):
-        # Converting palettes by iteration has an enormous performance
-        # overhead. We cheat massively and dangerously here.
-        pal = cast(pointer(cp.palette), POINTER(c_double * (256 * 5)))
-        val = np.frombuffer(buffer(pal.contents), count=256*5)
-        return np.uint8(np.reshape(val, (256, 5))[:,1:] * 255.0)
-
-    def _interp_colors(self, cp, time, twidth):
-        # TODO: any visible difference between uint8 and richer formats?
-        height = self.features.palette_height
-        pal = np.empty((height, 256, 4), dtype=np.uint8)
-
-        if len(self.genomes) > 1:
-            # The typical case; applying real motion blur
-            times = np.linspace(-0.5, 0.5, height) * twidth + time
-            for n, t in enumerate(times):
-                self._interp(cp, t)
-                pal[n] = self._pal_to_np(cp)
-        else:
-            # Cannot call any interp functions on a single genome; rather than
-            # have alternate code-paths, just copy the same colors everywhere
-            pal[0] = self._pal_to_np(self.genomes[0])
-            pal[1:] = pal[0]
-        return pal
+        filt_stream.synchronize()
+        yield start, self._trim(h_out)

    def _trim(self, result):
-        g = self.features.gutter
+        g = self.info.gutter
        return result[g:-g,g:-g].copy()

-
-class Features(object):
-    """
-    Determine features and constants required to render a particular set of
-    genomes. The values of this class are fixed before compilation begins.
-    """
-    # Constant parameters which control handling of out-of-frame samples:
-    # Number of iterations to iterate without write after new point
-    fuse = 10
-    # Maximum consecutive out-of-bounds points before picking new point
-    max_oob = 10
-    max_nxforms = 12
-
-    # Height of the texture pallete which gets uploaded to the GPU (assuming
-    # that palette-from-texture is enabled). For most genomes, this doesn't
-    # need to be very large at all. However, since only an easily-cached
-    # fraction of this will be accessed per SM, larger values shouldn't hurt
-    # performance too much. Power-of-two, please.
-    palette_height = 16
-
-    # Maximum width of DE and other spatial filters, and thus in turn the
-    # amount of padding applied. Note that, for now, this must not be changed!
-    # The filtering code makes deep assumptions about this value.
-    gutter = 16
-
-    # TODO: for now, we always throw away the alpha channel before writing.
-    # All code is in place to not do this, we just need to find a way to expose
-    # this preference via the API (or push alpha blending entirely on the client,
-    # which I'm not opposed to)
-    alpha_output_channel = False
-
-    def __init__(self, genomes):
-        any = lambda l: bool(filter(None, map(l, genomes)))
-        self.max_ntemporal_samples = max(
-                [cp.nbatches * cp.ntemporal_samples for cp in genomes])
-        self.non_box_temporal_filter = genomes[0].temporal_filter_type
-        self.palette_mode = genomes[0].palette_mode and "linear" or "nearest"
-
-        assert len(set([len(cp.xforms) for cp in genomes])) == 1, ("genomes "
-            "must have same number of xforms! (use flam3-genome first)")
-        self.nxforms = len(genomes[0].xforms)
-        self.xforms = [XFormFeatures([cp.xforms[i] for cp in genomes], i)
-                       for i in range(self.nxforms)]
-        if any(lambda cp: cp.final_xform_enable):
-            if not all([cp.final_xform_index == genomes[0].final_xform_index
-                        for cp in genomes]):
-                raise ValueError("Differing final xform indexes")
-            self.final_xform_index = genomes[0].final_xform_index
-        else:
-            self.final_xform_index = None
-
-        alphas = np.array([c.color[3] for g in genomes
-                           for c in g.palette.entries])
-        self.pal_has_alpha = np.any(alphas != 1.0)
-
-        self.max_cps = max([cp.ntemporal_samples for cp in genomes])
-
-        self.width = genomes[0].width
-        self.height = genomes[0].height
-        self.acc_width = genomes[0].width + 2 * self.gutter
-        self.acc_height = genomes[0].height + 2 * self.gutter
-        self.acc_stride = 32 * int(math.ceil(self.acc_width / 32.))
-        self.std_xforms = filter(lambda v: v != self.final_xform_index,
-                                 range(self.nxforms))
-        self.chaos_used = False
-        for cp in genomes:
-            for r in range(len(self.std_xforms)):
-                for c in range(len(self.std_xforms)):
-                    if cp.chaos[r][c] != 1.0:
-                        self.chaos_used = True
-
-
-
-class XFormFeatures(object):
-    def __init__(self, xforms, xform_id):
-        self.id = xform_id
-        any = lambda l: bool(filter(None, map(l, xforms)))
-
-        self.has_post = any(lambda xf: not self.id_matrix(xf.post))
-        self.vars = set()
-        for x in xforms:
-            self.vars = (
-                self.vars.union(set([i for i, v in enumerate(x.var) if v])))
-
-    @staticmethod
-    def id_matrix(m):
-        return (m[0][0] == 1 and m[1][0] == 0 and m[2][0] == 0 and
-                m[0][1] == 0 and m[1][1] == 1 and m[2][1] == 0)
-