New genome format to support flockutil

2025-12-26 14:59:21 -05:00 · 2011-12-15 11:11:05 -05:00
parent 12655b8611
commit b43481e374
5 changed files with 303 additions and 275 deletions
--- a/cuburn/render.py
+++ b/cuburn/render.py
@ -4,11 +4,11 @@ import re
 import time as timemod
 import tempfile
 from collections import namedtuple
-from itertools import cycle, repeat, chain, izip
+from itertools import cycle, repeat, chain, izip, imap, ifilter
 from ctypes import *
 from cStringIO import StringIO
 import numpy as np
-from numpy import int32 as i32, uint64 as u64
+from numpy import float32 as f32, int32 as i32, uint32 as u32, uint64 as u64
 from scipy import ndimage

 from fr0stlib import pyflam3
@ -24,41 +24,55 @@ from cuburn import affine
 from cuburn.code import util, mwc, iter, filtering, sort

 RenderedImage = namedtuple('RenderedImage', 'buf idx gpu_time')
+Dimensions = namedtuple('Dimensions', 'w h aw ah astride')

 def _sync_stream(dst, src):
    dst.wait_for_event(cuda.Event(cuda.event_flags.DISABLE_TIMING).record(src))

+def argset(obj, **kwargs):
+    for k, v in kwargs.items():
+        setattr(obj, k, v)
+    return obj
+
 class Renderer(object):
    """
    Control structure for rendering a series of frames.
-
-    Each animation will dynamically generate a kernel that includes only the
-    code necessary to render the genomes provided. The process of generating
-    and uploading the kernel takes a small but finite amount of time. In
-    general, the kernel generated for all genomes resulting from interpolating
-    between two control points will have identical performance, so it is
-    wasteful to create more than one animation for any interpolated sequence.
-
-    However, genome sequences interpolated from three or more control points
-    with different features enabled will have the code needed to render all
-    genomes enabled for every frame. Doing this can hurt performance.
-
-    In other words, it's best to use exactly one Animation for each
-    interpolated sequence between one or two genomes.
    """

+    # Number of iterations to iterate without write after generating a new
+    # point. This number is currently fixed pretty deeply in the set of magic
+    # constants which govern buffer sizes; changing the value here won't
+    # actually change the code on the device to do something different.
+    fuse = 256
+
+    # The palette texture/surface covers the color coordinate from [0,1] with
+    # (for now, a fixed 256) equidistant horizontal samples, and spans the
+    # temporal range of the frame linearly with this many rows. Increasing
+    # this value increases the number of uniquely-dithered samples when using
+    # pre-dithered surfaces.
+    palette_height = 64
+
+    # Maximum width of DE and other spatial filters, and thus in turn the
+    # amount of padding applied. Note that, for now, this must not be changed!
+    # The filtering code makes deep assumptions about this value.
+    gutter = 15
+
+    # Accumulation mode. Leave it at 'atomic' for now.
+    acc_mode = 'atomic'
+
+    # TODO
+    chaos_used = False
+
    cmp_options = ('-use_fast_math', '-maxrregcount', '42')
    keep = False

-    def __init__(self, info):
-        self.info = info
+    def __init__(self):
        self._iter = self.src = self.cubin = self.mod = None
-        self.packed_genome = None

        # Ensure class options don't get contaminated on an instance
        self.cmp_options = list(self.cmp_options)

-    def compile(self, keep=None, cmp_options=None, jit_options=[]):
+    def compile(self, genome, keep=None, cmp_options=None):
        """
        Compile a kernel capable of rendering every frame in this animation.
        The resulting compiled kernel is stored in the ``cubin`` property;
@ -73,7 +87,7 @@ class Renderer(object):
        keep = self.keep if keep is None else keep
        cmp_options = self.cmp_options if cmp_options is None else cmp_options

-        self._iter = iter.IterCode(self.info)
+        self._iter = iter.IterCode(self, genome)
        self._iter.packer.finalize()
        self.src = util.assemble_code(util.BaseCode, mwc.MWC, self._iter.packer,
                                      self._iter)
@ -82,41 +96,81 @@ class Renderer(object):
        self.cubin = pycuda.compiler.compile(
                self.src, keep=keep, options=cmp_options,
                cache_dir=False if keep else None)
+
+    def load(self, genome, jit_options=[]):
+        if not self.cubin:
+            self.compile(genome)
        self.mod = cuda.module_from_buffer(self.cubin, jit_options)
        with open('/tmp/iter_kern.cubin', 'wb') as fp:
            fp.write(self.cubin)
        return self.src

-    def render(self, times):
+    def render(self, genome, times, width, height, blend=True):
        """
-        Render a flame for each genome in the iterable value 'genomes'.
-        Returns a RenderedImage object with the rendered buffer in the
-        requested format (3D RGBA ndarray only for now).
+        Render a frame for each timestamp in the iterable value ``times``. This
+        function returns a generator that will yield a RenderedImage object
+        containing a shared reference to the output buffer for each specified
+        frame.

-        This method produces a considerable amount of side effects, and should
-        not be used lightly. Things may go poorly for you if this method is not
-        allowed to run until completion (by exhausting all items in the
-        generator object).
+        The returned buffer is page-locked host memory. Between the time a
+        buffer is yielded and the time the next frame's results are requested,
+        the buffer will not be modified. Thereafter, however, it will be
+        overwritten by an asynchronous DMA operation coming from the CUDA
+        device. If you hang on to it for longer than one frame, copy it.

-        ``times`` is a sequence of (idx, start, stop) times, where index is
-        the logical frame number (though it can be any value) and 'start' and
-        'stop' together define the time range to be rendered for each frame.
+        ``genome`` is the genome to be rendered. Successive calls to the
+        `render()` method on one ``Renderer`` object must use genomes which
+        produce identical compiled code, and this will not be verified by the
+        renderer. In practice, this means you can alter genome parameter
+        values, but the full set of keys must remain identical between runs on
+        the same renderer.
+
+        ``times`` is a list of (idx, cen_time) tuples, where ``idx`` is passed
+        unmodified in the RenderedImage return value and ``cen_time`` is the
+        central time of the current frame in spline-time units. (Any
+        clock-time or frame-time units in the genome should be preconverted.)
+
+        If ``blend`` is False, the output buffer will contain unclipped,
+        premultiplied RGBA data, without vibrancy, highlight power, or the
+        alpha elbow applied.
        """
-        if times == []:
+        r = self.render_gen(genome, width, height, blend=blend)
+        next(r)
+        return ifilter(None, imap(r.send, chain(times, [None])))
+
+    def render_gen(self, genome, width, height, blend=True):
+        """
+        Render frames. This method is wrapped by the ``render()`` method; see
+        its docstring for warnings and details.
+
+        Instead of passing frame times as an iterable, they are passed
+        individually via the ``generator.send()`` method. There is an
+        internal pipeline latency of one frame, so the first call to the
+        ``send()`` method will return None, the second call will return the
+        first frame's result, and so on. To retrieve the last frame in a
+        sequence, send ``None``.
+
+        Direct use of this method is useful for implementing render servers.
+        """
+
+        last_idx = None
+        next_frame = yield
+        if next_frame is None:
            return

+        if not self.mod:
+            self.load(genome)
+
        filt = filtering.Filtering()

        reset_rb_fun = self.mod.get_function("reset_rb")
        packer_fun = self.mod.get_function("interp_iter_params")
        iter_fun = self.mod.get_function("iter")

-        info = self.info
-
        # The synchronization model is messy. See helpers/task_model.svg.
        iter_stream = cuda.Stream()
        filt_stream = cuda.Stream()
-        if info.acc_mode == 'deferred':
+        if self.acc_mode == 'deferred':
            write_stream = cuda.Stream()
            write_fun = self.mod.get_function("write_shmem")
        else:
@ -128,19 +182,30 @@ class Renderer(object):
        event_a = cuda.Event().record(filt_stream)
        event_b = None

-        nbins = info.acc_height * info.acc_stride
+        awidth = width + 2 * self.gutter
+        aheight = height + 2 * self.gutter
+        astride = 32 * int(np.ceil(awidth / 32.))
+        dim = Dimensions(width, height, awidth, aheight, astride)
+        d_acc_size = self.mod.get_global('acc_size')[0]
+        cuda.memcpy_htod_async(d_acc_size, u32(list(dim)), write_stream)
+
+        nbins = awidth * aheight
        # Extra padding in accum helps with write_shmem overruns
        d_accum = cuda.mem_alloc(16 * nbins + (1<<16))
        d_out = cuda.mem_alloc(16 * nbins)
-        if info.acc_mode == 'atomic':
+        if self.acc_mode == 'atomic':
            d_atom = cuda.mem_alloc(8 * nbins)
            flush_fun = self.mod.get_function("flush_atom")

-        acc_size = np.array([info.acc_width, info.acc_height, info.acc_stride])
-        d_acc_size = self.mod.get_global('acc_size')[0]
-        cuda.memcpy_htod_async(d_acc_size, np.uint32(acc_size), write_stream)
+        obuf_copy = argset(cuda.Memcpy2D(),
+            src_y=self.gutter, src_x_in_bytes=16*self.gutter,
+            src_pitch=16*astride, dst_pitch=16*width,
+            width_in_bytes=16*width, height=height)
+        obuf_copy.set_src_device(d_out)
+        h_out_a = cuda.pagelocked_empty((height, width, 4), f32)
+        h_out_b = cuda.pagelocked_empty((height, width, 4), f32)

-        if info.acc_mode == 'deferred':
+        if self.acc_mode == 'deferred':
            # Having a fixed, power-of-two log size makes things much easier
            log_size = 64 << 20
            d_log = cuda.mem_alloc(log_size * 4)
@ -153,9 +218,8 @@ class Renderer(object):
        # Calculate 'nslots', the number of simultaneous running threads that
        # can be active on the GPU during iteration (and thus the number of
        # slots for loading and storing RNG and point context that will be
-        # prepared on the device), 'rb_size' (the number of blocks in
-        # 'nslots'), and determine a number of temporal samples
-        # likely to load-balance effectively
+        # prepared on the device), and derive 'rb_size', the number of blocks in
+        # 'nslots'.
        iter_threads_per_block = 256
        dev_data = pycuda.tools.DeviceData()
        occupancy = pycuda.tools.OccupancyRecord(
@ -169,14 +233,16 @@ class Renderer(object):
        reset_rb_fun(np.int32(rb_size), block=(1,1,1))

        d_points = cuda.mem_alloc(nslots * 16)
-        # We may add extra seeds to simplify palette dithering.
-        seeds = mwc.MWC.make_seeds(max(nslots, 256 * info.palette_height))
+        # This statement may add extra seeds to simplify palette dithering.
+        seeds = mwc.MWC.make_seeds(max(nslots, 256 * self.palette_height))
        d_seeds = cuda.to_device(seeds)

        # We used to auto-calculate this to a multiple of the number of SMs on
        # the device, but since we now use shorter launches and, to a certain
        # extent, allow simultaneous occupancy, that's not as important. The
-        # 1024 is a magic constant, though: FUSE
+        # 1024 is a magic constant to ensure reasonable and power-of-two log
+        # size for deferred: 256MB / (4B * FUSE * NTHREADS). Enhancements to
+        # the sort engine are needed to make this more flexible.
        ntemporal_samples = 1024
        genome_times, genome_knots = self._iter.packer.pack()
        d_genome_times = cuda.to_device(genome_times)
@ -184,37 +250,31 @@ class Renderer(object):
        info_size = 4 * len(self._iter.packer) * ntemporal_samples
        d_infos = cuda.mem_alloc(info_size)

-        pals = info.genome.color.palette
+        pals = genome.color.palette
        if isinstance(pals, basestring):
            pals = [0.0, pals, 1.0, pals]
-        palint_times = np.empty(len(genome_times[0]), np.float32)
+        palint_times = np.empty(len(genome_times[0]), f32)
        palint_times.fill(100.0)
-        palint_times[:len(pals)/2] = pals[::2]
+        palint_times[:len(pals)] = [p[0] for p in pals]
        d_palint_times = cuda.to_device(palint_times)
        d_palint_vals = cuda.to_device(
-                np.concatenate(map(info.db.palettes.get, pals[1::2])))
+                np.concatenate([p[1].data for p in pals]))

-        if info.acc_mode in ('deferred', 'atomic'):
+        if self.acc_mode in ('deferred', 'atomic'):
            palette_fun = self.mod.get_function("interp_palette_hsv_flat")
-            dsc = cuda.ArrayDescriptor3D()
-            dsc.height = info.palette_height
-            dsc.width = 256
-            dsc.depth = 0
-            dsc.format = cuda.array_format.SIGNED_INT32
-            dsc.num_channels = 2
-            dsc.flags = cuda.array3d_flags.SURFACE_LDST
+            dsc = argset(cuda.ArrayDescriptor3D(), height=self.palette_height,
+                    width=256, depth=0, format=cuda.array_format.SIGNED_INT32,
+                    num_channels=2, flags=cuda.array3d_flags.SURFACE_LDST)
            palarray = cuda.Array(dsc)

            tref = self.mod.get_surfref('flatpal')
            tref.set_array(palarray, 0)
        else:
            palette_fun = self.mod.get_function("interp_palette_hsv")
-            dsc = cuda.ArrayDescriptor()
-            dsc.height = info.palette_height
-            dsc.width = 256
-            dsc.format = cuda.array_format.UNSIGNED_INT8
-            dsc.num_channels = 4
-            d_palmem = cuda.mem_alloc(256 * info.palette_height * 4)
+            dsc = argset(cuda.ArrayDescriptor(), height=self.palette_height,
+                    width=256, format=cuda.array_format.UNSIGNED_INT8,
+                    num_channels=4)
+            d_palmem = cuda.mem_alloc(256 * self.palette_height * 4)

            tref = self.mod.get_texref('palTex')
            tref.set_address_2d(d_palmem, dsc, 1024)
@ -222,49 +282,46 @@ class Renderer(object):
            tref.set_flags(cuda.TRSF_NORMALIZED_COORDINATES)
            tref.set_filter_mode(cuda.filter_mode.LINEAR)

-        h_out_a = cuda.pagelocked_empty((info.acc_height, info.acc_stride, 4),
-                                        np.float32)
-        h_out_b = cuda.pagelocked_empty((info.acc_height, info.acc_stride, 4),
-                                        np.float32)
-        last_idx = None
+        while next_frame is not None:
+            # tc, td, ts, te: central, delta, start, end times
+            idx, tc = next_frame
+            td = genome.adj_frame_width(tc)
+            ts, te = tc - 0.5 * td, tc + 0.5 * td

-        for idx, start, stop in times:
-            twidth = np.float32((stop-start) / info.palette_height)
-            if info.acc_mode in ('deferred', 'atomic'):
-                palette_fun(d_seeds, d_palint_times, d_palint_vals,
-                            np.float32(start), twidth,
-                            block=(256,1,1), grid=(info.palette_height,1),
-                            stream=write_stream)
+            if self.acc_mode in ('deferred', 'atomic'):
+                # In this mode, the palette writes to a surface reference, but
+                # requires dithering, so we pass it the seeds instead
+                arg0 = d_seeds
            else:
-                palette_fun(d_palmem, d_palint_times, d_palint_vals,
-                            np.float32(start), twidth,
-                            block=(256,1,1), grid=(info.palette_height,1),
-                            stream=write_stream)
+                arg0 = d_palmem
+            palette_fun(arg0, d_palint_times, d_palint_vals,
+                        f32(ts), f32(td / self.palette_height),
+                        block=(256,1,1), grid=(self.palette_height,1),
+                        stream=write_stream)

-            width = np.float32((stop-start) / ntemporal_samples)
-            packer_fun(d_infos, d_genome_times, d_genome_knots,
-                       np.float32(start), width, d_seeds,
-                       np.int32(ntemporal_samples), block=(256,1,1),
+            packer_fun(d_infos, d_seeds, d_genome_times, d_genome_knots,
+                       f32(ts), f32(td / ntemporal_samples),
+                       i32(ntemporal_samples), block=(256,1,1),
                       grid=(int(np.ceil(ntemporal_samples/256.)),1),
                       stream=iter_stream)

            # Reset points so that they will be FUSEd
            util.BaseCode.fill_dptr(self.mod, d_points, 4 * nslots,
-                                    iter_stream, np.float32(np.nan))
+                                    iter_stream, f32(np.nan))

            # Get interpolated control points for debugging
            #iter_stream.synchronize()
            #d_temp = cuda.from_device(d_infos,
-                    #(ntemporal_samples, len(self._iter.packer)), np.float32)
+                    #(ntemporal_samples, len(self._iter.packer)), f32)
            #for i, n in zip(d_temp[5], self._iter.packer.packed):
                #print '%60s %g' % ('_'.join(n), i)

            util.BaseCode.fill_dptr(self.mod, d_accum, 4 * nbins, write_stream)
-            if info.acc_mode == 'atomic':
+            if self.acc_mode == 'atomic':
                util.BaseCode.fill_dptr(self.mod, d_atom, 2 * nbins, write_stream)
-            nrounds = ( (info.density * info.width * info.height)
-                      / (ntemporal_samples * 256 * 256) ) + 1
-            if info.acc_mode == 'deferred':
+            nrounds = int( (genome.spp(tc) * width * height)
+                         / (ntemporal_samples * 256 * 256) ) + 1
+            if self.acc_mode == 'deferred':
                for i in range(nrounds):
                    iter_fun(np.uint64(d_log), d_seeds, d_points, d_infos,
                             block=(32, self._iter.NTHREADS/32, 1),
@ -272,18 +329,17 @@ class Renderer(object):
                    _sync_stream(write_stream, iter_stream)
                    sorter.sort(d_log_sorted, d_log, log_size, 3, True,
                                stream=write_stream)
-                    #print cuda.from_device(sorter.dglobal, (256,), np.uint32)
                    _sync_stream(iter_stream, write_stream)
                    write_fun(d_accum, d_log_sorted, sorter.dglobal, i32(nbins),
                              block=(1024, 1, 1), grid=(nwriteblocks, 1),
                              stream=write_stream)
            else:
                args = [u64(d_accum), d_seeds, d_points, d_infos]
-                if info.acc_mode == 'atomic':
+                if self.acc_mode == 'atomic':
                    args.append(u64(d_atom))
                iter_fun(*args, block=(32, self._iter.NTHREADS/32, 1),
                         grid=(ntemporal_samples, nrounds), stream=iter_stream)
-                if info.acc_mode == 'atomic':
+                if self.acc_mode == 'atomic':
                    nblocks = int(np.ceil(np.sqrt(nbins/float(512))))
                    flush_fun(u64(d_accum), u64(d_atom), i32(nbins),
                              block=(512, 1, 1), grid=(nblocks, nblocks),
@ -291,27 +347,29 @@ class Renderer(object):

            util.BaseCode.fill_dptr(self.mod, d_out, 4 * nbins, filt_stream)
            _sync_stream(filt_stream, write_stream)
-            filt.de(d_out, d_accum, info, start, stop, filt_stream)
+            filt.de(d_out, d_accum, genome, dim, tc, stream=filt_stream)
            _sync_stream(write_stream, filt_stream)
-            filt.colorclip(d_out, info, start, stop, filt_stream)
-            cuda.memcpy_dtoh_async(h_out_a, d_out, filt_stream)
+            filt.colorclip(d_out, genome, dim, tc, blend, stream=filt_stream)
+            obuf_copy.set_dst_host(h_out_a)
+            obuf_copy(filt_stream)

            if event_b:
                while not event_a.query():
                    timemod.sleep(0.01)
                gpu_time = event_a.time_since(event_b)
-                yield RenderedImage(self._trim(h_out_b), last_idx, gpu_time)
+                result = RenderedImage(h_out_b, last_idx, gpu_time)
+            else:
+                result = None
+            last_idx = idx

            event_a, event_b = cuda.Event().record(filt_stream), event_a
            h_out_a, h_out_b = h_out_b, h_out_a
-            last_idx = idx
+
+            # TODO: add ability to flush a frame without breaking the pipe
+            next_frame = yield result

        while not event_a.query():
            timemod.sleep(0.001)
        gpu_time = event_a.time_since(event_b)
-        yield RenderedImage(self._trim(h_out_b), last_idx, gpu_time)
-
-    def _trim(self, result):
-        g = self.info.gutter
-        return result[g:-g,g:g+self.info.width].copy()
+        yield RenderedImage(h_out_b, last_idx, gpu_time)