Refactor host rendering code for better load

2025-07-12 03:05:14 -04:00 · 2011-10-15 22:22:43 -04:00
parent 8e99c9c463
commit 9bafbda81a
2 changed files with 157 additions and 214 deletions
--- a/cuburn/render.py
+++ b/cuburn/render.py
@ -107,7 +107,19 @@ class Animation(object):
    In other words, it's best to use exactly one Animation for each
    interpolated sequence between one or two genomes.
    """
    # Large launches lock the display for a considerable period and may be
    # killed due to a device timeout; small launches are harder to load-balance
    # on the GPU and incur overhead. This empirical value is multiplied by the
    # number of SMs on the device to determine how many blocks should be in
    # each launch. Extremely high quality, high resolution renders may still
    # encounter a device timeout, requiring the user to increase the split
    # amount. This factor is not used in async mode.
    SM_FACTOR = 8
    cmp_options = ('-use_fast_math', '-maxrregcount', '42')
    keep = False
    def __init__(self, ctypes_genome_array):
@ -170,7 +182,8 @@ class Animation(object):
            self.compile()
        self.mod = cuda.module_from_buffer(self.cubin, jit_options)
-    def render_frames(self, times=None, block=True):
+
    def render_frames(self, times=None, sync=False):
        """
        Render a flame for each genome in the iterable value 'genomes'.
        Returns a Python generator object which will yield a 2-tuple of
@ -192,260 +205,182 @@ class Animation(object):
        ``times`` is a sequence of center times at which to render, or ``None``
        to render one frame for each genome used to create the animation.
-        ``block`` will cause this thread to spin, waiting for the GPU to
+        If ``sync`` is True, the CPU will sync with the GPU after every block
-        finish the current task. Otherwise, this generator will yield ``None``
+        of temporal samples and yield None until the frame is ready. This
-        until the GPU is finished, for filtering later.
+        allows a single-card system to avoid having to go thirty seconds
        between window refreshes while rendering. Otherwise, tasks will be
        piled asynchronously on the card so that it is always under load.
        """
        f = self.features
        times = times if times is not None else [cp.time for cp in self.genomes]
        iter_stream = cuda.Stream()
        filt_stream = cuda.Stream()
        cen_cp = pyflam3.Genome()
        dst_cp = pyflam3.Genome()
-        if block:
+        nbins = f.acc_height * f.acc_stride
-            rdr = _AnimRenderer(self)
+        d_accum = cuda.mem_alloc(16 * nbins)
-            for t in times:
+        d_out = cuda.mem_alloc(16 * nbins)
-                rdr.render(t)
+
-                yield rdr.get_result()
+        num_sm = cuda.Context.get_device().multiprocessor_count
        if sync:
            cps_per_block = num_sm * self.SM_FACTOR
        else:
-            # TODO: share buffers.
+            cps_per_block = f.max_cps
            rdrs = [_AnimRenderer(self) for i in range(2)]
-            # Zip up each genome with an alternating renderer, plus 2 empty
+        info_size = self._iter.packer.align * cps_per_block
-            # genomes at the end to flush all pending tasks
+        d_infos = cuda.mem_alloc(info_size)
-            exttimes = times[:] + [None, None]
+        d_palmem = cuda.mem_alloc(256 * f.palette_height * 4)
            for rdr, t in izip(cycle(rdrs), exttimes):
                if rdr.pending:
                    while not rdr.done():
                        yield None
                    yield rdr.get_result()
                if t is not None:
                    rdr.render(t)
-    def _interp(self, time, cp):
+        seeds = mwc.MWC.make_seeds(self._iter.NTHREADS * cps_per_block)
-        flam3_interpolate(self._g_arr, len(self._g_arr), time, 0, byref(cp))
+        d_seeds = cuda.to_device(seeds)
-class _AnimRenderer(object):
+        h_infos = cuda.pagelocked_empty((info_size / 4,), np.float32)
-    # Large launches lock the display for a considerable period and may be
+        h_palmem = cuda.pagelocked_empty(
-    # killed due to a device timeout; small launches are harder to load-balance
+                (f.palette_height, 256, 4), np.uint8)
-    # on the GPU and incur overhead. This empirical value is multiplied by the
+        h_out = cuda.pagelocked_empty((f.acc_height, f.acc_stride, 4), np.float32)
    # number of SMs on the device to determine how many blocks should be in
    # each launch. Extremely high quality, high resolution renders may still
    # encounter a device timeout, and no workaround is in place for that yet.
    SM_FACTOR = 8
-    # Currently, palette interpolation is done independently of animation
+        filter_done_event = None
    # interpolation, so that the process is not biased and so we only need to
    # mess about with one texture per renderer. This many steps will always be
    # used, no matter the number of time steps.
    PAL_HEIGHT = 16
-    # Use synchronous launches
+        packer = self._iter.packer
-    sync = False
+        iter_fun = self.mod.get_function("iter")
-    # Delay this long between iterations (only active when sync is True)
+        #iter_fun.set_cache_config(cuda.func_cache.PREFER_L1)
    sleep = None
-    def __init__(self, anim):
+        util.BaseCode.zero_dptr(self.mod, d_accum, 4 * nbins, filt_stream)
        self.anim = anim
        self.pending = False
        self.cen_time = None
        self.stream = cuda.Stream()
-        self._nsms = cuda.Context.get_device().multiprocessor_count
+        last_time = times[0]
        self.cps_per_block = self._nsms * self.SM_FACTOR
        self.ncps = anim.features.max_cps
        self.nblocks = int(math.ceil(self.ncps / float(self.cps_per_block)))
-        # These are stored to avoid leaks, not to be stateful in method calls
+        for time in times:
-        self._dst_cp = pyflam3.Genome()
+            self._interp(cen_cp, time)
        memset(byref(self._dst_cp), 0, sizeof(self._dst_cp))
        self._cen_cp = pyflam3.Genome()
        memset(byref(self._cen_cp), 0, sizeof(self._cen_cp))
-        self.nbins = anim.features.acc_height * anim.features.acc_stride
+            h_palmem[:] = self._interp_colors(dst_cp, time,
-        self.d_accum = cuda.mem_alloc(16 * self.nbins)
+                                              cen_cp.temporal_filter_width)
-        self.d_out = cuda.mem_alloc(16 * self.nbins)
+            cuda.memcpy_htod_async(d_palmem, h_palmem, iter_stream)
            tref = self.mod.get_texref('palTex')
            array_info = cuda.ArrayDescriptor()
            array_info.height = f.palette_height
            array_info.width = 256
            array_info.array_format = cuda.array_format.UNSIGNED_INT8
            array_info.num_channels = 4
            tref.set_address_2d(d_palmem, array_info, 1024)
        info_size = anim._iter.packer.align * self.ncps
        self.d_infos = cuda.mem_alloc(info_size)
        # Defer generation of seeds until they're first needed
        self.d_seeds = None
        # During the main rendering loop, we alternate between two streams and
        # two sets of seeds, synchronizing them at the end of rendering.
        self.alt_stream = cuda.Stream()
        self.d_alt_seeds = None
        # It's less than ideal, but we lock some memory ahead of time
        self.h_infos_locked = cuda.pagelocked_empty((info_size/4,), np.float32)
        if self.sync:
            self.stream = self.alt_stream = None
    def render(self, cen_time):
        assert not self.pending, "Tried to render with results pending!"
        self.pending = True
        self.cen_time = cen_time
        a = self.anim
        cen_cp = self._cen_cp
        a._interp(cen_time, cen_cp)
        palette = self._interp_colors(cen_time, cen_cp)
        util.BaseCode.zero_dptr(a.mod, self.d_accum, 4 * self.nbins,
                                self.stream)
        # Ensure all main stream tasks are done before starting alt stream
        if not self.sync:
            self.alt_stream.wait_for_event(cuda.Event().record(self.stream))
        dpal = cuda.make_multichannel_2d_array(palette, 'C')
        tref = a.mod.get_texref('palTex')
        tref.set_array(dpal)
            tref.set_format(cuda.array_format.UNSIGNED_INT8, 4)
            tref.set_flags(cuda.TRSF_NORMALIZED_COORDINATES)
            tref.set_filter_mode(cuda.filter_mode.LINEAR)
        cp = self._dst_cp
        packer = a._iter.packer
        iter_fun = a.mod.get_function("iter")
        #iter_fun.set_cache_config(cuda.func_cache.PREFER_L1)
            # Must be accumulated over all CPs
            gam, vib = 0, 0
            bkgd = np.zeros(3)
-        # This is gross, but there are a lot of fiddly corner cases with any
+            mblur_times = enumerate( np.linspace(-0.5, 0.5, cen_cp.ntemporal_samples)
-        # index-based iteration scheme.
+                                     * cen_cp.temporal_filter_width + time )
        times = list(enumerate(self._mk_dts(cen_time, cen_cp, self.ncps)))
        for b, block_times in enumerate(_chunk(times, self.cps_per_block)):
            on_main = b % 2 == 0
            stream = self.stream if on_main else self.alt_stream
            d_seeds = self.d_seeds if on_main else self.d_alt_seeds
            if not d_seeds:
                seeds = mwc.MWC.make_seeds(a._iter.NTHREADS *
                                           self.cps_per_block)
                if self.sync:
                    d_seeds = cuda.to_device(seeds)
                else:
                    size = seeds.dtype.itemsize * seeds.size
                    d_seeds = cuda.mem_alloc(size)
                    h_seeds = cuda.pagelocked_empty(seeds.shape, seeds.dtype)
                    h_seeds[:] = seeds
                    cuda.memcpy_htod_async(d_seeds, h_seeds, stream)
                if on_main:
                    self.d_seeds = d_seeds
                else:
                    self.d_alt_seeds = d_seeds
            for block_times in _chunk(list(mblur_times), cps_per_block):
                infos = []
-            if len(a.genomes) > 1:
+                if len(self.genomes) > 1:
                    for n, t in block_times:
-                    a._interp(t, cp)
+                        self._interp(dst_cp, t)
                        frac = float(n) / cen_cp.ntemporal_samples
-                    info = packer.pack(cp=Genome(cp), cp_step_frac=frac)
+                        info = packer.pack(cp=Genome(dst_cp), cp_step_frac=frac)
                        infos.append(info)
-                    gam += cp.gamma
+                        gam += dst_cp.gamma
-                    vib += cp.vibrancy
+                        vib += dst_cp.vibrancy
-                    bkgd += np.array(cp.background)
+                        bkgd += np.array(dst_cp.background)
                else:
                    # Can't interpolate normally; just pack copies
-                packed = packer.pack(cp=a.genomes[0], cp_step_frac=0)
+                    packed = packer.pack(cp=self.genomes[0], cp_step_frac=0)
                    infos = [packed] * len(block_times)
-                gam += a.genomes[0].gamma * len(block_times)
+                    gam += self.genomes[0].gamma * len(block_times)
-                vib += a.genomes[0].vibrancy * len(block_times)
+                    vib += self.genomes[0].vibrancy * len(block_times)
-                bkgd += np.array(a.genomes[0].background) * len(block_times)
+                    bkgd += np.array(self.genomes[0].background) * len(block_times)
                infos = np.concatenate(infos)
-            offset = b * packer.align * self.cps_per_block
+                h_infos[:len(infos)] = infos
-            d_info_off = int(self.d_infos) + offset
+                cuda.memcpy_htod_async(d_infos, h_infos)
            if self.sync:
                cuda.memcpy_htod(d_info_off, infos)
            else:
                h_infos = self.h_infos_locked[offset/4:offset/4+len(infos)]
                h_infos[:] = infos
                cuda.memcpy_htod_async(d_info_off, h_infos, stream)
-            iter_fun(d_seeds, np.uintp(d_info_off), np.uint64(self.d_accum),
+                if filter_done_event:
-                     block=(32, a._iter.NTHREADS/32, 1),
+                    iter_stream.wait_for_event(filter_done_event)
                # TODO: replace with option to split long runs shorter ones
                # for interactivity
                for i in range(1):
                    iter_fun(d_seeds, d_infos, np.uint64(d_accum),
                             block=(32, self._iter.NTHREADS/32, 1),
                             grid=(len(block_times), 1),
-                     texrefs=[tref], stream=stream)
+                             texrefs=[tref], stream=iter_stream)
-            if self.sync and self.sleep:
+                    if sync:
-                time.sleep(self.sleep)
+                        iter_stream.synchronize()
                        yield None
-        # Now ensure all alt stream tasks are done before continuing main
+            if filter_done_event and not sync:
-        if not self.sync:
+                filt_stream.synchronize()
-            self.stream.wait_for_event(cuda.Event().record(self.alt_stream))
+                yield last_time, self._trim(h_out)
                last_time = time
-        util.BaseCode.zero_dptr(a.mod, self.d_out, 4 * self.nbins,
+            util.BaseCode.zero_dptr(self.mod, d_out, 4 * nbins, filt_stream)
-                                self.stream)
+            self._de.invoke(self.mod, Genome(cen_cp), d_accum, d_out, filt_stream)
-        a._de.invoke(a.mod, Genome(cen_cp), self.d_accum, self.d_out,
+            util.BaseCode.zero_dptr(self.mod, d_accum, 4 * nbins, filt_stream)
-                     self.stream)
+            filter_done_event = cuda.Event().record(filt_stream)
-        f = np.float32
+            f32 = np.float32
-        n = f(self.ncps)
+            n = f32(cen_cp.ntemporal_samples)
-        gam = f(n / gam)
+            gam = f32(n / gam)
-        vib = f(vib / n)
+            vib = f32(vib / n)
-        hipow = f(cen_cp.highlight_power)
+            hipow = f32(cen_cp.highlight_power)
-        lin = f(cen_cp.gam_lin_thresh)
+            lin = f32(cen_cp.gam_lin_thresh)
-        lingam = f(math.pow(cen_cp.gam_lin_thresh, gam-1.0) if lin > 0 else 0)
+            lingam = f32(math.pow(cen_cp.gam_lin_thresh, gam-1.0) if lin > 0 else 0)
            bkgd = vec.make_float3(*(bkgd / n))
-        # TODO: get block size from colorclip class? It actually does not
+            color_fun = self.mod.get_function("colorclip")
-        # depend on that being the case
+            color_fun(d_out, gam, vib, hipow, lin, lingam, bkgd,
-        color_fun = a.mod.get_function("colorclip")
+                      block=(256, 1, 1), grid=(nbins / 256, 1),
-        color_fun(self.d_out, gam, vib, hipow, lin, lingam, bkgd,
+                      stream=filt_stream)
-                  block=(256, 1, 1), grid=(self.nbins / 256, 1),
+            cuda.memcpy_dtoh_async(h_out, d_out, filt_stream)
                  stream=self.stream)
            if sync:
                filt_stream.synchronize()
                yield time, self._trim(h_out)
-        # TODO: The stream seems to sync right here, automatically, before
+        if not sync:
-        # returning. I think PyCUDA is forcing a sync when something drops out
+            filt_stream.synchronize()
-        # of scope. Investigate.
+            yield time, self._trim(h_out)
-    def _pal_to_np(self, cp):
+    def _interp(self, cp, time):
        flam3_interpolate(self._g_arr, len(self._g_arr), time, 0, byref(cp))
    @staticmethod
    def _pal_to_np(cp):
        # Converting palettes by iteration has an enormous performance
        # overhead. We cheat massively and dangerously here.
        pal = cast(pointer(cp.palette), POINTER(c_double * (256 * 5)))
        val = np.frombuffer(buffer(pal.contents), count=256*5)
        return np.uint8(np.reshape(val, (256, 5))[:,1:] * 255.0)
-    def _interp_colors(self, cen_time, cen_cp):
+    def _interp_colors(self, cp, time, twidth):
        # TODO: any visible difference between uint8 and richer formats?
-        pal = np.empty((self.PAL_HEIGHT, 256, 4), dtype=np.uint8)
+        height = self.features.palette_height
-        a = self.anim
+        pal = np.empty((height, 256, 4), dtype=np.uint8)
-        if len(a.genomes) > 1:
+        if len(self.genomes) > 1:
            # The typical case; applying real motion blur
-            cp = self._dst_cp
+            times = np.linspace(-0.5, 0.5, height) * twidth + time
            times = self._mk_dts(cen_time, cen_cp, self.PAL_HEIGHT)
            for n, t in enumerate(times):
-                a._interp(t, cp)
+                self._interp(cp, t)
                pal[n] = self._pal_to_np(cp)
        else:
            # Cannot call any interp functions on a single genome; rather than
            # have alternate code-paths, just copy the same colors everywhere
-            pal[0] = self._pal_to_np(a.genomes[0])
+            pal[0] = self._pal_to_np(self.genomes[0])
            pal[1:] = pal[0]
        return pal
-    def done(self):
+    def _trim(self, result):
-        if self.sync:
+        g = self.features.gutter
-            return True
+        return result[g:-g,g:-g].copy()
        return self.stream.is_done()
    def get_result(self):
        if not self.sync:
            self.stream.synchronize()
        self.pending = False
        a = self.anim
        obuf_dim = (a.features.acc_height, a.features.acc_stride, 4)
        out = cuda.from_device(self.d_out, obuf_dim, np.float32)
        g = a.features.gutter
        return self.cen_time, out[g:-g,g:-g]
    @staticmethod
    def _mk_dts(cen_time, cen_cp, ncps):
        w = cen_cp.temporal_filter_width
        return [cen_time + w * (t / (ncps - 1.0) - 0.5) for t in range(ncps)]
 class Features(object):
    """
--- a/main.py
+++ b/main.py
@ -15,6 +15,7 @@ import argparse
 import multiprocessing
 from subprocess import Popen
 from ctypes import *
 from itertools import ifilter
 import numpy as np
 import Image
@ -47,6 +48,7 @@ def save(args, time, raw):
    noalpha = raw[:,:,:3]
    if args.raw:
        real_stdout.write(buffer(np.uint8(noalpha * 255.0)))
        sys.stderr.write('.')
        return
    name = fmt_filename(args, time)
@ -161,7 +163,7 @@ def main(args):
        def on_mouse_motion(x, y, dx, dy):
            pass
-        frames = anim.render_frames(times, block=False)
+        frames = anim.render_frames(times, sync=args.sync)
        def poll(dt):
            out = next(frames, False)
            if out is False:
@ -173,14 +175,20 @@ def main(args):
                imgbuf = np.uint8(buf.flatten() * 255)
                image.set_data('RGBA', -anim.features.width*4, imgbuf.tostring())
                label.text = '%s %4g' % (args.name, time)
            else:
                label.text += '.'
            if args.sleep:
                time.sleep(args.sleep)
        pyglet.clock.set_fps_limit(30)
        pyglet.clock.schedule_interval(poll, 1/30.)
        pyglet.app.run()
    else:
-        for time, out in anim.render_frames(times):
+        for time, out in ifilter(None, anim.render_frames(times, sync=args.sync)):
            save(args, time, out)
            if args.sleep:
                time.sleep(args.sleep)
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Render fractal flames.')