mirror of
				https://github.com/stevenrobertson/cuburn.git
				synced 2025-11-03 18:00:55 -05:00 
			
		
		
		
	
		
			
				
	
	
		
			435 lines
		
	
	
		
			18 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			435 lines
		
	
	
		
			18 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
"""
 | 
						|
Resources and tools to perform rendering.
 | 
						|
"""
 | 
						|
 | 
						|
import os
 | 
						|
import sys
 | 
						|
import re
 | 
						|
import time
 | 
						|
import tempfile
 | 
						|
from collections import namedtuple
 | 
						|
import numpy as np
 | 
						|
from numpy import float32 as f32, int32 as i32, uint32 as u32, uint64 as u64
 | 
						|
 | 
						|
import pycuda.driver as cuda
 | 
						|
import pycuda.tools
 | 
						|
 | 
						|
import filters
 | 
						|
import output
 | 
						|
from code import util, mwc, iter, interp, sort
 | 
						|
from code.util import ClsMod, devlib, filldptrlib, assemble_code, launch
 | 
						|
from cuburn.genome.util import palette_decode
 | 
						|
 | 
						|
RenderedImage = namedtuple('RenderedImage', 'buf idx gpu_time')
 | 
						|
Dimensions = namedtuple('Dimensions', 'w h aw ah astride')
 | 
						|
 | 
						|
class DurationEvent(cuda.Event):
 | 
						|
    """
 | 
						|
    A CUDA event which is implicitly aware of a prior event for time
 | 
						|
    calculations.
 | 
						|
 | 
						|
    Note that instances retain a reference to their prior, so an unbroken
 | 
						|
    chain of DurationEvents will leak. Use normal events as priors.
 | 
						|
    """
 | 
						|
    def __init__(self, prior):
 | 
						|
        super(DurationEvent, self).__init__()
 | 
						|
        self._prior = prior
 | 
						|
    def time(self):
 | 
						|
        return self.time_since(self._prior)
 | 
						|
 | 
						|
class Framebuffers(object):
 | 
						|
    """
 | 
						|
    The largest memory allocations, and a stream to serialize their use.
 | 
						|
 | 
						|
    ``d_front`` and ``d_back`` are separate buffers, each large enough to hold
 | 
						|
    four float32 components per pixel (including any gutter pixels added for
 | 
						|
    alignment or padding).
 | 
						|
 | 
						|
    Every user of this set of buffers may use and overwrite the buffers in
 | 
						|
    any way, as long as the output for the next stage winds up in the front
 | 
						|
    buffer. The front and back buffers can be exchanged by the ``flip()``
 | 
						|
    method (which simply exchanges the pointers); while no similar method
 | 
						|
    exists for the side buffer, you're free to do the same by taking local
 | 
						|
    copies of the references and exchanging them yourself.
 | 
						|
 | 
						|
    ``d_left`` and ``d_right`` and ``d_uleft`` and ``d_uright`` are similar,
 | 
						|
    but without strict dependencies. Each stage is free to stomp these buffers,
 | 
						|
    but must be done with them by the next stage.
 | 
						|
 | 
						|
    There's one spot in the stream interleaving where the behavior is
 | 
						|
    different: the ``Output.convert`` call must store its output to the back
 | 
						|
    buffer, which will remain untouched until the dtoh copy of the converted
 | 
						|
    buffer is finished. This happens while the ``iter`` kernel of the next
 | 
						|
    frame writes to the front and side buffers, which means in practice that
 | 
						|
    there's essentially no waiting on any buffers.
 | 
						|
 | 
						|
    If an output module decides to get all krazy and actually replaces the
 | 
						|
    references to the buffers on this object - to, say, implement a temporally
 | 
						|
    aware tone-mapping or denoising pass - that's probably okay, but just make
 | 
						|
    sure it appears like it's expected to.
 | 
						|
    """
 | 
						|
 | 
						|
    # Minimum extension of accumulation buffer beyond output region. Used to
 | 
						|
    # alleviate edge effects during filtering. Actual gutter may be larger to
 | 
						|
    # accomodate alignment requirements; when it is, that extension will be
 | 
						|
    # applied to the lower-right corner of the buffer. This is asymmetrical,
 | 
						|
    # but simplifies trimming logic when it's time for that.
 | 
						|
    gutter = 12
 | 
						|
 | 
						|
    @classmethod
 | 
						|
    def calc_dim(cls, width, height):
 | 
						|
        """
 | 
						|
        Given a width and height, return a valid set of dimensions which
 | 
						|
        include at least enough gutter to exceed the minimum, and where
 | 
						|
        (acc_width % 32) == 0 and (acc_height % 16) == 0.
 | 
						|
        """
 | 
						|
        awidth = width + 2 * cls.gutter
 | 
						|
        aheight = 16 * int(np.ceil((height + 2 * cls.gutter) / 16.))
 | 
						|
        astride = 32 * int(np.ceil(awidth / 32.))
 | 
						|
        return Dimensions(width, height, awidth, aheight, astride)
 | 
						|
 | 
						|
    def __init__(self):
 | 
						|
        self.stream = cuda.Stream()
 | 
						|
        self.pool = pycuda.tools.PageLockedMemoryPool()
 | 
						|
        self._clear()
 | 
						|
 | 
						|
        # These resources rely on the slots/ringbuffer mechanism for sharing,
 | 
						|
        # and so can be shared across any number of launches, genomes, and
 | 
						|
        # render kernels. Notably, seeds are self-synchronizing, so they're not
 | 
						|
        # attached to either stream object.
 | 
						|
        self.d_rb = cuda.to_device(np.array([0, 0], dtype=u32))
 | 
						|
        seeds = mwc.make_seeds(util.DEFAULT_RB_SIZE * 256)
 | 
						|
        self.d_seeds = cuda.to_device(seeds)
 | 
						|
        self._len_d_points = util.DEFAULT_RB_SIZE * 256 * 16
 | 
						|
        self.d_points = cuda.mem_alloc(self._len_d_points)
 | 
						|
 | 
						|
    def _clear(self):
 | 
						|
        self.nbins = self.d_front = self.d_back = None
 | 
						|
        self.d_left = self.d_right = self.d_uleft = self.d_uright = None
 | 
						|
 | 
						|
    def free(self, stream=None):
 | 
						|
        if stream is not None:
 | 
						|
            stream.synchronize()
 | 
						|
        else:
 | 
						|
            cuda.Context.synchronize()
 | 
						|
        for p in (self.d_front, self.d_back, self.d_left, self.d_right,
 | 
						|
                  self.d_uleft, self.d_uright):
 | 
						|
            if p is not None:
 | 
						|
                p.free()
 | 
						|
        self._clear()
 | 
						|
 | 
						|
    def alloc(self, dim, stream=None):
 | 
						|
        """
 | 
						|
        Ensure that this object's framebuffers are large enough to handle the
 | 
						|
        given dimensions, allocating new ones if not.
 | 
						|
 | 
						|
        If ``stream`` is not None and a reallocation is necessary, the stream
 | 
						|
        will be synchronized before the old buffers are deallocated.
 | 
						|
        """
 | 
						|
        nbins = dim.ah * dim.astride
 | 
						|
        if self.nbins >= nbins: return
 | 
						|
        if self.nbins is not None: self.free()
 | 
						|
        try:
 | 
						|
            self.d_front  = cuda.mem_alloc(16 * nbins)
 | 
						|
            self.d_back   = cuda.mem_alloc(16 * nbins)
 | 
						|
            self.d_left   = cuda.mem_alloc(16 * nbins)
 | 
						|
            self.d_right  = cuda.mem_alloc(16 * nbins)
 | 
						|
            self.d_uleft  = cuda.mem_alloc(2  * nbins)
 | 
						|
            self.d_uright = cuda.mem_alloc(2  * nbins)
 | 
						|
            self.nbins = nbins
 | 
						|
        except cuda.MemoryError, e:
 | 
						|
            # If a frame that's too large sneaks by the task distributor, we
 | 
						|
            # don't want to kill the server, but we also don't want to leave
 | 
						|
            # it stuck without any free memory to complete the next alloc.
 | 
						|
            # TODO: measure free mem and only take tasks that fit (but that
 | 
						|
            # should be done elsewhere)
 | 
						|
            self.free(stream)
 | 
						|
            raise e
 | 
						|
 | 
						|
    def set_dim(self, width, height, stream=None):
 | 
						|
        """
 | 
						|
        Compute padded dimensions for given width and height, ensure that the
 | 
						|
        buffers are large enough (and reallocate if not), and return the
 | 
						|
        calculated dimensions.
 | 
						|
 | 
						|
        Note that the returned dimensions are always the same for a given
 | 
						|
        width, height, and minimum gutter, even if the underlying buffers are
 | 
						|
        larger due to a previous allocation.
 | 
						|
        """
 | 
						|
        dim = self.calc_dim(width, height)
 | 
						|
        self.alloc(dim, stream)
 | 
						|
        return dim
 | 
						|
 | 
						|
    def flip(self):
 | 
						|
        """Flip the front and back buffers."""
 | 
						|
        self.d_front, self.d_back = self.d_back, self.d_front
 | 
						|
 | 
						|
    def flip_side(self):
 | 
						|
        """Flip the left and right buffers (float and uchar)."""
 | 
						|
        self.d_left, self.d_right = self.d_right, self.d_left
 | 
						|
        self.d_uleft, self.d_uright = self.d_uright, self.d_uleft
 | 
						|
 | 
						|
class DevSrc(object):
 | 
						|
    """
 | 
						|
    The buffers which represent a genome on-device, in the formats needed to
 | 
						|
    serve as a source for interpolating temporal samples.
 | 
						|
    """
 | 
						|
 | 
						|
    # Maximum number of knots per parameter. This also covers the maximum
 | 
						|
    # number of palettes allowed.
 | 
						|
    max_knots = 1 << util.DEFAULT_SEARCH_ROUNDS
 | 
						|
 | 
						|
    # Maximum number of parameters per genome. This number is exceedingly
 | 
						|
    # high, and should never even come close to being hit.
 | 
						|
    max_params = 1024
 | 
						|
 | 
						|
    def __init__(self):
 | 
						|
        self.d_times = cuda.mem_alloc(4 * self.max_knots * self.max_params)
 | 
						|
        self.d_knots = cuda.mem_alloc(4 * self.max_knots * self.max_params)
 | 
						|
        self.d_ptimes = cuda.mem_alloc(4 * self.max_knots)
 | 
						|
        self.d_pals = cuda.mem_alloc(4 * 4 * 256 * self.max_knots)
 | 
						|
 | 
						|
class DevInfo(object):
 | 
						|
    """
 | 
						|
    The buffers which hold temporal samples on-device, as used by iter.
 | 
						|
    """
 | 
						|
 | 
						|
    # The palette texture/surface covers the color coordinate from [0,1] with
 | 
						|
    # equidistant horizontal samples, and spans the temporal range of the
 | 
						|
    # frame linearly with this many rows. Increasing these values increases the
 | 
						|
    # number of uniquely-dithered samples when using pre-dithered surfaces, as
 | 
						|
    # is done in 'atomic' accumulation.
 | 
						|
    palette_width = 256 # TODO: make this setting be respected
 | 
						|
    palette_height = 64
 | 
						|
 | 
						|
    # This used to be determined automagically, but simultaneous occupancy
 | 
						|
    # and a much smaller block size simplifies this.
 | 
						|
    ntemporal_samples = 1024
 | 
						|
 | 
						|
    # Number of iterations to iterate without write after generating a new
 | 
						|
    # point. This number is currently fixed pretty deeply in the set of magic
 | 
						|
    # constants which govern buffer sizes; changing the value here won't
 | 
						|
    # actually change the code on the device to do something different.
 | 
						|
    # It's here just for documentation purposes.
 | 
						|
    fuse = 256
 | 
						|
 | 
						|
    def __init__(self):
 | 
						|
        self.d_params = cuda.mem_alloc(
 | 
						|
                self.ntemporal_samples * DevSrc.max_params * 4)
 | 
						|
        self.palette_surf_dsc = util.argset(cuda.ArrayDescriptor3D(),
 | 
						|
                height=self.palette_height, width=self.palette_width, depth=0,
 | 
						|
                format=cuda.array_format.SIGNED_INT32,
 | 
						|
                num_channels=2, flags=cuda.array3d_flags.SURFACE_LDST)
 | 
						|
        self.d_pal_array = cuda.Array(self.palette_surf_dsc)
 | 
						|
 | 
						|
class Renderer(object):
 | 
						|
    # Unloading a module triggers a context sync. To keep the renderer
 | 
						|
    # asynchronous, and avoid expensive CPU polling, this hangs on to
 | 
						|
    # a number of (relatively small) CUDA modules and flushes them together.
 | 
						|
    MAX_MODREFS = 20
 | 
						|
    _modrefs = {}
 | 
						|
 | 
						|
    @classmethod
 | 
						|
    def compile(cls, gnm, arch=None, keep=False):
 | 
						|
        packer, lib = iter.mkiterlib(gnm)
 | 
						|
        cubin = util.compile('iter', assemble_code(lib), arch=arch, keep=keep)
 | 
						|
        return packer, lib, cubin
 | 
						|
 | 
						|
    def load(self, cubin):
 | 
						|
        if cubin in self._modrefs:
 | 
						|
            return self._modrefs[cubin]
 | 
						|
        mod = cuda.module_from_buffer(self.cubin)
 | 
						|
        if len(self._modrefs) > self.MAX_MODREFS:
 | 
						|
            self._modrefs.clear()
 | 
						|
        self._modrefs[cubin] = mod
 | 
						|
        return mod
 | 
						|
 | 
						|
    def __init__(self, gnm, gprof, keep=False, arch=None):
 | 
						|
        self.packer, self.lib, self.cubin = self.compile(gnm, keep=keep, arch=arch)
 | 
						|
        self.mod = self.load(self.cubin)
 | 
						|
        self.filts = filters.create(gprof)
 | 
						|
        self.out = output.get_output_for_profile(gprof)
 | 
						|
 | 
						|
class RenderManager(ClsMod):
 | 
						|
    lib = devlib(deps=[interp.palintlib, filldptrlib])
 | 
						|
 | 
						|
    def __init__(self):
 | 
						|
        super(RenderManager, self).__init__()
 | 
						|
        self.fb = Framebuffers()
 | 
						|
        self.src_a, self.src_b = DevSrc(), DevSrc()
 | 
						|
        self.info_a, self.info_b = DevInfo(), DevInfo()
 | 
						|
        self.stream_a, self.stream_b = cuda.Stream(), cuda.Stream()
 | 
						|
        self.filt_evt = self.copy_evt = None
 | 
						|
 | 
						|
    def _copy(self, rdr, gnm):
 | 
						|
        """
 | 
						|
        Queue a copy of a host genome into a set of device interpolation source
 | 
						|
        buffers.
 | 
						|
 | 
						|
        Note that for now, this is broken! It ignores ``gnm``, and only packs
 | 
						|
        the genome that was used when creating the renderer.
 | 
						|
        """
 | 
						|
        times, knots = rdr.packer.pack(gnm, self.fb.pool)
 | 
						|
        cuda.memcpy_htod_async(self.src_a.d_times, times, self.stream_a)
 | 
						|
        cuda.memcpy_htod_async(self.src_a.d_knots, knots, self.stream_a)
 | 
						|
 | 
						|
        palsrc = dict([(v[0], palette_decode(v[1:])) for v in gnm['palette']])
 | 
						|
        ptimes, pvals = zip(*sorted(palsrc.items()))
 | 
						|
        palettes = self.fb.pool.allocate((len(palsrc), 256, 4), f32)
 | 
						|
        palettes[:] = pvals
 | 
						|
        palette_times = self.fb.pool.allocate((self.src_a.max_knots,), f32)
 | 
						|
        palette_times.fill(1e9)
 | 
						|
        palette_times[:len(ptimes)] = ptimes
 | 
						|
        cuda.memcpy_htod_async(self.src_a.d_pals, palettes, self.stream_a)
 | 
						|
        cuda.memcpy_htod_async(self.src_a.d_ptimes, palette_times,
 | 
						|
                               self.stream_a)
 | 
						|
 | 
						|
        # TODO: use bilerp tex as src for palette interp
 | 
						|
 | 
						|
    def _interp(self, rdr, gnm, dim, ts, td):
 | 
						|
        d_acc_size = rdr.mod.get_global('acc_size')[0]
 | 
						|
        p_dim = self.fb.pool.allocate((len(dim),), u32)
 | 
						|
        p_dim[:] = dim
 | 
						|
        cuda.memcpy_htod_async(d_acc_size, p_dim, self.stream_a)
 | 
						|
 | 
						|
        tref = self.mod.get_surfref('flatpal')
 | 
						|
        tref.set_array(self.info_a.d_pal_array, 0)
 | 
						|
        launch('interp_palette_flat', self.mod, self.stream_a,
 | 
						|
                256, self.info_a.palette_height,
 | 
						|
                self.fb.d_rb, self.fb.d_seeds,
 | 
						|
                self.src_a.d_ptimes, self.src_a.d_pals,
 | 
						|
                f32(ts), f32(td / self.info_a.palette_height))
 | 
						|
 | 
						|
        nts = self.info_a.ntemporal_samples
 | 
						|
        launch('interp_iter_params', rdr.mod, self.stream_a,
 | 
						|
                256, np.ceil(nts / 256.),
 | 
						|
                self.info_a.d_params, self.src_a.d_times, self.src_a.d_knots,
 | 
						|
                f32(ts), f32(td / nts), i32(nts))
 | 
						|
        #self._print_interp_knots(rdr)
 | 
						|
 | 
						|
    def _print_interp_knots(self, rdr, tsidx=5):
 | 
						|
        infos = cuda.from_device(self.info_a.d_params,
 | 
						|
                (tsidx + 1, len(rdr.packer)), f32)
 | 
						|
        for i, n in zip(infos[-1], rdr.packer.packed):
 | 
						|
            print '%60s %g' % ('_'.join(n), i)
 | 
						|
 | 
						|
    def _iter(self, rdr, gnm, gprof, dim, tc):
 | 
						|
        tref = rdr.mod.get_surfref('flatpal')
 | 
						|
        tref.set_array(self.info_a.d_pal_array, 0)
 | 
						|
 | 
						|
        nbins = dim.ah * dim.astride
 | 
						|
        fill = lambda b, s, v=i32(0): util.fill_dptr(
 | 
						|
                self.mod, b, s, stream=self.stream_a, value=v)
 | 
						|
        fill(self.fb.d_front,  4 * nbins)
 | 
						|
        fill(self.fb.d_left,   4 * nbins)
 | 
						|
        fill(self.fb.d_right,  4 * nbins)
 | 
						|
        fill(self.fb.d_points, self.fb._len_d_points / 4, f32(np.nan))
 | 
						|
        fill(self.fb.d_uleft,  nbins / 2)
 | 
						|
        fill(self.fb.d_uright, nbins / 2)
 | 
						|
 | 
						|
        nts = self.info_a.ntemporal_samples
 | 
						|
        nsamps = (gprof.spp(tc) * dim.w * dim.h)
 | 
						|
        nrounds = int(nsamps / (nts * 256. * 256)) + 1
 | 
						|
 | 
						|
        # Split the launch into multiple rounds, to prevent a system on older
 | 
						|
        # GPUs from locking up and to give us a chance to flush some stuff.
 | 
						|
        hidden_stream = cuda.Stream()
 | 
						|
        iter_stream_left, iter_stream_right = self.stream_a, hidden_stream
 | 
						|
        block_size = 4
 | 
						|
 | 
						|
        while nrounds:
 | 
						|
          n = min(nrounds, block_size)
 | 
						|
          now = time.time()
 | 
						|
          launch('iter', rdr.mod, iter_stream_left, (32, 8, 1), (nts, n),
 | 
						|
                 self.fb.d_front, self.fb.d_left,
 | 
						|
                 self.fb.d_rb, self.fb.d_seeds, self.fb.d_points,
 | 
						|
                 self.fb.d_uleft, self.info_a.d_params)
 | 
						|
          delta = time.time() - now
 | 
						|
          if delta > 0.1:
 | 
						|
            # More than 100ms passed attempting to launch. The GPU is likely
 | 
						|
            # out of queued execution resources on a long render, and scheduling
 | 
						|
            # additional work will just keep spinning the CPU at 100%.
 | 
						|
            # Do a blocking sync to free up resources. This may slightly reduce
 | 
						|
            # parallelism but makes it a whole heck of a lot easier to keep
 | 
						|
            # using the computer while things render.
 | 
						|
            print >> sys.stderr, 'Launches became blocking, synchronizing'
 | 
						|
            iter_stream_right.synchronize()
 | 
						|
 | 
						|
          # Make sure the other stream is done flushing before we start
 | 
						|
          iter_stream_left.wait_for_event(cuda.Event().record(iter_stream_right))
 | 
						|
 | 
						|
          launch('flush_atom', rdr.mod, iter_stream_left,
 | 
						|
                  (16, 16, 1), (dim.astride / 16, dim.ah / 16),
 | 
						|
                  u64(self.fb.d_front), u64(self.fb.d_left),
 | 
						|
                  u64(self.fb.d_uleft), i32(nbins))
 | 
						|
 | 
						|
          self.fb.flip_side()
 | 
						|
          iter_stream_left, iter_stream_right = iter_stream_right, iter_stream_left
 | 
						|
          nrounds -= n
 | 
						|
          block_size += block_size / 2
 | 
						|
 | 
						|
        # Always wait on all events in the hidden stream before continuing on A
 | 
						|
        self.stream_a.wait_for_event(cuda.Event().record(hidden_stream))
 | 
						|
 | 
						|
    def queue_frame(self, rdr, gnm, gprof, tc, copy=True):
 | 
						|
        """
 | 
						|
        Queue one frame for rendering.
 | 
						|
 | 
						|
        ``rdr`` is a compiled Renderer module. Caller must ensure that the
 | 
						|
        module is compatible with the genome data provided.
 | 
						|
 | 
						|
        ``gnm`` is the genome to be rendered.
 | 
						|
 | 
						|
        ``tc`` is the center time at which to render.
 | 
						|
 | 
						|
        ``w``, ``h`` are the width and height of the desired output in px.
 | 
						|
 | 
						|
        If ``copy`` is False, the genome data will not be recopied for each
 | 
						|
        new genome. This function must be called with ``copy=True`` the first
 | 
						|
        time a new genome is used, and may be called in that manner
 | 
						|
        subsequently without harm. I suspect the performance impact is low, so
 | 
						|
        leave ``copy`` to True every time for now.
 | 
						|
 | 
						|
        The return value is a 2-tuple ``(evt, h_out)``, where ``evt`` is a
 | 
						|
        DurationEvent and ``h_out`` is the return value of the output module's
 | 
						|
        ``copy`` function. In the typical case, ``h_out`` will be a host
 | 
						|
        allocation containing data in an appropriate format for the output
 | 
						|
        module's file writer, and ``evt`` indicates when the asynchronous
 | 
						|
        DMA copy which will populate ``h_out`` is complete. This can vary
 | 
						|
        depending on the output module in use, though.
 | 
						|
 | 
						|
        This method is absolutely _not_ threadsafe, but it's fine to use it
 | 
						|
        alongside non-threaded approaches to concurrency like coroutines.
 | 
						|
        """
 | 
						|
        timing_event = cuda.Event().record(self.stream_b)
 | 
						|
        # Note: we synchronize on the previous stream if buffers need to be
 | 
						|
        # reallocated, which implicitly also syncs the current stream.
 | 
						|
        dim = self.fb.set_dim(gprof.width, gprof.height, self.stream_b)
 | 
						|
 | 
						|
        # TODO: calculate this externally somewhere?
 | 
						|
        td = gprof.frame_width(tc) / round(gprof.fps * gprof.duration)
 | 
						|
        ts, te = tc - 0.5 * td, tc + 0.5 * td
 | 
						|
 | 
						|
        # The stream interleaving here is nontrivial.
 | 
						|
        # TODO: update diagram and link to it here
 | 
						|
        if copy:
 | 
						|
            self.src_a, self.src_b = self.src_b, self.src_a
 | 
						|
            self._copy(rdr, gnm)
 | 
						|
        self._interp(rdr, gnm, dim, ts, td)
 | 
						|
        if self.filt_evt:
 | 
						|
            self.stream_a.wait_for_event(self.filt_evt)
 | 
						|
        self._iter(rdr, gnm, gprof, dim, tc)
 | 
						|
        if self.copy_evt:
 | 
						|
            self.stream_a.wait_for_event(self.copy_evt)
 | 
						|
        for filt in rdr.filts:
 | 
						|
            params = getattr(gprof.filters, filt.name)
 | 
						|
            filt.apply(self.fb, gprof, params, dim, tc, self.stream_a)
 | 
						|
        rdr.out.convert(self.fb, gprof, dim, self.stream_a)
 | 
						|
        self.filt_evt = cuda.Event().record(self.stream_a)
 | 
						|
        h_out = rdr.out.copy(self.fb, dim, self.fb.pool, self.stream_a)
 | 
						|
        self.copy_evt = DurationEvent(timing_event).record(self.stream_a)
 | 
						|
 | 
						|
        self.info_a, self.info_b = self.info_b, self.info_a
 | 
						|
        self.stream_a, self.stream_b = self.stream_b, self.stream_a
 | 
						|
        return self.copy_evt, h_out
 |