diff --git a/cuburn/render.py b/cuburn/render.py index f1a311e..13a591b 100644 --- a/cuburn/render.py +++ b/cuburn/render.py @@ -107,7 +107,19 @@ class Animation(object): In other words, it's best to use exactly one Animation for each interpolated sequence between one or two genomes. """ + + # Large launches lock the display for a considerable period and may be + # killed due to a device timeout; small launches are harder to load-balance + # on the GPU and incur overhead. This empirical value is multiplied by the + # number of SMs on the device to determine how many blocks should be in + # each launch. Extremely high quality, high resolution renders may still + # encounter a device timeout, requiring the user to increase the split + # amount. This factor is not used in async mode. + SM_FACTOR = 8 + cmp_options = ('-use_fast_math', '-maxrregcount', '42') + + keep = False def __init__(self, ctypes_genome_array): @@ -170,7 +182,8 @@ class Animation(object): self.compile() self.mod = cuda.module_from_buffer(self.cubin, jit_options) - def render_frames(self, times=None, block=True): + + def render_frames(self, times=None, sync=False): """ Render a flame for each genome in the iterable value 'genomes'. Returns a Python generator object which will yield a 2-tuple of @@ -192,260 +205,182 @@ class Animation(object): ``times`` is a sequence of center times at which to render, or ``None`` to render one frame for each genome used to create the animation. - ``block`` will cause this thread to spin, waiting for the GPU to - finish the current task. Otherwise, this generator will yield ``None`` - until the GPU is finished, for filtering later. + If ``sync`` is True, the CPU will sync with the GPU after every block + of temporal samples and yield None until the frame is ready. This + allows a single-card system to avoid having to go thirty seconds + between window refreshes while rendering. Otherwise, tasks will be + piled asynchronously on the card so that it is always under load. """ + + f = self.features + times = times if times is not None else [cp.time for cp in self.genomes] + iter_stream = cuda.Stream() + filt_stream = cuda.Stream() + cen_cp = pyflam3.Genome() + dst_cp = pyflam3.Genome() - if block: - rdr = _AnimRenderer(self) - for t in times: - rdr.render(t) - yield rdr.get_result() + nbins = f.acc_height * f.acc_stride + d_accum = cuda.mem_alloc(16 * nbins) + d_out = cuda.mem_alloc(16 * nbins) + + num_sm = cuda.Context.get_device().multiprocessor_count + if sync: + cps_per_block = num_sm * self.SM_FACTOR else: - # TODO: share buffers. - rdrs = [_AnimRenderer(self) for i in range(2)] + cps_per_block = f.max_cps - # Zip up each genome with an alternating renderer, plus 2 empty - # genomes at the end to flush all pending tasks - exttimes = times[:] + [None, None] - for rdr, t in izip(cycle(rdrs), exttimes): - if rdr.pending: - while not rdr.done(): - yield None - yield rdr.get_result() - if t is not None: - rdr.render(t) + info_size = self._iter.packer.align * cps_per_block + d_infos = cuda.mem_alloc(info_size) + d_palmem = cuda.mem_alloc(256 * f.palette_height * 4) - def _interp(self, time, cp): - flam3_interpolate(self._g_arr, len(self._g_arr), time, 0, byref(cp)) + seeds = mwc.MWC.make_seeds(self._iter.NTHREADS * cps_per_block) + d_seeds = cuda.to_device(seeds) -class _AnimRenderer(object): - # Large launches lock the display for a considerable period and may be - # killed due to a device timeout; small launches are harder to load-balance - # on the GPU and incur overhead. This empirical value is multiplied by the - # number of SMs on the device to determine how many blocks should be in - # each launch. Extremely high quality, high resolution renders may still - # encounter a device timeout, and no workaround is in place for that yet. - SM_FACTOR = 8 + h_infos = cuda.pagelocked_empty((info_size / 4,), np.float32) + h_palmem = cuda.pagelocked_empty( + (f.palette_height, 256, 4), np.uint8) + h_out = cuda.pagelocked_empty((f.acc_height, f.acc_stride, 4), np.float32) - # Currently, palette interpolation is done independently of animation - # interpolation, so that the process is not biased and so we only need to - # mess about with one texture per renderer. This many steps will always be - # used, no matter the number of time steps. - PAL_HEIGHT = 16 + filter_done_event = None - # Use synchronous launches - sync = False - # Delay this long between iterations (only active when sync is True) - sleep = None - - def __init__(self, anim): - self.anim = anim - self.pending = False - self.cen_time = None - self.stream = cuda.Stream() - - self._nsms = cuda.Context.get_device().multiprocessor_count - self.cps_per_block = self._nsms * self.SM_FACTOR - self.ncps = anim.features.max_cps - self.nblocks = int(math.ceil(self.ncps / float(self.cps_per_block))) - - # These are stored to avoid leaks, not to be stateful in method calls - self._dst_cp = pyflam3.Genome() - memset(byref(self._dst_cp), 0, sizeof(self._dst_cp)) - self._cen_cp = pyflam3.Genome() - memset(byref(self._cen_cp), 0, sizeof(self._cen_cp)) - - self.nbins = anim.features.acc_height * anim.features.acc_stride - self.d_accum = cuda.mem_alloc(16 * self.nbins) - self.d_out = cuda.mem_alloc(16 * self.nbins) - - info_size = anim._iter.packer.align * self.ncps - self.d_infos = cuda.mem_alloc(info_size) - # Defer generation of seeds until they're first needed - self.d_seeds = None - - # During the main rendering loop, we alternate between two streams and - # two sets of seeds, synchronizing them at the end of rendering. - self.alt_stream = cuda.Stream() - self.d_alt_seeds = None - - # It's less than ideal, but we lock some memory ahead of time - self.h_infos_locked = cuda.pagelocked_empty((info_size/4,), np.float32) - - if self.sync: - self.stream = self.alt_stream = None - - def render(self, cen_time): - assert not self.pending, "Tried to render with results pending!" - self.pending = True - self.cen_time = cen_time - a = self.anim - - cen_cp = self._cen_cp - a._interp(cen_time, cen_cp) - palette = self._interp_colors(cen_time, cen_cp) - - util.BaseCode.zero_dptr(a.mod, self.d_accum, 4 * self.nbins, - self.stream) - # Ensure all main stream tasks are done before starting alt stream - if not self.sync: - self.alt_stream.wait_for_event(cuda.Event().record(self.stream)) - - dpal = cuda.make_multichannel_2d_array(palette, 'C') - tref = a.mod.get_texref('palTex') - tref.set_array(dpal) - tref.set_format(cuda.array_format.UNSIGNED_INT8, 4) - tref.set_flags(cuda.TRSF_NORMALIZED_COORDINATES) - tref.set_filter_mode(cuda.filter_mode.LINEAR) - - cp = self._dst_cp - packer = a._iter.packer - - iter_fun = a.mod.get_function("iter") + packer = self._iter.packer + iter_fun = self.mod.get_function("iter") #iter_fun.set_cache_config(cuda.func_cache.PREFER_L1) - # Must be accumulated over all CPs - gam, vib = 0, 0 - bkgd = np.zeros(3) + util.BaseCode.zero_dptr(self.mod, d_accum, 4 * nbins, filt_stream) - # This is gross, but there are a lot of fiddly corner cases with any - # index-based iteration scheme. - times = list(enumerate(self._mk_dts(cen_time, cen_cp, self.ncps))) - for b, block_times in enumerate(_chunk(times, self.cps_per_block)): - on_main = b % 2 == 0 - stream = self.stream if on_main else self.alt_stream - d_seeds = self.d_seeds if on_main else self.d_alt_seeds + last_time = times[0] - if not d_seeds: - seeds = mwc.MWC.make_seeds(a._iter.NTHREADS * - self.cps_per_block) - if self.sync: - d_seeds = cuda.to_device(seeds) + for time in times: + self._interp(cen_cp, time) + + h_palmem[:] = self._interp_colors(dst_cp, time, + cen_cp.temporal_filter_width) + cuda.memcpy_htod_async(d_palmem, h_palmem, iter_stream) + tref = self.mod.get_texref('palTex') + array_info = cuda.ArrayDescriptor() + array_info.height = f.palette_height + array_info.width = 256 + array_info.array_format = cuda.array_format.UNSIGNED_INT8 + array_info.num_channels = 4 + tref.set_address_2d(d_palmem, array_info, 1024) + + tref.set_format(cuda.array_format.UNSIGNED_INT8, 4) + tref.set_flags(cuda.TRSF_NORMALIZED_COORDINATES) + tref.set_filter_mode(cuda.filter_mode.LINEAR) + + # Must be accumulated over all CPs + gam, vib = 0, 0 + bkgd = np.zeros(3) + + mblur_times = enumerate( np.linspace(-0.5, 0.5, cen_cp.ntemporal_samples) + * cen_cp.temporal_filter_width + time ) + + for block_times in _chunk(list(mblur_times), cps_per_block): + infos = [] + if len(self.genomes) > 1: + for n, t in block_times: + self._interp(dst_cp, t) + frac = float(n) / cen_cp.ntemporal_samples + info = packer.pack(cp=Genome(dst_cp), cp_step_frac=frac) + infos.append(info) + gam += dst_cp.gamma + vib += dst_cp.vibrancy + bkgd += np.array(dst_cp.background) else: - size = seeds.dtype.itemsize * seeds.size - d_seeds = cuda.mem_alloc(size) - h_seeds = cuda.pagelocked_empty(seeds.shape, seeds.dtype) - h_seeds[:] = seeds - cuda.memcpy_htod_async(d_seeds, h_seeds, stream) - if on_main: - self.d_seeds = d_seeds - else: - self.d_alt_seeds = d_seeds + # Can't interpolate normally; just pack copies + packed = packer.pack(cp=self.genomes[0], cp_step_frac=0) + infos = [packed] * len(block_times) + gam += self.genomes[0].gamma * len(block_times) + vib += self.genomes[0].vibrancy * len(block_times) + bkgd += np.array(self.genomes[0].background) * len(block_times) - infos = [] - if len(a.genomes) > 1: - for n, t in block_times: - a._interp(t, cp) - frac = float(n) / cen_cp.ntemporal_samples - info = packer.pack(cp=Genome(cp), cp_step_frac=frac) - infos.append(info) - gam += cp.gamma - vib += cp.vibrancy - bkgd += np.array(cp.background) - else: - # Can't interpolate normally; just pack copies - packed = packer.pack(cp=a.genomes[0], cp_step_frac=0) - infos = [packed] * len(block_times) - gam += a.genomes[0].gamma * len(block_times) - vib += a.genomes[0].vibrancy * len(block_times) - bkgd += np.array(a.genomes[0].background) * len(block_times) + infos = np.concatenate(infos) + h_infos[:len(infos)] = infos + cuda.memcpy_htod_async(d_infos, h_infos) - infos = np.concatenate(infos) - offset = b * packer.align * self.cps_per_block - d_info_off = int(self.d_infos) + offset - if self.sync: - cuda.memcpy_htod(d_info_off, infos) - else: - h_infos = self.h_infos_locked[offset/4:offset/4+len(infos)] - h_infos[:] = infos - cuda.memcpy_htod_async(d_info_off, h_infos, stream) + if filter_done_event: + iter_stream.wait_for_event(filter_done_event) - iter_fun(d_seeds, np.uintp(d_info_off), np.uint64(self.d_accum), - block=(32, a._iter.NTHREADS/32, 1), - grid=(len(block_times), 1), - texrefs=[tref], stream=stream) + # TODO: replace with option to split long runs shorter ones + # for interactivity + for i in range(1): + iter_fun(d_seeds, d_infos, np.uint64(d_accum), + block=(32, self._iter.NTHREADS/32, 1), + grid=(len(block_times), 1), + texrefs=[tref], stream=iter_stream) - if self.sync and self.sleep: - time.sleep(self.sleep) + if sync: + iter_stream.synchronize() + yield None - # Now ensure all alt stream tasks are done before continuing main - if not self.sync: - self.stream.wait_for_event(cuda.Event().record(self.alt_stream)) + if filter_done_event and not sync: + filt_stream.synchronize() + yield last_time, self._trim(h_out) + last_time = time - util.BaseCode.zero_dptr(a.mod, self.d_out, 4 * self.nbins, - self.stream) - a._de.invoke(a.mod, Genome(cen_cp), self.d_accum, self.d_out, - self.stream) + util.BaseCode.zero_dptr(self.mod, d_out, 4 * nbins, filt_stream) + self._de.invoke(self.mod, Genome(cen_cp), d_accum, d_out, filt_stream) + util.BaseCode.zero_dptr(self.mod, d_accum, 4 * nbins, filt_stream) + filter_done_event = cuda.Event().record(filt_stream) - f = np.float32 - n = f(self.ncps) - gam = f(n / gam) - vib = f(vib / n) - hipow = f(cen_cp.highlight_power) - lin = f(cen_cp.gam_lin_thresh) - lingam = f(math.pow(cen_cp.gam_lin_thresh, gam-1.0) if lin > 0 else 0) - bkgd = vec.make_float3(*(bkgd / n)) + f32 = np.float32 + n = f32(cen_cp.ntemporal_samples) + gam = f32(n / gam) + vib = f32(vib / n) + hipow = f32(cen_cp.highlight_power) + lin = f32(cen_cp.gam_lin_thresh) + lingam = f32(math.pow(cen_cp.gam_lin_thresh, gam-1.0) if lin > 0 else 0) + bkgd = vec.make_float3(*(bkgd / n)) - # TODO: get block size from colorclip class? It actually does not - # depend on that being the case - color_fun = a.mod.get_function("colorclip") - color_fun(self.d_out, gam, vib, hipow, lin, lingam, bkgd, - block=(256, 1, 1), grid=(self.nbins / 256, 1), - stream=self.stream) + color_fun = self.mod.get_function("colorclip") + color_fun(d_out, gam, vib, hipow, lin, lingam, bkgd, + block=(256, 1, 1), grid=(nbins / 256, 1), + stream=filt_stream) + cuda.memcpy_dtoh_async(h_out, d_out, filt_stream) + if sync: + filt_stream.synchronize() + yield time, self._trim(h_out) - # TODO: The stream seems to sync right here, automatically, before - # returning. I think PyCUDA is forcing a sync when something drops out - # of scope. Investigate. + if not sync: + filt_stream.synchronize() + yield time, self._trim(h_out) - def _pal_to_np(self, cp): + def _interp(self, cp, time): + flam3_interpolate(self._g_arr, len(self._g_arr), time, 0, byref(cp)) + + @staticmethod + def _pal_to_np(cp): # Converting palettes by iteration has an enormous performance # overhead. We cheat massively and dangerously here. pal = cast(pointer(cp.palette), POINTER(c_double * (256 * 5))) val = np.frombuffer(buffer(pal.contents), count=256*5) return np.uint8(np.reshape(val, (256, 5))[:,1:] * 255.0) - def _interp_colors(self, cen_time, cen_cp): + def _interp_colors(self, cp, time, twidth): # TODO: any visible difference between uint8 and richer formats? - pal = np.empty((self.PAL_HEIGHT, 256, 4), dtype=np.uint8) - a = self.anim + height = self.features.palette_height + pal = np.empty((height, 256, 4), dtype=np.uint8) - if len(a.genomes) > 1: + if len(self.genomes) > 1: # The typical case; applying real motion blur - cp = self._dst_cp - times = self._mk_dts(cen_time, cen_cp, self.PAL_HEIGHT) + times = np.linspace(-0.5, 0.5, height) * twidth + time for n, t in enumerate(times): - a._interp(t, cp) + self._interp(cp, t) pal[n] = self._pal_to_np(cp) else: # Cannot call any interp functions on a single genome; rather than # have alternate code-paths, just copy the same colors everywhere - pal[0] = self._pal_to_np(a.genomes[0]) + pal[0] = self._pal_to_np(self.genomes[0]) pal[1:] = pal[0] return pal - def done(self): - if self.sync: - return True - return self.stream.is_done() + def _trim(self, result): + g = self.features.gutter + return result[g:-g,g:-g].copy() - def get_result(self): - if not self.sync: - self.stream.synchronize() - self.pending = False - a = self.anim - obuf_dim = (a.features.acc_height, a.features.acc_stride, 4) - out = cuda.from_device(self.d_out, obuf_dim, np.float32) - g = a.features.gutter - return self.cen_time, out[g:-g,g:-g] - - @staticmethod - def _mk_dts(cen_time, cen_cp, ncps): - w = cen_cp.temporal_filter_width - return [cen_time + w * (t / (ncps - 1.0) - 0.5) for t in range(ncps)] class Features(object): """ diff --git a/main.py b/main.py index a7f7fea..d19680d 100644 --- a/main.py +++ b/main.py @@ -15,6 +15,7 @@ import argparse import multiprocessing from subprocess import Popen from ctypes import * +from itertools import ifilter import numpy as np import Image @@ -47,6 +48,7 @@ def save(args, time, raw): noalpha = raw[:,:,:3] if args.raw: real_stdout.write(buffer(np.uint8(noalpha * 255.0))) + sys.stderr.write('.') return name = fmt_filename(args, time) @@ -161,7 +163,7 @@ def main(args): def on_mouse_motion(x, y, dx, dy): pass - frames = anim.render_frames(times, block=False) + frames = anim.render_frames(times, sync=args.sync) def poll(dt): out = next(frames, False) if out is False: @@ -173,14 +175,20 @@ def main(args): imgbuf = np.uint8(buf.flatten() * 255) image.set_data('RGBA', -anim.features.width*4, imgbuf.tostring()) label.text = '%s %4g' % (args.name, time) + else: + label.text += '.' + if args.sleep: + time.sleep(args.sleep) pyglet.clock.set_fps_limit(30) pyglet.clock.schedule_interval(poll, 1/30.) pyglet.app.run() else: - for time, out in anim.render_frames(times): + for time, out in ifilter(None, anim.render_frames(times, sync=args.sync)): save(args, time, out) + if args.sleep: + time.sleep(args.sleep) if __name__ == "__main__": parser = argparse.ArgumentParser(description='Render fractal flames.')