mirror of
synced 2025-02-05 11:40:04 -05:00
Refactor host rendering code for better load
This commit is contained in:
@ -107,7 +107,19 @@ class Animation(object):
In other words, it's best to use exactly one Animation for each
In other words, it's best to use exactly one Animation for each
interpolated sequence between one or two genomes.
interpolated sequence between one or two genomes.
# Large launches lock the display for a considerable period and may be
# killed due to a device timeout; small launches are harder to load-balance
# on the GPU and incur overhead. This empirical value is multiplied by the
# number of SMs on the device to determine how many blocks should be in
# each launch. Extremely high quality, high resolution renders may still
# encounter a device timeout, requiring the user to increase the split
# amount. This factor is not used in async mode.
cmp_options = ('-use_fast_math', '-maxrregcount', '42')
cmp_options = ('-use_fast_math', '-maxrregcount', '42')
keep = False
keep = False
def __init__(self, ctypes_genome_array):
def __init__(self, ctypes_genome_array):
@ -170,7 +182,8 @@ class Animation(object):
self.mod = cuda.module_from_buffer(self.cubin, jit_options)
self.mod = cuda.module_from_buffer(self.cubin, jit_options)
def render_frames(self, times=None, block=True):
def render_frames(self, times=None, sync=False):
Render a flame for each genome in the iterable value 'genomes'.
Render a flame for each genome in the iterable value 'genomes'.
Returns a Python generator object which will yield a 2-tuple of
Returns a Python generator object which will yield a 2-tuple of
@ -192,260 +205,182 @@ class Animation(object):
``times`` is a sequence of center times at which to render, or ``None``
``times`` is a sequence of center times at which to render, or ``None``
to render one frame for each genome used to create the animation.
to render one frame for each genome used to create the animation.
``block`` will cause this thread to spin, waiting for the GPU to
If ``sync`` is True, the CPU will sync with the GPU after every block
finish the current task. Otherwise, this generator will yield ``None``
of temporal samples and yield None until the frame is ready. This
until the GPU is finished, for filtering later.
allows a single-card system to avoid having to go thirty seconds
between window refreshes while rendering. Otherwise, tasks will be
piled asynchronously on the card so that it is always under load.
f = self.features
times = times if times is not None else [cp.time for cp in self.genomes]
times = times if times is not None else [cp.time for cp in self.genomes]
iter_stream = cuda.Stream()
filt_stream = cuda.Stream()
cen_cp = pyflam3.Genome()
dst_cp = pyflam3.Genome()
if block:
nbins = f.acc_height * f.acc_stride
rdr = _AnimRenderer(self)
d_accum = cuda.mem_alloc(16 * nbins)
for t in times:
d_out = cuda.mem_alloc(16 * nbins)
yield rdr.get_result()
num_sm = cuda.Context.get_device().multiprocessor_count
if sync:
cps_per_block = num_sm * self.SM_FACTOR
# TODO: share buffers.
cps_per_block = f.max_cps
rdrs = [_AnimRenderer(self) for i in range(2)]
# Zip up each genome with an alternating renderer, plus 2 empty
info_size = self._iter.packer.align * cps_per_block
# genomes at the end to flush all pending tasks
d_infos = cuda.mem_alloc(info_size)
exttimes = times[:] + [None, None]
d_palmem = cuda.mem_alloc(256 * f.palette_height * 4)
for rdr, t in izip(cycle(rdrs), exttimes):
if rdr.pending:
while not rdr.done():
yield None
yield rdr.get_result()
if t is not None:
def _interp(self, time, cp):
seeds = mwc.MWC.make_seeds(self._iter.NTHREADS * cps_per_block)
flam3_interpolate(self._g_arr, len(self._g_arr), time, 0, byref(cp))
d_seeds = cuda.to_device(seeds)
class _AnimRenderer(object):
h_infos = cuda.pagelocked_empty((info_size / 4,), np.float32)
# Large launches lock the display for a considerable period and may be
h_palmem = cuda.pagelocked_empty(
# killed due to a device timeout; small launches are harder to load-balance
(f.palette_height, 256, 4), np.uint8)
# on the GPU and incur overhead. This empirical value is multiplied by the
h_out = cuda.pagelocked_empty((f.acc_height, f.acc_stride, 4), np.float32)
# number of SMs on the device to determine how many blocks should be in
# each launch. Extremely high quality, high resolution renders may still
# encounter a device timeout, and no workaround is in place for that yet.
# Currently, palette interpolation is done independently of animation
filter_done_event = None
# interpolation, so that the process is not biased and so we only need to
# mess about with one texture per renderer. This many steps will always be
# used, no matter the number of time steps.
# Use synchronous launches
packer = self._iter.packer
sync = False
iter_fun = self.mod.get_function("iter")
# Delay this long between iterations (only active when sync is True)
sleep = None
def __init__(self, anim):
self.anim = anim
self.pending = False
self.cen_time = None
self.stream = cuda.Stream()
self._nsms = cuda.Context.get_device().multiprocessor_count
self.cps_per_block = self._nsms * self.SM_FACTOR
self.ncps = anim.features.max_cps
self.nblocks = int(math.ceil(self.ncps / float(self.cps_per_block)))
# These are stored to avoid leaks, not to be stateful in method calls
self._dst_cp = pyflam3.Genome()
memset(byref(self._dst_cp), 0, sizeof(self._dst_cp))
self._cen_cp = pyflam3.Genome()
memset(byref(self._cen_cp), 0, sizeof(self._cen_cp))
self.nbins = anim.features.acc_height * anim.features.acc_stride
self.d_accum = cuda.mem_alloc(16 * self.nbins)
self.d_out = cuda.mem_alloc(16 * self.nbins)
info_size = anim._iter.packer.align * self.ncps
self.d_infos = cuda.mem_alloc(info_size)
# Defer generation of seeds until they're first needed
self.d_seeds = None
# During the main rendering loop, we alternate between two streams and
# two sets of seeds, synchronizing them at the end of rendering.
self.alt_stream = cuda.Stream()
self.d_alt_seeds = None
# It's less than ideal, but we lock some memory ahead of time
self.h_infos_locked = cuda.pagelocked_empty((info_size/4,), np.float32)
if self.sync:
self.stream = self.alt_stream = None
def render(self, cen_time):
assert not self.pending, "Tried to render with results pending!"
self.pending = True
self.cen_time = cen_time
a = self.anim
cen_cp = self._cen_cp
a._interp(cen_time, cen_cp)
palette = self._interp_colors(cen_time, cen_cp)
util.BaseCode.zero_dptr(a.mod, self.d_accum, 4 * self.nbins,
# Ensure all main stream tasks are done before starting alt stream
if not self.sync:
dpal = cuda.make_multichannel_2d_array(palette, 'C')
tref = a.mod.get_texref('palTex')
tref.set_format(cuda.array_format.UNSIGNED_INT8, 4)
cp = self._dst_cp
packer = a._iter.packer
iter_fun = a.mod.get_function("iter")
# Must be accumulated over all CPs
util.BaseCode.zero_dptr(self.mod, d_accum, 4 * nbins, filt_stream)
gam, vib = 0, 0
bkgd = np.zeros(3)
# This is gross, but there are a lot of fiddly corner cases with any
last_time = times[0]
# index-based iteration scheme.
times = list(enumerate(self._mk_dts(cen_time, cen_cp, self.ncps)))
for b, block_times in enumerate(_chunk(times, self.cps_per_block)):
on_main = b % 2 == 0
stream = self.stream if on_main else self.alt_stream
d_seeds = self.d_seeds if on_main else self.d_alt_seeds
if not d_seeds:
for time in times:
seeds = mwc.MWC.make_seeds(a._iter.NTHREADS *
self._interp(cen_cp, time)
if self.sync:
h_palmem[:] = self._interp_colors(dst_cp, time,
d_seeds = cuda.to_device(seeds)
cuda.memcpy_htod_async(d_palmem, h_palmem, iter_stream)
tref = self.mod.get_texref('palTex')
array_info = cuda.ArrayDescriptor()
array_info.height = f.palette_height
array_info.width = 256
array_info.array_format = cuda.array_format.UNSIGNED_INT8
array_info.num_channels = 4
tref.set_address_2d(d_palmem, array_info, 1024)
tref.set_format(cuda.array_format.UNSIGNED_INT8, 4)
# Must be accumulated over all CPs
gam, vib = 0, 0
bkgd = np.zeros(3)
mblur_times = enumerate( np.linspace(-0.5, 0.5, cen_cp.ntemporal_samples)
* cen_cp.temporal_filter_width + time )
for block_times in _chunk(list(mblur_times), cps_per_block):
infos = []
if len(self.genomes) > 1:
for n, t in block_times:
self._interp(dst_cp, t)
frac = float(n) / cen_cp.ntemporal_samples
info = packer.pack(cp=Genome(dst_cp), cp_step_frac=frac)
gam += dst_cp.gamma
vib += dst_cp.vibrancy
bkgd += np.array(dst_cp.background)
size = seeds.dtype.itemsize * seeds.size
# Can't interpolate normally; just pack copies
d_seeds = cuda.mem_alloc(size)
packed = packer.pack(cp=self.genomes[0], cp_step_frac=0)
h_seeds = cuda.pagelocked_empty(seeds.shape, seeds.dtype)
infos = [packed] * len(block_times)
h_seeds[:] = seeds
gam += self.genomes[0].gamma * len(block_times)
cuda.memcpy_htod_async(d_seeds, h_seeds, stream)
vib += self.genomes[0].vibrancy * len(block_times)
if on_main:
bkgd += np.array(self.genomes[0].background) * len(block_times)
self.d_seeds = d_seeds
self.d_alt_seeds = d_seeds
infos = []
infos = np.concatenate(infos)
if len(a.genomes) > 1:
h_infos[:len(infos)] = infos
for n, t in block_times:
cuda.memcpy_htod_async(d_infos, h_infos)
a._interp(t, cp)
frac = float(n) / cen_cp.ntemporal_samples
info = packer.pack(cp=Genome(cp), cp_step_frac=frac)
gam += cp.gamma
vib += cp.vibrancy
bkgd += np.array(cp.background)
# Can't interpolate normally; just pack copies
packed = packer.pack(cp=a.genomes[0], cp_step_frac=0)
infos = [packed] * len(block_times)
gam += a.genomes[0].gamma * len(block_times)
vib += a.genomes[0].vibrancy * len(block_times)
bkgd += np.array(a.genomes[0].background) * len(block_times)
infos = np.concatenate(infos)
if filter_done_event:
offset = b * packer.align * self.cps_per_block
d_info_off = int(self.d_infos) + offset
if self.sync:
cuda.memcpy_htod(d_info_off, infos)
h_infos = self.h_infos_locked[offset/4:offset/4+len(infos)]
h_infos[:] = infos
cuda.memcpy_htod_async(d_info_off, h_infos, stream)
iter_fun(d_seeds, np.uintp(d_info_off), np.uint64(self.d_accum),
# TODO: replace with option to split long runs shorter ones
block=(32, a._iter.NTHREADS/32, 1),
# for interactivity
grid=(len(block_times), 1),
for i in range(1):
texrefs=[tref], stream=stream)
iter_fun(d_seeds, d_infos, np.uint64(d_accum),
block=(32, self._iter.NTHREADS/32, 1),
grid=(len(block_times), 1),
texrefs=[tref], stream=iter_stream)
if self.sync and self.sleep:
if sync:
yield None
# Now ensure all alt stream tasks are done before continuing main
if filter_done_event and not sync:
if not self.sync:
yield last_time, self._trim(h_out)
last_time = time
util.BaseCode.zero_dptr(a.mod, self.d_out, 4 * self.nbins,
util.BaseCode.zero_dptr(self.mod, d_out, 4 * nbins, filt_stream)
self._de.invoke(self.mod, Genome(cen_cp), d_accum, d_out, filt_stream)
a._de.invoke(a.mod, Genome(cen_cp), self.d_accum, self.d_out,
util.BaseCode.zero_dptr(self.mod, d_accum, 4 * nbins, filt_stream)
filter_done_event = cuda.Event().record(filt_stream)
f = np.float32
f32 = np.float32
n = f(self.ncps)
n = f32(cen_cp.ntemporal_samples)
gam = f(n / gam)
gam = f32(n / gam)
vib = f(vib / n)
vib = f32(vib / n)
hipow = f(cen_cp.highlight_power)
hipow = f32(cen_cp.highlight_power)
lin = f(cen_cp.gam_lin_thresh)
lin = f32(cen_cp.gam_lin_thresh)
lingam = f(math.pow(cen_cp.gam_lin_thresh, gam-1.0) if lin > 0 else 0)
lingam = f32(math.pow(cen_cp.gam_lin_thresh, gam-1.0) if lin > 0 else 0)
bkgd = vec.make_float3(*(bkgd / n))
bkgd = vec.make_float3(*(bkgd / n))
# TODO: get block size from colorclip class? It actually does not
color_fun = self.mod.get_function("colorclip")
# depend on that being the case
color_fun(d_out, gam, vib, hipow, lin, lingam, bkgd,
color_fun = a.mod.get_function("colorclip")
block=(256, 1, 1), grid=(nbins / 256, 1),
color_fun(self.d_out, gam, vib, hipow, lin, lingam, bkgd,
block=(256, 1, 1), grid=(self.nbins / 256, 1),
cuda.memcpy_dtoh_async(h_out, d_out, filt_stream)
if sync:
yield time, self._trim(h_out)
# TODO: The stream seems to sync right here, automatically, before
if not sync:
# returning. I think PyCUDA is forcing a sync when something drops out
# of scope. Investigate.
yield time, self._trim(h_out)
def _pal_to_np(self, cp):
def _interp(self, cp, time):
flam3_interpolate(self._g_arr, len(self._g_arr), time, 0, byref(cp))
def _pal_to_np(cp):
# Converting palettes by iteration has an enormous performance
# Converting palettes by iteration has an enormous performance
# overhead. We cheat massively and dangerously here.
# overhead. We cheat massively and dangerously here.
pal = cast(pointer(cp.palette), POINTER(c_double * (256 * 5)))
pal = cast(pointer(cp.palette), POINTER(c_double * (256 * 5)))
val = np.frombuffer(buffer(pal.contents), count=256*5)
val = np.frombuffer(buffer(pal.contents), count=256*5)
return np.uint8(np.reshape(val, (256, 5))[:,1:] * 255.0)
return np.uint8(np.reshape(val, (256, 5))[:,1:] * 255.0)
def _interp_colors(self, cen_time, cen_cp):
def _interp_colors(self, cp, time, twidth):
# TODO: any visible difference between uint8 and richer formats?
# TODO: any visible difference between uint8 and richer formats?
pal = np.empty((self.PAL_HEIGHT, 256, 4), dtype=np.uint8)
height = self.features.palette_height
a = self.anim
pal = np.empty((height, 256, 4), dtype=np.uint8)
if len(a.genomes) > 1:
if len(self.genomes) > 1:
# The typical case; applying real motion blur
# The typical case; applying real motion blur
cp = self._dst_cp
times = np.linspace(-0.5, 0.5, height) * twidth + time
times = self._mk_dts(cen_time, cen_cp, self.PAL_HEIGHT)
for n, t in enumerate(times):
for n, t in enumerate(times):
a._interp(t, cp)
self._interp(cp, t)
pal[n] = self._pal_to_np(cp)
pal[n] = self._pal_to_np(cp)
# Cannot call any interp functions on a single genome; rather than
# Cannot call any interp functions on a single genome; rather than
# have alternate code-paths, just copy the same colors everywhere
# have alternate code-paths, just copy the same colors everywhere
pal[0] = self._pal_to_np(a.genomes[0])
pal[0] = self._pal_to_np(self.genomes[0])
pal[1:] = pal[0]
pal[1:] = pal[0]
return pal
return pal
def done(self):
def _trim(self, result):
if self.sync:
g = self.features.gutter
return True
return result[g:-g,g:-g].copy()
return self.stream.is_done()
def get_result(self):
if not self.sync:
self.pending = False
a = self.anim
obuf_dim = (a.features.acc_height, a.features.acc_stride, 4)
out = cuda.from_device(self.d_out, obuf_dim, np.float32)
g = a.features.gutter
return self.cen_time, out[g:-g,g:-g]
def _mk_dts(cen_time, cen_cp, ncps):
w = cen_cp.temporal_filter_width
return [cen_time + w * (t / (ncps - 1.0) - 0.5) for t in range(ncps)]
class Features(object):
class Features(object):
@ -15,6 +15,7 @@ import argparse
import multiprocessing
import multiprocessing
from subprocess import Popen
from subprocess import Popen
from ctypes import *
from ctypes import *
from itertools import ifilter
import numpy as np
import numpy as np
import Image
import Image
@ -47,6 +48,7 @@ def save(args, time, raw):
noalpha = raw[:,:,:3]
noalpha = raw[:,:,:3]
if args.raw:
if args.raw:
real_stdout.write(buffer(np.uint8(noalpha * 255.0)))
real_stdout.write(buffer(np.uint8(noalpha * 255.0)))
name = fmt_filename(args, time)
name = fmt_filename(args, time)
@ -161,7 +163,7 @@ def main(args):
def on_mouse_motion(x, y, dx, dy):
def on_mouse_motion(x, y, dx, dy):
frames = anim.render_frames(times, block=False)
frames = anim.render_frames(times, sync=args.sync)
def poll(dt):
def poll(dt):
out = next(frames, False)
out = next(frames, False)
if out is False:
if out is False:
@ -173,14 +175,20 @@ def main(args):
imgbuf = np.uint8(buf.flatten() * 255)
imgbuf = np.uint8(buf.flatten() * 255)
image.set_data('RGBA', -anim.features.width*4, imgbuf.tostring())
image.set_data('RGBA', -anim.features.width*4, imgbuf.tostring())
label.text = '%s %4g' % (args.name, time)
label.text = '%s %4g' % (args.name, time)
label.text += '.'
if args.sleep:
pyglet.clock.schedule_interval(poll, 1/30.)
pyglet.clock.schedule_interval(poll, 1/30.)
for time, out in anim.render_frames(times):
for time, out in ifilter(None, anim.render_frames(times, sync=args.sync)):
save(args, time, out)
save(args, time, out)
if args.sleep:
if __name__ == "__main__":
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Render fractal flames.')
parser = argparse.ArgumentParser(description='Render fractal flames.')
Reference in New Issue
Block a user