Refactor host rendering code for better load

This commit is contained in:
Steven Robertson 2011-10-15 22:22:43 -04:00
parent 8e99c9c463
commit 9bafbda81a
2 changed files with 157 additions and 214 deletions

View File

@ -107,7 +107,19 @@ class Animation(object):
In other words, it's best to use exactly one Animation for each In other words, it's best to use exactly one Animation for each
interpolated sequence between one or two genomes. interpolated sequence between one or two genomes.
""" """
# Large launches lock the display for a considerable period and may be
# killed due to a device timeout; small launches are harder to load-balance
# on the GPU and incur overhead. This empirical value is multiplied by the
# number of SMs on the device to determine how many blocks should be in
# each launch. Extremely high quality, high resolution renders may still
# encounter a device timeout, requiring the user to increase the split
# amount. This factor is not used in async mode.
SM_FACTOR = 8
cmp_options = ('-use_fast_math', '-maxrregcount', '42') cmp_options = ('-use_fast_math', '-maxrregcount', '42')
keep = False keep = False
def __init__(self, ctypes_genome_array): def __init__(self, ctypes_genome_array):
@ -170,7 +182,8 @@ class Animation(object):
self.compile() self.compile()
self.mod = cuda.module_from_buffer(self.cubin, jit_options) self.mod = cuda.module_from_buffer(self.cubin, jit_options)
def render_frames(self, times=None, block=True):
def render_frames(self, times=None, sync=False):
""" """
Render a flame for each genome in the iterable value 'genomes'. Render a flame for each genome in the iterable value 'genomes'.
Returns a Python generator object which will yield a 2-tuple of Returns a Python generator object which will yield a 2-tuple of
@ -192,260 +205,182 @@ class Animation(object):
``times`` is a sequence of center times at which to render, or ``None`` ``times`` is a sequence of center times at which to render, or ``None``
to render one frame for each genome used to create the animation. to render one frame for each genome used to create the animation.
``block`` will cause this thread to spin, waiting for the GPU to If ``sync`` is True, the CPU will sync with the GPU after every block
finish the current task. Otherwise, this generator will yield ``None`` of temporal samples and yield None until the frame is ready. This
until the GPU is finished, for filtering later. allows a single-card system to avoid having to go thirty seconds
between window refreshes while rendering. Otherwise, tasks will be
piled asynchronously on the card so that it is always under load.
""" """
f = self.features
times = times if times is not None else [cp.time for cp in self.genomes] times = times if times is not None else [cp.time for cp in self.genomes]
iter_stream = cuda.Stream()
filt_stream = cuda.Stream()
cen_cp = pyflam3.Genome()
dst_cp = pyflam3.Genome()
if block: nbins = f.acc_height * f.acc_stride
rdr = _AnimRenderer(self) d_accum = cuda.mem_alloc(16 * nbins)
for t in times: d_out = cuda.mem_alloc(16 * nbins)
rdr.render(t)
yield rdr.get_result() num_sm = cuda.Context.get_device().multiprocessor_count
if sync:
cps_per_block = num_sm * self.SM_FACTOR
else: else:
# TODO: share buffers. cps_per_block = f.max_cps
rdrs = [_AnimRenderer(self) for i in range(2)]
# Zip up each genome with an alternating renderer, plus 2 empty info_size = self._iter.packer.align * cps_per_block
# genomes at the end to flush all pending tasks d_infos = cuda.mem_alloc(info_size)
exttimes = times[:] + [None, None] d_palmem = cuda.mem_alloc(256 * f.palette_height * 4)
for rdr, t in izip(cycle(rdrs), exttimes):
if rdr.pending:
while not rdr.done():
yield None
yield rdr.get_result()
if t is not None:
rdr.render(t)
def _interp(self, time, cp): seeds = mwc.MWC.make_seeds(self._iter.NTHREADS * cps_per_block)
flam3_interpolate(self._g_arr, len(self._g_arr), time, 0, byref(cp)) d_seeds = cuda.to_device(seeds)
class _AnimRenderer(object): h_infos = cuda.pagelocked_empty((info_size / 4,), np.float32)
# Large launches lock the display for a considerable period and may be h_palmem = cuda.pagelocked_empty(
# killed due to a device timeout; small launches are harder to load-balance (f.palette_height, 256, 4), np.uint8)
# on the GPU and incur overhead. This empirical value is multiplied by the h_out = cuda.pagelocked_empty((f.acc_height, f.acc_stride, 4), np.float32)
# number of SMs on the device to determine how many blocks should be in
# each launch. Extremely high quality, high resolution renders may still
# encounter a device timeout, and no workaround is in place for that yet.
SM_FACTOR = 8
# Currently, palette interpolation is done independently of animation filter_done_event = None
# interpolation, so that the process is not biased and so we only need to
# mess about with one texture per renderer. This many steps will always be
# used, no matter the number of time steps.
PAL_HEIGHT = 16
# Use synchronous launches packer = self._iter.packer
sync = False iter_fun = self.mod.get_function("iter")
# Delay this long between iterations (only active when sync is True) #iter_fun.set_cache_config(cuda.func_cache.PREFER_L1)
sleep = None
def __init__(self, anim): util.BaseCode.zero_dptr(self.mod, d_accum, 4 * nbins, filt_stream)
self.anim = anim
self.pending = False
self.cen_time = None
self.stream = cuda.Stream()
self._nsms = cuda.Context.get_device().multiprocessor_count last_time = times[0]
self.cps_per_block = self._nsms * self.SM_FACTOR
self.ncps = anim.features.max_cps
self.nblocks = int(math.ceil(self.ncps / float(self.cps_per_block)))
# These are stored to avoid leaks, not to be stateful in method calls for time in times:
self._dst_cp = pyflam3.Genome() self._interp(cen_cp, time)
memset(byref(self._dst_cp), 0, sizeof(self._dst_cp))
self._cen_cp = pyflam3.Genome()
memset(byref(self._cen_cp), 0, sizeof(self._cen_cp))
self.nbins = anim.features.acc_height * anim.features.acc_stride h_palmem[:] = self._interp_colors(dst_cp, time,
self.d_accum = cuda.mem_alloc(16 * self.nbins) cen_cp.temporal_filter_width)
self.d_out = cuda.mem_alloc(16 * self.nbins) cuda.memcpy_htod_async(d_palmem, h_palmem, iter_stream)
tref = self.mod.get_texref('palTex')
array_info = cuda.ArrayDescriptor()
array_info.height = f.palette_height
array_info.width = 256
array_info.array_format = cuda.array_format.UNSIGNED_INT8
array_info.num_channels = 4
tref.set_address_2d(d_palmem, array_info, 1024)
info_size = anim._iter.packer.align * self.ncps
self.d_infos = cuda.mem_alloc(info_size)
# Defer generation of seeds until they're first needed
self.d_seeds = None
# During the main rendering loop, we alternate between two streams and
# two sets of seeds, synchronizing them at the end of rendering.
self.alt_stream = cuda.Stream()
self.d_alt_seeds = None
# It's less than ideal, but we lock some memory ahead of time
self.h_infos_locked = cuda.pagelocked_empty((info_size/4,), np.float32)
if self.sync:
self.stream = self.alt_stream = None
def render(self, cen_time):
assert not self.pending, "Tried to render with results pending!"
self.pending = True
self.cen_time = cen_time
a = self.anim
cen_cp = self._cen_cp
a._interp(cen_time, cen_cp)
palette = self._interp_colors(cen_time, cen_cp)
util.BaseCode.zero_dptr(a.mod, self.d_accum, 4 * self.nbins,
self.stream)
# Ensure all main stream tasks are done before starting alt stream
if not self.sync:
self.alt_stream.wait_for_event(cuda.Event().record(self.stream))
dpal = cuda.make_multichannel_2d_array(palette, 'C')
tref = a.mod.get_texref('palTex')
tref.set_array(dpal)
tref.set_format(cuda.array_format.UNSIGNED_INT8, 4) tref.set_format(cuda.array_format.UNSIGNED_INT8, 4)
tref.set_flags(cuda.TRSF_NORMALIZED_COORDINATES) tref.set_flags(cuda.TRSF_NORMALIZED_COORDINATES)
tref.set_filter_mode(cuda.filter_mode.LINEAR) tref.set_filter_mode(cuda.filter_mode.LINEAR)
cp = self._dst_cp
packer = a._iter.packer
iter_fun = a.mod.get_function("iter")
#iter_fun.set_cache_config(cuda.func_cache.PREFER_L1)
# Must be accumulated over all CPs # Must be accumulated over all CPs
gam, vib = 0, 0 gam, vib = 0, 0
bkgd = np.zeros(3) bkgd = np.zeros(3)
# This is gross, but there are a lot of fiddly corner cases with any mblur_times = enumerate( np.linspace(-0.5, 0.5, cen_cp.ntemporal_samples)
# index-based iteration scheme. * cen_cp.temporal_filter_width + time )
times = list(enumerate(self._mk_dts(cen_time, cen_cp, self.ncps)))
for b, block_times in enumerate(_chunk(times, self.cps_per_block)):
on_main = b % 2 == 0
stream = self.stream if on_main else self.alt_stream
d_seeds = self.d_seeds if on_main else self.d_alt_seeds
if not d_seeds:
seeds = mwc.MWC.make_seeds(a._iter.NTHREADS *
self.cps_per_block)
if self.sync:
d_seeds = cuda.to_device(seeds)
else:
size = seeds.dtype.itemsize * seeds.size
d_seeds = cuda.mem_alloc(size)
h_seeds = cuda.pagelocked_empty(seeds.shape, seeds.dtype)
h_seeds[:] = seeds
cuda.memcpy_htod_async(d_seeds, h_seeds, stream)
if on_main:
self.d_seeds = d_seeds
else:
self.d_alt_seeds = d_seeds
for block_times in _chunk(list(mblur_times), cps_per_block):
infos = [] infos = []
if len(a.genomes) > 1: if len(self.genomes) > 1:
for n, t in block_times: for n, t in block_times:
a._interp(t, cp) self._interp(dst_cp, t)
frac = float(n) / cen_cp.ntemporal_samples frac = float(n) / cen_cp.ntemporal_samples
info = packer.pack(cp=Genome(cp), cp_step_frac=frac) info = packer.pack(cp=Genome(dst_cp), cp_step_frac=frac)
infos.append(info) infos.append(info)
gam += cp.gamma gam += dst_cp.gamma
vib += cp.vibrancy vib += dst_cp.vibrancy
bkgd += np.array(cp.background) bkgd += np.array(dst_cp.background)
else: else:
# Can't interpolate normally; just pack copies # Can't interpolate normally; just pack copies
packed = packer.pack(cp=a.genomes[0], cp_step_frac=0) packed = packer.pack(cp=self.genomes[0], cp_step_frac=0)
infos = [packed] * len(block_times) infos = [packed] * len(block_times)
gam += a.genomes[0].gamma * len(block_times) gam += self.genomes[0].gamma * len(block_times)
vib += a.genomes[0].vibrancy * len(block_times) vib += self.genomes[0].vibrancy * len(block_times)
bkgd += np.array(a.genomes[0].background) * len(block_times) bkgd += np.array(self.genomes[0].background) * len(block_times)
infos = np.concatenate(infos) infos = np.concatenate(infos)
offset = b * packer.align * self.cps_per_block h_infos[:len(infos)] = infos
d_info_off = int(self.d_infos) + offset cuda.memcpy_htod_async(d_infos, h_infos)
if self.sync:
cuda.memcpy_htod(d_info_off, infos)
else:
h_infos = self.h_infos_locked[offset/4:offset/4+len(infos)]
h_infos[:] = infos
cuda.memcpy_htod_async(d_info_off, h_infos, stream)
iter_fun(d_seeds, np.uintp(d_info_off), np.uint64(self.d_accum), if filter_done_event:
block=(32, a._iter.NTHREADS/32, 1), iter_stream.wait_for_event(filter_done_event)
# TODO: replace with option to split long runs shorter ones
# for interactivity
for i in range(1):
iter_fun(d_seeds, d_infos, np.uint64(d_accum),
block=(32, self._iter.NTHREADS/32, 1),
grid=(len(block_times), 1), grid=(len(block_times), 1),
texrefs=[tref], stream=stream) texrefs=[tref], stream=iter_stream)
if self.sync and self.sleep: if sync:
time.sleep(self.sleep) iter_stream.synchronize()
yield None
# Now ensure all alt stream tasks are done before continuing main if filter_done_event and not sync:
if not self.sync: filt_stream.synchronize()
self.stream.wait_for_event(cuda.Event().record(self.alt_stream)) yield last_time, self._trim(h_out)
last_time = time
util.BaseCode.zero_dptr(a.mod, self.d_out, 4 * self.nbins, util.BaseCode.zero_dptr(self.mod, d_out, 4 * nbins, filt_stream)
self.stream) self._de.invoke(self.mod, Genome(cen_cp), d_accum, d_out, filt_stream)
a._de.invoke(a.mod, Genome(cen_cp), self.d_accum, self.d_out, util.BaseCode.zero_dptr(self.mod, d_accum, 4 * nbins, filt_stream)
self.stream) filter_done_event = cuda.Event().record(filt_stream)
f = np.float32 f32 = np.float32
n = f(self.ncps) n = f32(cen_cp.ntemporal_samples)
gam = f(n / gam) gam = f32(n / gam)
vib = f(vib / n) vib = f32(vib / n)
hipow = f(cen_cp.highlight_power) hipow = f32(cen_cp.highlight_power)
lin = f(cen_cp.gam_lin_thresh) lin = f32(cen_cp.gam_lin_thresh)
lingam = f(math.pow(cen_cp.gam_lin_thresh, gam-1.0) if lin > 0 else 0) lingam = f32(math.pow(cen_cp.gam_lin_thresh, gam-1.0) if lin > 0 else 0)
bkgd = vec.make_float3(*(bkgd / n)) bkgd = vec.make_float3(*(bkgd / n))
# TODO: get block size from colorclip class? It actually does not color_fun = self.mod.get_function("colorclip")
# depend on that being the case color_fun(d_out, gam, vib, hipow, lin, lingam, bkgd,
color_fun = a.mod.get_function("colorclip") block=(256, 1, 1), grid=(nbins / 256, 1),
color_fun(self.d_out, gam, vib, hipow, lin, lingam, bkgd, stream=filt_stream)
block=(256, 1, 1), grid=(self.nbins / 256, 1), cuda.memcpy_dtoh_async(h_out, d_out, filt_stream)
stream=self.stream)
if sync:
filt_stream.synchronize()
yield time, self._trim(h_out)
# TODO: The stream seems to sync right here, automatically, before if not sync:
# returning. I think PyCUDA is forcing a sync when something drops out filt_stream.synchronize()
# of scope. Investigate. yield time, self._trim(h_out)
def _pal_to_np(self, cp): def _interp(self, cp, time):
flam3_interpolate(self._g_arr, len(self._g_arr), time, 0, byref(cp))
@staticmethod
def _pal_to_np(cp):
# Converting palettes by iteration has an enormous performance # Converting palettes by iteration has an enormous performance
# overhead. We cheat massively and dangerously here. # overhead. We cheat massively and dangerously here.
pal = cast(pointer(cp.palette), POINTER(c_double * (256 * 5))) pal = cast(pointer(cp.palette), POINTER(c_double * (256 * 5)))
val = np.frombuffer(buffer(pal.contents), count=256*5) val = np.frombuffer(buffer(pal.contents), count=256*5)
return np.uint8(np.reshape(val, (256, 5))[:,1:] * 255.0) return np.uint8(np.reshape(val, (256, 5))[:,1:] * 255.0)
def _interp_colors(self, cen_time, cen_cp): def _interp_colors(self, cp, time, twidth):
# TODO: any visible difference between uint8 and richer formats? # TODO: any visible difference between uint8 and richer formats?
pal = np.empty((self.PAL_HEIGHT, 256, 4), dtype=np.uint8) height = self.features.palette_height
a = self.anim pal = np.empty((height, 256, 4), dtype=np.uint8)
if len(a.genomes) > 1: if len(self.genomes) > 1:
# The typical case; applying real motion blur # The typical case; applying real motion blur
cp = self._dst_cp times = np.linspace(-0.5, 0.5, height) * twidth + time
times = self._mk_dts(cen_time, cen_cp, self.PAL_HEIGHT)
for n, t in enumerate(times): for n, t in enumerate(times):
a._interp(t, cp) self._interp(cp, t)
pal[n] = self._pal_to_np(cp) pal[n] = self._pal_to_np(cp)
else: else:
# Cannot call any interp functions on a single genome; rather than # Cannot call any interp functions on a single genome; rather than
# have alternate code-paths, just copy the same colors everywhere # have alternate code-paths, just copy the same colors everywhere
pal[0] = self._pal_to_np(a.genomes[0]) pal[0] = self._pal_to_np(self.genomes[0])
pal[1:] = pal[0] pal[1:] = pal[0]
return pal return pal
def done(self): def _trim(self, result):
if self.sync: g = self.features.gutter
return True return result[g:-g,g:-g].copy()
return self.stream.is_done()
def get_result(self):
if not self.sync:
self.stream.synchronize()
self.pending = False
a = self.anim
obuf_dim = (a.features.acc_height, a.features.acc_stride, 4)
out = cuda.from_device(self.d_out, obuf_dim, np.float32)
g = a.features.gutter
return self.cen_time, out[g:-g,g:-g]
@staticmethod
def _mk_dts(cen_time, cen_cp, ncps):
w = cen_cp.temporal_filter_width
return [cen_time + w * (t / (ncps - 1.0) - 0.5) for t in range(ncps)]
class Features(object): class Features(object):
""" """

12
main.py
View File

@ -15,6 +15,7 @@ import argparse
import multiprocessing import multiprocessing
from subprocess import Popen from subprocess import Popen
from ctypes import * from ctypes import *
from itertools import ifilter
import numpy as np import numpy as np
import Image import Image
@ -47,6 +48,7 @@ def save(args, time, raw):
noalpha = raw[:,:,:3] noalpha = raw[:,:,:3]
if args.raw: if args.raw:
real_stdout.write(buffer(np.uint8(noalpha * 255.0))) real_stdout.write(buffer(np.uint8(noalpha * 255.0)))
sys.stderr.write('.')
return return
name = fmt_filename(args, time) name = fmt_filename(args, time)
@ -161,7 +163,7 @@ def main(args):
def on_mouse_motion(x, y, dx, dy): def on_mouse_motion(x, y, dx, dy):
pass pass
frames = anim.render_frames(times, block=False) frames = anim.render_frames(times, sync=args.sync)
def poll(dt): def poll(dt):
out = next(frames, False) out = next(frames, False)
if out is False: if out is False:
@ -173,14 +175,20 @@ def main(args):
imgbuf = np.uint8(buf.flatten() * 255) imgbuf = np.uint8(buf.flatten() * 255)
image.set_data('RGBA', -anim.features.width*4, imgbuf.tostring()) image.set_data('RGBA', -anim.features.width*4, imgbuf.tostring())
label.text = '%s %4g' % (args.name, time) label.text = '%s %4g' % (args.name, time)
else:
label.text += '.'
if args.sleep:
time.sleep(args.sleep)
pyglet.clock.set_fps_limit(30) pyglet.clock.set_fps_limit(30)
pyglet.clock.schedule_interval(poll, 1/30.) pyglet.clock.schedule_interval(poll, 1/30.)
pyglet.app.run() pyglet.app.run()
else: else:
for time, out in anim.render_frames(times): for time, out in ifilter(None, anim.render_frames(times, sync=args.sync)):
save(args, time, out) save(args, time, out)
if args.sleep:
time.sleep(args.sleep)
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Render fractal flames.') parser = argparse.ArgumentParser(description='Render fractal flames.')