Refactor host rendering code for better load

This commit is contained in:
Steven Robertson 2011-10-15 22:22:43 -04:00
parent 8e99c9c463
commit 9bafbda81a
2 changed files with 157 additions and 214 deletions

View File

@ -107,7 +107,19 @@ class Animation(object):
In other words, it's best to use exactly one Animation for each
interpolated sequence between one or two genomes.
"""
# Large launches lock the display for a considerable period and may be
# killed due to a device timeout; small launches are harder to load-balance
# on the GPU and incur overhead. This empirical value is multiplied by the
# number of SMs on the device to determine how many blocks should be in
# each launch. Extremely high quality, high resolution renders may still
# encounter a device timeout, requiring the user to increase the split
# amount. This factor is not used in async mode.
SM_FACTOR = 8
cmp_options = ('-use_fast_math', '-maxrregcount', '42')
keep = False
def __init__(self, ctypes_genome_array):
@ -170,7 +182,8 @@ class Animation(object):
self.compile()
self.mod = cuda.module_from_buffer(self.cubin, jit_options)
def render_frames(self, times=None, block=True):
def render_frames(self, times=None, sync=False):
"""
Render a flame for each genome in the iterable value 'genomes'.
Returns a Python generator object which will yield a 2-tuple of
@ -192,260 +205,182 @@ class Animation(object):
``times`` is a sequence of center times at which to render, or ``None``
to render one frame for each genome used to create the animation.
``block`` will cause this thread to spin, waiting for the GPU to
finish the current task. Otherwise, this generator will yield ``None``
until the GPU is finished, for filtering later.
If ``sync`` is True, the CPU will sync with the GPU after every block
of temporal samples and yield None until the frame is ready. This
allows a single-card system to avoid having to go thirty seconds
between window refreshes while rendering. Otherwise, tasks will be
piled asynchronously on the card so that it is always under load.
"""
f = self.features
times = times if times is not None else [cp.time for cp in self.genomes]
iter_stream = cuda.Stream()
filt_stream = cuda.Stream()
cen_cp = pyflam3.Genome()
dst_cp = pyflam3.Genome()
if block:
rdr = _AnimRenderer(self)
for t in times:
rdr.render(t)
yield rdr.get_result()
nbins = f.acc_height * f.acc_stride
d_accum = cuda.mem_alloc(16 * nbins)
d_out = cuda.mem_alloc(16 * nbins)
num_sm = cuda.Context.get_device().multiprocessor_count
if sync:
cps_per_block = num_sm * self.SM_FACTOR
else:
# TODO: share buffers.
rdrs = [_AnimRenderer(self) for i in range(2)]
cps_per_block = f.max_cps
# Zip up each genome with an alternating renderer, plus 2 empty
# genomes at the end to flush all pending tasks
exttimes = times[:] + [None, None]
for rdr, t in izip(cycle(rdrs), exttimes):
if rdr.pending:
while not rdr.done():
yield None
yield rdr.get_result()
if t is not None:
rdr.render(t)
info_size = self._iter.packer.align * cps_per_block
d_infos = cuda.mem_alloc(info_size)
d_palmem = cuda.mem_alloc(256 * f.palette_height * 4)
def _interp(self, time, cp):
flam3_interpolate(self._g_arr, len(self._g_arr), time, 0, byref(cp))
seeds = mwc.MWC.make_seeds(self._iter.NTHREADS * cps_per_block)
d_seeds = cuda.to_device(seeds)
class _AnimRenderer(object):
# Large launches lock the display for a considerable period and may be
# killed due to a device timeout; small launches are harder to load-balance
# on the GPU and incur overhead. This empirical value is multiplied by the
# number of SMs on the device to determine how many blocks should be in
# each launch. Extremely high quality, high resolution renders may still
# encounter a device timeout, and no workaround is in place for that yet.
SM_FACTOR = 8
h_infos = cuda.pagelocked_empty((info_size / 4,), np.float32)
h_palmem = cuda.pagelocked_empty(
(f.palette_height, 256, 4), np.uint8)
h_out = cuda.pagelocked_empty((f.acc_height, f.acc_stride, 4), np.float32)
# Currently, palette interpolation is done independently of animation
# interpolation, so that the process is not biased and so we only need to
# mess about with one texture per renderer. This many steps will always be
# used, no matter the number of time steps.
PAL_HEIGHT = 16
filter_done_event = None
# Use synchronous launches
sync = False
# Delay this long between iterations (only active when sync is True)
sleep = None
def __init__(self, anim):
self.anim = anim
self.pending = False
self.cen_time = None
self.stream = cuda.Stream()
self._nsms = cuda.Context.get_device().multiprocessor_count
self.cps_per_block = self._nsms * self.SM_FACTOR
self.ncps = anim.features.max_cps
self.nblocks = int(math.ceil(self.ncps / float(self.cps_per_block)))
# These are stored to avoid leaks, not to be stateful in method calls
self._dst_cp = pyflam3.Genome()
memset(byref(self._dst_cp), 0, sizeof(self._dst_cp))
self._cen_cp = pyflam3.Genome()
memset(byref(self._cen_cp), 0, sizeof(self._cen_cp))
self.nbins = anim.features.acc_height * anim.features.acc_stride
self.d_accum = cuda.mem_alloc(16 * self.nbins)
self.d_out = cuda.mem_alloc(16 * self.nbins)
info_size = anim._iter.packer.align * self.ncps
self.d_infos = cuda.mem_alloc(info_size)
# Defer generation of seeds until they're first needed
self.d_seeds = None
# During the main rendering loop, we alternate between two streams and
# two sets of seeds, synchronizing them at the end of rendering.
self.alt_stream = cuda.Stream()
self.d_alt_seeds = None
# It's less than ideal, but we lock some memory ahead of time
self.h_infos_locked = cuda.pagelocked_empty((info_size/4,), np.float32)
if self.sync:
self.stream = self.alt_stream = None
def render(self, cen_time):
assert not self.pending, "Tried to render with results pending!"
self.pending = True
self.cen_time = cen_time
a = self.anim
cen_cp = self._cen_cp
a._interp(cen_time, cen_cp)
palette = self._interp_colors(cen_time, cen_cp)
util.BaseCode.zero_dptr(a.mod, self.d_accum, 4 * self.nbins,
self.stream)
# Ensure all main stream tasks are done before starting alt stream
if not self.sync:
self.alt_stream.wait_for_event(cuda.Event().record(self.stream))
dpal = cuda.make_multichannel_2d_array(palette, 'C')
tref = a.mod.get_texref('palTex')
tref.set_array(dpal)
tref.set_format(cuda.array_format.UNSIGNED_INT8, 4)
tref.set_flags(cuda.TRSF_NORMALIZED_COORDINATES)
tref.set_filter_mode(cuda.filter_mode.LINEAR)
cp = self._dst_cp
packer = a._iter.packer
iter_fun = a.mod.get_function("iter")
packer = self._iter.packer
iter_fun = self.mod.get_function("iter")
#iter_fun.set_cache_config(cuda.func_cache.PREFER_L1)
# Must be accumulated over all CPs
gam, vib = 0, 0
bkgd = np.zeros(3)
util.BaseCode.zero_dptr(self.mod, d_accum, 4 * nbins, filt_stream)
# This is gross, but there are a lot of fiddly corner cases with any
# index-based iteration scheme.
times = list(enumerate(self._mk_dts(cen_time, cen_cp, self.ncps)))
for b, block_times in enumerate(_chunk(times, self.cps_per_block)):
on_main = b % 2 == 0
stream = self.stream if on_main else self.alt_stream
d_seeds = self.d_seeds if on_main else self.d_alt_seeds
last_time = times[0]
if not d_seeds:
seeds = mwc.MWC.make_seeds(a._iter.NTHREADS *
self.cps_per_block)
if self.sync:
d_seeds = cuda.to_device(seeds)
for time in times:
self._interp(cen_cp, time)
h_palmem[:] = self._interp_colors(dst_cp, time,
cen_cp.temporal_filter_width)
cuda.memcpy_htod_async(d_palmem, h_palmem, iter_stream)
tref = self.mod.get_texref('palTex')
array_info = cuda.ArrayDescriptor()
array_info.height = f.palette_height
array_info.width = 256
array_info.array_format = cuda.array_format.UNSIGNED_INT8
array_info.num_channels = 4
tref.set_address_2d(d_palmem, array_info, 1024)
tref.set_format(cuda.array_format.UNSIGNED_INT8, 4)
tref.set_flags(cuda.TRSF_NORMALIZED_COORDINATES)
tref.set_filter_mode(cuda.filter_mode.LINEAR)
# Must be accumulated over all CPs
gam, vib = 0, 0
bkgd = np.zeros(3)
mblur_times = enumerate( np.linspace(-0.5, 0.5, cen_cp.ntemporal_samples)
* cen_cp.temporal_filter_width + time )
for block_times in _chunk(list(mblur_times), cps_per_block):
infos = []
if len(self.genomes) > 1:
for n, t in block_times:
self._interp(dst_cp, t)
frac = float(n) / cen_cp.ntemporal_samples
info = packer.pack(cp=Genome(dst_cp), cp_step_frac=frac)
infos.append(info)
gam += dst_cp.gamma
vib += dst_cp.vibrancy
bkgd += np.array(dst_cp.background)
else:
size = seeds.dtype.itemsize * seeds.size
d_seeds = cuda.mem_alloc(size)
h_seeds = cuda.pagelocked_empty(seeds.shape, seeds.dtype)
h_seeds[:] = seeds
cuda.memcpy_htod_async(d_seeds, h_seeds, stream)
if on_main:
self.d_seeds = d_seeds
else:
self.d_alt_seeds = d_seeds
# Can't interpolate normally; just pack copies
packed = packer.pack(cp=self.genomes[0], cp_step_frac=0)
infos = [packed] * len(block_times)
gam += self.genomes[0].gamma * len(block_times)
vib += self.genomes[0].vibrancy * len(block_times)
bkgd += np.array(self.genomes[0].background) * len(block_times)
infos = []
if len(a.genomes) > 1:
for n, t in block_times:
a._interp(t, cp)
frac = float(n) / cen_cp.ntemporal_samples
info = packer.pack(cp=Genome(cp), cp_step_frac=frac)
infos.append(info)
gam += cp.gamma
vib += cp.vibrancy
bkgd += np.array(cp.background)
else:
# Can't interpolate normally; just pack copies
packed = packer.pack(cp=a.genomes[0], cp_step_frac=0)
infos = [packed] * len(block_times)
gam += a.genomes[0].gamma * len(block_times)
vib += a.genomes[0].vibrancy * len(block_times)
bkgd += np.array(a.genomes[0].background) * len(block_times)
infos = np.concatenate(infos)
h_infos[:len(infos)] = infos
cuda.memcpy_htod_async(d_infos, h_infos)
infos = np.concatenate(infos)
offset = b * packer.align * self.cps_per_block
d_info_off = int(self.d_infos) + offset
if self.sync:
cuda.memcpy_htod(d_info_off, infos)
else:
h_infos = self.h_infos_locked[offset/4:offset/4+len(infos)]
h_infos[:] = infos
cuda.memcpy_htod_async(d_info_off, h_infos, stream)
if filter_done_event:
iter_stream.wait_for_event(filter_done_event)
iter_fun(d_seeds, np.uintp(d_info_off), np.uint64(self.d_accum),
block=(32, a._iter.NTHREADS/32, 1),
grid=(len(block_times), 1),
texrefs=[tref], stream=stream)
# TODO: replace with option to split long runs shorter ones
# for interactivity
for i in range(1):
iter_fun(d_seeds, d_infos, np.uint64(d_accum),
block=(32, self._iter.NTHREADS/32, 1),
grid=(len(block_times), 1),
texrefs=[tref], stream=iter_stream)
if self.sync and self.sleep:
time.sleep(self.sleep)
if sync:
iter_stream.synchronize()
yield None
# Now ensure all alt stream tasks are done before continuing main
if not self.sync:
self.stream.wait_for_event(cuda.Event().record(self.alt_stream))
if filter_done_event and not sync:
filt_stream.synchronize()
yield last_time, self._trim(h_out)
last_time = time
util.BaseCode.zero_dptr(a.mod, self.d_out, 4 * self.nbins,
self.stream)
a._de.invoke(a.mod, Genome(cen_cp), self.d_accum, self.d_out,
self.stream)
util.BaseCode.zero_dptr(self.mod, d_out, 4 * nbins, filt_stream)
self._de.invoke(self.mod, Genome(cen_cp), d_accum, d_out, filt_stream)
util.BaseCode.zero_dptr(self.mod, d_accum, 4 * nbins, filt_stream)
filter_done_event = cuda.Event().record(filt_stream)
f = np.float32
n = f(self.ncps)
gam = f(n / gam)
vib = f(vib / n)
hipow = f(cen_cp.highlight_power)
lin = f(cen_cp.gam_lin_thresh)
lingam = f(math.pow(cen_cp.gam_lin_thresh, gam-1.0) if lin > 0 else 0)
bkgd = vec.make_float3(*(bkgd / n))
f32 = np.float32
n = f32(cen_cp.ntemporal_samples)
gam = f32(n / gam)
vib = f32(vib / n)
hipow = f32(cen_cp.highlight_power)
lin = f32(cen_cp.gam_lin_thresh)
lingam = f32(math.pow(cen_cp.gam_lin_thresh, gam-1.0) if lin > 0 else 0)
bkgd = vec.make_float3(*(bkgd / n))
# TODO: get block size from colorclip class? It actually does not
# depend on that being the case
color_fun = a.mod.get_function("colorclip")
color_fun(self.d_out, gam, vib, hipow, lin, lingam, bkgd,
block=(256, 1, 1), grid=(self.nbins / 256, 1),
stream=self.stream)
color_fun = self.mod.get_function("colorclip")
color_fun(d_out, gam, vib, hipow, lin, lingam, bkgd,
block=(256, 1, 1), grid=(nbins / 256, 1),
stream=filt_stream)
cuda.memcpy_dtoh_async(h_out, d_out, filt_stream)
if sync:
filt_stream.synchronize()
yield time, self._trim(h_out)
# TODO: The stream seems to sync right here, automatically, before
# returning. I think PyCUDA is forcing a sync when something drops out
# of scope. Investigate.
if not sync:
filt_stream.synchronize()
yield time, self._trim(h_out)
def _pal_to_np(self, cp):
def _interp(self, cp, time):
flam3_interpolate(self._g_arr, len(self._g_arr), time, 0, byref(cp))
@staticmethod
def _pal_to_np(cp):
# Converting palettes by iteration has an enormous performance
# overhead. We cheat massively and dangerously here.
pal = cast(pointer(cp.palette), POINTER(c_double * (256 * 5)))
val = np.frombuffer(buffer(pal.contents), count=256*5)
return np.uint8(np.reshape(val, (256, 5))[:,1:] * 255.0)
def _interp_colors(self, cen_time, cen_cp):
def _interp_colors(self, cp, time, twidth):
# TODO: any visible difference between uint8 and richer formats?
pal = np.empty((self.PAL_HEIGHT, 256, 4), dtype=np.uint8)
a = self.anim
height = self.features.palette_height
pal = np.empty((height, 256, 4), dtype=np.uint8)
if len(a.genomes) > 1:
if len(self.genomes) > 1:
# The typical case; applying real motion blur
cp = self._dst_cp
times = self._mk_dts(cen_time, cen_cp, self.PAL_HEIGHT)
times = np.linspace(-0.5, 0.5, height) * twidth + time
for n, t in enumerate(times):
a._interp(t, cp)
self._interp(cp, t)
pal[n] = self._pal_to_np(cp)
else:
# Cannot call any interp functions on a single genome; rather than
# have alternate code-paths, just copy the same colors everywhere
pal[0] = self._pal_to_np(a.genomes[0])
pal[0] = self._pal_to_np(self.genomes[0])
pal[1:] = pal[0]
return pal
def done(self):
if self.sync:
return True
return self.stream.is_done()
def _trim(self, result):
g = self.features.gutter
return result[g:-g,g:-g].copy()
def get_result(self):
if not self.sync:
self.stream.synchronize()
self.pending = False
a = self.anim
obuf_dim = (a.features.acc_height, a.features.acc_stride, 4)
out = cuda.from_device(self.d_out, obuf_dim, np.float32)
g = a.features.gutter
return self.cen_time, out[g:-g,g:-g]
@staticmethod
def _mk_dts(cen_time, cen_cp, ncps):
w = cen_cp.temporal_filter_width
return [cen_time + w * (t / (ncps - 1.0) - 0.5) for t in range(ncps)]
class Features(object):
"""

12
main.py
View File

@ -15,6 +15,7 @@ import argparse
import multiprocessing
from subprocess import Popen
from ctypes import *
from itertools import ifilter
import numpy as np
import Image
@ -47,6 +48,7 @@ def save(args, time, raw):
noalpha = raw[:,:,:3]
if args.raw:
real_stdout.write(buffer(np.uint8(noalpha * 255.0)))
sys.stderr.write('.')
return
name = fmt_filename(args, time)
@ -161,7 +163,7 @@ def main(args):
def on_mouse_motion(x, y, dx, dy):
pass
frames = anim.render_frames(times, block=False)
frames = anim.render_frames(times, sync=args.sync)
def poll(dt):
out = next(frames, False)
if out is False:
@ -173,14 +175,20 @@ def main(args):
imgbuf = np.uint8(buf.flatten() * 255)
image.set_data('RGBA', -anim.features.width*4, imgbuf.tostring())
label.text = '%s %4g' % (args.name, time)
else:
label.text += '.'
if args.sleep:
time.sleep(args.sleep)
pyglet.clock.set_fps_limit(30)
pyglet.clock.schedule_interval(poll, 1/30.)
pyglet.app.run()
else:
for time, out in anim.render_frames(times):
for time, out in ifilter(None, anim.render_frames(times, sync=args.sync)):
save(args, time, out)
if args.sleep:
time.sleep(args.sleep)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Render fractal flames.')