mirror of
https://github.com/stevenrobertson/cuburn.git
synced 2025-02-05 11:40:04 -05:00
Actually asynchronous rendering.
This change didn't affect GPU performance at all, but it did improve CPU startup time, and should also improve time for long-running animations.
This commit is contained in:
parent
8c7e86c7c7
commit
095936666e
@ -14,6 +14,7 @@ from fr0stlib.pyflam3.constants import *
|
|||||||
|
|
||||||
import pycuda.compiler
|
import pycuda.compiler
|
||||||
import pycuda.driver as cuda
|
import pycuda.driver as cuda
|
||||||
|
import pycuda.tools
|
||||||
from pycuda.gpuarray import vec
|
from pycuda.gpuarray import vec
|
||||||
|
|
||||||
from cuburn import affine
|
from cuburn import affine
|
||||||
@ -175,10 +176,12 @@ class Animation(object):
|
|||||||
"""
|
"""
|
||||||
# Don't see this changing, but empirical tests could prove me wrong
|
# Don't see this changing, but empirical tests could prove me wrong
|
||||||
NRENDERERS = 2
|
NRENDERERS = 2
|
||||||
|
# This could be shared too?
|
||||||
|
pool = pycuda.tools.PageLockedMemoryPool()
|
||||||
# TODO: under a slightly modified sequencing, certain buffers can be
|
# TODO: under a slightly modified sequencing, certain buffers can be
|
||||||
# shared (though this may be unimportant if a good AA technique which
|
# shared (though this may be unimportant if a good AA technique which
|
||||||
# doesn't require full SS can be found)
|
# doesn't require full SS can be found)
|
||||||
rdrs = [_AnimRenderer(self) for i in range(NRENDERERS)]
|
rdrs = [_AnimRenderer(self, pool) for i in range(NRENDERERS)]
|
||||||
|
|
||||||
# Zip up each genome with an alternating renderer, plus enough empty
|
# Zip up each genome with an alternating renderer, plus enough empty
|
||||||
# genomes at the end to flush all pending tasks
|
# genomes at the end to flush all pending tasks
|
||||||
@ -211,8 +214,9 @@ class _AnimRenderer(object):
|
|||||||
PAL_HEIGHT = 16
|
PAL_HEIGHT = 16
|
||||||
|
|
||||||
|
|
||||||
def __init__(self, anim):
|
def __init__(self, anim, pool):
|
||||||
self.anim = anim
|
self.anim = anim
|
||||||
|
self.pool = pool
|
||||||
self.pending = False
|
self.pending = False
|
||||||
self.stream = cuda.Stream()
|
self.stream = cuda.Stream()
|
||||||
|
|
||||||
@ -232,8 +236,13 @@ class _AnimRenderer(object):
|
|||||||
self.d_accum = cuda.mem_alloc(16 * self.nbins)
|
self.d_accum = cuda.mem_alloc(16 * self.nbins)
|
||||||
self.d_out = cuda.mem_alloc(16 * self.nbins)
|
self.d_out = cuda.mem_alloc(16 * self.nbins)
|
||||||
self.d_infos = cuda.mem_alloc(anim._iter.packer.align * self.ncps)
|
self.d_infos = cuda.mem_alloc(anim._iter.packer.align * self.ncps)
|
||||||
# Defer allocation until first needed
|
# Defer generation of seeds until they're first needed
|
||||||
self.d_seeds = [None] * self.nblocks
|
self.d_seeds = None
|
||||||
|
|
||||||
|
# During the main rendering loop, we alternate between two streams and
|
||||||
|
# two sets of seeds, synchronizing them at the end of rendering.
|
||||||
|
self.alt_stream = cuda.Stream()
|
||||||
|
self.d_alt_seeds = None
|
||||||
|
|
||||||
def render(self, cen_time):
|
def render(self, cen_time):
|
||||||
assert not self.pending, "Tried to render with results pending!"
|
assert not self.pending, "Tried to render with results pending!"
|
||||||
@ -246,13 +255,9 @@ class _AnimRenderer(object):
|
|||||||
|
|
||||||
util.BaseCode.zero_dptr(a.mod, self.d_accum, 4 * self.nbins,
|
util.BaseCode.zero_dptr(a.mod, self.d_accum, 4 * self.nbins,
|
||||||
self.stream)
|
self.stream)
|
||||||
|
# Ensure all main stream tasks are done before starting alt stream
|
||||||
|
self.alt_stream.wait_for_event(cuda.Event().record(self.stream))
|
||||||
|
|
||||||
# ------------------------------------------------------------
|
|
||||||
# TODO WARNING TODO WARNING TODO WARNING TODO WARNING TODO
|
|
||||||
# This will replace the palette while it's in use by the other
|
|
||||||
# rendering function. Need to pass palette texref in function
|
|
||||||
# invocation.
|
|
||||||
# ------------------------------------------------------------
|
|
||||||
dpal = cuda.make_multichannel_2d_array(palette, 'C')
|
dpal = cuda.make_multichannel_2d_array(palette, 'C')
|
||||||
tref = a.mod.get_texref('palTex')
|
tref = a.mod.get_texref('palTex')
|
||||||
tref.set_array(dpal)
|
tref.set_array(dpal)
|
||||||
@ -274,6 +279,23 @@ class _AnimRenderer(object):
|
|||||||
# index-based iteration scheme.
|
# index-based iteration scheme.
|
||||||
times = list(enumerate(self._mk_dts(cen_time, cen_cp, self.ncps)))
|
times = list(enumerate(self._mk_dts(cen_time, cen_cp, self.ncps)))
|
||||||
for b, block_times in enumerate(_chunk(times, self.cps_per_block)):
|
for b, block_times in enumerate(_chunk(times, self.cps_per_block)):
|
||||||
|
on_main = b % 2 == 0
|
||||||
|
stream = self.stream if on_main else self.alt_stream
|
||||||
|
d_seeds = self.d_seeds if on_main else self.d_alt_seeds
|
||||||
|
|
||||||
|
if not d_seeds:
|
||||||
|
seeds = mwc.MWC.make_seeds(iter.IterCode.NTHREADS *
|
||||||
|
self.cps_per_block)
|
||||||
|
h_seeds = self.pool.allocate(seeds.shape, seeds.dtype)
|
||||||
|
h_seeds[:] = seeds
|
||||||
|
size = seeds.dtype.itemsize * seeds.size
|
||||||
|
d_seeds = cuda.mem_alloc(size)
|
||||||
|
cuda.memcpy_htod_async(d_seeds, h_seeds, stream)
|
||||||
|
if on_main:
|
||||||
|
self.d_seeds = d_seeds
|
||||||
|
else:
|
||||||
|
self.d_alt_seeds = d_seeds
|
||||||
|
|
||||||
infos = []
|
infos = []
|
||||||
if len(a.genomes) > 1:
|
if len(a.genomes) > 1:
|
||||||
for n, t in block_times:
|
for n, t in block_times:
|
||||||
@ -286,8 +308,6 @@ class _AnimRenderer(object):
|
|||||||
bkgd += np.array(cp.background)
|
bkgd += np.array(cp.background)
|
||||||
else:
|
else:
|
||||||
# Can't interpolate normally; just pack copies
|
# Can't interpolate normally; just pack copies
|
||||||
# TODO: this still packs the genome 20 times or so instead of
|
|
||||||
# once
|
|
||||||
packed = packer.pack(cp=a.genomes[0], cp_step_frac=0)
|
packed = packer.pack(cp=a.genomes[0], cp_step_frac=0)
|
||||||
infos = [packed] * len(block_times)
|
infos = [packed] * len(block_times)
|
||||||
gam += a.genomes[0].gamma * len(block_times)
|
gam += a.genomes[0].gamma * len(block_times)
|
||||||
@ -295,32 +315,21 @@ class _AnimRenderer(object):
|
|||||||
bkgd += np.array(a.genomes[0].background) * len(block_times)
|
bkgd += np.array(a.genomes[0].background) * len(block_times)
|
||||||
|
|
||||||
infos = np.concatenate(infos)
|
infos = np.concatenate(infos)
|
||||||
|
h_infos = self.pool.allocate(infos.shape, infos.dtype)
|
||||||
|
h_infos[:] = infos
|
||||||
offset = b * packer.align * self.cps_per_block
|
offset = b * packer.align * self.cps_per_block
|
||||||
# TODO: portable across 32/64-bit arches?
|
# TODO: portable across 32/64-bit arches?
|
||||||
d_info_off = int(self.d_infos) + offset
|
d_info_off = int(self.d_infos) + offset
|
||||||
cuda.memcpy_htod(d_info_off, infos)
|
cuda.memcpy_htod_async(d_info_off, h_infos, stream)
|
||||||
|
|
||||||
if not self.d_seeds[b]:
|
|
||||||
seeds = mwc.MWC.make_seeds(iter.IterCode.NTHREADS *
|
|
||||||
self.cps_per_block)
|
|
||||||
self.d_seeds[b] = cuda.to_device(seeds)
|
|
||||||
|
|
||||||
# TODO: get block config from IterCode
|
# TODO: get block config from IterCode
|
||||||
# TODO: print timing information
|
# TODO: print timing information
|
||||||
iter_fun(self.d_seeds[b], np.uint64(d_info_off),
|
iter_fun(d_seeds, np.uint64(d_info_off), self.d_accum,
|
||||||
self.d_accum, texrefs=[tref],
|
|
||||||
block=(32, 16, 1), grid=(len(block_times), 1),
|
block=(32, 16, 1), grid=(len(block_times), 1),
|
||||||
stream=self.stream)
|
texrefs=[tref], stream=stream)
|
||||||
|
|
||||||
# MAJOR TODO: for now, we kill almost all parallelism by forcing the
|
# Now ensure all alt stream tasks are done before continuing main
|
||||||
# stream here. Later, once we've decided on a density-buffer prefilter,
|
self.stream.wait_for_event(cuda.Event().record(self.alt_stream))
|
||||||
# we will move it to the GPU, allowing it to be embedded in the stream
|
|
||||||
# and letting the remaining code be asynchronous.
|
|
||||||
#self.stream.synchronize()
|
|
||||||
#dbuf_dim = (a.features.acc_height, a.features.acc_stride)
|
|
||||||
#dbuf = cuda.from_device(self.d_den, dbuf_dim, np.float32)
|
|
||||||
#dbuf = ndimage.filters.gaussian_filter(dbuf, 0.6)
|
|
||||||
#cuda.memcpy_htod(self.d_den, dbuf)
|
|
||||||
|
|
||||||
util.BaseCode.zero_dptr(a.mod, self.d_out, 4 * self.nbins,
|
util.BaseCode.zero_dptr(a.mod, self.d_out, 4 * self.nbins,
|
||||||
self.stream)
|
self.stream)
|
||||||
|
Loading…
Reference in New Issue
Block a user