mirror of
https://github.com/stevenrobertson/cuburn.git
synced 2025-02-05 11:40:04 -05:00
Improve asynchrony; improve palette interp perf.
This commit is contained in:
parent
7b9bb165ac
commit
81f61d4d5d
@ -237,7 +237,6 @@ class _AnimRenderer(object):
|
|||||||
self.nblocks = int(math.ceil(self.ncps / float(self.cps_per_block)))
|
self.nblocks = int(math.ceil(self.ncps / float(self.cps_per_block)))
|
||||||
|
|
||||||
# These are stored to avoid leaks, not to be stateful in method calls
|
# These are stored to avoid leaks, not to be stateful in method calls
|
||||||
# TODO: ensure proper cleanup is done
|
|
||||||
self._dst_cp = pyflam3.Genome()
|
self._dst_cp = pyflam3.Genome()
|
||||||
memset(byref(self._dst_cp), 0, sizeof(self._dst_cp))
|
memset(byref(self._dst_cp), 0, sizeof(self._dst_cp))
|
||||||
self._cen_cp = pyflam3.Genome()
|
self._cen_cp = pyflam3.Genome()
|
||||||
@ -257,6 +256,9 @@ class _AnimRenderer(object):
|
|||||||
self.alt_stream = cuda.Stream()
|
self.alt_stream = cuda.Stream()
|
||||||
self.d_alt_seeds = None
|
self.d_alt_seeds = None
|
||||||
|
|
||||||
|
# It's less than ideal, but we lock some memory ahead of time
|
||||||
|
self.h_infos_locked = cuda.pagelocked_empty((info_size/4,), np.float32)
|
||||||
|
|
||||||
def render(self, cen_time):
|
def render(self, cen_time):
|
||||||
assert not self.pending, "Tried to render with results pending!"
|
assert not self.pending, "Tried to render with results pending!"
|
||||||
self.pending = True
|
self.pending = True
|
||||||
@ -328,15 +330,14 @@ class _AnimRenderer(object):
|
|||||||
bkgd += np.array(a.genomes[0].background) * len(block_times)
|
bkgd += np.array(a.genomes[0].background) * len(block_times)
|
||||||
|
|
||||||
infos = np.concatenate(infos)
|
infos = np.concatenate(infos)
|
||||||
h_infos = cuda.pagelocked_empty(infos.shape, infos.dtype)
|
|
||||||
h_infos[:] = infos
|
|
||||||
offset = b * packer.align * self.cps_per_block
|
offset = b * packer.align * self.cps_per_block
|
||||||
|
h_infos = self.h_infos_locked[offset/4:offset/4+len(infos)]
|
||||||
|
h_infos[:] = infos
|
||||||
# TODO: portable across 32/64-bit arches?
|
# TODO: portable across 32/64-bit arches?
|
||||||
d_info_off = int(self.d_infos) + offset
|
d_info_off = int(self.d_infos) + offset
|
||||||
cuda.memcpy_htod_async(d_info_off, h_infos, stream)
|
cuda.memcpy_htod_async(d_info_off, h_infos, stream)
|
||||||
|
|
||||||
# TODO: get block config from IterCode
|
# TODO: get block config from IterCode
|
||||||
# TODO: print timing information
|
|
||||||
iter_fun(d_seeds, np.uint64(d_info_off), self.d_accum,
|
iter_fun(d_seeds, np.uint64(d_info_off), self.d_accum,
|
||||||
block=(32, 16, 1), grid=(len(block_times), 1),
|
block=(32, 16, 1), grid=(len(block_times), 1),
|
||||||
texrefs=[tref], stream=stream)
|
texrefs=[tref], stream=stream)
|
||||||
@ -365,6 +366,18 @@ class _AnimRenderer(object):
|
|||||||
block=(256, 1, 1), grid=(self.nbins / 256, 1),
|
block=(256, 1, 1), grid=(self.nbins / 256, 1),
|
||||||
stream=self.stream)
|
stream=self.stream)
|
||||||
|
|
||||||
|
|
||||||
|
# TODO: The stream seems to sync right here, automatically, before
|
||||||
|
# returning. I think PyCUDA is forcing a sync when something drops out
|
||||||
|
# of scope. Investigate.
|
||||||
|
|
||||||
|
def _pal_to_np(self, cp):
|
||||||
|
# Converting palettes by iteration has an enormous performance
|
||||||
|
# overhead. We cheat massively and dangerously here.
|
||||||
|
pal = cast(pointer(cp.palette), POINTER(c_double * (256 * 5)))
|
||||||
|
val = np.frombuffer(buffer(pal.contents), count=256*5)
|
||||||
|
return np.uint8(np.reshape(val, (256, 5))[:,1:] * 255.0)
|
||||||
|
|
||||||
def _interp_colors(self, cen_time, cen_cp):
|
def _interp_colors(self, cen_time, cen_cp):
|
||||||
# TODO: any visible difference between uint8 and richer formats?
|
# TODO: any visible difference between uint8 and richer formats?
|
||||||
pal = np.empty((self.PAL_HEIGHT, 256, 4), dtype=np.uint8)
|
pal = np.empty((self.PAL_HEIGHT, 256, 4), dtype=np.uint8)
|
||||||
@ -376,14 +389,11 @@ class _AnimRenderer(object):
|
|||||||
times = self._mk_dts(cen_time, cen_cp, self.PAL_HEIGHT)
|
times = self._mk_dts(cen_time, cen_cp, self.PAL_HEIGHT)
|
||||||
for n, t in enumerate(times):
|
for n, t in enumerate(times):
|
||||||
a._interp(t, cp)
|
a._interp(t, cp)
|
||||||
for i, e in enumerate(cp.palette.entries):
|
pal[n] = self._pal_to_np(cp)
|
||||||
pal[n][i] = np.uint8(np.array(e.color) * 255.0)
|
|
||||||
else:
|
else:
|
||||||
# Cannot call any interp functions on a single genome; rather than
|
# Cannot call any interp functions on a single genome; rather than
|
||||||
# have alternate code-paths, just copy the same colors everywhere
|
# have alternate code-paths, just copy the same colors everywhere
|
||||||
for i, e in enumerate(a.genomes[0].palette.entries):
|
pal[0] = self._pal_to_np(a.genomes[0])
|
||||||
# TODO: This triggers a RuntimeWarning
|
|
||||||
pal[0][i] = np.uint8(np.array(e.color) * 255.0)
|
|
||||||
pal[1:] = pal[0]
|
pal[1:] = pal[0]
|
||||||
return pal
|
return pal
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user