mirror of
https://github.com/stevenrobertson/cuburn.git
synced 2025-07-12 03:05:14 -04:00
Speed enhancement: alpha packing.
When the alpha channel is used in a color palette, the code now replaces the blue channel in the accumulation buffer with a pair of two U16s, which encode the values of the blue and alpha channels as a fraction of the value of the density. When the alpha channel is always 1.0, the blue channel works as normal. Density is now always the last element in the accumulation buffer. Eliminating the separate IO operations improved total runtime by more than 30% on my card, while the extra calculations reduced that to 20% when alpha was present (though that can be optimized further).
This commit is contained in:
@ -229,7 +229,6 @@ class _AnimRenderer(object):
|
||||
memset(byref(self._cen_cp), 0, sizeof(self._cen_cp))
|
||||
|
||||
self.nbins = anim.features.acc_height * anim.features.acc_stride
|
||||
self.d_den = cuda.mem_alloc(4 * self.nbins)
|
||||
self.d_accum = cuda.mem_alloc(16 * self.nbins)
|
||||
self.d_out = cuda.mem_alloc(16 * self.nbins)
|
||||
self.d_infos = cuda.mem_alloc(anim._iter.packer.align * self.ncps)
|
||||
@ -245,8 +244,6 @@ class _AnimRenderer(object):
|
||||
a._interp(cen_time, cen_cp)
|
||||
palette = self._interp_colors(cen_time, cen_cp)
|
||||
|
||||
util.BaseCode.zero_dptr(a.mod, self.d_den, self.nbins,
|
||||
self.stream)
|
||||
util.BaseCode.zero_dptr(a.mod, self.d_accum, 4 * self.nbins,
|
||||
self.stream)
|
||||
|
||||
@ -311,7 +308,7 @@ class _AnimRenderer(object):
|
||||
# TODO: get block config from IterCode
|
||||
# TODO: print timing information
|
||||
iter_fun(self.d_seeds[b], np.uint64(d_info_off),
|
||||
self.d_accum, self.d_den, texrefs=[tref],
|
||||
self.d_accum, texrefs=[tref],
|
||||
block=(32, 16, 1), grid=(len(block_times), 1),
|
||||
stream=self.stream)
|
||||
|
||||
@ -327,8 +324,7 @@ class _AnimRenderer(object):
|
||||
|
||||
util.BaseCode.zero_dptr(a.mod, self.d_out, 4 * self.nbins,
|
||||
self.stream)
|
||||
a._de.invoke(a.mod, Genome(cen_cp),
|
||||
self.d_accum, self.d_out, self.d_den,
|
||||
a._de.invoke(a.mod, Genome(cen_cp), self.d_accum, self.d_out,
|
||||
self.stream)
|
||||
|
||||
f = np.float32
|
||||
@ -381,11 +377,6 @@ class _AnimRenderer(object):
|
||||
g = a.features.gutter
|
||||
obuf_dim = (a.features.acc_height, a.features.acc_stride, 4)
|
||||
out = cuda.from_device(self.d_out, obuf_dim, np.float32)
|
||||
#dacc = cuda.from_device(self.d_accum, obuf_dim, np.float32)
|
||||
#daccw = dacc[:,:,3]
|
||||
#print daccw.sum()
|
||||
# TODO: performance?
|
||||
g = a.features.gutter
|
||||
out = np.delete(out, np.s_[:g], axis=0)
|
||||
out = np.delete(out, np.s_[:g], axis=1)
|
||||
out = np.delete(out, np.s_[-g:], axis=0)
|
||||
@ -447,6 +438,10 @@ class Features(object):
|
||||
else:
|
||||
self.final_xform_index = None
|
||||
|
||||
alphas = np.array([c.color[3] for g in genomes
|
||||
for c in g.palette.entries])
|
||||
self.pal_has_alpha = np.any(alphas != 1.0)
|
||||
|
||||
self.max_cps = max([cp.ntemporal_samples for cp in genomes])
|
||||
|
||||
self.width = genomes[0].width
|
||||
|
Reference in New Issue
Block a user