mirror of
https://github.com/stevenrobertson/cuburn.git
synced 2025-02-05 11:40:04 -05:00
Fix missing control points in async version.
The allocation pool was reallocating the same frame as soon as it had left the current scope, before it had been copied. We just reallocate the same chunks. I don't think this has any real performance impact but this can be verified.
This commit is contained in:
parent
b081bc9378
commit
9b03f557c2
@ -241,7 +241,6 @@ void density_est(float4 *pixbuf, float4 *outbuf,
|
|||||||
de_add(si, jj, -ii, scaled);
|
de_add(si, jj, -ii, scaled);
|
||||||
|
|
||||||
iif += 1;
|
iif += 1;
|
||||||
// TODO: validate that the above avoids bank conflicts
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -176,12 +176,10 @@ class Animation(object):
|
|||||||
"""
|
"""
|
||||||
# Don't see this changing, but empirical tests could prove me wrong
|
# Don't see this changing, but empirical tests could prove me wrong
|
||||||
NRENDERERS = 2
|
NRENDERERS = 2
|
||||||
# This could be shared too?
|
|
||||||
pool = pycuda.tools.PageLockedMemoryPool()
|
|
||||||
# TODO: under a slightly modified sequencing, certain buffers can be
|
# TODO: under a slightly modified sequencing, certain buffers can be
|
||||||
# shared (though this may be unimportant if a good AA technique which
|
# shared (though this may be unimportant if a good AA technique which
|
||||||
# doesn't require full SS can be found)
|
# doesn't require full SS can be found)
|
||||||
rdrs = [_AnimRenderer(self, pool) for i in range(NRENDERERS)]
|
rdrs = [_AnimRenderer(self) for i in range(NRENDERERS)]
|
||||||
|
|
||||||
# Zip up each genome with an alternating renderer, plus enough empty
|
# Zip up each genome with an alternating renderer, plus enough empty
|
||||||
# genomes at the end to flush all pending tasks
|
# genomes at the end to flush all pending tasks
|
||||||
@ -196,8 +194,6 @@ class Animation(object):
|
|||||||
def _interp(self, time, cp):
|
def _interp(self, time, cp):
|
||||||
flam3_interpolate(self._g_arr, len(self._g_arr), time, 0, byref(cp))
|
flam3_interpolate(self._g_arr, len(self._g_arr), time, 0, byref(cp))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class _AnimRenderer(object):
|
class _AnimRenderer(object):
|
||||||
# Large launches lock the display for a considerable period and may be
|
# Large launches lock the display for a considerable period and may be
|
||||||
# killed due to a device timeout; small launches are harder to load-balance
|
# killed due to a device timeout; small launches are harder to load-balance
|
||||||
@ -214,9 +210,8 @@ class _AnimRenderer(object):
|
|||||||
PAL_HEIGHT = 16
|
PAL_HEIGHT = 16
|
||||||
|
|
||||||
|
|
||||||
def __init__(self, anim, pool):
|
def __init__(self, anim):
|
||||||
self.anim = anim
|
self.anim = anim
|
||||||
self.pool = pool
|
|
||||||
self.pending = False
|
self.pending = False
|
||||||
self.stream = cuda.Stream()
|
self.stream = cuda.Stream()
|
||||||
|
|
||||||
@ -235,7 +230,9 @@ class _AnimRenderer(object):
|
|||||||
self.nbins = anim.features.acc_height * anim.features.acc_stride
|
self.nbins = anim.features.acc_height * anim.features.acc_stride
|
||||||
self.d_accum = cuda.mem_alloc(16 * self.nbins)
|
self.d_accum = cuda.mem_alloc(16 * self.nbins)
|
||||||
self.d_out = cuda.mem_alloc(16 * self.nbins)
|
self.d_out = cuda.mem_alloc(16 * self.nbins)
|
||||||
self.d_infos = cuda.mem_alloc(anim._iter.packer.align * self.ncps)
|
|
||||||
|
info_size = anim._iter.packer.align * self.ncps
|
||||||
|
self.d_infos = cuda.mem_alloc(info_size)
|
||||||
# Defer generation of seeds until they're first needed
|
# Defer generation of seeds until they're first needed
|
||||||
self.d_seeds = None
|
self.d_seeds = None
|
||||||
|
|
||||||
@ -286,7 +283,7 @@ class _AnimRenderer(object):
|
|||||||
if not d_seeds:
|
if not d_seeds:
|
||||||
seeds = mwc.MWC.make_seeds(iter.IterCode.NTHREADS *
|
seeds = mwc.MWC.make_seeds(iter.IterCode.NTHREADS *
|
||||||
self.cps_per_block)
|
self.cps_per_block)
|
||||||
h_seeds = self.pool.allocate(seeds.shape, seeds.dtype)
|
h_seeds = cuda.pagelocked_empty(seeds.shape, seeds.dtype)
|
||||||
h_seeds[:] = seeds
|
h_seeds[:] = seeds
|
||||||
size = seeds.dtype.itemsize * seeds.size
|
size = seeds.dtype.itemsize * seeds.size
|
||||||
d_seeds = cuda.mem_alloc(size)
|
d_seeds = cuda.mem_alloc(size)
|
||||||
@ -315,7 +312,7 @@ class _AnimRenderer(object):
|
|||||||
bkgd += np.array(a.genomes[0].background) * len(block_times)
|
bkgd += np.array(a.genomes[0].background) * len(block_times)
|
||||||
|
|
||||||
infos = np.concatenate(infos)
|
infos = np.concatenate(infos)
|
||||||
h_infos = self.pool.allocate(infos.shape, infos.dtype)
|
h_infos = cuda.pagelocked_empty(infos.shape, infos.dtype)
|
||||||
h_infos[:] = infos
|
h_infos[:] = infos
|
||||||
offset = b * packer.align * self.cps_per_block
|
offset = b * packer.align * self.cps_per_block
|
||||||
# TODO: portable across 32/64-bit arches?
|
# TODO: portable across 32/64-bit arches?
|
||||||
|
Loading…
Reference in New Issue
Block a user