mirror of
https://github.com/stevenrobertson/cuburn.git
synced 2025-02-05 11:40:04 -05:00
Limit the maximum number of separate xf buffers
This commit is contained in:
parent
45b75d3fa5
commit
c054c757bd
@ -228,9 +228,7 @@ void iter(
|
|||||||
float4 old_point = points[this_rb_idx];
|
float4 old_point = points[this_rb_idx];
|
||||||
float x = old_point.x, y = old_point.y, color = old_point.z;
|
float x = old_point.x, y = old_point.y, color = old_point.z;
|
||||||
|
|
||||||
{{if info.chaos_used}}
|
{{if not info.chaos_used}}
|
||||||
int last_xf_used = 0;
|
|
||||||
{{else}}
|
|
||||||
// Shared memory size can be reduced by a factor of four using a slower
|
// Shared memory size can be reduced by a factor of four using a slower
|
||||||
// 4-stage reduce, but on Fermi hardware shmem use isn't a bottleneck
|
// 4-stage reduce, but on Fermi hardware shmem use isn't a bottleneck
|
||||||
__shared__ float swap[{{4*NTHREADS}}];
|
__shared__ float swap[{{4*NTHREADS}}];
|
||||||
@ -240,10 +238,10 @@ void iter(
|
|||||||
if (threadIdx.y == 0 && threadIdx.x < {{NWARPS*2}})
|
if (threadIdx.y == 0 && threadIdx.x < {{NWARPS*2}})
|
||||||
cosel[threadIdx.x] = mwc_next_01(rctx);
|
cosel[threadIdx.x] = mwc_next_01(rctx);
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
int last_xf_used = 0;
|
|
||||||
{{endif}}
|
{{endif}}
|
||||||
|
|
||||||
bool fuse = false;
|
bool fuse = false;
|
||||||
|
int last_xf_used = 0;
|
||||||
|
|
||||||
// This condition checks for large numbers, Infs, and NaNs.
|
// This condition checks for large numbers, Infs, and NaNs.
|
||||||
if (!(-(fabsf(x) + fabsf(y)) > -1.0e6f)) {
|
if (!(-(fabsf(x) + fabsf(y)) > -1.0e6f)) {
|
||||||
@ -358,8 +356,8 @@ void iter(
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
uint32_t i = (last_xf_used * acc_size.aheight + iy)
|
uint32_t ibase = (last_xf_used % {{info.max_nxf}}) * acc_size.aheight;
|
||||||
* acc_size.astride + ix;
|
uint32_t i = (ibase + iy) * acc_size.astride + ix;
|
||||||
{{if info.acc_mode == 'atomic'}}
|
{{if info.acc_mode == 'atomic'}}
|
||||||
asm volatile ({{crep("""
|
asm volatile ({{crep("""
|
||||||
{
|
{
|
||||||
|
@ -54,6 +54,12 @@ class Renderer(object):
|
|||||||
# Accumulation mode. Leave it at 'atomic' for now.
|
# Accumulation mode. Leave it at 'atomic' for now.
|
||||||
acc_mode = 'atomic'
|
acc_mode = 'atomic'
|
||||||
|
|
||||||
|
# At most this many separate buffers for xforms will be allocated, after
|
||||||
|
# which further xforms will wrap to the first when writing. Currently it
|
||||||
|
# is compiled in, so power-of-two and no runtime maximization. Current
|
||||||
|
# value of 16 fits into a 1GB card at 1080p.
|
||||||
|
max_nxf = 16
|
||||||
|
|
||||||
# TODO
|
# TODO
|
||||||
chaos_used = False
|
chaos_used = False
|
||||||
|
|
||||||
@ -186,6 +192,7 @@ class Renderer(object):
|
|||||||
|
|
||||||
nbins = astride * aheight
|
nbins = astride * aheight
|
||||||
nxf = len(filter(lambda g: g != 'final', genome.xforms))
|
nxf = len(filter(lambda g: g != 'final', genome.xforms))
|
||||||
|
nxf = min(nxf, self.max_nxf)
|
||||||
d_accum = cuda.mem_alloc(16 * nbins * nxf)
|
d_accum = cuda.mem_alloc(16 * nbins * nxf)
|
||||||
d_out = cuda.mem_alloc(16 * nbins)
|
d_out = cuda.mem_alloc(16 * nbins)
|
||||||
if self.acc_mode == 'atomic':
|
if self.acc_mode == 'atomic':
|
||||||
|
Loading…
Reference in New Issue
Block a user