From c054c757bd598842d4a94e802f773441baba7f97 Mon Sep 17 00:00:00 2001 From: Steven Robertson Date: Sun, 22 Jan 2012 23:52:09 -0500 Subject: [PATCH] Limit the maximum number of separate xf buffers --- cuburn/code/iter.py | 10 ++++------ cuburn/render.py | 7 +++++++ 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/cuburn/code/iter.py b/cuburn/code/iter.py index 03c8bc6..7cacc81 100644 --- a/cuburn/code/iter.py +++ b/cuburn/code/iter.py @@ -228,9 +228,7 @@ void iter( float4 old_point = points[this_rb_idx]; float x = old_point.x, y = old_point.y, color = old_point.z; -{{if info.chaos_used}} - int last_xf_used = 0; -{{else}} +{{if not info.chaos_used}} // Shared memory size can be reduced by a factor of four using a slower // 4-stage reduce, but on Fermi hardware shmem use isn't a bottleneck __shared__ float swap[{{4*NTHREADS}}]; @@ -240,10 +238,10 @@ void iter( if (threadIdx.y == 0 && threadIdx.x < {{NWARPS*2}}) cosel[threadIdx.x] = mwc_next_01(rctx); __syncthreads(); - int last_xf_used = 0; {{endif}} bool fuse = false; + int last_xf_used = 0; // This condition checks for large numbers, Infs, and NaNs. if (!(-(fabsf(x) + fabsf(y)) > -1.0e6f)) { @@ -358,8 +356,8 @@ void iter( continue; } - uint32_t i = (last_xf_used * acc_size.aheight + iy) - * acc_size.astride + ix; + uint32_t ibase = (last_xf_used % {{info.max_nxf}}) * acc_size.aheight; + uint32_t i = (ibase + iy) * acc_size.astride + ix; {{if info.acc_mode == 'atomic'}} asm volatile ({{crep(""" { diff --git a/cuburn/render.py b/cuburn/render.py index 2f0d09a..c666926 100644 --- a/cuburn/render.py +++ b/cuburn/render.py @@ -54,6 +54,12 @@ class Renderer(object): # Accumulation mode. Leave it at 'atomic' for now. acc_mode = 'atomic' + # At most this many separate buffers for xforms will be allocated, after + # which further xforms will wrap to the first when writing. Currently it + # is compiled in, so power-of-two and no runtime maximization. Current + # value of 16 fits into a 1GB card at 1080p. + max_nxf = 16 + # TODO chaos_used = False @@ -186,6 +192,7 @@ class Renderer(object): nbins = astride * aheight nxf = len(filter(lambda g: g != 'final', genome.xforms)) + nxf = min(nxf, self.max_nxf) d_accum = cuda.mem_alloc(16 * nbins * nxf) d_out = cuda.mem_alloc(16 * nbins) if self.acc_mode == 'atomic':