From c054c757bd598842d4a94e802f773441baba7f97 Mon Sep 17 00:00:00 2001
From: Steven Robertson <steven@strobe.cc>
Date: Sun, 22 Jan 2012 23:52:09 -0500
Subject: [PATCH] Limit the maximum number of separate xf buffers

---
 cuburn/code/iter.py | 10 ++++------
 cuburn/render.py    |  7 +++++++
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/cuburn/code/iter.py b/cuburn/code/iter.py
index 03c8bc6..7cacc81 100644
--- a/cuburn/code/iter.py
+++ b/cuburn/code/iter.py
@@ -228,9 +228,7 @@ void iter(
     float4 old_point = points[this_rb_idx];
     float x = old_point.x, y = old_point.y, color = old_point.z;
 
-{{if info.chaos_used}}
-    int last_xf_used = 0;
-{{else}}
+{{if not info.chaos_used}}
     // Shared memory size can be reduced by a factor of four using a slower
     // 4-stage reduce, but on Fermi hardware shmem use isn't a bottleneck
     __shared__ float swap[{{4*NTHREADS}}];
@@ -240,10 +238,10 @@ void iter(
     if (threadIdx.y == 0 && threadIdx.x < {{NWARPS*2}})
         cosel[threadIdx.x] = mwc_next_01(rctx);
     __syncthreads();
-    int last_xf_used = 0;
 {{endif}}
 
     bool fuse = false;
+    int last_xf_used = 0;
 
     // This condition checks for large numbers, Infs, and NaNs.
     if (!(-(fabsf(x) + fabsf(y)) > -1.0e6f)) {
@@ -358,8 +356,8 @@ void iter(
             continue;
         }
 
-        uint32_t i = (last_xf_used * acc_size.aheight + iy)
-                   * acc_size.astride + ix;
+        uint32_t ibase = (last_xf_used % {{info.max_nxf}}) * acc_size.aheight;
+        uint32_t i = (ibase + iy) * acc_size.astride + ix;
 {{if info.acc_mode == 'atomic'}}
         asm volatile ({{crep("""
 {
diff --git a/cuburn/render.py b/cuburn/render.py
index 2f0d09a..c666926 100644
--- a/cuburn/render.py
+++ b/cuburn/render.py
@@ -54,6 +54,12 @@ class Renderer(object):
     # Accumulation mode. Leave it at 'atomic' for now.
     acc_mode = 'atomic'
 
+    # At most this many separate buffers for xforms will be allocated, after
+    # which further xforms will wrap to the first when writing. Currently it
+    # is compiled in, so power-of-two and no runtime maximization. Current
+    # value of 16 fits into a 1GB card at 1080p.
+    max_nxf = 16
+
     # TODO
     chaos_used = False
 
@@ -186,6 +192,7 @@ class Renderer(object):
 
         nbins = astride * aheight
         nxf = len(filter(lambda g: g != 'final', genome.xforms))
+        nxf = min(nxf, self.max_nxf)
         d_accum = cuda.mem_alloc(16 * nbins * nxf)
         d_out = cuda.mem_alloc(16 * nbins)
         if self.acc_mode == 'atomic':