mirror of
				https://github.com/stevenrobertson/cuburn.git
				synced 2025-11-04 02:10:45 -05:00 
			
		
		
		
	Limit the maximum number of separate xf buffers
This commit is contained in:
		@ -228,9 +228,7 @@ void iter(
 | 
				
			|||||||
    float4 old_point = points[this_rb_idx];
 | 
					    float4 old_point = points[this_rb_idx];
 | 
				
			||||||
    float x = old_point.x, y = old_point.y, color = old_point.z;
 | 
					    float x = old_point.x, y = old_point.y, color = old_point.z;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
{{if info.chaos_used}}
 | 
					{{if not info.chaos_used}}
 | 
				
			||||||
    int last_xf_used = 0;
 | 
					 | 
				
			||||||
{{else}}
 | 
					 | 
				
			||||||
    // Shared memory size can be reduced by a factor of four using a slower
 | 
					    // Shared memory size can be reduced by a factor of four using a slower
 | 
				
			||||||
    // 4-stage reduce, but on Fermi hardware shmem use isn't a bottleneck
 | 
					    // 4-stage reduce, but on Fermi hardware shmem use isn't a bottleneck
 | 
				
			||||||
    __shared__ float swap[{{4*NTHREADS}}];
 | 
					    __shared__ float swap[{{4*NTHREADS}}];
 | 
				
			||||||
@ -240,10 +238,10 @@ void iter(
 | 
				
			|||||||
    if (threadIdx.y == 0 && threadIdx.x < {{NWARPS*2}})
 | 
					    if (threadIdx.y == 0 && threadIdx.x < {{NWARPS*2}})
 | 
				
			||||||
        cosel[threadIdx.x] = mwc_next_01(rctx);
 | 
					        cosel[threadIdx.x] = mwc_next_01(rctx);
 | 
				
			||||||
    __syncthreads();
 | 
					    __syncthreads();
 | 
				
			||||||
    int last_xf_used = 0;
 | 
					 | 
				
			||||||
{{endif}}
 | 
					{{endif}}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    bool fuse = false;
 | 
					    bool fuse = false;
 | 
				
			||||||
 | 
					    int last_xf_used = 0;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    // This condition checks for large numbers, Infs, and NaNs.
 | 
					    // This condition checks for large numbers, Infs, and NaNs.
 | 
				
			||||||
    if (!(-(fabsf(x) + fabsf(y)) > -1.0e6f)) {
 | 
					    if (!(-(fabsf(x) + fabsf(y)) > -1.0e6f)) {
 | 
				
			||||||
@ -358,8 +356,8 @@ void iter(
 | 
				
			|||||||
            continue;
 | 
					            continue;
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        uint32_t i = (last_xf_used * acc_size.aheight + iy)
 | 
					        uint32_t ibase = (last_xf_used % {{info.max_nxf}}) * acc_size.aheight;
 | 
				
			||||||
                   * acc_size.astride + ix;
 | 
					        uint32_t i = (ibase + iy) * acc_size.astride + ix;
 | 
				
			||||||
{{if info.acc_mode == 'atomic'}}
 | 
					{{if info.acc_mode == 'atomic'}}
 | 
				
			||||||
        asm volatile ({{crep("""
 | 
					        asm volatile ({{crep("""
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
 | 
				
			|||||||
@ -54,6 +54,12 @@ class Renderer(object):
 | 
				
			|||||||
    # Accumulation mode. Leave it at 'atomic' for now.
 | 
					    # Accumulation mode. Leave it at 'atomic' for now.
 | 
				
			||||||
    acc_mode = 'atomic'
 | 
					    acc_mode = 'atomic'
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # At most this many separate buffers for xforms will be allocated, after
 | 
				
			||||||
 | 
					    # which further xforms will wrap to the first when writing. Currently it
 | 
				
			||||||
 | 
					    # is compiled in, so power-of-two and no runtime maximization. Current
 | 
				
			||||||
 | 
					    # value of 16 fits into a 1GB card at 1080p.
 | 
				
			||||||
 | 
					    max_nxf = 16
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # TODO
 | 
					    # TODO
 | 
				
			||||||
    chaos_used = False
 | 
					    chaos_used = False
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -186,6 +192,7 @@ class Renderer(object):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
        nbins = astride * aheight
 | 
					        nbins = astride * aheight
 | 
				
			||||||
        nxf = len(filter(lambda g: g != 'final', genome.xforms))
 | 
					        nxf = len(filter(lambda g: g != 'final', genome.xforms))
 | 
				
			||||||
 | 
					        nxf = min(nxf, self.max_nxf)
 | 
				
			||||||
        d_accum = cuda.mem_alloc(16 * nbins * nxf)
 | 
					        d_accum = cuda.mem_alloc(16 * nbins * nxf)
 | 
				
			||||||
        d_out = cuda.mem_alloc(16 * nbins)
 | 
					        d_out = cuda.mem_alloc(16 * nbins)
 | 
				
			||||||
        if self.acc_mode == 'atomic':
 | 
					        if self.acc_mode == 'atomic':
 | 
				
			||||||
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user