Fix missing control points in async version.

The allocation pool was reallocating the same frame as soon as it had left the current scope, before it had been copied. We just reallocate the same chunks. I don't think this has any real performance impact but this can be verified.
2025-07-14 12:15:18 -04:00 · 2011-10-11 20:54:33 -04:00
parent b081bc9378
commit 9b03f557c2
2 changed files with 7 additions and 11 deletions
--- a/cuburn/code/filtering.py
+++ b/cuburn/code/filtering.py
@ -241,7 +241,6 @@ void density_est(float4 *pixbuf, float4 *outbuf,
                        de_add(si,  jj, -ii, scaled);
                        iif += 1;
                        // TODO: validate that the above avoids bank conflicts
                    }
                }
            }
--- a/cuburn/render.py
+++ b/cuburn/render.py
@ -176,12 +176,10 @@ class Animation(object):
        """
        # Don't see this changing, but empirical tests could prove me wrong
        NRENDERERS = 2
        # This could be shared too?
        pool = pycuda.tools.PageLockedMemoryPool()
        # TODO: under a slightly modified sequencing, certain buffers can be
        # shared (though this may be unimportant if a good AA technique which
        # doesn't require full SS can be found)
-        rdrs = [_AnimRenderer(self, pool) for i in range(NRENDERERS)]
+        rdrs = [_AnimRenderer(self) for i in range(NRENDERERS)]
        # Zip up each genome with an alternating renderer, plus enough empty
        # genomes at the end to flush all pending tasks
@ -196,8 +194,6 @@ class Animation(object):
    def _interp(self, time, cp):
        flam3_interpolate(self._g_arr, len(self._g_arr), time, 0, byref(cp))
 class _AnimRenderer(object):
    # Large launches lock the display for a considerable period and may be
    # killed due to a device timeout; small launches are harder to load-balance
@ -214,9 +210,8 @@ class _AnimRenderer(object):
    PAL_HEIGHT = 16
-    def __init__(self, anim, pool):
+    def __init__(self, anim):
        self.anim = anim
        self.pool = pool
        self.pending = False
        self.stream = cuda.Stream()
@ -235,7 +230,9 @@ class _AnimRenderer(object):
        self.nbins = anim.features.acc_height * anim.features.acc_stride
        self.d_accum = cuda.mem_alloc(16 * self.nbins)
        self.d_out = cuda.mem_alloc(16 * self.nbins)
-        self.d_infos = cuda.mem_alloc(anim._iter.packer.align * self.ncps)
+
        info_size = anim._iter.packer.align * self.ncps
        self.d_infos = cuda.mem_alloc(info_size)
        # Defer generation of seeds until they're first needed
        self.d_seeds = None
@ -286,7 +283,7 @@ class _AnimRenderer(object):
            if not d_seeds:
                seeds = mwc.MWC.make_seeds(iter.IterCode.NTHREADS *
                                           self.cps_per_block)
-                h_seeds = self.pool.allocate(seeds.shape, seeds.dtype)
+                h_seeds = cuda.pagelocked_empty(seeds.shape, seeds.dtype)
                h_seeds[:] = seeds
                size = seeds.dtype.itemsize * seeds.size
                d_seeds = cuda.mem_alloc(size)
@ -315,7 +312,7 @@ class _AnimRenderer(object):
                bkgd += np.array(a.genomes[0].background) * len(block_times)
            infos = np.concatenate(infos)
-            h_infos = self.pool.allocate(infos.shape, infos.dtype)
+            h_infos = cuda.pagelocked_empty(infos.shape, infos.dtype)
            h_infos[:] = infos
            offset = b * packer.align * self.cps_per_block
            # TODO: portable across 32/64-bit arches?