diff --git a/cuburn/code/filtering.py b/cuburn/code/filtering.py
index 2ad17bf..0e4e8d9 100644
--- a/cuburn/code/filtering.py
+++ b/cuburn/code/filtering.py
@@ -241,7 +241,6 @@ void density_est(float4 *pixbuf, float4 *outbuf,
                         de_add(si,  jj, -ii, scaled);
 
                         iif += 1;
-                        // TODO: validate that the above avoids bank conflicts
                     }
                 }
             }
diff --git a/cuburn/render.py b/cuburn/render.py
index d0ba434..7bba5bd 100644
--- a/cuburn/render.py
+++ b/cuburn/render.py
@@ -176,12 +176,10 @@ class Animation(object):
         """
         # Don't see this changing, but empirical tests could prove me wrong
         NRENDERERS = 2
-        # This could be shared too?
-        pool = pycuda.tools.PageLockedMemoryPool()
         # TODO: under a slightly modified sequencing, certain buffers can be
         # shared (though this may be unimportant if a good AA technique which
         # doesn't require full SS can be found)
-        rdrs = [_AnimRenderer(self, pool) for i in range(NRENDERERS)]
+        rdrs = [_AnimRenderer(self) for i in range(NRENDERERS)]
 
         # Zip up each genome with an alternating renderer, plus enough empty
         # genomes at the end to flush all pending tasks
@@ -196,8 +194,6 @@ class Animation(object):
     def _interp(self, time, cp):
         flam3_interpolate(self._g_arr, len(self._g_arr), time, 0, byref(cp))
 
-
-
 class _AnimRenderer(object):
     # Large launches lock the display for a considerable period and may be
     # killed due to a device timeout; small launches are harder to load-balance
@@ -214,9 +210,8 @@ class _AnimRenderer(object):
     PAL_HEIGHT = 16
 
 
-    def __init__(self, anim, pool):
+    def __init__(self, anim):
         self.anim = anim
-        self.pool = pool
         self.pending = False
         self.stream = cuda.Stream()
 
@@ -235,7 +230,9 @@ class _AnimRenderer(object):
         self.nbins = anim.features.acc_height * anim.features.acc_stride
         self.d_accum = cuda.mem_alloc(16 * self.nbins)
         self.d_out = cuda.mem_alloc(16 * self.nbins)
-        self.d_infos = cuda.mem_alloc(anim._iter.packer.align * self.ncps)
+
+        info_size = anim._iter.packer.align * self.ncps
+        self.d_infos = cuda.mem_alloc(info_size)
         # Defer generation of seeds until they're first needed
         self.d_seeds = None
 
@@ -286,7 +283,7 @@ class _AnimRenderer(object):
             if not d_seeds:
                 seeds = mwc.MWC.make_seeds(iter.IterCode.NTHREADS *
                                            self.cps_per_block)
-                h_seeds = self.pool.allocate(seeds.shape, seeds.dtype)
+                h_seeds = cuda.pagelocked_empty(seeds.shape, seeds.dtype)
                 h_seeds[:] = seeds
                 size = seeds.dtype.itemsize * seeds.size
                 d_seeds = cuda.mem_alloc(size)
@@ -315,7 +312,7 @@ class _AnimRenderer(object):
                 bkgd += np.array(a.genomes[0].background) * len(block_times)
 
             infos = np.concatenate(infos)
-            h_infos = self.pool.allocate(infos.shape, infos.dtype)
+            h_infos = cuda.pagelocked_empty(infos.shape, infos.dtype)
             h_infos[:] = infos
             offset = b * packer.align * self.cps_per_block
             # TODO: portable across 32/64-bit arches?