Remove SS from DE, and improve performance.

2025-07-05 15:55:14 -04:00 · 2012-01-20 09:35:12 -05:00
parent e1914a9c87
commit 1398706886
3 changed files with 83 additions and 67 deletions
--- a/cuburn/render.py
+++ b/cuburn/render.py
@ -20,7 +20,7 @@ from cuburn import affine
 from cuburn.code import util, mwc, iter, interp, filtering, sort

 RenderedImage = namedtuple('RenderedImage', 'buf idx gpu_time')
-Dimensions = namedtuple('Dimensions', 'w h aw ah astride ss')
+Dimensions = namedtuple('Dimensions', 'w h aw ah astride')

 def _sync_stream(dst, src):
    dst.wait_for_event(cuda.Event(cuda.event_flags.DISABLE_TIMING).record(src))
@ -54,7 +54,7 @@ class Renderer(object):
    # Maximum width of DE and other spatial filters, and thus in turn the
    # amount of padding applied. Note that, for now, this must not be changed!
    # The filtering code makes deep assumptions about this value.
-    gutter = 15
+    gutter = 10

    # Accumulation mode. Leave it at 'atomic' for now.
    acc_mode = 'atomic'
@ -138,7 +138,7 @@ class Renderer(object):
        next(r)
        return ifilter(None, imap(r.send, chain(times, [None])))

-    def render_gen(self, genome, width, height, ss=1, blend=True):
+    def render_gen(self, genome, width, height, blend=True):
        """
        Render frames. This method is wrapped by the ``render()`` method; see
        its docstring for warnings and details.
@ -182,25 +182,24 @@ class Renderer(object):
        event_a = cuda.Event().record(filt_stream)
        event_b = None

-        owidth = width + 2 * self.gutter
-        oheight = height + 2 * self.gutter
-        ostride = 32 * int(np.ceil(owidth / 32.))
-        awidth, aheight, astride = owidth * ss, oheight * ss, ostride * ss
-        dim = Dimensions(width, height, awidth, aheight, astride, ss)
+        awidth = width + 2 * self.gutter
+        aheight = 32 * int(np.ceil((height + 2 * self.gutter) / 32.))
+        astride = 32 * int(np.ceil(awidth / 32.))
+        dim = Dimensions(width, height, awidth, aheight, astride)
        d_acc_size = self.mod.get_global('acc_size')[0]
        cuda.memcpy_htod_async(d_acc_size, u32(list(dim)), write_stream)

-        nbins = awidth * aheight
+        nbins = astride * aheight
        # Extra padding in accum helps with write_shmem overruns
        d_accum = cuda.mem_alloc(16 * nbins + (1<<16))
-        d_out = cuda.mem_alloc(16 * oheight * ostride)
+        d_out = cuda.mem_alloc(16 * aheight * astride)
        if self.acc_mode == 'atomic':
            d_atom = cuda.mem_alloc(8 * nbins)
            flush_fun = self.mod.get_function("flush_atom")

        obuf_copy = argset(cuda.Memcpy2D(),
            src_y=self.gutter, src_x_in_bytes=16*self.gutter,
-            src_pitch=16*ostride, dst_pitch=16*width,
+            src_pitch=16*astride, dst_pitch=16*width,
            width_in_bytes=16*width, height=height)
        obuf_copy.set_src_device(d_out)
        h_out_a = cuda.pagelocked_empty((height, width, 4), f32)
@ -344,10 +343,8 @@ class Renderer(object):
                              block=(512, 1, 1), grid=(nblocks, nblocks),
                              stream=iter_stream)

+            util.BaseCode.fill_dptr(self.mod, d_out, 4 * nbins, filt_stream)
            _sync_stream(filt_stream, write_stream)
-            filt.blur_density(d_accum, d_out, dim, stream=filt_stream)
-            util.BaseCode.fill_dptr(self.mod, d_out, 4 * nbins / ss ** 2,
-                                    filt_stream)
            filt.de(d_out, d_accum, genome, dim, tc, stream=filt_stream)
            _sync_stream(write_stream, filt_stream)
            filt.colorclip(d_out, genome, dim, tc, blend, stream=filt_stream)