diff --git a/cuburn/code/filters.py b/cuburn/code/filters.py index 16d4733..4e6eb3d 100644 --- a/cuburn/code/filters.py +++ b/cuburn/code/filters.py @@ -71,6 +71,7 @@ denblurlib = devlib(deps=[texshearlib], decls=''' texture chan4_src; texture chan1_src; +// Call the Python function set_blur_width() to override these defaults. __constant__ float gauss_coefs[7] = { 0.00443305f, 0.05400558f, 0.24203623f, 0.39905028f, 0.24203623f, 0.05400558f, 0.00443305f diff --git a/cuburn/filters.py b/cuburn/filters.py index 8bf1d17..93c7d91 100644 --- a/cuburn/filters.py +++ b/cuburn/filters.py @@ -8,6 +8,13 @@ from pycuda.gpuarray import vec import code.filters from code.util import ClsMod, argset, launch2 +def set_blur_width(mod, pool, stdev=1, stream=None): + coefs = pool.allocate((7,), f32) + coefs[:] = np.exp(np.float32(np.arange(-3, 4))**2/(-2*stdev**2)) + coefs /= np.sum(coefs) + ptr, size = mod.get_global('gauss_coefs') + cuda.memcpy_htod_async(ptr, coefs, stream) + def mktref(mod, n): tref = mod.get_texref(n) tref.set_filter_mode(cuda.filter_mode.POINT) @@ -44,6 +51,7 @@ class Bilateral(Filter, ClsMod): tref = mktref(self.mod, 'chan4_src') grad_dsc = mkdsc(dim, 1) grad_tref = mktref(self.mod, 'chan1_src') + set_blur_width(self.mod, fb.pool, stream=stream) for pattern in range(self.directions): # Scale spatial parameter so that a "pixel" is equivalent to an @@ -89,6 +97,7 @@ class HaloClip(Filter, ClsMod): dsc = mkdsc(dim, 1) tref = mktref(self.mod, 'chan1_src') + set_blur_width(self.mod, fb.pool, stream=stream) launch2('apply_gamma', self.mod, stream, dim, fb.d_side, fb.d_front, gam) tref.set_address_2d(fb.d_side, dsc, 4 * params.astride) @@ -104,7 +113,6 @@ class HaloClip(Filter, ClsMod): class ColorClip(Filter, ClsMod): lib = code.filters.colorcliplib def apply(self, fb, gprof, params, dim, tc, stream=None): - # TODO: implement integration over cubic splines? gam = f32(1 / params.gamma(tc)) vib = f32(params.vibrance(tc)) hipow = f32(params.highlight_power(tc)) diff --git a/cuburn/render.py b/cuburn/render.py index 723fb25..b272a8e 100644 --- a/cuburn/render.py +++ b/cuburn/render.py @@ -73,6 +73,7 @@ class Framebuffers(object): def __init__(self): self.stream = cuda.Stream() + self.pool = pycuda.tools.PageLockedMemoryPool() self._clear() # These resources rely on the slots/ringbuffer mechanism for sharing, @@ -218,8 +219,6 @@ class RenderManager(ClsMod): def __init__(self): super(RenderManager, self).__init__() - self.pool = pycuda.tools.PageLockedMemoryPool() - self.fb = Framebuffers() self.src_a, self.src_b = DevSrc(), DevSrc() self.info_a, self.info_b = DevInfo(), DevInfo() @@ -234,15 +233,15 @@ class RenderManager(ClsMod): Note that for now, this is broken! It ignores ``gnm``, and only packs the genome that was used when creating the renderer. """ - times, knots = rdr.packer.pack(gnm, self.pool) + times, knots = rdr.packer.pack(gnm, self.fb.pool) cuda.memcpy_htod_async(self.src_a.d_times, times, self.stream_a) cuda.memcpy_htod_async(self.src_a.d_knots, knots, self.stream_a) palsrc = dict([(v[0], palette_decode(v[1:])) for v in gnm['palette']]) ptimes, pvals = zip(*sorted(palsrc.items())) - palettes = self.pool.allocate((len(palsrc), 256, 4), f32) + palettes = self.fb.pool.allocate((len(palsrc), 256, 4), f32) palettes[:] = pvals - palette_times = self.pool.allocate((self.src_a.max_knots,), f32) + palette_times = self.fb.pool.allocate((self.src_a.max_knots,), f32) palette_times.fill(1e9) palette_times[:len(ptimes)] = ptimes cuda.memcpy_htod_async(self.src_a.d_pals, palettes, self.stream_a) @@ -253,7 +252,7 @@ class RenderManager(ClsMod): def _interp(self, rdr, gnm, dim, ts, td): d_acc_size = rdr.mod.get_global('acc_size')[0] - p_dim = self.pool.allocate((len(dim),), u32) + p_dim = self.fb.pool.allocate((len(dim),), u32) p_dim[:] = dim cuda.memcpy_htod_async(d_acc_size, p_dim, self.stream_a) @@ -363,7 +362,7 @@ class RenderManager(ClsMod): filt.apply(self.fb, gprof, params, dim, tc, self.stream_a) rdr.out.convert(self.fb, gprof, dim, self.stream_a) self.filt_evt = cuda.Event().record(self.stream_a) - h_out = rdr.out.copy(self.fb, dim, self.pool, self.stream_a) + h_out = rdr.out.copy(self.fb, dim, self.fb.pool, self.stream_a) self.copy_evt = cuda.Event().record(self.stream_a) self.info_a, self.info_b = self.info_b, self.info_a