diff --git a/cuburn/code/filtering.py b/cuburn/code/filtering.py index f1280fe..9a15379 100644 --- a/cuburn/code/filtering.py +++ b/cuburn/code/filtering.py @@ -1,15 +1,18 @@ +import numpy as np +import pycuda.compiler +from pycuda.gpuarray import vec + from cuburn.code.util import * -class ColorClip(HunkOCode): - def __init__(self, info): - self.defs = self.defs_tmpl.substitute(info=info) +_CODE = ''' +#include - defs_tmpl = Template(''' __global__ void colorclip(float4 *pixbuf, float gamma, float vibrancy, float highpow, - float linrange, float lingam, float3 bkgd, int fbsize) { - int i = gtid(); + float linrange, float lingam, float3 bkgd, int fbsize, + int alpha_output_channel) { + int i = threadIdx.x + blockDim.x * (blockIdx.x + gridDim.x * blockIdx.y); if (i >= fbsize) return; float4 pix = pixbuf[i]; @@ -61,16 +64,16 @@ void colorclip(float4 *pixbuf, float gamma, float vibrancy, float highpow, pix.y += (1.0f - vibrancy) * powf(opix.y, gamma); pix.z += (1.0f - vibrancy) * powf(opix.z, gamma); - {{if info.alpha_output_channel}} - float 1_alpha = 1 / alpha; - pix.x *= 1_alpha; - pix.y *= 1_alpha; - pix.z *= 1_alpha; - {{else}} - pix.x += (1.0f - alpha) * bkgd.x; - pix.y += (1.0f - alpha) * bkgd.y; - pix.z += (1.0f - alpha) * bkgd.z; - {{endif}} + if (alpha_output_channel) { + float one_alpha = 1.0f / alpha; + pix.x *= one_alpha; + pix.y *= one_alpha; + pix.z *= one_alpha; + } else { + pix.x += (1.0f - alpha) * bkgd.x; + pix.y += (1.0f - alpha) * bkgd.y; + pix.z += (1.0f - alpha) * bkgd.z; + } pix.w = alpha; // Clamp values. I think this is superfluous, but I'm not certain. @@ -80,24 +83,8 @@ void colorclip(float4 *pixbuf, float gamma, float vibrancy, float highpow, pixbuf[i] = pix; } -''') -class DensityEst(HunkOCode): - """ - NOTE: for now, this *must* be invoked with a block size of (32,32,1), and - a grid size of (W/32,1). At least 21 pixel gutters are required, and the - stride and height probably need to be multiples of 32. - """ - def __init__(self, info): - self.info = info - - headers = "#include\n" - @property - def defs(self): - return self.defs_tmpl.substitute(info=self.info) - - defs_tmpl = Template(''' #define W 21 // Filter width (regardless of standard deviation chosen) #define W2 10 // Half of filter width, rounded down #define FW 52 // Width of local result storage (NW+W2+W2) @@ -116,9 +103,7 @@ __device__ void de_add(int ibase, int ii, int jj, float4 scaled) { __global__ void logscale(float4 *pixbuf, float4 *outbuf, float k1, float k2) { int i = blockDim.x * blockIdx.x + threadIdx.x; - float den; float4 pix = pixbuf[i]; - read_pix(pix, den); float ls = fmaxf(0, k1 * logf(1.0f + pix.w * k2) / pix.w); pix.x *= ls; @@ -137,21 +122,19 @@ void logscale(float4 *pixbuf, float4 *outbuf, float k1, float k2) { __global__ void density_est(float4 *pixbuf, float4 *outbuf, float est_sd, float neg_est_curve, float est_min, - float k1, float k2) { + float k1, float k2, int height, int stride) { for (int i = threadIdx.x + 32*threadIdx.y; i < FW2; i += 32) de_r[i] = de_g[i] = de_b[i] = de_a[i] = 0.0f; __syncthreads(); - for (int imrow = threadIdx.y + W2; imrow < ({{info.acc_height}} - W2); imrow += 32) + for (int imrow = threadIdx.y + W2; imrow < (height - W2); imrow += 32) { - int idx = {{info.acc_stride}} * imrow + - + blockIdx.x * 32 + threadIdx.x + W2; + int idx = stride * imrow + blockIdx.x * 32 + threadIdx.x + W2; float4 in = pixbuf[idx]; - float den; - read_pix(in, den); + float den = in.w; - if (in.w > 0 && den > 0) { + if (den > 0) { // Compute a fast and dirty approximation of a "gradient" using // a [[-1 0 0][0 0 0][0 0 1]]/4 matrix (and its reflection) @@ -168,10 +151,10 @@ void density_est(float4 *pixbuf, float4 *outbuf, // like MLAA. float *dens = reinterpret_cast(pixbuf); int didx = idx * 4 + 3; - float x = 0.25f * ( dens[didx+{{info.acc_stride*4}}+4] - - dens[didx-{{info.acc_stride*4}}-4] ); - float y = 0.25f * ( dens[didx+{{info.acc_stride*4}}-4] - - dens[didx-{{info.acc_stride*4}}+4] ); + float x = 0.25f * ( dens[didx+stride*4+4] + - dens[didx-stride*4-4] ); + float y = 0.25f * ( dens[didx+stride*4-4] + - dens[didx-stride*4+4] ); float diag_mag = sqrtf(x*x + y*y); float ls = k1 * logf(1.0f + in.w * k2) / in.w; @@ -272,7 +255,7 @@ void density_est(float4 *pixbuf, float4 *outbuf, __syncthreads(); // TODO: could coalesce this, but what a pain for (int i = threadIdx.x; i < FW; i += 32) { - idx = {{info.acc_stride}} * imrow + blockIdx.x * 32 + i + W2; + idx = stride * imrow + blockIdx.x * 32 + i + W2; int si = threadIdx.y * FW + i; float *out = reinterpret_cast(&outbuf[idx]); atomicAdd(out, de_r[si]); @@ -301,34 +284,70 @@ void density_est(float4 *pixbuf, float4 *outbuf, __syncthreads(); } } +''' -''') +class Filtering(object): - def invoke(self, mod, cp, abufd, obufd, stream=None): - # TODO: add no-est version - # TODO: come up with a general way to average these parameters + mod = None - k1 = np.float32(cp.color.brightness * 268 / 256) + @classmethod + def init_mod(cls): + if cls.mod is None: + cls.mod = pycuda.compiler.SourceModule(_CODE, + options=['-use_fast_math', '-maxrregcount', '32']) + + def __init__(self): + self.init_mod() + + def de(self, ddst, dsrc, info, start, stop, stream=None): + # TODO: use integration to obtain parameter values + t = (start + stop) / 2 + cp = info.genome + + k1 = np.float32(cp.color.brightness(t) * 268 / 256) # Old definition of area is (w*h/(s*s)). Since new scale 'ns' is now # s/w, new definition is (w*h/(s*s*w*w)) = (h/(s*s*w)) - area = self.info.height / (cp.camera.scale ** 2 * self.info.width) - k2 = np.float32(1 / (area * self.info.density )) + area = info.height / (cp.camera.scale(t) ** 2 * info.width) + k2 = np.float32(1 / (area * info.density)) if cp.de.radius == 0: - nbins = self.info.acc_height * self.info.acc_stride - fun = mod.get_function("logscale") - t = fun(abufd, obufd, k1, k2, + nbins = info.acc_height * info.acc_stride + fun = self.mod.get_function("logscale") + t = fun(dsrc, ddst, k1, k2, block=(512, 1, 1), grid=(nbins/512, 1), stream=stream) else: # flam3_gaussian_filter() uses an implicit standard deviation of # 0.5, but the DE filters scale filter distance by the default # spatial support factor of 1.5, so the effective base SD is # (0.5/1.5)=1/3. - est_sd = np.float32(cp.de.radius / 3.) - neg_est_curve = np.float32(-cp.de.curve) - est_min = np.float32(cp.de.minimum / 3.) - fun = mod.get_function("density_est") - fun(abufd, obufd, est_sd, neg_est_curve, est_min, k1, k2, - block=(32, 32, 1), grid=(self.info.acc_width/32, 1), - stream=stream) + est_sd = np.float32(cp.de.radius(t) / 3.) + neg_est_curve = np.float32(-cp.de.curve(t)) + est_min = np.float32(cp.de.minimum(t) / 3.) + fun = self.mod.get_function("density_est") + fun(dsrc, ddst, est_sd, neg_est_curve, est_min, k1, k2, + np.int32(info.acc_height), np.int32(info.acc_stride), + block=(32, 32, 1), grid=(info.acc_width/32, 1), stream=stream) + + def colorclip(self, dbuf, info, start, stop, stream=None): + f32 = np.float32 + t = (start + stop) / 2 + cp = info.genome + nbins = info.acc_height * info.acc_stride + + # TODO: implement integration over cubic splines? + gam = f32(1 / cp.color.gamma(t)) + vib = f32(cp.color.vibrancy(t)) + hipow = f32(cp.color.highlight_power(t)) + lin = f32(cp.color.gamma_threshold(t)) + lingam = f32(lin ** (gam-1.0) if lin > 0 else 0) + bkgd = vec.make_float3( + cp.color.background.r(t), + cp.color.background.g(t), + cp.color.background.b(t)) + + color_fun = self.mod.get_function("colorclip") + blocks = int(np.ceil(np.sqrt(nbins / 256))) + color_fun(dbuf, gam, vib, hipow, lin, lingam, bkgd, np.int32(nbins), + np.int32(0), + block=(256, 1, 1), grid=(blocks, blocks), stream=stream) diff --git a/cuburn/render.py b/cuburn/render.py index 08a795e..e39210c 100644 --- a/cuburn/render.py +++ b/cuburn/render.py @@ -1,6 +1,5 @@ import os import sys -import math import re import time as timemod import tempfile @@ -18,7 +17,6 @@ from fr0stlib.pyflam3.constants import * import pycuda.compiler import pycuda.driver as cuda import pycuda.tools -from pycuda.gpuarray import vec import cuburn.genome from cuburn import affine @@ -50,13 +48,13 @@ class Renderer(object): def __init__(self, info): self.info = info - self._iter = self._de = self.src = self.cubin = self.mod = None + self._iter = self.src = self.cubin = self.mod = None self.packed_genome = None # Ensure class options don't get contaminated on an instance self.cmp_options = list(self.cmp_options) - def compile(self, keep=None, cmp_options=None): + def compile(self, keep=None, cmp_options=None, jit_options=[]): """ Compile a kernel capable of rendering every frame in this animation. The resulting compiled kernel is stored in the ``cubin`` property; @@ -72,41 +70,18 @@ class Renderer(object): cmp_options = self.cmp_options if cmp_options is None else cmp_options self._iter = iter.IterCode(self.info) - self._de = filtering.DensityEst(self.info) - cclip = filtering.ColorClip(self.info) self._iter.packer.finalize() self.src = util.assemble_code(util.BaseCode, mwc.MWC, self._iter.packer, - self._iter, cclip, self._de) + self._iter) with open(os.path.join(tempfile.gettempdir(), 'kernel.cu'), 'w') as fp: fp.write(self.src) self.cubin = pycuda.compiler.compile( self.src, keep=keep, options=cmp_options, cache_dir=False if keep else None) - return self.src - - def copy(self): - """ - Return a copy of this animation without any references to the current - CUDA context. This can be used to load an animation in multiple CUDA - contexts without recompiling, so that rendering can proceed across - multiple devices - but managing that is up to you. - """ - import copy - new = copy.copy(self) - new.mod = None - return new - - def load(self, jit_options=[]): - """ - Replace the currently loaded CUDA module in the active CUDA context - with the compiled code's module. A reference is kept to the module, - meaning that rendering should henceforth only be called from the - thread and context in which this function was called. - """ - if self.cubin is None: - self.compile() self.mod = cuda.module_from_buffer(self.cubin, jit_options) - + with open('/tmp/iter_kern.cubin', 'wb') as fp: + fp.write(self.cubin) + return self.src def render(self, times): """ @@ -126,6 +101,8 @@ class Renderer(object): if times == []: return + filt = filtering.Filtering() + reset_rb_fun = self.mod.get_function("reset_rb") packer_fun = self.mod.get_function("interp_iter_params") palette_fun = self.mod.get_function("interp_palette_hsv") @@ -193,8 +170,6 @@ class Renderer(object): last_idx = None for idx, start, stop in times: - cen_cp = cuburn.genome.HacketyGenome(info.genome, (start+stop)/2) - width = np.float32((stop-start) / info.palette_height) palette_fun(d_palmem, d_palint_times, d_palint_vals, np.float32(start), width, @@ -234,25 +209,11 @@ class Renderer(object): grid=(ntemporal_samples, 1), texrefs=[tref], stream=stream) + stream.synchronize() + util.BaseCode.fill_dptr(self.mod, d_out, 4 * nbins, stream) - self._de.invoke(self.mod, cen_cp, d_accum, d_out, stream) - - f32 = np.float32 - # TODO: implement integration over cubic splines? - gam = f32(1 / cen_cp.color.gamma) - vib = f32(cen_cp.color.vibrancy) - hipow = f32(cen_cp.color.highlight_power) - lin = f32(cen_cp.color.gamma_threshold) - lingam = f32(math.pow(lin, gam-1.0) if lin > 0 else 0) - bkgd = vec.make_float3( - cen_cp.color.background.r, - cen_cp.color.background.g, - cen_cp.color.background.b) - - color_fun = self.mod.get_function("colorclip") - blocks = int(np.ceil(np.sqrt(nbins / 256))) - color_fun(d_out, gam, vib, hipow, lin, lingam, bkgd, np.int32(nbins), - block=(256, 1, 1), grid=(blocks, blocks), stream=stream) + filt.de(d_out, d_accum, info, start, stop, stream) + filt.colorclip(d_out, info, start, stop, stream) cuda.memcpy_dtoh_async(h_out_a, d_out, stream) if event_b: