mirror of
https://github.com/stevenrobertson/cuburn.git
synced 2025-03-16 00:11:30 -04:00
Support CUDA 4.1. Split filtering into new module.
The new toolkit generates code for filtering which uses too many registers, so this change splits filtering into its own module so that it can have separate register usage limits during compiling. As a bonus, this should improve startup time in general, since the filtering code is now fixed and does not need to be recompiled.
This commit is contained in:
parent
cea91d75bf
commit
3147fd40d2
@ -1,15 +1,18 @@
|
|||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import pycuda.compiler
|
||||||
|
from pycuda.gpuarray import vec
|
||||||
|
|
||||||
from cuburn.code.util import *
|
from cuburn.code.util import *
|
||||||
|
|
||||||
class ColorClip(HunkOCode):
|
_CODE = '''
|
||||||
def __init__(self, info):
|
#include<math_constants.h>
|
||||||
self.defs = self.defs_tmpl.substitute(info=info)
|
|
||||||
|
|
||||||
defs_tmpl = Template('''
|
|
||||||
__global__
|
__global__
|
||||||
void colorclip(float4 *pixbuf, float gamma, float vibrancy, float highpow,
|
void colorclip(float4 *pixbuf, float gamma, float vibrancy, float highpow,
|
||||||
float linrange, float lingam, float3 bkgd, int fbsize) {
|
float linrange, float lingam, float3 bkgd, int fbsize,
|
||||||
int i = gtid();
|
int alpha_output_channel) {
|
||||||
|
int i = threadIdx.x + blockDim.x * (blockIdx.x + gridDim.x * blockIdx.y);
|
||||||
if (i >= fbsize) return;
|
if (i >= fbsize) return;
|
||||||
|
|
||||||
float4 pix = pixbuf[i];
|
float4 pix = pixbuf[i];
|
||||||
@ -61,16 +64,16 @@ void colorclip(float4 *pixbuf, float gamma, float vibrancy, float highpow,
|
|||||||
pix.y += (1.0f - vibrancy) * powf(opix.y, gamma);
|
pix.y += (1.0f - vibrancy) * powf(opix.y, gamma);
|
||||||
pix.z += (1.0f - vibrancy) * powf(opix.z, gamma);
|
pix.z += (1.0f - vibrancy) * powf(opix.z, gamma);
|
||||||
|
|
||||||
{{if info.alpha_output_channel}}
|
if (alpha_output_channel) {
|
||||||
float 1_alpha = 1 / alpha;
|
float one_alpha = 1.0f / alpha;
|
||||||
pix.x *= 1_alpha;
|
pix.x *= one_alpha;
|
||||||
pix.y *= 1_alpha;
|
pix.y *= one_alpha;
|
||||||
pix.z *= 1_alpha;
|
pix.z *= one_alpha;
|
||||||
{{else}}
|
} else {
|
||||||
pix.x += (1.0f - alpha) * bkgd.x;
|
pix.x += (1.0f - alpha) * bkgd.x;
|
||||||
pix.y += (1.0f - alpha) * bkgd.y;
|
pix.y += (1.0f - alpha) * bkgd.y;
|
||||||
pix.z += (1.0f - alpha) * bkgd.z;
|
pix.z += (1.0f - alpha) * bkgd.z;
|
||||||
{{endif}}
|
}
|
||||||
pix.w = alpha;
|
pix.w = alpha;
|
||||||
|
|
||||||
// Clamp values. I think this is superfluous, but I'm not certain.
|
// Clamp values. I think this is superfluous, but I'm not certain.
|
||||||
@ -80,24 +83,8 @@ void colorclip(float4 *pixbuf, float gamma, float vibrancy, float highpow,
|
|||||||
|
|
||||||
pixbuf[i] = pix;
|
pixbuf[i] = pix;
|
||||||
}
|
}
|
||||||
''')
|
|
||||||
|
|
||||||
class DensityEst(HunkOCode):
|
|
||||||
"""
|
|
||||||
NOTE: for now, this *must* be invoked with a block size of (32,32,1), and
|
|
||||||
a grid size of (W/32,1). At least 21 pixel gutters are required, and the
|
|
||||||
stride and height probably need to be multiples of 32.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, info):
|
|
||||||
self.info = info
|
|
||||||
|
|
||||||
headers = "#include<math_constants.h>\n"
|
|
||||||
@property
|
|
||||||
def defs(self):
|
|
||||||
return self.defs_tmpl.substitute(info=self.info)
|
|
||||||
|
|
||||||
defs_tmpl = Template('''
|
|
||||||
#define W 21 // Filter width (regardless of standard deviation chosen)
|
#define W 21 // Filter width (regardless of standard deviation chosen)
|
||||||
#define W2 10 // Half of filter width, rounded down
|
#define W2 10 // Half of filter width, rounded down
|
||||||
#define FW 52 // Width of local result storage (NW+W2+W2)
|
#define FW 52 // Width of local result storage (NW+W2+W2)
|
||||||
@ -116,9 +103,7 @@ __device__ void de_add(int ibase, int ii, int jj, float4 scaled) {
|
|||||||
__global__
|
__global__
|
||||||
void logscale(float4 *pixbuf, float4 *outbuf, float k1, float k2) {
|
void logscale(float4 *pixbuf, float4 *outbuf, float k1, float k2) {
|
||||||
int i = blockDim.x * blockIdx.x + threadIdx.x;
|
int i = blockDim.x * blockIdx.x + threadIdx.x;
|
||||||
float den;
|
|
||||||
float4 pix = pixbuf[i];
|
float4 pix = pixbuf[i];
|
||||||
read_pix(pix, den);
|
|
||||||
|
|
||||||
float ls = fmaxf(0, k1 * logf(1.0f + pix.w * k2) / pix.w);
|
float ls = fmaxf(0, k1 * logf(1.0f + pix.w * k2) / pix.w);
|
||||||
pix.x *= ls;
|
pix.x *= ls;
|
||||||
@ -137,21 +122,19 @@ void logscale(float4 *pixbuf, float4 *outbuf, float k1, float k2) {
|
|||||||
__global__
|
__global__
|
||||||
void density_est(float4 *pixbuf, float4 *outbuf,
|
void density_est(float4 *pixbuf, float4 *outbuf,
|
||||||
float est_sd, float neg_est_curve, float est_min,
|
float est_sd, float neg_est_curve, float est_min,
|
||||||
float k1, float k2) {
|
float k1, float k2, int height, int stride) {
|
||||||
for (int i = threadIdx.x + 32*threadIdx.y; i < FW2; i += 32)
|
for (int i = threadIdx.x + 32*threadIdx.y; i < FW2; i += 32)
|
||||||
de_r[i] = de_g[i] = de_b[i] = de_a[i] = 0.0f;
|
de_r[i] = de_g[i] = de_b[i] = de_a[i] = 0.0f;
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
for (int imrow = threadIdx.y + W2; imrow < ({{info.acc_height}} - W2); imrow += 32)
|
for (int imrow = threadIdx.y + W2; imrow < (height - W2); imrow += 32)
|
||||||
{
|
{
|
||||||
int idx = {{info.acc_stride}} * imrow +
|
int idx = stride * imrow + blockIdx.x * 32 + threadIdx.x + W2;
|
||||||
+ blockIdx.x * 32 + threadIdx.x + W2;
|
|
||||||
|
|
||||||
float4 in = pixbuf[idx];
|
float4 in = pixbuf[idx];
|
||||||
float den;
|
float den = in.w;
|
||||||
read_pix(in, den);
|
|
||||||
|
|
||||||
if (in.w > 0 && den > 0) {
|
if (den > 0) {
|
||||||
|
|
||||||
// Compute a fast and dirty approximation of a "gradient" using
|
// Compute a fast and dirty approximation of a "gradient" using
|
||||||
// a [[-1 0 0][0 0 0][0 0 1]]/4 matrix (and its reflection)
|
// a [[-1 0 0][0 0 0][0 0 1]]/4 matrix (and its reflection)
|
||||||
@ -168,10 +151,10 @@ void density_est(float4 *pixbuf, float4 *outbuf,
|
|||||||
// like MLAA.
|
// like MLAA.
|
||||||
float *dens = reinterpret_cast<float*>(pixbuf);
|
float *dens = reinterpret_cast<float*>(pixbuf);
|
||||||
int didx = idx * 4 + 3;
|
int didx = idx * 4 + 3;
|
||||||
float x = 0.25f * ( dens[didx+{{info.acc_stride*4}}+4]
|
float x = 0.25f * ( dens[didx+stride*4+4]
|
||||||
- dens[didx-{{info.acc_stride*4}}-4] );
|
- dens[didx-stride*4-4] );
|
||||||
float y = 0.25f * ( dens[didx+{{info.acc_stride*4}}-4]
|
float y = 0.25f * ( dens[didx+stride*4-4]
|
||||||
- dens[didx-{{info.acc_stride*4}}+4] );
|
- dens[didx-stride*4+4] );
|
||||||
float diag_mag = sqrtf(x*x + y*y);
|
float diag_mag = sqrtf(x*x + y*y);
|
||||||
|
|
||||||
float ls = k1 * logf(1.0f + in.w * k2) / in.w;
|
float ls = k1 * logf(1.0f + in.w * k2) / in.w;
|
||||||
@ -272,7 +255,7 @@ void density_est(float4 *pixbuf, float4 *outbuf,
|
|||||||
__syncthreads();
|
__syncthreads();
|
||||||
// TODO: could coalesce this, but what a pain
|
// TODO: could coalesce this, but what a pain
|
||||||
for (int i = threadIdx.x; i < FW; i += 32) {
|
for (int i = threadIdx.x; i < FW; i += 32) {
|
||||||
idx = {{info.acc_stride}} * imrow + blockIdx.x * 32 + i + W2;
|
idx = stride * imrow + blockIdx.x * 32 + i + W2;
|
||||||
int si = threadIdx.y * FW + i;
|
int si = threadIdx.y * FW + i;
|
||||||
float *out = reinterpret_cast<float*>(&outbuf[idx]);
|
float *out = reinterpret_cast<float*>(&outbuf[idx]);
|
||||||
atomicAdd(out, de_r[si]);
|
atomicAdd(out, de_r[si]);
|
||||||
@ -301,34 +284,70 @@ void density_est(float4 *pixbuf, float4 *outbuf,
|
|||||||
__syncthreads();
|
__syncthreads();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
'''
|
||||||
|
|
||||||
''')
|
class Filtering(object):
|
||||||
|
|
||||||
def invoke(self, mod, cp, abufd, obufd, stream=None):
|
mod = None
|
||||||
# TODO: add no-est version
|
|
||||||
# TODO: come up with a general way to average these parameters
|
|
||||||
|
|
||||||
k1 = np.float32(cp.color.brightness * 268 / 256)
|
@classmethod
|
||||||
|
def init_mod(cls):
|
||||||
|
if cls.mod is None:
|
||||||
|
cls.mod = pycuda.compiler.SourceModule(_CODE,
|
||||||
|
options=['-use_fast_math', '-maxrregcount', '32'])
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.init_mod()
|
||||||
|
|
||||||
|
def de(self, ddst, dsrc, info, start, stop, stream=None):
|
||||||
|
# TODO: use integration to obtain parameter values
|
||||||
|
t = (start + stop) / 2
|
||||||
|
cp = info.genome
|
||||||
|
|
||||||
|
k1 = np.float32(cp.color.brightness(t) * 268 / 256)
|
||||||
# Old definition of area is (w*h/(s*s)). Since new scale 'ns' is now
|
# Old definition of area is (w*h/(s*s)). Since new scale 'ns' is now
|
||||||
# s/w, new definition is (w*h/(s*s*w*w)) = (h/(s*s*w))
|
# s/w, new definition is (w*h/(s*s*w*w)) = (h/(s*s*w))
|
||||||
area = self.info.height / (cp.camera.scale ** 2 * self.info.width)
|
area = info.height / (cp.camera.scale(t) ** 2 * info.width)
|
||||||
k2 = np.float32(1 / (area * self.info.density ))
|
k2 = np.float32(1 / (area * info.density))
|
||||||
|
|
||||||
if cp.de.radius == 0:
|
if cp.de.radius == 0:
|
||||||
nbins = self.info.acc_height * self.info.acc_stride
|
nbins = info.acc_height * info.acc_stride
|
||||||
fun = mod.get_function("logscale")
|
fun = self.mod.get_function("logscale")
|
||||||
t = fun(abufd, obufd, k1, k2,
|
t = fun(dsrc, ddst, k1, k2,
|
||||||
block=(512, 1, 1), grid=(nbins/512, 1), stream=stream)
|
block=(512, 1, 1), grid=(nbins/512, 1), stream=stream)
|
||||||
else:
|
else:
|
||||||
# flam3_gaussian_filter() uses an implicit standard deviation of
|
# flam3_gaussian_filter() uses an implicit standard deviation of
|
||||||
# 0.5, but the DE filters scale filter distance by the default
|
# 0.5, but the DE filters scale filter distance by the default
|
||||||
# spatial support factor of 1.5, so the effective base SD is
|
# spatial support factor of 1.5, so the effective base SD is
|
||||||
# (0.5/1.5)=1/3.
|
# (0.5/1.5)=1/3.
|
||||||
est_sd = np.float32(cp.de.radius / 3.)
|
est_sd = np.float32(cp.de.radius(t) / 3.)
|
||||||
neg_est_curve = np.float32(-cp.de.curve)
|
neg_est_curve = np.float32(-cp.de.curve(t))
|
||||||
est_min = np.float32(cp.de.minimum / 3.)
|
est_min = np.float32(cp.de.minimum(t) / 3.)
|
||||||
fun = mod.get_function("density_est")
|
fun = self.mod.get_function("density_est")
|
||||||
fun(abufd, obufd, est_sd, neg_est_curve, est_min, k1, k2,
|
fun(dsrc, ddst, est_sd, neg_est_curve, est_min, k1, k2,
|
||||||
block=(32, 32, 1), grid=(self.info.acc_width/32, 1),
|
np.int32(info.acc_height), np.int32(info.acc_stride),
|
||||||
stream=stream)
|
block=(32, 32, 1), grid=(info.acc_width/32, 1), stream=stream)
|
||||||
|
|
||||||
|
def colorclip(self, dbuf, info, start, stop, stream=None):
|
||||||
|
f32 = np.float32
|
||||||
|
t = (start + stop) / 2
|
||||||
|
cp = info.genome
|
||||||
|
nbins = info.acc_height * info.acc_stride
|
||||||
|
|
||||||
|
# TODO: implement integration over cubic splines?
|
||||||
|
gam = f32(1 / cp.color.gamma(t))
|
||||||
|
vib = f32(cp.color.vibrancy(t))
|
||||||
|
hipow = f32(cp.color.highlight_power(t))
|
||||||
|
lin = f32(cp.color.gamma_threshold(t))
|
||||||
|
lingam = f32(lin ** (gam-1.0) if lin > 0 else 0)
|
||||||
|
bkgd = vec.make_float3(
|
||||||
|
cp.color.background.r(t),
|
||||||
|
cp.color.background.g(t),
|
||||||
|
cp.color.background.b(t))
|
||||||
|
|
||||||
|
color_fun = self.mod.get_function("colorclip")
|
||||||
|
blocks = int(np.ceil(np.sqrt(nbins / 256)))
|
||||||
|
color_fun(dbuf, gam, vib, hipow, lin, lingam, bkgd, np.int32(nbins),
|
||||||
|
np.int32(0),
|
||||||
|
block=(256, 1, 1), grid=(blocks, blocks), stream=stream)
|
||||||
|
|
||||||
|
@ -1,6 +1,5 @@
|
|||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import math
|
|
||||||
import re
|
import re
|
||||||
import time as timemod
|
import time as timemod
|
||||||
import tempfile
|
import tempfile
|
||||||
@ -18,7 +17,6 @@ from fr0stlib.pyflam3.constants import *
|
|||||||
import pycuda.compiler
|
import pycuda.compiler
|
||||||
import pycuda.driver as cuda
|
import pycuda.driver as cuda
|
||||||
import pycuda.tools
|
import pycuda.tools
|
||||||
from pycuda.gpuarray import vec
|
|
||||||
|
|
||||||
import cuburn.genome
|
import cuburn.genome
|
||||||
from cuburn import affine
|
from cuburn import affine
|
||||||
@ -50,13 +48,13 @@ class Renderer(object):
|
|||||||
|
|
||||||
def __init__(self, info):
|
def __init__(self, info):
|
||||||
self.info = info
|
self.info = info
|
||||||
self._iter = self._de = self.src = self.cubin = self.mod = None
|
self._iter = self.src = self.cubin = self.mod = None
|
||||||
self.packed_genome = None
|
self.packed_genome = None
|
||||||
|
|
||||||
# Ensure class options don't get contaminated on an instance
|
# Ensure class options don't get contaminated on an instance
|
||||||
self.cmp_options = list(self.cmp_options)
|
self.cmp_options = list(self.cmp_options)
|
||||||
|
|
||||||
def compile(self, keep=None, cmp_options=None):
|
def compile(self, keep=None, cmp_options=None, jit_options=[]):
|
||||||
"""
|
"""
|
||||||
Compile a kernel capable of rendering every frame in this animation.
|
Compile a kernel capable of rendering every frame in this animation.
|
||||||
The resulting compiled kernel is stored in the ``cubin`` property;
|
The resulting compiled kernel is stored in the ``cubin`` property;
|
||||||
@ -72,41 +70,18 @@ class Renderer(object):
|
|||||||
cmp_options = self.cmp_options if cmp_options is None else cmp_options
|
cmp_options = self.cmp_options if cmp_options is None else cmp_options
|
||||||
|
|
||||||
self._iter = iter.IterCode(self.info)
|
self._iter = iter.IterCode(self.info)
|
||||||
self._de = filtering.DensityEst(self.info)
|
|
||||||
cclip = filtering.ColorClip(self.info)
|
|
||||||
self._iter.packer.finalize()
|
self._iter.packer.finalize()
|
||||||
self.src = util.assemble_code(util.BaseCode, mwc.MWC, self._iter.packer,
|
self.src = util.assemble_code(util.BaseCode, mwc.MWC, self._iter.packer,
|
||||||
self._iter, cclip, self._de)
|
self._iter)
|
||||||
with open(os.path.join(tempfile.gettempdir(), 'kernel.cu'), 'w') as fp:
|
with open(os.path.join(tempfile.gettempdir(), 'kernel.cu'), 'w') as fp:
|
||||||
fp.write(self.src)
|
fp.write(self.src)
|
||||||
self.cubin = pycuda.compiler.compile(
|
self.cubin = pycuda.compiler.compile(
|
||||||
self.src, keep=keep, options=cmp_options,
|
self.src, keep=keep, options=cmp_options,
|
||||||
cache_dir=False if keep else None)
|
cache_dir=False if keep else None)
|
||||||
return self.src
|
|
||||||
|
|
||||||
def copy(self):
|
|
||||||
"""
|
|
||||||
Return a copy of this animation without any references to the current
|
|
||||||
CUDA context. This can be used to load an animation in multiple CUDA
|
|
||||||
contexts without recompiling, so that rendering can proceed across
|
|
||||||
multiple devices - but managing that is up to you.
|
|
||||||
"""
|
|
||||||
import copy
|
|
||||||
new = copy.copy(self)
|
|
||||||
new.mod = None
|
|
||||||
return new
|
|
||||||
|
|
||||||
def load(self, jit_options=[]):
|
|
||||||
"""
|
|
||||||
Replace the currently loaded CUDA module in the active CUDA context
|
|
||||||
with the compiled code's module. A reference is kept to the module,
|
|
||||||
meaning that rendering should henceforth only be called from the
|
|
||||||
thread and context in which this function was called.
|
|
||||||
"""
|
|
||||||
if self.cubin is None:
|
|
||||||
self.compile()
|
|
||||||
self.mod = cuda.module_from_buffer(self.cubin, jit_options)
|
self.mod = cuda.module_from_buffer(self.cubin, jit_options)
|
||||||
|
with open('/tmp/iter_kern.cubin', 'wb') as fp:
|
||||||
|
fp.write(self.cubin)
|
||||||
|
return self.src
|
||||||
|
|
||||||
def render(self, times):
|
def render(self, times):
|
||||||
"""
|
"""
|
||||||
@ -126,6 +101,8 @@ class Renderer(object):
|
|||||||
if times == []:
|
if times == []:
|
||||||
return
|
return
|
||||||
|
|
||||||
|
filt = filtering.Filtering()
|
||||||
|
|
||||||
reset_rb_fun = self.mod.get_function("reset_rb")
|
reset_rb_fun = self.mod.get_function("reset_rb")
|
||||||
packer_fun = self.mod.get_function("interp_iter_params")
|
packer_fun = self.mod.get_function("interp_iter_params")
|
||||||
palette_fun = self.mod.get_function("interp_palette_hsv")
|
palette_fun = self.mod.get_function("interp_palette_hsv")
|
||||||
@ -193,8 +170,6 @@ class Renderer(object):
|
|||||||
last_idx = None
|
last_idx = None
|
||||||
|
|
||||||
for idx, start, stop in times:
|
for idx, start, stop in times:
|
||||||
cen_cp = cuburn.genome.HacketyGenome(info.genome, (start+stop)/2)
|
|
||||||
|
|
||||||
width = np.float32((stop-start) / info.palette_height)
|
width = np.float32((stop-start) / info.palette_height)
|
||||||
palette_fun(d_palmem, d_palint_times, d_palint_vals,
|
palette_fun(d_palmem, d_palint_times, d_palint_vals,
|
||||||
np.float32(start), width,
|
np.float32(start), width,
|
||||||
@ -234,25 +209,11 @@ class Renderer(object):
|
|||||||
grid=(ntemporal_samples, 1),
|
grid=(ntemporal_samples, 1),
|
||||||
texrefs=[tref], stream=stream)
|
texrefs=[tref], stream=stream)
|
||||||
|
|
||||||
|
stream.synchronize()
|
||||||
|
|
||||||
util.BaseCode.fill_dptr(self.mod, d_out, 4 * nbins, stream)
|
util.BaseCode.fill_dptr(self.mod, d_out, 4 * nbins, stream)
|
||||||
self._de.invoke(self.mod, cen_cp, d_accum, d_out, stream)
|
filt.de(d_out, d_accum, info, start, stop, stream)
|
||||||
|
filt.colorclip(d_out, info, start, stop, stream)
|
||||||
f32 = np.float32
|
|
||||||
# TODO: implement integration over cubic splines?
|
|
||||||
gam = f32(1 / cen_cp.color.gamma)
|
|
||||||
vib = f32(cen_cp.color.vibrancy)
|
|
||||||
hipow = f32(cen_cp.color.highlight_power)
|
|
||||||
lin = f32(cen_cp.color.gamma_threshold)
|
|
||||||
lingam = f32(math.pow(lin, gam-1.0) if lin > 0 else 0)
|
|
||||||
bkgd = vec.make_float3(
|
|
||||||
cen_cp.color.background.r,
|
|
||||||
cen_cp.color.background.g,
|
|
||||||
cen_cp.color.background.b)
|
|
||||||
|
|
||||||
color_fun = self.mod.get_function("colorclip")
|
|
||||||
blocks = int(np.ceil(np.sqrt(nbins / 256)))
|
|
||||||
color_fun(d_out, gam, vib, hipow, lin, lingam, bkgd, np.int32(nbins),
|
|
||||||
block=(256, 1, 1), grid=(blocks, blocks), stream=stream)
|
|
||||||
cuda.memcpy_dtoh_async(h_out_a, d_out, stream)
|
cuda.memcpy_dtoh_async(h_out_a, d_out, stream)
|
||||||
|
|
||||||
if event_b:
|
if event_b:
|
||||||
|
Loading…
Reference in New Issue
Block a user