mirror of
https://github.com/stevenrobertson/cuburn.git
synced 2025-02-05 11:40:04 -05:00
Another non-working checkin
This commit is contained in:
parent
fac6f838a4
commit
3d94c256a9
@ -4,32 +4,22 @@ from cuburn.code.util import *
|
|||||||
class ColorClip(HunkOCode):
|
class ColorClip(HunkOCode):
|
||||||
defs = """
|
defs = """
|
||||||
__global__
|
__global__
|
||||||
void logfilt(float4 *pixbuf, float k1, float k2,
|
void colorclip(float4 *pixbuf, float gamma, float vibrancy, float highpow) {
|
||||||
float gamma, float vibrancy, float highpow) {
|
|
||||||
// TODO: test if over an edge of the framebuffer
|
// TODO: test if over an edge of the framebuffer
|
||||||
int i = blockDim.x * blockIdx.x + threadIdx.x;
|
int i = blockDim.x * blockIdx.x + threadIdx.x;
|
||||||
float4 pix = pixbuf[i];
|
float4 pix = pixbuf[i];
|
||||||
|
|
||||||
if (pix.w <= 0) return;
|
if (pix.w <= 0) return;
|
||||||
|
|
||||||
float ls = k1 * logf(1.0 + pix.w * k2) / pix.w;
|
|
||||||
pix.x *= ls;
|
|
||||||
pix.y *= ls;
|
|
||||||
pix.z *= ls;
|
|
||||||
pix.w *= ls;
|
|
||||||
|
|
||||||
float4 opix = pix;
|
float4 opix = pix;
|
||||||
|
|
||||||
// TODO: linearized bottom range
|
// TODO: linearized bottom range
|
||||||
float alpha = powf(pix.w, gamma);
|
float alpha = powf(pix.w, gamma);
|
||||||
ls = vibrancy * alpha / pix.w;
|
float ls = vibrancy * alpha / pix.w;
|
||||||
|
|
||||||
float maxc = fmaxf(pix.x, fmaxf(pix.y, pix.z));
|
float maxc = fmaxf(pix.x, fmaxf(pix.y, pix.z));
|
||||||
float newls = 1 / maxc;
|
float newls = 1 / maxc;
|
||||||
|
|
||||||
// TODO: detect if highlight power is globally disabled and drop
|
|
||||||
// this branch
|
|
||||||
|
|
||||||
if (maxc * ls > 1 && highpow >= 0) {
|
if (maxc * ls > 1 && highpow >= 0) {
|
||||||
// TODO: does CUDA autopromote the int here to a float before GPU?
|
// TODO: does CUDA autopromote the int here to a float before GPU?
|
||||||
float lsratio = powf(newls / ls, highpow);
|
float lsratio = powf(newls / ls, highpow);
|
||||||
@ -65,15 +55,18 @@ void logfilt(float4 *pixbuf, float k1, float k2,
|
|||||||
|
|
||||||
class DensityEst(HunkOCode):
|
class DensityEst(HunkOCode):
|
||||||
"""
|
"""
|
||||||
NOTE: for now, this *must* be invoked with a block size of (32,16,1), and
|
NOTE: for now, this *must* be invoked with a block size of (32,32,1), and
|
||||||
a grid size of (W/32) for vertical filtering or (H/32) for horizontal.
|
a grid size of (W/32,1). At least 7 pixel gutters are required, and the
|
||||||
|
stride and height probably need to be multiples of 32.
|
||||||
It will probably fail for images that are not multiples of 32.
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
# Note, changing this does not yet have any effect, it's just informational
|
||||||
|
MAX_WIDTH=15
|
||||||
|
|
||||||
def __init__(self, features, cp):
|
def __init__(self, features, cp):
|
||||||
self.features, self.cp = features, cp
|
self.features, self.cp = features, cp
|
||||||
|
|
||||||
|
headers = "#include<math_constants.h>\n"
|
||||||
@property
|
@property
|
||||||
def defs(self):
|
def defs(self):
|
||||||
return self.defs_tmpl.substitute(features=self.features, cp=self.cp)
|
return self.defs_tmpl.substitute(features=self.features, cp=self.cp)
|
||||||
@ -81,109 +74,184 @@ class DensityEst(HunkOCode):
|
|||||||
defs_tmpl = Template("""
|
defs_tmpl = Template("""
|
||||||
#define W 15 // Filter width (regardless of standard deviation chosen)
|
#define W 15 // Filter width (regardless of standard deviation chosen)
|
||||||
#define W2 7 // Half of filter width, rounded down
|
#define W2 7 // Half of filter width, rounded down
|
||||||
#define NW 16 // Number of warps in each set of points
|
#define FW 46 // Width of local result storage (NW+W2+W2)
|
||||||
#define FW 30 // Width of local result storage per-lane (NW+W2+W2)
|
#define FW2 (FW*FW)
|
||||||
#define BX 32 // The size of a block's X dimension (== 1 warp)
|
|
||||||
|
__shared__ float de_r[FW2], de_g[FW2], de_b[FW2], de_a[FW2];
|
||||||
|
|
||||||
|
__device__ void de_add(int ibase, int ii, int jj, float4 scaled) {
|
||||||
|
int idx = ibase + FW * ii + jj;
|
||||||
|
atomicAdd(de_r+idx, scaled.x);
|
||||||
|
atomicAdd(de_g+idx, scaled.y);
|
||||||
|
atomicAdd(de_b+idx, scaled.z);
|
||||||
|
atomicAdd(de_a+idx, scaled.w);
|
||||||
|
}
|
||||||
|
|
||||||
__global__
|
__global__
|
||||||
void density_est(float4 *pixbuf, float *denbuf, int vertical) {
|
void logscale(float4 *pixbuf, float4 *outbuf, float k1, float k2) {
|
||||||
__shared__ float r[BX*FW], g[BX*FW], b[BX*FW], a[BX*FW];
|
int i = blockDim.x * blockIdx.x + threadIdx.x;
|
||||||
|
float4 pix = pixbuf[i];
|
||||||
|
|
||||||
int ibase; // The index of the first element within this lane's strip
|
float ls = fmaxf(0, k1 * logf(1.0 + pix.w * k2) / pix.w);
|
||||||
int imax; // The maximum offset from the first element in the strip
|
pix.x *= ls;
|
||||||
int istride; // Number of indices until next point to filter
|
pix.y *= ls;
|
||||||
|
pix.z *= ls;
|
||||||
|
pix.w *= ls;
|
||||||
|
|
||||||
if (vertical) {
|
outbuf[i] = pix;
|
||||||
ibase = threadIdx.x + blockIdx.x * BX;
|
}
|
||||||
imax = {{features.acc_height}};
|
|
||||||
istride = {{features.acc_stride}};
|
|
||||||
} else {
|
|
||||||
ibase = (blockIdx.x * BX + threadIdx.x) * {{features.acc_stride}};
|
|
||||||
imax = {{features.acc_width}};
|
|
||||||
istride = 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int i = threadIdx.x + BX*threadIdx.y; i < BX*FW; i += NW * BX)
|
__global__
|
||||||
r[i] = g[i] = b[i] = a[i] = 0.0f;
|
void density_est(float4 *pixbuf, float4 *outbuf, float *denbuf,
|
||||||
|
float k1, float k2) {
|
||||||
|
for (int i = threadIdx.x + 32*threadIdx.y; i < FW2; i += 32)
|
||||||
|
de_r[i] = de_g[i] = de_b[i] = de_a[i] = 0.0f;
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
for (int imrow = threadIdx.y + W2; imrow < {{features.acc_height}}; imrow += 32)
|
||||||
|
{
|
||||||
|
int idx = {{features.acc_stride}} * imrow +
|
||||||
|
+ blockIdx.x * 32 + threadIdx.x + W2;
|
||||||
|
|
||||||
for (int i = threadIdx.y; i < imax; i += NW) {
|
|
||||||
int idx = ibase+i*istride;
|
|
||||||
float4 in = pixbuf[idx];
|
float4 in = pixbuf[idx];
|
||||||
float den = denbuf[idx];
|
float den = denbuf[idx];
|
||||||
|
|
||||||
int j = (threadIdx.y + W2) * 32 + threadIdx.x;
|
if (in.w > 0 && den > 0) {
|
||||||
|
float ls = k1 * 12 * logf(1.0 + in.w * k2) / in.w;
|
||||||
|
in.x *= ls;
|
||||||
|
in.y *= ls;
|
||||||
|
in.z *= ls;
|
||||||
|
in.w *= ls;
|
||||||
|
|
||||||
float sd = {{0.35 * cp.estimator}} / powf(den+1.0f, {{cp.estimator_curve}});
|
// Calculate standard deviation of Gaussian kernel.
|
||||||
{{if cp.estimator_minimum > 1}}
|
// flam3_gaussian_filter() uses an implicit standard deviation of 0.5,
|
||||||
sd = fmaxf(sd, {{cp.estimator_minimum}});
|
// but the DE filters scale filter distance by the default spatial
|
||||||
{{endif}}
|
// support factor of 1.5, so the effective base SD is (0.5/1.5)=1/3.
|
||||||
sd *= sd;
|
float sd = {{cp.estimator / 3.}};
|
||||||
|
|
||||||
// TODO: log scaling here? matches flam3, but, ick
|
// The base SD is then scaled in inverse proportion to the density of
|
||||||
// TODO: investigate harm caused by varying standard deviation in a
|
// the point being scaled.
|
||||||
// separable environment
|
sd *= powf(den+1.0f, {{-cp.estimator_curve}});
|
||||||
float coeff = rsqrtf(2.0f*M_PI*sd*sd);
|
|
||||||
atomicAdd(r+j, in.x * coeff);
|
|
||||||
atomicAdd(g+j, in.y * coeff);
|
|
||||||
atomicAdd(b+j, in.z * coeff);
|
|
||||||
atomicAdd(a+j, in.w * coeff);
|
|
||||||
sd = -0.5/sd;
|
|
||||||
|
|
||||||
// #pragma unroll
|
// Clamp the final standard deviation. Things will go badly if the
|
||||||
for (int k = 1; k <= W2; k++) {
|
// minimum is undershot.
|
||||||
float scale = exp(sd*k*k)*coeff;
|
sd = fmaxf(sd, {{max(cp.estimator_minimum / 3., 0.3)}} );
|
||||||
idx = j+k*32;
|
|
||||||
atomicAdd(r+idx, in.x * scale);
|
// This five-term polynomial approximates the sum of the filters with
|
||||||
atomicAdd(g+idx, in.y * scale);
|
// the clamping logic used here. See helpers/filt_err.py.
|
||||||
atomicAdd(b+idx, in.z * scale);
|
float filtsum;
|
||||||
atomicAdd(a+idx, in.w * scale);
|
filtsum = -0.20885075f * sd + 0.90557721f;
|
||||||
idx = j-k*32;
|
filtsum = filtsum * sd + 5.28363054f;
|
||||||
atomicAdd(r+idx, in.x * scale);
|
filtsum = filtsum * sd + -0.11733533f;
|
||||||
atomicAdd(g+idx, in.y * scale);
|
filtsum = filtsum * sd + 0.35670333f;
|
||||||
atomicAdd(b+idx, in.z * scale);
|
float filtscale = 1 / filtsum;
|
||||||
atomicAdd(a+idx, in.w * scale);
|
|
||||||
|
// The reciprocal SD scaling coeffecient in the Gaussian exponent.
|
||||||
|
// exp(-x^2/(2*sd^2)) = exp2f(x^2*rsd)
|
||||||
|
float rsd = -0.5f * CUDART_L2E_F / (sd * sd);
|
||||||
|
|
||||||
|
int si = (threadIdx.y + W2) * FW + threadIdx.x + W2;
|
||||||
|
for (int jj = 0; jj <= W2; jj++) {
|
||||||
|
float jj2f = jj;
|
||||||
|
jj2f *= jj2f;
|
||||||
|
|
||||||
|
float iif = 0;
|
||||||
|
for (int ii = 0; ii <= jj; ii++) {
|
||||||
|
iif += 1;
|
||||||
|
|
||||||
|
float coeff = exp2f((jj2f + iif * iif) * rsd) * filtscale;
|
||||||
|
if (coeff < 0.0001f) break;
|
||||||
|
|
||||||
|
float4 scaled;
|
||||||
|
scaled.x = in.x * coeff;
|
||||||
|
scaled.y = in.y * coeff;
|
||||||
|
scaled.z = in.z * coeff;
|
||||||
|
scaled.w = in.w * coeff;
|
||||||
|
|
||||||
|
de_add(si, ii, jj, scaled);
|
||||||
|
if (jj == 0) continue;
|
||||||
|
de_add(si, ii, -jj, scaled);
|
||||||
|
if (ii != 0) {
|
||||||
|
de_add(si, -ii, jj, scaled);
|
||||||
|
de_add(si, -ii, -jj, scaled);
|
||||||
|
if (ii == jj) continue;
|
||||||
|
}
|
||||||
|
de_add(si, jj, ii, scaled);
|
||||||
|
de_add(si, -jj, ii, scaled);
|
||||||
|
|
||||||
|
if (ii == 0) continue;
|
||||||
|
de_add(si, -jj, -ii, scaled);
|
||||||
|
de_add(si, jj, -ii, scaled);
|
||||||
|
|
||||||
|
// TODO: validate that the above avoids bank conflicts
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
float4 out;
|
// TODO: could coalesce this, but what a pain
|
||||||
j = threadIdx.y * BX + threadIdx.x;
|
for (int i = threadIdx.x; i < FW; i += 32) {
|
||||||
out.x = r[j];
|
idx = {{features.acc_stride}} * imrow + blockIdx.x * 32 + i + W2;
|
||||||
out.y = g[j];
|
int si = threadIdx.y * FW + i;
|
||||||
out.z = b[j];
|
float *out = reinterpret_cast<float*>(&outbuf[idx]);
|
||||||
out.w = a[j];
|
atomicAdd(out, de_r[si]);
|
||||||
idx = ibase+(i-W2)*istride;
|
atomicAdd(out+1, de_g[si]);
|
||||||
if (idx > 0)
|
atomicAdd(out+2, de_b[si]);
|
||||||
pixbuf[idx] = out;
|
atomicAdd(out+3, de_a[si]);
|
||||||
__syncthreads();
|
}
|
||||||
|
|
||||||
|
if (threadIdx.y == 5000) {
|
||||||
|
for (int i = threadIdx.x; i < FW; i += 32) {
|
||||||
|
idx = {{features.acc_stride}} * (imrow + 32)
|
||||||
|
+ blockIdx.x * 32 + i + W2;
|
||||||
|
int si = 32 * FW + i;
|
||||||
|
float *out = reinterpret_cast<float*>(&outbuf[idx]);
|
||||||
|
atomicAdd(out, 0.2 + de_r[si]);
|
||||||
|
atomicAdd(out+1, de_g[si]);
|
||||||
|
atomicAdd(out+2, de_b[si]);
|
||||||
|
atomicAdd(out+3, de_a[si]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__syncthreads();
|
||||||
// TODO: shift instead of copying
|
// TODO: shift instead of copying
|
||||||
idx = threadIdx.x + BX * threadIdx.y;
|
int tid = threadIdx.y * 32 + threadIdx.x;
|
||||||
if (threadIdx.y < NW-2) {
|
for (int i = tid; i < FW*(W2+W2); i += 512) {
|
||||||
r[idx] = r[idx + BX*NW];
|
de_r[i] = de_r[i+FW*32];
|
||||||
g[idx] = g[idx + BX*NW];
|
de_g[i] = de_g[i+FW*32];
|
||||||
b[idx] = b[idx + BX*NW];
|
de_b[i] = de_b[i+FW*32];
|
||||||
a[idx] = a[idx + BX*NW];
|
de_a[i] = de_a[i+FW*32];
|
||||||
}
|
}
|
||||||
__syncthreads();
|
|
||||||
|
|
||||||
r[idx + BX*(NW-2)] = 0.0f;
|
__syncthreads();
|
||||||
g[idx + BX*(NW-2)] = 0.0f;
|
for (int i = tid + FW*(W2+W2); i < FW2; i += 512) {
|
||||||
b[idx + BX*(NW-2)] = 0.0f;
|
de_r[i] = 0.0f;
|
||||||
a[idx + BX*(NW-2)] = 0.0f;
|
de_g[i] = 0.0f;
|
||||||
|
de_b[i] = 0.0f;
|
||||||
|
de_a[i] = 0.0f;
|
||||||
|
}
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
""")
|
""")
|
||||||
|
|
||||||
def invoke(self, mod, abufd, dbufd):
|
def invoke(self, mod, abufd, obufd, dbufd):
|
||||||
fun = mod.get_function("density_est")
|
# TODO: add no-est version
|
||||||
t = fun(abufd, dbufd, np.int32(0),
|
# TODO: come up with a general way to average these parameters
|
||||||
block=(32, 16, 1), grid=(self.features.acc_height/32,1),
|
k1 = self.cp.brightness * 268 / 256
|
||||||
time_kernel=True)
|
area = self.features.width * self.features.height / self.cp.ppu ** 2
|
||||||
print "Horizontal density estimation: %g" % t
|
k2 = 1 / (area * self.cp.adj_density)
|
||||||
|
|
||||||
t = fun(abufd, dbufd, np.int32(1),
|
if self.cp.estimator == 0:
|
||||||
block=(32, 16, 1), grid=(self.features.acc_width/32,1),
|
fun = mod.get_function("logscale")
|
||||||
|
t = fun(abufd, obufd, np.float32(k1), np.float32(k2),
|
||||||
|
block=(self.features.acc_width, 1, 1),
|
||||||
|
grid=(self.features.acc_height, 1), time_kernel=True)
|
||||||
|
else:
|
||||||
|
fun = mod.get_function("density_est")
|
||||||
|
t = fun(abufd, obufd, dbufd, np.float32(k1), np.float32(k2),
|
||||||
|
block=(32, 32, 1), grid=(self.features.acc_stride/32 - 1, 1),
|
||||||
time_kernel=True)
|
time_kernel=True)
|
||||||
print "Vertical density estimation: %g" % t
|
print "Density estimation: %g" % t
|
||||||
|
|
||||||
|
@ -220,19 +220,15 @@ def render(features, cps):
|
|||||||
|
|
||||||
npix = features.width * features.height
|
npix = features.width * features.height
|
||||||
|
|
||||||
k1 = cp.brightness * 268 / 256
|
obufd = cuda.to_device(abuf)
|
||||||
area = features.width * features.height / cp.ppu ** 2
|
de.invoke(mod, abufd, obufd, dbufd)
|
||||||
k2 = 1 / (area * cp.adj_density)
|
|
||||||
|
|
||||||
de.invoke(mod, abufd, dbufd)
|
fun = mod.get_function("colorclip")
|
||||||
|
t = fun(obufd, f(1 / cp.gamma), f(cp.vibrancy), f(cp.highlight_power),
|
||||||
fun = mod.get_function("logfilt")
|
|
||||||
t = fun(abufd, f(k1), f(k2),
|
|
||||||
f(1 / cp.gamma), f(cp.vibrancy), f(cp.highlight_power),
|
|
||||||
block=(256,1,1), grid=(npix/256,1), time_kernel=True)
|
block=(256,1,1), grid=(npix/256,1), time_kernel=True)
|
||||||
print "Completed color filtering in %g seconds" % t
|
print "Completed color filtering in %g seconds" % t
|
||||||
|
|
||||||
abuf = cuda.from_device_like(abufd, abuf)
|
abuf = cuda.from_device_like(obufd, abuf)
|
||||||
return abuf, dbuf
|
return abuf, dbuf
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user