mirror of
https://github.com/stevenrobertson/cuburn.git
synced 2025-04-21 00:51:31 -04:00
Absurdly complicated enhancements to filtering.
This commit is contained in:
parent
c572f62d7d
commit
b4132c7cd9
@ -9,9 +9,6 @@ from pycuda.gpuarray import vec
|
||||
from cuburn.code.util import *
|
||||
|
||||
_CODE = r'''
|
||||
#include<math_constants.h>
|
||||
#include<stdio.h>
|
||||
|
||||
__global__
|
||||
void colorclip(float4 *pixbuf, float gamma, float vibrance, float highpow,
|
||||
float linrange, float lingam, float3 bkgd,
|
||||
@ -100,10 +97,11 @@ void colorclip(float4 *pixbuf, float gamma, float vibrance, float highpow,
|
||||
}
|
||||
|
||||
__global__
|
||||
void logscale(float4 *pixbuf, float4 *outbuf, float k1, float k2) {
|
||||
void logscale(float4 *outbuf, const float4 *pixbuf, float k1, float k2) {
|
||||
int i = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
float4 pix = pixbuf[i];
|
||||
|
||||
// float ls = fmaxf(0, k1 * logf(1.0f + pix.w * pix.w * k2 / (1 + pix.w)) / pix.w);
|
||||
float ls = fmaxf(0, k1 * logf(1.0f + pix.w * k2) / pix.w);
|
||||
pix.x *= ls;
|
||||
pix.y *= ls;
|
||||
@ -113,173 +111,13 @@ void logscale(float4 *pixbuf, float4 *outbuf, float k1, float k2) {
|
||||
outbuf[i] = pix;
|
||||
}
|
||||
|
||||
// SS must be no greater than 4x
|
||||
// Element-wise computation of ``dst[i]=dst[i]+src[i]*scale``.
|
||||
__global__
|
||||
void blur_density_h(float *scratch, const float4 *pixbuf, int astride) {
|
||||
// TODO: respect supersampling? configurable blur width?
|
||||
int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (x > astride) return;
|
||||
int y = blockIdx.y;
|
||||
const float *dsrc =
|
||||
reinterpret_cast<const float*>(&pixbuf[astride * y]) + 3;
|
||||
|
||||
float den = 0.0f;
|
||||
|
||||
den += dsrc[4*max(0, x-2)] * 0.00135f;
|
||||
den += dsrc[4*max(0, x-1)] * 0.1573f;
|
||||
den += dsrc[4*x] * 0.6827f;
|
||||
den += dsrc[4*min(astride-1, x+1)] * 0.1573f;
|
||||
den += dsrc[4*min(astride-1, x+2)] * 0.00135f;
|
||||
scratch[astride * y + x] = den;
|
||||
}
|
||||
|
||||
__global__
|
||||
void blur_density_v(float4 *pixbuf, const float *scratch,
|
||||
int astride, int aheight) {
|
||||
// TODO: respect supersampling? configurable blur width?
|
||||
int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (x > astride) return;
|
||||
int y = blockIdx.y;
|
||||
const float *dsrc = scratch + x;
|
||||
|
||||
float den = 0.0f;
|
||||
|
||||
den += dsrc[astride*max(0, y-2)] * 0.00135f;
|
||||
den += dsrc[astride*max(0, y-1)] * 0.1573f;
|
||||
den += dsrc[astride*y] * 0.6827f;
|
||||
den += dsrc[astride*min(astride-1, y+1)] * 0.1573f;
|
||||
den += dsrc[astride*min(astride-1, y+2)] * 0.00135f;
|
||||
|
||||
float *ddst = reinterpret_cast<float*>(&pixbuf[astride * y + x]) + 3;
|
||||
*ddst = min(*ddst, den);
|
||||
}
|
||||
|
||||
#define W 21 // Filter width (regardless of standard deviation chosen)
|
||||
#define W2 10 // Half of filter width, rounded down
|
||||
#define FW 53 // Width of local result storage (NW+W2+W2)
|
||||
#define FW2 (FW*FW)
|
||||
|
||||
__global__
|
||||
void density_est(float4 *outbuf, const float4 *pixbuf,
|
||||
float scale_coeff, float est_curve, float edge_clamp,
|
||||
float k1, float k2, int height, int stride) {
|
||||
__shared__ float de_r[FW2], de_g[FW2], de_b[FW2], de_a[FW2];
|
||||
|
||||
// The max supported radius is really 7, but the extra units simplify the
|
||||
// logic in the bottlenecked section below.
|
||||
__shared__ int minradius[32];
|
||||
|
||||
for (int i = threadIdx.x + 32*threadIdx.y; i < FW2; i += 1024)
|
||||
de_r[i] = de_g[i] = de_b[i] = de_a[i] = 0.0f;
|
||||
__syncthreads();
|
||||
|
||||
for (int imrow = threadIdx.y; imrow < height; imrow += 32) {
|
||||
// Prepare the shared voting buffer. Syncing afterward is
|
||||
// almost unnecessary but we do it to be safe
|
||||
if (threadIdx.y == 0)
|
||||
minradius[threadIdx.x] = 0.0f;
|
||||
__syncthreads();
|
||||
|
||||
int col = blockIdx.x * 32 + threadIdx.x;
|
||||
int in_idx = stride * imrow + col;
|
||||
float4 in = pixbuf[in_idx];
|
||||
float den = in.w;
|
||||
|
||||
float ls = k1 * logf(1.0f + in.w * k2) / in.w;
|
||||
|
||||
// The synchronization model used now prevents us from
|
||||
// cutting early in the case of a zero point, so we just carry
|
||||
// it along with us here
|
||||
if (den <= 0) ls = 0.0f;
|
||||
|
||||
in.x *= ls;
|
||||
in.y *= ls;
|
||||
in.z *= ls;
|
||||
in.w *= ls;
|
||||
|
||||
// Base index of destination for writes
|
||||
int si = (threadIdx.y + W2) * FW + threadIdx.x + W2;
|
||||
|
||||
// Calculate scaling coefficient for the Gaussian kernel. This
|
||||
// does not match with a normal Gaussian; it just fits with
|
||||
// flam3's implementation.
|
||||
float scale = powf(den, est_curve) * scale_coeff;
|
||||
|
||||
// Force a minimum blur radius. This works out to be a
|
||||
// standard deviation of about 0.35px. Also force a maximum,
|
||||
// which limits spherical error to about 2 quanta at 10 bit
|
||||
// precision.
|
||||
scale = max(0.30f, min(scale, 2.0f));
|
||||
|
||||
// Determine a minimum radius for this image section.
|
||||
int radius = (int) min(ceilf(2.12132f / scale), 10.0f);
|
||||
if (den <= 0) radius = 0;
|
||||
minradius[radius] = radius;
|
||||
|
||||
// Go bottlenecked to compute the maximum radius in this block
|
||||
__syncthreads();
|
||||
if (threadIdx.y == 0) {
|
||||
int blt = __ballot(minradius[threadIdx.x]);
|
||||
minradius[0] = 31 - __clz(blt);
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
radius = minradius[0];
|
||||
|
||||
for (int jj = -radius; jj <= radius; jj++) {
|
||||
float jjf = (jj - 0.5f) * scale;
|
||||
float jdiff = erff(jjf + scale) - erff(jjf);
|
||||
|
||||
for (int ii = -radius; ii <= radius; ii++) {
|
||||
float iif = (ii - 0.5f) * scale;
|
||||
float coeff = 0.25f * jdiff * (erff(iif + scale) - erff(iif));
|
||||
|
||||
int idx = si + FW * ii + jj;
|
||||
de_r[idx] += in.x * coeff;
|
||||
de_g[idx] += in.y * coeff;
|
||||
de_b[idx] += in.z * coeff;
|
||||
de_a[idx] += in.w * coeff;
|
||||
__syncthreads();
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
// TODO: could coalesce this, but what a pain
|
||||
for (int i = threadIdx.x; i < FW; i += 32) {
|
||||
int out_idx = stride * imrow + blockIdx.x * 32 + i;
|
||||
int si = threadIdx.y * FW + i;
|
||||
float *out = reinterpret_cast<float*>(&outbuf[out_idx]);
|
||||
atomicAdd(out, de_r[si]);
|
||||
atomicAdd(out+1, de_g[si]);
|
||||
atomicAdd(out+2, de_b[si]);
|
||||
atomicAdd(out+3, de_a[si]);
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
int tid = threadIdx.y * 32 + threadIdx.x;
|
||||
for (int i = tid; i < FW*(W2+W2); i += 1024) {
|
||||
de_r[i] = de_r[i+FW*32];
|
||||
de_g[i] = de_g[i+FW*32];
|
||||
de_b[i] = de_b[i+FW*32];
|
||||
de_a[i] = de_a[i+FW*32];
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
for (int i = tid + FW*(W2+W2); i < FW2; i += 1024) {
|
||||
de_r[i] = 0.0f;
|
||||
de_g[i] = 0.0f;
|
||||
de_b[i] = 0.0f;
|
||||
de_a[i] = 0.0f;
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
}
|
||||
|
||||
__global__
|
||||
void fma_buf(float4 *dst, const float4 *sub, int astride, float scale) {
|
||||
void fma_buf(float4 *dst, const float4 *src, int astride, float scale) {
|
||||
int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
int i = y * astride + x;
|
||||
float4 d = dst[i], s = sub[i];
|
||||
float4 d = dst[i], s = src[i];
|
||||
d.x += s.x * scale;
|
||||
d.y += s.y * scale;
|
||||
d.z += s.z * scale;
|
||||
@ -288,125 +126,250 @@ void fma_buf(float4 *dst, const float4 *sub, int astride, float scale) {
|
||||
}
|
||||
|
||||
texture<float4, cudaTextureType2D> bilateral_src;
|
||||
texture<float, cudaTextureType2D> blur_src;
|
||||
|
||||
// Apply a Gaussian-esque blur to the density channel of the texture in
|
||||
// ``bilateral_src`` in the horizontal direction, and write it to ``dst``, a
|
||||
// one-channel buffer
|
||||
__global__ void blur_h(float *dst) {
|
||||
int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
float den = 0.0f;
|
||||
den += tex2D(bilateral_src, x - 2, y).w * 0.00135f;
|
||||
den += tex2D(bilateral_src, x - 1, y).w * 0.1573f;
|
||||
den += tex2D(bilateral_src, x, y).w * 0.6827f;
|
||||
den += tex2D(bilateral_src, x + 1, y).w * 0.1573f;
|
||||
den += tex2D(bilateral_src, x + 2, y).w * 0.00135f;
|
||||
dst[y * (blockDim.x * gridDim.x) + x] = den;
|
||||
}
|
||||
|
||||
// As above, but with a one-channel texture as source
|
||||
__global__ void blur_h_1cp(float *dst) {
|
||||
int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
float den = 0.0f;
|
||||
den += tex2D(blur_src, x - 2, y) * 0.00135f;
|
||||
den += tex2D(blur_src, x - 1, y) * 0.1573f;
|
||||
den += tex2D(blur_src, x, y) * 0.6827f;
|
||||
den += tex2D(blur_src, x + 1, y) * 0.1573f;
|
||||
den += tex2D(blur_src, x + 2, y) * 0.00135f;
|
||||
dst[y * (blockDim.x * gridDim.x) + x] = den;
|
||||
}
|
||||
|
||||
// As above, but in the vertical direction
|
||||
__global__ void blur_v(float *dst) {
|
||||
int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
float den = 0.0f;
|
||||
den += tex2D(blur_src, x, y - 2) * 0.00135f;
|
||||
den += tex2D(blur_src, x, y - 1) * 0.1573f;
|
||||
den += tex2D(blur_src, x, y ) * 0.6827f;
|
||||
den += tex2D(blur_src, x, y + 1) * 0.1573f;
|
||||
den += tex2D(blur_src, x, y + 2) * 0.00135f;
|
||||
dst[y * (blockDim.x * gridDim.x) + x] = den;
|
||||
}
|
||||
|
||||
/* sstd: spatial standard deviation (Gaussian filter)
|
||||
* cstd: color standard deviation (Gaussian on the range [0, 1], where 1
|
||||
* represents an "opposite" color).
|
||||
*
|
||||
* Density is controlled by a power-of-two Gompertz distribution:
|
||||
* v = 1 - 2^(-sum^dpow * 2^((dhalfpt - x) * dspeed))
|
||||
*
|
||||
* dhalfpt: The difference in density values between two points at which the
|
||||
* filter admits 50% of the spatial and color kernels, when dpow
|
||||
* is 0. `3` seems to be a good fit for most images at decent
|
||||
* sampling levels.
|
||||
* dspeed: The sharpness of the filter's cutoff around dhalfpt. At `1`, the
|
||||
* filter admits 75% of a point that differs by one fewer than
|
||||
* `dhalfpt` density steps from the current point (when dpow is 0);
|
||||
* at `2`, it admits 93.75% of the same. `0.5` works pretty well.
|
||||
* dpow: The change of filter intensity as density scales. This should be
|
||||
* set automatically in response to changes in expected density per
|
||||
* cell.
|
||||
*/
|
||||
__global__
|
||||
void bilateral(float4 *dst, int astride, float dstd, float rstd) {
|
||||
int xi = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
int yi = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
float x = xi, y = yi;
|
||||
void bilateral(float4 *dst, float sstd, float cstd,
|
||||
float dhalfpt, float dspeed, float dpow) {
|
||||
int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
const float r = 9.0f;
|
||||
const int r = 7;
|
||||
|
||||
// Precalculate the spatial coeffecients.
|
||||
__shared__ float spa_coefs[32];
|
||||
if (threadIdx.y == 0) {
|
||||
float df = threadIdx.x;
|
||||
spa_coefs[threadIdx.x] = expf(df * df / (-M_SQRT2 * sstd));
|
||||
}
|
||||
|
||||
// 3.0f compensates for [0,3] range of `cdiff`
|
||||
float cscale = 1.0f / (-M_SQRT2 * 3.0f * cstd);
|
||||
|
||||
// Gather the center point, and pre-average the color values for easier
|
||||
// comparison.
|
||||
float4 cen = tex2D(bilateral_src, x, y);
|
||||
if (cen.w > 0.0f) {
|
||||
float cdrcp = 1.0f / cen.w;
|
||||
cen.x *= cdrcp;
|
||||
cen.y *= cdrcp;
|
||||
cen.z *= cdrcp;
|
||||
}
|
||||
|
||||
// Compute the Sobel directional derivative of a pre-blurred version of
|
||||
// the density channel at the center point.
|
||||
float nw = tex2D(blur_src, x - 1, y - 1);
|
||||
float ne = tex2D(blur_src, x + 1, y - 1);
|
||||
float sw = tex2D(blur_src, x - 1, y + 1);
|
||||
float se = tex2D(blur_src, x + 1, y + 1);
|
||||
float h = ne + se + 2 * tex2D(blur_src, x + 1, y)
|
||||
-(nw + sw + 2 * tex2D(blur_src, x - 1, y));
|
||||
float v = se + sw + 2 * tex2D(blur_src, x, y + 1)
|
||||
-(ne + nw + 2 * tex2D(blur_src, x, y - 1));
|
||||
|
||||
// TODO: figure out how to work `mag` in to scaling the degree to which
|
||||
// the directionality filter clamps
|
||||
float mag = sqrtf(h * h + v * v);
|
||||
|
||||
float cen_den = tex2D(bilateral_src, x, y).w;
|
||||
float4 out = make_float4(0, 0, 0, 0);
|
||||
float weightsum = 0.0f;
|
||||
|
||||
for (float i = -r; i <= r; i++) {
|
||||
for (float j = -r; j <= r; j++) {
|
||||
// Be extra-sure spatial coeffecients have been written
|
||||
__syncthreads();
|
||||
|
||||
for (int i = -r; i <= r; i++) {
|
||||
for (int j = -r; j <= r; j++) {
|
||||
float4 pix = tex2D(bilateral_src, x + i, y + j);
|
||||
float den_diff = log2f(fabsf(pix.w - cen_den) + 1.0f);
|
||||
float factor = expf(i * i * dstd)
|
||||
* expf(j * j * dstd)
|
||||
* expf(den_diff * den_diff * rstd);
|
||||
float cdiff = 0.5f;
|
||||
|
||||
if (pix.w > 0.0f && cen.w > 0.0f) {
|
||||
float pdrcp = 1.0f / pix.w;
|
||||
float yd = pix.x * pdrcp - cen.x;
|
||||
float ud = pix.y * pdrcp - cen.y;
|
||||
float vd = pix.z * pdrcp - cen.z;
|
||||
cdiff = yd * yd + ud * ud + vd * vd;
|
||||
}
|
||||
float ddiff = dspeed * (dhalfpt - fabsf(pix.w - cen.w) - 1.0f);
|
||||
float dsum = -powf(0.5f * (pix.w + cen.w + 1.0f), dpow);
|
||||
float dfact = 1.0f - exp2f(dsum * exp2f(ddiff));
|
||||
float angfact = (h * i + v * j) / (sqrtf(i*i + j*j) * mag + 1.0e-10f);
|
||||
|
||||
// Oh, this is ridiculous. But it works!
|
||||
float factor = spa_coefs[abs(i)] * spa_coefs[abs(j)]
|
||||
* expf(cscale * cdiff) * dfact
|
||||
* exp2f(2.0f * (angfact - 1.0f));
|
||||
|
||||
weightsum += factor;
|
||||
out.x += factor * pix.x;
|
||||
out.y += factor * pix.y;
|
||||
out.z += factor * pix.z;
|
||||
out.w += factor * pix.w;
|
||||
weightsum += factor;
|
||||
}
|
||||
}
|
||||
float weightrcp = 1.0f / weightsum;
|
||||
|
||||
float weightrcp = 1.0f / (weightsum + 1e-10f);
|
||||
out.x *= weightrcp;
|
||||
out.y *= weightrcp;
|
||||
out.z *= weightrcp;
|
||||
out.w *= weightrcp;
|
||||
|
||||
dst[yi * astride + xi] = out;
|
||||
}
|
||||
// Uncomment to write directional gradient using YUV colors
|
||||
/*
|
||||
out.x = mag;
|
||||
out.y = h;
|
||||
out.z = v;
|
||||
out.w = mag;
|
||||
*/
|
||||
|
||||
const int astride = blockDim.x * gridDim.x;
|
||||
dst[y * astride + x] = out;
|
||||
}
|
||||
'''
|
||||
|
||||
class Filtering(object):
|
||||
|
||||
class Filtering(HunkOCode):
|
||||
mod = None
|
||||
defs = _CODE
|
||||
|
||||
@classmethod
|
||||
def init_mod(cls):
|
||||
if cls.mod is None:
|
||||
cls.mod = pycuda.compiler.SourceModule(_CODE,
|
||||
cls.mod = pycuda.compiler.SourceModule(assemble_code(BaseCode, cls),
|
||||
options=['-use_fast_math', '-maxrregcount', '32'])
|
||||
|
||||
def __init__(self):
|
||||
self.init_mod()
|
||||
|
||||
def blur_density(self, ddst, dscratch, dim, stream=None):
|
||||
blur_h = self.mod.get_function("blur_density_h")
|
||||
blur_h(dscratch, ddst, i32(dim.astride), block=(192, 1, 1),
|
||||
grid=(dim.astride / 192, dim.ah), stream=stream)
|
||||
blur_v = self.mod.get_function("blur_density_v")
|
||||
blur_v(ddst, dscratch, i32(dim.astride), i32(dim.ah), block=(192, 1, 1),
|
||||
grid=(dim.astride / 192, dim.ah), stream=stream)
|
||||
def de(self, ddst, dsrc, dscratch, gnm, dim, tc, nxf, stream=None):
|
||||
# Helper variables and functions to keep it clean
|
||||
sb = 16 * dim.astride
|
||||
bs = sb * dim.ah
|
||||
bl, gr = (32, 8, 1), (dim.astride / 32, dim.ah / 8)
|
||||
|
||||
def de(self, ddst, dsrc, gnm, dim, tc, nxf, stream=None):
|
||||
def launch(f, *args, **kwargs):
|
||||
f(*args, block=bl, grid=gr, stream=stream, **kwargs)
|
||||
mkdsc = lambda c: argset(cuda.ArrayDescriptor(), height=dim.ah,
|
||||
width=dim.astride, num_channels=c,
|
||||
format=cuda.array_format.FLOAT)
|
||||
def mktref(n):
|
||||
tref = self.mod.get_texref(n)
|
||||
tref.set_filter_mode(cuda.filter_mode.POINT)
|
||||
tref.set_address_mode(0, cuda.address_mode.WRAP)
|
||||
tref.set_address_mode(1, cuda.address_mode.WRAP)
|
||||
return tref
|
||||
|
||||
dsc = mkdsc(4)
|
||||
tref = mktref('bilateral_src')
|
||||
grad_dsc = mkdsc(1)
|
||||
grad_tref = mktref('blur_src')
|
||||
|
||||
bilateral, blur_h, blur_h_1cp, blur_v, fma_buf = map(
|
||||
self.mod.get_function,
|
||||
'bilateral blur_h blur_h_1cp blur_v fma_buf'.split())
|
||||
ROUNDS = 2 # TODO: user customizable?
|
||||
|
||||
def do_bilateral(bsrc, bdst):
|
||||
tref.set_address_2d(bsrc, dsc, sb)
|
||||
launch(blur_h, np.intp(bdst), texrefs=[tref])
|
||||
grad_tref.set_address_2d(bdst, grad_dsc, sb / 4)
|
||||
launch(blur_v, dscratch, texrefs=[grad_tref])
|
||||
grad_tref.set_address_2d(dscratch, grad_dsc, sb / 4)
|
||||
launch(blur_h_1cp, np.intp(bdst), texrefs=[tref])
|
||||
grad_tref.set_address_2d(bdst, grad_dsc, sb / 4)
|
||||
launch(blur_v, dscratch, texrefs=[grad_tref])
|
||||
grad_tref.set_address_2d(dscratch, grad_dsc, sb / 4)
|
||||
launch(bilateral, np.intp(bdst), f32(4), f32(0.1),
|
||||
f32(5), f32(0.4), f32(0.6), texrefs=[tref, grad_tref])
|
||||
return bdst, bsrc
|
||||
|
||||
# Filter the first xform, using `ddst` as an intermediate buffer.
|
||||
# End result is the filtered copy in `dsrc`.
|
||||
a, b = dsrc, ddst
|
||||
for i in range(ROUNDS):
|
||||
a, b = do_bilateral(a, b)
|
||||
if ROUNDS % 2:
|
||||
cuda.memcpy_dtod_async(b, a, bs, stream)
|
||||
|
||||
# Filter the remaining xforms, using `ddst` as an intermediate
|
||||
# buffer, then add the result to `dsrc` (at the zero'th xform).
|
||||
for x in range(1, nxf):
|
||||
a, b = int(dsrc) + x * bs, ddst
|
||||
for i in range(ROUNDS):
|
||||
a, b = do_bilateral(a, b)
|
||||
launch(fma_buf, dsrc, np.intp(a), i32(dim.astride), f32(1))
|
||||
|
||||
# Log-scale the accumulated buffer in `dsrc`.
|
||||
k1 = f32(gnm.color.brightness(tc) * 268 / 256)
|
||||
# Old definition of area is (w*h/(s*s)). Since new scale 'ns' is now
|
||||
# s/w, new definition is (w*h/(s*s*w*w)) = (h/(s*s*w))
|
||||
area = dim.h / (gnm.camera.scale(tc) ** 2 * dim.w)
|
||||
k2 = f32(1.0 / (area * gnm.spp(tc)))
|
||||
|
||||
if gnm.de.radius(tc) == 0 or True:
|
||||
# Stride in bytes, and buffer size
|
||||
sb = 16 * dim.astride
|
||||
bs = sb * dim.ah
|
||||
|
||||
|
||||
dsc = argset(cuda.ArrayDescriptor(), height=dim.ah,
|
||||
width=dim.astride, format=cuda.array_format.FLOAT,
|
||||
num_channels=4)
|
||||
tref = self.mod.get_texref('bilateral_src')
|
||||
tref.set_filter_mode(cuda.filter_mode.POINT)
|
||||
tref.set_address_mode(0, cuda.address_mode.WRAP)
|
||||
tref.set_address_mode(1, cuda.address_mode.WRAP)
|
||||
|
||||
bilateral = self.mod.get_function('bilateral')
|
||||
fma_buf = self.mod.get_function('fma_buf')
|
||||
for i in range(nxf):
|
||||
tref.set_address_2d(int(dsrc) + bs * i, dsc, sb)
|
||||
bilateral(ddst, i32(dim.astride), f32(-0.1), f32(-0.1),
|
||||
block=(32, 8, 1), grid=(dim.astride / 32, dim.ah / 8),
|
||||
texrefs=[tref], stream=stream)
|
||||
if i == 0:
|
||||
cuda.memcpy_dtod_async(dsrc, ddst, bs, stream)
|
||||
else:
|
||||
fma_buf(dsrc, ddst, i32(dim.astride), f32(1.0),
|
||||
block=(32, 8, 1), grid=(dim.astride / 32, dim.ah / 8),
|
||||
stream=stream)
|
||||
|
||||
tref.set_address_2d(dsrc, dsc, sb)
|
||||
ROUNDS = 3
|
||||
|
||||
a, b = dsrc, ddst
|
||||
for i in range(ROUNDS):
|
||||
bilateral(b, i32(dim.astride), f32(-0.1), f32(-0.2),
|
||||
block=(32, 8, 1), grid=(dim.astride / 32, dim.ah / 8),
|
||||
texrefs=[tref], stream=stream)
|
||||
a, b = b, a
|
||||
|
||||
nbins = dim.ah * dim.astride
|
||||
fun = self.mod.get_function("logscale")
|
||||
# This function only looks at one pixel, so using the same
|
||||
# parameter for input and output is OK (if e.g. ROUNDS is odd)
|
||||
t = fun(a, ddst, k1, k2,
|
||||
block=(512, 1, 1), grid=(nbins/512, 1), stream=stream)
|
||||
else:
|
||||
scale_coeff = f32(1.0 / gnm.de.radius(tc))
|
||||
est_curve = f32(gnm.de.curve(tc))
|
||||
# TODO: experiment with this
|
||||
edge_clamp = f32(1.2)
|
||||
fun = self.mod.get_function("density_est")
|
||||
fun(ddst, dsrc, scale_coeff, est_curve, edge_clamp, k1, k2,
|
||||
i32(dim.ah), i32(dim.astride),
|
||||
block=(32, 32, 1), grid=(dim.aw/32, 1), stream=stream)
|
||||
nbins = dim.ah * dim.astride
|
||||
logscale = self.mod.get_function("logscale")
|
||||
t = logscale(ddst, dsrc, k1, k2,
|
||||
block=(512, 1, 1), grid=(nbins/512, 1), stream=stream)
|
||||
|
||||
def colorclip(self, dbuf, gnm, dim, tc, blend, stream=None):
|
||||
nbins = dim.ah * dim.astride
|
||||
|
@ -198,6 +198,10 @@ class Renderer(object):
|
||||
if self.acc_mode == 'atomic':
|
||||
d_atom = cuda.mem_alloc(8 * nbins * nxf)
|
||||
flush_fun = self.mod.get_function("flush_atom")
|
||||
else:
|
||||
# d_atom is also used as a scratch buffer during filtering, so we
|
||||
# need it at least this size
|
||||
d_atom = cuda.mem_alloc(4 * nbins)
|
||||
|
||||
obuf_copy = util.argset(cuda.Memcpy2D(),
|
||||
src_y=self.gutter, src_x_in_bytes=16*self.gutter,
|
||||
@ -350,7 +354,8 @@ class Renderer(object):
|
||||
|
||||
util.BaseCode.fill_dptr(self.mod, d_out, 4 * nbins, filt_stream)
|
||||
_sync_stream(filt_stream, write_stream)
|
||||
filt.de(d_out, d_accum, genome, dim, tc, nxf, stream=filt_stream)
|
||||
filt.de(d_out, d_accum, d_atom, genome, dim, tc, nxf,
|
||||
stream=filt_stream)
|
||||
_sync_stream(write_stream, filt_stream)
|
||||
filt.colorclip(d_out, genome, dim, tc, blend, stream=filt_stream)
|
||||
obuf_copy.set_dst_host(h_out_a)
|
||||
|
Loading…
Reference in New Issue
Block a user