mirror of
https://github.com/stevenrobertson/cuburn.git
synced 2025-02-05 11:40:04 -05:00
Experimental supersampling and DE changes
This commit is contained in:
parent
11c729d370
commit
8c29212821
@ -7,8 +7,9 @@ from pycuda.gpuarray import vec
|
|||||||
|
|
||||||
from cuburn.code.util import *
|
from cuburn.code.util import *
|
||||||
|
|
||||||
_CODE = '''
|
_CODE = r'''
|
||||||
#include<math_constants.h>
|
#include<math_constants.h>
|
||||||
|
#include<stdio.h>
|
||||||
|
|
||||||
__global__
|
__global__
|
||||||
void colorclip(float4 *pixbuf, float gamma, float vibrance, float highpow,
|
void colorclip(float4 *pixbuf, float gamma, float vibrance, float highpow,
|
||||||
@ -87,22 +88,6 @@ void colorclip(float4 *pixbuf, float gamma, float vibrance, float highpow,
|
|||||||
pixbuf[i] = pix;
|
pixbuf[i] = pix;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
#define W 15 // Filter width (regardless of standard deviation chosen)
|
|
||||||
#define W2 7 // Half of filter width, rounded down
|
|
||||||
#define FW 46 // Width of local result storage (NW+W2+W2)
|
|
||||||
#define FW2 (FW*FW)
|
|
||||||
|
|
||||||
__shared__ float de_r[FW2], de_g[FW2], de_b[FW2], de_a[FW2];
|
|
||||||
|
|
||||||
__device__ void de_add(int ibase, int ii, int jj, float4 scaled) {
|
|
||||||
int idx = ibase + FW * ii + jj;
|
|
||||||
atomicAdd(de_r+idx, scaled.x);
|
|
||||||
atomicAdd(de_g+idx, scaled.y);
|
|
||||||
atomicAdd(de_b+idx, scaled.z);
|
|
||||||
atomicAdd(de_a+idx, scaled.w);
|
|
||||||
}
|
|
||||||
|
|
||||||
__global__
|
__global__
|
||||||
void logscale(float4 *pixbuf, float4 *outbuf, float k1, float k2) {
|
void logscale(float4 *pixbuf, float4 *outbuf, float k1, float k2) {
|
||||||
int i = blockDim.x * blockIdx.x + threadIdx.x;
|
int i = blockDim.x * blockIdx.x + threadIdx.x;
|
||||||
@ -117,48 +102,82 @@ void logscale(float4 *pixbuf, float4 *outbuf, float k1, float k2) {
|
|||||||
outbuf[i] = pix;
|
outbuf[i] = pix;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// SS must be no greater than 4x
|
||||||
|
__global__
|
||||||
|
void blur_density_h(float *scratch, const float4 *pixbuf, int astride) {
|
||||||
|
// TODO: respect supersampling? configurable blur width?
|
||||||
|
int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
if (x > astride) return;
|
||||||
|
int y = blockIdx.y;
|
||||||
|
const float *dsrc =
|
||||||
|
reinterpret_cast<const float*>(&pixbuf[astride * y]) + 3;
|
||||||
|
|
||||||
// See helpers/filt_err.py for source of these values.
|
float den = 0.0f;
|
||||||
#define MAX_SCALE -0.12f
|
|
||||||
#define MIN_SCALE -9.2103404f
|
den += dsrc[4*max(0, x-2)] * 0.00135f;
|
||||||
|
den += dsrc[4*max(0, x-1)] * 0.1573f;
|
||||||
|
den += dsrc[4*x] * 0.6827f;
|
||||||
|
den += dsrc[4*min(astride-1, x+1)] * 0.1573f;
|
||||||
|
den += dsrc[4*min(astride-1, x+2)] * 0.00135f;
|
||||||
|
scratch[astride * y + x] = den;
|
||||||
|
}
|
||||||
|
|
||||||
__global__
|
__global__
|
||||||
void density_est(float4 *pixbuf, float4 *outbuf,
|
void blur_density_v(float4 *pixbuf, const float *scratch,
|
||||||
|
int astride, int aheight) {
|
||||||
|
// TODO: respect supersampling? configurable blur width?
|
||||||
|
int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
if (x > astride) return;
|
||||||
|
int y = blockIdx.y;
|
||||||
|
const float *dsrc = scratch + x;
|
||||||
|
|
||||||
|
float den = 0.0f;
|
||||||
|
|
||||||
|
den += dsrc[astride*max(0, y-2)] * 0.00135f;
|
||||||
|
den += dsrc[astride*max(0, y-1)] * 0.1573f;
|
||||||
|
den += dsrc[astride*y] * 0.6827f;
|
||||||
|
den += dsrc[astride*min(astride-1, y+1)] * 0.1573f;
|
||||||
|
den += dsrc[astride*min(astride-1, y+2)] * 0.00135f;
|
||||||
|
|
||||||
|
float *ddst = reinterpret_cast<float*>(&pixbuf[astride * y + x]) + 3;
|
||||||
|
*ddst = den;
|
||||||
|
}
|
||||||
|
|
||||||
|
#define W 15 // Filter width (regardless of standard deviation chosen)
|
||||||
|
#define W2 7 // Half of filter width, rounded down
|
||||||
|
#define FW 46 // Width of local result storage (NW+W2+W2)
|
||||||
|
#define FW2 (FW*FW)
|
||||||
|
|
||||||
|
__global__
|
||||||
|
void density_est(float4 *outbuf, const float4 *pixbuf,
|
||||||
float scale_coeff, float est_curve, float edge_clamp,
|
float scale_coeff, float est_curve, float edge_clamp,
|
||||||
float k1, float k2, int height, int stride) {
|
float k1, float k2, int height, int stride, int ss) {
|
||||||
for (int i = threadIdx.x + 32*threadIdx.y; i < FW2; i += 32)
|
__shared__ float de_r[FW2], de_g[FW2], de_b[FW2], de_a[FW2];
|
||||||
|
|
||||||
|
for (int i = threadIdx.x + 32*threadIdx.y; i < FW2; i += 1024)
|
||||||
de_r[i] = de_g[i] = de_b[i] = de_a[i] = 0.0f;
|
de_r[i] = de_g[i] = de_b[i] = de_a[i] = 0.0f;
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
for (int imrow = threadIdx.y + W2; imrow < (height - W2); imrow += 32)
|
for (int imrow = threadIdx.y; imrow < height; imrow += 32) {
|
||||||
{
|
for (int ssx = 0; ssx < ss; ssx++) {
|
||||||
int idx = stride * imrow + blockIdx.x * 32 + threadIdx.x + W2;
|
float ssxo = (0.5f + ssx) / ss;
|
||||||
|
for (int ssy = 0; ssy < ss; ssy++) {
|
||||||
|
float ssyo = (0.5f + ssy) / ss;
|
||||||
|
|
||||||
float4 in = pixbuf[idx];
|
int col = blockIdx.x * 32 + threadIdx.x;
|
||||||
|
int in_idx = ss * (stride * (imrow * ss + ssy) + col) + ssx;
|
||||||
|
float4 in = pixbuf[in_idx];
|
||||||
|
// TODO: compute maximum filter radius needed across all warps
|
||||||
|
// to limit rounds in advance
|
||||||
float den = in.w;
|
float den = in.w;
|
||||||
|
|
||||||
if (den > 0) {
|
|
||||||
|
|
||||||
// Compute a fast and dirty approximation of a "gradient" using
|
|
||||||
// a [[-1 0 0][0 0 0][0 0 1]]/4 matrix (and its reflection)
|
|
||||||
// for angled edge detection, and limit blurring in those regions
|
|
||||||
// to both provide a bit of smoothing and prevent irregular
|
|
||||||
// bleed-out along gradients close to the image grid.
|
|
||||||
//
|
|
||||||
// For such a simple operator - particularly one whose entire
|
|
||||||
// justification is "it feels right" - it gives very good results
|
|
||||||
// over a wide range of images without any per-flame
|
|
||||||
// parameter tuning. In rare cases, color clamping and extreme
|
|
||||||
// palette changes can cause aliasing to reappear after the DE
|
|
||||||
// step; the only way to fix that is through a color-buffer AA
|
|
||||||
// like MLAA.
|
|
||||||
float *dens = reinterpret_cast<float*>(pixbuf);
|
|
||||||
int didx = idx * 4 + 3;
|
|
||||||
float x = dens[didx+stride*4+4] - dens[didx-stride*4-4];
|
|
||||||
float y = dens[didx+stride*4-4] - dens[didx-stride*4+4];
|
|
||||||
float diag_mag = sqrtf(x*x + y*y) * 0.3333333f;
|
|
||||||
|
|
||||||
float ls = k1 * logf(1.0f + in.w * k2) / in.w;
|
float ls = k1 * logf(1.0f + in.w * k2) / in.w;
|
||||||
|
|
||||||
|
// The synchronization model used now prevents us from
|
||||||
|
// cutting early in the case of a zero point, so we just carry
|
||||||
|
// it along with us here
|
||||||
|
if (den <= 0) ls = 0.0f;
|
||||||
|
|
||||||
in.x *= ls;
|
in.x *= ls;
|
||||||
in.y *= ls;
|
in.y *= ls;
|
||||||
in.z *= ls;
|
in.z *= ls;
|
||||||
@ -170,76 +189,23 @@ void density_est(float4 *pixbuf, float4 *outbuf,
|
|||||||
// Calculate scaling coefficient for the Gaussian kernel. This
|
// Calculate scaling coefficient for the Gaussian kernel. This
|
||||||
// does not match with a normal Gaussian; it just fits with
|
// does not match with a normal Gaussian; it just fits with
|
||||||
// flam3's implementation.
|
// flam3's implementation.
|
||||||
float scale = powf(den, est_curve) * scale_coeff;
|
float scale = powf(den * ss, est_curve) * scale_coeff;
|
||||||
|
scale = min(scale, 1.41421f);
|
||||||
|
|
||||||
// If the gradient scale is smaller than the minimum scale, we're
|
for (int jj = -W2; jj < W2; jj++) {
|
||||||
// probably on a strong edge; blur slightly.
|
float jjf = (jj - ssxo) * scale;
|
||||||
if (diag_mag > den * edge_clamp) {
|
float jdiff = erff(jjf + scale) - erff(jjf);
|
||||||
scale = -2.0f;
|
|
||||||
// Uncomment to see which pixels are being clamped
|
|
||||||
// de_g[si] = 1.0f;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Below a certain threshold, only one coeffecient would be
|
for (int ii = -W2; ii < W2; ii++) {
|
||||||
// retained anyway; we hop right to it.
|
float iif = (ii - ssyo) * scale;
|
||||||
if (scale <= MIN_SCALE) {
|
float coeff = 0.25f * jdiff * (erff(iif + scale) - erff(iif));
|
||||||
de_add(si, 0, 0, in);
|
|
||||||
} else {
|
|
||||||
// These polynomials approximates the reciprocal of the sum of
|
|
||||||
// all retained filter coefficients. See helpers/filt_err.py.
|
|
||||||
float filtsum;
|
|
||||||
if (scale < -1.1f) {
|
|
||||||
filtsum = 5.20066078e-06f;
|
|
||||||
filtsum = filtsum * scale + 2.14025771e-04f;
|
|
||||||
filtsum = filtsum * scale + 3.62761668e-03f;
|
|
||||||
filtsum = filtsum * scale + 3.21970172e-02f;
|
|
||||||
filtsum = filtsum * scale + 1.54297248e-01f;
|
|
||||||
filtsum = filtsum * scale + 3.42210710e-01f;
|
|
||||||
filtsum = filtsum * scale + 3.06015890e-02f;
|
|
||||||
filtsum = filtsum * scale + 1.33724615e-01f;
|
|
||||||
} else {
|
|
||||||
filtsum = -1.23516649e-01f;
|
|
||||||
filtsum = filtsum * scale + -5.14862895e-01f;
|
|
||||||
filtsum = filtsum * scale + -8.61198902e-01f;
|
|
||||||
filtsum = filtsum * scale + -7.41916001e-01f;
|
|
||||||
filtsum = filtsum * scale + -3.51667106e-01f;
|
|
||||||
filtsum = filtsum * scale + -9.07439440e-02f;
|
|
||||||
filtsum = filtsum * scale + -3.30008656e-01f;
|
|
||||||
filtsum = filtsum * scale + -4.78249392e-04f;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int jj = 0; jj <= W2; jj++) {
|
|
||||||
float jj2f = jj;
|
|
||||||
jj2f *= jj2f;
|
|
||||||
|
|
||||||
float iif = 0;
|
|
||||||
for (int ii = 0; ii <= jj; ii++) {
|
|
||||||
float coeff = expf((jj2f + iif * iif) * scale)
|
|
||||||
* filtsum;
|
|
||||||
if (coeff < 0.0001f) break;
|
|
||||||
iif += 1;
|
|
||||||
|
|
||||||
float4 scaled;
|
|
||||||
scaled.x = in.x * coeff;
|
|
||||||
scaled.y = in.y * coeff;
|
|
||||||
scaled.z = in.z * coeff;
|
|
||||||
scaled.w = in.w * coeff;
|
|
||||||
|
|
||||||
de_add(si, ii, jj, scaled);
|
|
||||||
if (jj == 0) continue;
|
|
||||||
de_add(si, ii, -jj, scaled);
|
|
||||||
if (ii != 0) {
|
|
||||||
de_add(si, -ii, jj, scaled);
|
|
||||||
de_add(si, -ii, -jj, scaled);
|
|
||||||
if (ii == jj) continue;
|
|
||||||
}
|
|
||||||
de_add(si, jj, ii, scaled);
|
|
||||||
de_add(si, -jj, ii, scaled);
|
|
||||||
|
|
||||||
if (ii == 0) continue;
|
|
||||||
de_add(si, -jj, -ii, scaled);
|
|
||||||
de_add(si, jj, -ii, scaled);
|
|
||||||
|
|
||||||
|
int idx = si + FW * ii + jj;
|
||||||
|
de_r[idx] += in.x * coeff;
|
||||||
|
de_g[idx] += in.y * coeff;
|
||||||
|
de_b[idx] += in.z * coeff;
|
||||||
|
de_a[idx] += in.w * coeff;
|
||||||
|
__syncthreads();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -248,9 +214,9 @@ void density_est(float4 *pixbuf, float4 *outbuf,
|
|||||||
__syncthreads();
|
__syncthreads();
|
||||||
// TODO: could coalesce this, but what a pain
|
// TODO: could coalesce this, but what a pain
|
||||||
for (int i = threadIdx.x; i < FW; i += 32) {
|
for (int i = threadIdx.x; i < FW; i += 32) {
|
||||||
idx = stride * imrow + blockIdx.x * 32 + i + W2;
|
int out_idx = stride * imrow + blockIdx.x * 32 + i + W2;
|
||||||
int si = threadIdx.y * FW + i;
|
int si = threadIdx.y * FW + i;
|
||||||
float *out = reinterpret_cast<float*>(&outbuf[idx]);
|
float *out = reinterpret_cast<float*>(&outbuf[out_idx]);
|
||||||
atomicAdd(out, de_r[si]);
|
atomicAdd(out, de_r[si]);
|
||||||
atomicAdd(out+1, de_g[si]);
|
atomicAdd(out+1, de_g[si]);
|
||||||
atomicAdd(out+2, de_b[si]);
|
atomicAdd(out+2, de_b[si]);
|
||||||
@ -259,7 +225,7 @@ void density_est(float4 *pixbuf, float4 *outbuf,
|
|||||||
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
int tid = threadIdx.y * 32 + threadIdx.x;
|
int tid = threadIdx.y * 32 + threadIdx.x;
|
||||||
for (int i = tid; i < FW*(W2+W2); i += 512) {
|
for (int i = tid; i < FW*(W2+W2); i += 1024) {
|
||||||
de_r[i] = de_r[i+FW*32];
|
de_r[i] = de_r[i+FW*32];
|
||||||
de_g[i] = de_g[i+FW*32];
|
de_g[i] = de_g[i+FW*32];
|
||||||
de_b[i] = de_b[i+FW*32];
|
de_b[i] = de_b[i+FW*32];
|
||||||
@ -267,7 +233,7 @@ void density_est(float4 *pixbuf, float4 *outbuf,
|
|||||||
}
|
}
|
||||||
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
for (int i = tid + FW*(W2+W2); i < FW2; i += 512) {
|
for (int i = tid + FW*(W2+W2); i < FW2; i += 1024) {
|
||||||
de_r[i] = 0.0f;
|
de_r[i] = 0.0f;
|
||||||
de_g[i] = 0.0f;
|
de_g[i] = 0.0f;
|
||||||
de_b[i] = 0.0f;
|
de_b[i] = 0.0f;
|
||||||
@ -291,12 +257,20 @@ class Filtering(object):
|
|||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.init_mod()
|
self.init_mod()
|
||||||
|
|
||||||
|
def blur_density(self, ddst, dscratch, dim, stream=None):
|
||||||
|
blur_h = self.mod.get_function("blur_density_h")
|
||||||
|
blur_h(dscratch, ddst, i32(dim.astride), block=(192, 1, 1),
|
||||||
|
grid=(dim.astride / 192, dim.ah), stream=stream)
|
||||||
|
blur_v = self.mod.get_function("blur_density_v")
|
||||||
|
blur_v(ddst, dscratch, i32(dim.astride), i32(dim.ah), block=(192, 1, 1),
|
||||||
|
grid=(dim.astride / 192, dim.ah), stream=stream)
|
||||||
|
|
||||||
def de(self, ddst, dsrc, gnm, dim, tc, stream=None):
|
def de(self, ddst, dsrc, gnm, dim, tc, stream=None):
|
||||||
k1 = f32(gnm.color.brightness(tc) * 268 / 256)
|
k1 = f32(gnm.color.brightness(tc) * 268 / 256)
|
||||||
# Old definition of area is (w*h/(s*s)). Since new scale 'ns' is now
|
# Old definition of area is (w*h/(s*s)). Since new scale 'ns' is now
|
||||||
# s/w, new definition is (w*h/(s*s*w*w)) = (h/(s*s*w))
|
# s/w, new definition is (w*h/(s*s*w*w)) = (h/(s*s*w))
|
||||||
area = dim.h / (gnm.camera.scale(tc) ** 2 * dim.w)
|
area = dim.h / (gnm.camera.scale(tc) ** 2 * dim.w)
|
||||||
k2 = f32(1 / (area * gnm.spp(tc)))
|
k2 = f32(1.0 / (area * gnm.spp(tc)))
|
||||||
|
|
||||||
if gnm.de.radius == 0:
|
if gnm.de.radius == 0:
|
||||||
nbins = dim.ah * dim.astride
|
nbins = dim.ah * dim.astride
|
||||||
@ -304,17 +278,17 @@ class Filtering(object):
|
|||||||
t = fun(dsrc, ddst, k1, k2,
|
t = fun(dsrc, ddst, k1, k2,
|
||||||
block=(512, 1, 1), grid=(nbins/512, 1), stream=stream)
|
block=(512, 1, 1), grid=(nbins/512, 1), stream=stream)
|
||||||
else:
|
else:
|
||||||
scale_coeff = f32(-(1 + gnm.de.radius(tc)) ** -2.0)
|
scale_coeff = f32(1.0 / gnm.de.radius(tc))
|
||||||
est_curve = f32(2 * gnm.de.curve(tc))
|
est_curve = f32(gnm.de.curve(tc))
|
||||||
# TODO: experiment with this
|
# TODO: experiment with this
|
||||||
edge_clamp = f32(1.2)
|
edge_clamp = f32(1.2)
|
||||||
fun = self.mod.get_function("density_est")
|
fun = self.mod.get_function("density_est")
|
||||||
fun(dsrc, ddst, scale_coeff, est_curve, edge_clamp, k1, k2,
|
fun(ddst, dsrc, scale_coeff, est_curve, edge_clamp, k1, k2,
|
||||||
i32(dim.ah), i32(dim.astride), block=(32, 32, 1),
|
i32(dim.ah / dim.ss), i32(dim.astride / dim.ss), i32(dim.ss),
|
||||||
grid=(dim.aw/32, 1), stream=stream)
|
block=(32, 32, 1), grid=(dim.aw/(32*dim.ss), 1), stream=stream)
|
||||||
|
|
||||||
def colorclip(self, dbuf, gnm, dim, tc, blend, stream=None):
|
def colorclip(self, dbuf, gnm, dim, tc, blend, stream=None):
|
||||||
nbins = dim.ah * dim.astride
|
nbins = dim.ah * dim.astride / dim.ss ** 2
|
||||||
|
|
||||||
# TODO: implement integration over cubic splines?
|
# TODO: implement integration over cubic splines?
|
||||||
gam = f32(1 / gnm.color.gamma(tc))
|
gam = f32(1 / gnm.color.gamma(tc))
|
||||||
@ -332,4 +306,3 @@ class Filtering(object):
|
|||||||
color_fun(dbuf, gam, vib, hipow, lin, lingam, bkgd, i32(nbins),
|
color_fun(dbuf, gam, vib, hipow, lin, lingam, bkgd, i32(nbins),
|
||||||
i32(blend), block=(256, 1, 1), grid=(blocks, blocks),
|
i32(blend), block=(256, 1, 1), grid=(blocks, blocks),
|
||||||
stream=stream)
|
stream=stream)
|
||||||
|
|
||||||
|
@ -68,7 +68,7 @@ def precalc_camera(pcam):
|
|||||||
float rot = {{pre_cam.rotation}} * M_PI / 180.0f;
|
float rot = {{pre_cam.rotation}} * M_PI / 180.0f;
|
||||||
float rotsin = sin(rot), rotcos = cos(rot);
|
float rotsin = sin(rot), rotcos = cos(rot);
|
||||||
float cenx = {{pre_cam.center.x}}, ceny = {{pre_cam.center.y}};
|
float cenx = {{pre_cam.center.x}}, ceny = {{pre_cam.center.y}};
|
||||||
float scale = {{pre_cam.scale}} * acc_size.width;
|
float scale = {{pre_cam.scale}} * acc_size.width * acc_size.ss;
|
||||||
|
|
||||||
{{pre_cam._set('xx')}} = scale * rotcos;
|
{{pre_cam._set('xx')}} = scale * rotcos;
|
||||||
{{pre_cam._set('xy')}} = scale * -rotsin;
|
{{pre_cam._set('xy')}} = scale * -rotsin;
|
||||||
@ -126,6 +126,7 @@ typedef struct {
|
|||||||
uint32_t awidth;
|
uint32_t awidth;
|
||||||
uint32_t aheight;
|
uint32_t aheight;
|
||||||
uint32_t astride;
|
uint32_t astride;
|
||||||
|
uint32_t ss;
|
||||||
} acc_size_t;
|
} acc_size_t;
|
||||||
__constant__ acc_size_t acc_size;
|
__constant__ acc_size_t acc_size;
|
||||||
|
|
||||||
@ -203,7 +204,7 @@ void iter(
|
|||||||
mwc_st rctx = msts[this_rb_idx];
|
mwc_st rctx = msts[this_rb_idx];
|
||||||
|
|
||||||
{{precalc_camera(pcp.camera)}}
|
{{precalc_camera(pcp.camera)}}
|
||||||
if (threadIdx.y == 5 && threadIdx.x == 4) {
|
if (acc_size.ss == 1 && threadIdx.y == 5 && threadIdx.x == 4) {
|
||||||
float ditherwidth = {{pcp.camera.dither_width}} * 0.33f;
|
float ditherwidth = {{pcp.camera.dither_width}} * 0.33f;
|
||||||
float u0 = mwc_next_01(rctx);
|
float u0 = mwc_next_01(rctx);
|
||||||
float r = ditherwidth * sqrtf(-2.0f * log2f(u0) / M_LOG2E);
|
float r = ditherwidth * sqrtf(-2.0f * log2f(u0) / M_LOG2E);
|
||||||
|
@ -20,7 +20,7 @@ from cuburn import affine
|
|||||||
from cuburn.code import util, mwc, iter, interp, filtering, sort
|
from cuburn.code import util, mwc, iter, interp, filtering, sort
|
||||||
|
|
||||||
RenderedImage = namedtuple('RenderedImage', 'buf idx gpu_time')
|
RenderedImage = namedtuple('RenderedImage', 'buf idx gpu_time')
|
||||||
Dimensions = namedtuple('Dimensions', 'w h aw ah astride')
|
Dimensions = namedtuple('Dimensions', 'w h aw ah astride ss')
|
||||||
|
|
||||||
def _sync_stream(dst, src):
|
def _sync_stream(dst, src):
|
||||||
dst.wait_for_event(cuda.Event(cuda.event_flags.DISABLE_TIMING).record(src))
|
dst.wait_for_event(cuda.Event(cuda.event_flags.DISABLE_TIMING).record(src))
|
||||||
@ -138,7 +138,7 @@ class Renderer(object):
|
|||||||
next(r)
|
next(r)
|
||||||
return ifilter(None, imap(r.send, chain(times, [None])))
|
return ifilter(None, imap(r.send, chain(times, [None])))
|
||||||
|
|
||||||
def render_gen(self, genome, width, height, blend=True):
|
def render_gen(self, genome, width, height, ss=1, blend=True):
|
||||||
"""
|
"""
|
||||||
Render frames. This method is wrapped by the ``render()`` method; see
|
Render frames. This method is wrapped by the ``render()`` method; see
|
||||||
its docstring for warnings and details.
|
its docstring for warnings and details.
|
||||||
@ -182,24 +182,25 @@ class Renderer(object):
|
|||||||
event_a = cuda.Event().record(filt_stream)
|
event_a = cuda.Event().record(filt_stream)
|
||||||
event_b = None
|
event_b = None
|
||||||
|
|
||||||
awidth = width + 2 * self.gutter
|
owidth = width + 2 * self.gutter
|
||||||
aheight = height + 2 * self.gutter
|
oheight = height + 2 * self.gutter
|
||||||
astride = 32 * int(np.ceil(awidth / 32.))
|
ostride = 32 * int(np.ceil(owidth / 32.))
|
||||||
dim = Dimensions(width, height, awidth, aheight, astride)
|
awidth, aheight, astride = owidth * ss, oheight * ss, ostride * ss
|
||||||
|
dim = Dimensions(width, height, awidth, aheight, astride, ss)
|
||||||
d_acc_size = self.mod.get_global('acc_size')[0]
|
d_acc_size = self.mod.get_global('acc_size')[0]
|
||||||
cuda.memcpy_htod_async(d_acc_size, u32(list(dim)), write_stream)
|
cuda.memcpy_htod_async(d_acc_size, u32(list(dim)), write_stream)
|
||||||
|
|
||||||
nbins = awidth * aheight
|
nbins = awidth * aheight
|
||||||
# Extra padding in accum helps with write_shmem overruns
|
# Extra padding in accum helps with write_shmem overruns
|
||||||
d_accum = cuda.mem_alloc(16 * nbins + (1<<16))
|
d_accum = cuda.mem_alloc(16 * nbins + (1<<16))
|
||||||
d_out = cuda.mem_alloc(16 * nbins)
|
d_out = cuda.mem_alloc(16 * oheight * ostride)
|
||||||
if self.acc_mode == 'atomic':
|
if self.acc_mode == 'atomic':
|
||||||
d_atom = cuda.mem_alloc(8 * nbins)
|
d_atom = cuda.mem_alloc(8 * nbins)
|
||||||
flush_fun = self.mod.get_function("flush_atom")
|
flush_fun = self.mod.get_function("flush_atom")
|
||||||
|
|
||||||
obuf_copy = argset(cuda.Memcpy2D(),
|
obuf_copy = argset(cuda.Memcpy2D(),
|
||||||
src_y=self.gutter, src_x_in_bytes=16*self.gutter,
|
src_y=self.gutter, src_x_in_bytes=16*self.gutter,
|
||||||
src_pitch=16*astride, dst_pitch=16*width,
|
src_pitch=16*ostride, dst_pitch=16*width,
|
||||||
width_in_bytes=16*width, height=height)
|
width_in_bytes=16*width, height=height)
|
||||||
obuf_copy.set_src_device(d_out)
|
obuf_copy.set_src_device(d_out)
|
||||||
h_out_a = cuda.pagelocked_empty((height, width, 4), f32)
|
h_out_a = cuda.pagelocked_empty((height, width, 4), f32)
|
||||||
@ -343,8 +344,10 @@ class Renderer(object):
|
|||||||
block=(512, 1, 1), grid=(nblocks, nblocks),
|
block=(512, 1, 1), grid=(nblocks, nblocks),
|
||||||
stream=iter_stream)
|
stream=iter_stream)
|
||||||
|
|
||||||
util.BaseCode.fill_dptr(self.mod, d_out, 4 * nbins, filt_stream)
|
|
||||||
_sync_stream(filt_stream, write_stream)
|
_sync_stream(filt_stream, write_stream)
|
||||||
|
filt.blur_density(d_accum, d_out, dim, stream=filt_stream)
|
||||||
|
util.BaseCode.fill_dptr(self.mod, d_out, 4 * nbins / ss ** 2,
|
||||||
|
filt_stream)
|
||||||
filt.de(d_out, d_accum, genome, dim, tc, stream=filt_stream)
|
filt.de(d_out, d_accum, genome, dim, tc, stream=filt_stream)
|
||||||
_sync_stream(write_stream, filt_stream)
|
_sync_stream(write_stream, filt_stream)
|
||||||
filt.colorclip(d_out, genome, dim, tc, blend, stream=filt_stream)
|
filt.colorclip(d_out, genome, dim, tc, blend, stream=filt_stream)
|
||||||
|
Loading…
Reference in New Issue
Block a user