mirror of
https://github.com/stevenrobertson/cuburn.git
synced 2025-02-05 11:40:04 -05:00
Sweeping refactor. More bugs undoubtedly remain.
This commit is contained in:
parent
4cfd328f85
commit
60a45c9a20
@ -1,9 +0,0 @@
|
||||
"""
|
||||
Contains the PTX fragments which will drive the device, and helper functions
|
||||
to combine those fragments.
|
||||
"""
|
||||
|
||||
import util
|
||||
import mwc
|
||||
import iter
|
||||
|
77
cuburn/code/color.py
Normal file
77
cuburn/code/color.py
Normal file
@ -0,0 +1,77 @@
|
||||
import numpy as np
|
||||
|
||||
from util import devlib
|
||||
|
||||
# The JPEG YUV full-range matrix, without bias into the positve regime.
|
||||
# This assumes input color space is CIERGB D65, encoded with gamma 2.2.
|
||||
# Note that some interpolated colors may exceed the sRGB and YUV gamuts.
|
||||
YUV_MATRIX = np.matrix([[ 0.299, 0.587, 0.114],
|
||||
[-0.168736, -0.331264, 0.5],
|
||||
[ 0.5, -0.418688, -0.081312]])
|
||||
|
||||
yuvlib = devlib(defs='''
|
||||
__device__ float3 rgb2yuv(float3 rgb);
|
||||
__device__ float3 yuv2rgb(float3 yuv);
|
||||
''', decls=r'''
|
||||
/* This conversion uses the JPEG full-range standard. Note that UV have range
|
||||
* [-0.5, 0.5], so consider biasing the results. */
|
||||
__device__ float3 rgb2yuv(float3 rgb) {
|
||||
return make_float3(
|
||||
0.299f * rgb.x + 0.587f * rgb.y + 0.114f * rgb.z,
|
||||
-0.168736f * rgb.x - 0.331264f * rgb.y + 0.5f * rgb.z,
|
||||
0.5f * rgb.x - 0.418688f * rgb.y - 0.081312f * rgb.z);
|
||||
}
|
||||
|
||||
__device__ float3 yuv2rgb(float3 yuv) {
|
||||
return make_float3(
|
||||
yuv.x + 1.402f * yuv.z,
|
||||
yuv.x - 0.34414f * yuv.y - 0.71414f * yuv.z,
|
||||
yuv.x + 1.772f * yuv.y);
|
||||
}
|
||||
''')
|
||||
|
||||
hsvlib = devlib(decls='''
|
||||
__device__ float3 rgb2hsv(float3 rgb);
|
||||
__device__ float3 hsv2rgb(float3 hsv);
|
||||
''', defs=r'''
|
||||
__device__ float3 rgb2hsv(float3 rgb) {
|
||||
float M = fmaxf(fmaxf(rgb.x, rgb.y), rgb.z);
|
||||
float m = fminf(fminf(rgb.x, rgb.y), rgb.z);
|
||||
float C = M - m;
|
||||
|
||||
float s = M > 0.0f ? C / M : 0.0f;
|
||||
|
||||
float h = 0.0f;
|
||||
if (s != 0.0f) {
|
||||
C = 1.0f / C;
|
||||
float rc = (M - rgb.x) * C;
|
||||
float gc = (M - rgb.y) * C;
|
||||
float bc = (M - rgb.z) * C;
|
||||
|
||||
if (rgb.x == M) h = bc - gc;
|
||||
else if (rgb.y == M) h = 2 + rc - bc;
|
||||
else h = 4 + gc - rc;
|
||||
|
||||
if (h < 0) h += 6.0f;
|
||||
}
|
||||
return make_float3(h, s, M);
|
||||
}
|
||||
|
||||
__device__ float3 hsv2rgb(float3 hsv) {
|
||||
float whole = floorf(hsv.x);
|
||||
float frac = hsv.x - whole;
|
||||
float val = hsv.z;
|
||||
float min = val * (1 - hsv.y);
|
||||
float mid = val * (1 - (hsv.y * frac));
|
||||
float alt = val * (1 - (hsv.y * (1 - frac)));
|
||||
|
||||
float3 out;
|
||||
if (whole == 0.0f) { out.x = val; out.y = alt; out.z = min; }
|
||||
else if (whole == 1.0f) { out.x = mid; out.y = val; out.z = min; }
|
||||
else if (whole == 2.0f) { out.x = min; out.y = val; out.z = alt; }
|
||||
else if (whole == 3.0f) { out.x = min; out.y = mid; out.z = val; }
|
||||
else if (whole == 4.0f) { out.x = alt; out.y = min; out.z = val; }
|
||||
else { out.x = val; out.y = min; out.z = mid; }
|
||||
return out;
|
||||
}
|
||||
''')
|
@ -1,13 +1,7 @@
|
||||
import numpy as np
|
||||
from numpy import float32 as f32, int32 as i32
|
||||
from util import devlib
|
||||
from color import yuvlib
|
||||
|
||||
import pycuda.driver as cuda
|
||||
import pycuda.compiler
|
||||
from pycuda.gpuarray import vec
|
||||
|
||||
from cuburn.code.util import *
|
||||
|
||||
_CODE = r'''
|
||||
texshearlib = devlib(defs=r'''
|
||||
// Filter directions specified in degrees, using image/texture addressing
|
||||
// [(0,0) is upper left corner, 90 degrees is down].
|
||||
|
||||
@ -22,7 +16,9 @@ __constant__ float2 addressing_patterns[16] = {
|
||||
{ 1.0f, -0.333333f}, { 0.333333f, 1.0f}, // 14, 15: -15, 75
|
||||
};
|
||||
|
||||
// Mon dieu! A C++ feature?
|
||||
// Mon dieu! A C++ feature? Gotta to close the "extern C" added by the compiler.
|
||||
}
|
||||
|
||||
template <typename T> __device__ T
|
||||
tex_shear(texture<T, cudaTextureType2D> ref, int pattern,
|
||||
float x, float y, float radius) {
|
||||
@ -39,100 +35,14 @@ tex_shear(texture<T, cudaTextureType2D> ref, int pattern,
|
||||
}
|
||||
|
||||
extern "C" {
|
||||
''')
|
||||
|
||||
__global__
|
||||
void colorclip(float4 *pixbuf, float gamma, float vibrance, float highpow,
|
||||
float linrange, float lingam, float3 bkgd,
|
||||
int fbsize, int blend_background_color) {
|
||||
int i = threadIdx.x + blockDim.x * (blockIdx.x + gridDim.x * blockIdx.y);
|
||||
if (i >= fbsize) return;
|
||||
|
||||
float4 pix = pixbuf[i];
|
||||
|
||||
if (pix.w <= 0) {
|
||||
pixbuf[i] = make_float4(bkgd.x, bkgd.y, bkgd.z, 0.0f);
|
||||
return;
|
||||
}
|
||||
pix.y -= 0.5f * pix.w;
|
||||
pix.z -= 0.5f * pix.w;
|
||||
float3 tmp = yuv2rgb(make_float3(pix.x, pix.y, pix.z));
|
||||
pix.x = tmp.x;
|
||||
pix.y = tmp.y;
|
||||
pix.z = tmp.z;
|
||||
|
||||
pix.x = fmaxf(0.0f, pix.x);
|
||||
pix.y = fmaxf(0.0f, pix.y);
|
||||
pix.z = fmaxf(0.0f, pix.z);
|
||||
|
||||
float4 opix = pix;
|
||||
|
||||
float alpha = powf(pix.w, gamma);
|
||||
if (pix.w < linrange) {
|
||||
float frac = pix.w / linrange;
|
||||
alpha = (1.0f - frac) * pix.w * lingam + frac * alpha;
|
||||
}
|
||||
|
||||
if (!blend_background_color) {
|
||||
float ls = alpha / pix.w;
|
||||
pix.x *= ls;
|
||||
pix.y *= ls;
|
||||
pix.z *= ls;
|
||||
pix.w = alpha;
|
||||
pixbuf[i] = pix;
|
||||
return;
|
||||
}
|
||||
|
||||
float ls = vibrance * alpha / pix.w;
|
||||
alpha = fminf(1.0f, fmaxf(0.0f, alpha));
|
||||
|
||||
float maxc = fmaxf(pix.x, fmaxf(pix.y, pix.z));
|
||||
float maxa = maxc * ls;
|
||||
float newls = 1.0f / maxc;
|
||||
|
||||
if (maxa > 1.0f && highpow >= 0.0f) {
|
||||
float lsratio = powf(newls / ls, highpow);
|
||||
pix.x *= newls;
|
||||
pix.y *= newls;
|
||||
pix.z *= newls;
|
||||
|
||||
// Reduce saturation (according to the HSV model) by proportionally
|
||||
// increasing the values of the other colors.
|
||||
pix.x = maxc - (maxc - pix.x) * lsratio;
|
||||
pix.y = maxc - (maxc - pix.y) * lsratio;
|
||||
pix.z = maxc - (maxc - pix.z) * lsratio;
|
||||
} else {
|
||||
float adjhlp = -highpow;
|
||||
if (adjhlp > 1.0f || maxa <= 1.0f) adjhlp = 1.0f;
|
||||
if (maxc > 0.0f) {
|
||||
float adj = ((1.0f - adjhlp) * newls + adjhlp * ls);
|
||||
pix.x *= adj;
|
||||
pix.y *= adj;
|
||||
pix.z *= adj;
|
||||
}
|
||||
}
|
||||
|
||||
pix.x += (1.0f - vibrance) * powf(opix.x, gamma);
|
||||
pix.y += (1.0f - vibrance) * powf(opix.y, gamma);
|
||||
pix.z += (1.0f - vibrance) * powf(opix.z, gamma);
|
||||
|
||||
pix.x += (1.0f - alpha) * bkgd.x;
|
||||
pix.y += (1.0f - alpha) * bkgd.y;
|
||||
pix.z += (1.0f - alpha) * bkgd.z;
|
||||
|
||||
pix.x = fminf(1.0f, pix.x);
|
||||
pix.y = fminf(1.0f, pix.y);
|
||||
pix.z = fminf(1.0f, pix.z);
|
||||
pix.w = alpha;
|
||||
|
||||
pixbuf[i] = pix;
|
||||
}
|
||||
|
||||
__global__
|
||||
void logscale(float4 *outbuf, const float4 *pixbuf, float k1, float k2) {
|
||||
logscalelib = devlib(defs=r'''
|
||||
__global__ void
|
||||
logscale(float4 *outbuf, const float4 *pixbuf, float k1, float k2) {
|
||||
int i = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
float4 pix = pixbuf[i];
|
||||
|
||||
// float ls = fmaxf(0, k1 * logf(1.0f + pix.w * pix.w * k2 / (1 + pix.w)) / pix.w);
|
||||
float ls = fmaxf(0, k1 * logf(1.0f + pix.w * k2) / pix.w);
|
||||
pix.x *= ls;
|
||||
pix.y *= ls;
|
||||
@ -141,10 +51,12 @@ void logscale(float4 *outbuf, const float4 *pixbuf, float k1, float k2) {
|
||||
|
||||
outbuf[i] = pix;
|
||||
}
|
||||
''')
|
||||
|
||||
fmabuflib = devlib(defs=r'''
|
||||
// Element-wise computation of ``dst[i]=dst[i]+src[i]*scale``.
|
||||
__global__
|
||||
void fma_buf(float4 *dst, const float4 *src, int astride, float scale) {
|
||||
__global__ void
|
||||
fma_buf(float4 *dst, const float4 *src, int astride, float scale) {
|
||||
int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
int i = y * astride + x;
|
||||
@ -155,23 +67,16 @@ void fma_buf(float4 *dst, const float4 *src, int astride, float scale) {
|
||||
d.w += s.w * scale;
|
||||
dst[i] = d;
|
||||
}
|
||||
''')
|
||||
|
||||
texture<float4, cudaTextureType2D> bilateral_src;
|
||||
denblurlib = devlib(decls='''
|
||||
texture<float, cudaTextureType2D> blur_src;
|
||||
|
||||
__global__ void logscale_den(float *dst, float k2) {
|
||||
int xi = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
int yi = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
float4 pix = tex2D(bilateral_src, xi, yi);
|
||||
float out = logf(1.0f + pix.w * k2);
|
||||
dst[yi * (blockDim.x * gridDim.x) + xi] = out;
|
||||
}
|
||||
|
||||
__constant__ float gauss_coefs[9] = {
|
||||
0.00443305f, 0.05400558f, 0.24203623f, 0.39905028f,
|
||||
0.24203623f, 0.05400558f, 0.00443305f
|
||||
};
|
||||
|
||||
''', defs=r'''
|
||||
// Apply a Gaussian-esque blur to the density channel of the texture in
|
||||
// ``bilateral_src`` in the horizontal direction, and write it to ``dst``, a
|
||||
// one-channel buffer.
|
||||
@ -203,7 +108,11 @@ __global__ void den_blur_1c(float *dst, int pattern, int upsample) {
|
||||
* gauss_coefs[i];
|
||||
dst[yi * (blockDim.x * gridDim.x) + xi] = den;
|
||||
}
|
||||
''')
|
||||
|
||||
bilaterallib = devlib(deps=[logscalelib, texshearlib, denblurlib], decls='''
|
||||
texture<float4, cudaTextureType2D> bilateral_src;
|
||||
''', defs=r'''
|
||||
/* sstd: spatial standard deviation (Gaussian filter)
|
||||
* cstd: color standard deviation (Gaussian on the range [0, 1], where 1
|
||||
* represents an "opposite" color).
|
||||
@ -215,9 +124,8 @@ __global__ void den_blur_1c(float *dst, int pattern, int upsample) {
|
||||
* gspeed: Speed of (exp2f) Gompertz distribution governing how much to
|
||||
* tighten gradients. Zero and negative values OK.
|
||||
*/
|
||||
|
||||
__global__
|
||||
void bilateral(float4 *dst, int pattern, int radius,
|
||||
__global__ void
|
||||
bilateral(float4 *dst, int pattern, int radius,
|
||||
float sstd, float cstd, float dstd, float dpow, float gspeed)
|
||||
{
|
||||
int xi = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
@ -317,124 +225,83 @@ void bilateral(float4 *dst, int pattern, int radius,
|
||||
const int astride = blockDim.x * gridDim.x;
|
||||
dst[yi * astride + xi] = out;
|
||||
}
|
||||
''')
|
||||
|
||||
} // end extern "C"
|
||||
'''
|
||||
colorcliplib = devlib(deps=[yuvlib], defs=r'''
|
||||
__global__ void
|
||||
colorclip(float4 *pixbuf, float gamma, float vibrance, float highpow,
|
||||
float linrange, float lingam, float3 bkgd, int fbsize)
|
||||
{
|
||||
int i = threadIdx.x + blockDim.x * (blockIdx.x + gridDim.x * blockIdx.y);
|
||||
if (i >= fbsize) return;
|
||||
|
||||
class Filtering(HunkOCode):
|
||||
mod = None
|
||||
defs = _CODE
|
||||
float4 pix = pixbuf[i];
|
||||
|
||||
@classmethod
|
||||
def init_mod(cls):
|
||||
if cls.mod is None:
|
||||
cls.mod = pycuda.compiler.SourceModule(assemble_code(BaseCode, cls),
|
||||
options=['-use_fast_math', '-maxrregcount', '32'],
|
||||
no_extern_c=True)
|
||||
if (pix.w <= 0) {
|
||||
pixbuf[i] = make_float4(bkgd.x, bkgd.y, bkgd.z, 0.0f);
|
||||
return;
|
||||
}
|
||||
pix.y -= 0.5f * pix.w;
|
||||
pix.z -= 0.5f * pix.w;
|
||||
float3 tmp = yuv2rgb(make_float3(pix.x, pix.y, pix.z));
|
||||
pix.x = tmp.x;
|
||||
pix.y = tmp.y;
|
||||
pix.z = tmp.z;
|
||||
|
||||
def __init__(self):
|
||||
self.init_mod()
|
||||
pix.x = fmaxf(0.0f, pix.x);
|
||||
pix.y = fmaxf(0.0f, pix.y);
|
||||
pix.z = fmaxf(0.0f, pix.z);
|
||||
|
||||
def de(self, ddst, dsrc, dscratch, gnm, dim, tc, nxf, stream=None):
|
||||
float4 opix = pix;
|
||||
|
||||
# Helper variables and functions to keep it clean
|
||||
sb = 16 * dim.astride
|
||||
bs = sb * dim.ah
|
||||
bl, gr = (32, 8, 1), (dim.astride / 32, dim.ah / 8)
|
||||
float alpha = powf(pix.w, gamma);
|
||||
if (pix.w < linrange) {
|
||||
float frac = pix.w / linrange;
|
||||
alpha = (1.0f - frac) * pix.w * lingam + frac * alpha;
|
||||
}
|
||||
|
||||
def launch(f, *args, **kwargs):
|
||||
f(*args, block=bl, grid=gr, stream=stream, **kwargs)
|
||||
mkdsc = lambda c: argset(cuda.ArrayDescriptor(), height=dim.ah,
|
||||
width=dim.astride, num_channels=c,
|
||||
format=cuda.array_format.FLOAT)
|
||||
def mktref(n):
|
||||
tref = self.mod.get_texref(n)
|
||||
tref.set_filter_mode(cuda.filter_mode.POINT)
|
||||
tref.set_address_mode(0, cuda.address_mode.WRAP)
|
||||
tref.set_address_mode(1, cuda.address_mode.WRAP)
|
||||
return tref
|
||||
float ls = vibrance * alpha / pix.w;
|
||||
alpha = fminf(1.0f, fmaxf(0.0f, alpha));
|
||||
|
||||
dsc = mkdsc(4)
|
||||
tref = mktref('bilateral_src')
|
||||
grad_dsc = mkdsc(1)
|
||||
grad_tref = mktref('blur_src')
|
||||
float maxc = fmaxf(pix.x, fmaxf(pix.y, pix.z));
|
||||
float maxa = maxc * ls;
|
||||
float newls = 1.0f / maxc;
|
||||
|
||||
bilateral, logscale_den, den_blur, den_blur_1c, fma_buf = map(
|
||||
self.mod.get_function,
|
||||
'bilateral logscale_den den_blur den_blur_1c fma_buf'.split())
|
||||
if (maxa > 1.0f && highpow >= 0.0f) {
|
||||
float lsratio = powf(newls / ls, highpow);
|
||||
pix.x *= newls;
|
||||
pix.y *= newls;
|
||||
pix.z *= newls;
|
||||
|
||||
# Number of different shear patterns to use when filtering. Must be
|
||||
# even, since we depend on buffer bouncing (but I don't think that it's
|
||||
# a requirement for the filter itself to get decent results).
|
||||
DIRECTIONS = 8
|
||||
// Reduce saturation (according to the HSV model) by proportionally
|
||||
// increasing the values of the other colors.
|
||||
pix.x = maxc - (maxc - pix.x) * lsratio;
|
||||
pix.y = maxc - (maxc - pix.y) * lsratio;
|
||||
pix.z = maxc - (maxc - pix.z) * lsratio;
|
||||
} else {
|
||||
float adjhlp = -highpow;
|
||||
if (adjhlp > 1.0f || maxa <= 1.0f) adjhlp = 1.0f;
|
||||
if (maxc > 0.0f) {
|
||||
float adj = ((1.0f - adjhlp) * newls + adjhlp * ls);
|
||||
pix.x *= adj;
|
||||
pix.y *= adj;
|
||||
pix.z *= adj;
|
||||
}
|
||||
}
|
||||
|
||||
def do_bilateral(bsrc, bdst, pattern, r=15, sstd=6, cstd=0.05,
|
||||
dstd=1.5, dpow=0.8, gspeed=4.0):
|
||||
# Scale spatial parameter so that a "pixel" is equivalent to an
|
||||
# actual pixel at 1080p
|
||||
sstd *= dim.w / 1920.
|
||||
pix.x += (1.0f - vibrance) * powf(opix.x, gamma);
|
||||
pix.y += (1.0f - vibrance) * powf(opix.y, gamma);
|
||||
pix.z += (1.0f - vibrance) * powf(opix.z, gamma);
|
||||
|
||||
tref.set_address_2d(bsrc, dsc, sb)
|
||||
pix.x += (1.0f - alpha) * bkgd.x;
|
||||
pix.y += (1.0f - alpha) * bkgd.y;
|
||||
pix.z += (1.0f - alpha) * bkgd.z;
|
||||
|
||||
# Blur density two octaves along sampling vector, ultimately
|
||||
# storing in `dscratch`
|
||||
launch(den_blur, np.intp(bdst), i32(pattern), i32(0),
|
||||
texrefs=[tref])
|
||||
grad_tref.set_address_2d(bdst, grad_dsc, sb / 4)
|
||||
launch(den_blur_1c, dscratch, i32(pattern), i32(1),
|
||||
texrefs=[grad_tref])
|
||||
grad_tref.set_address_2d(dscratch, grad_dsc, sb / 4)
|
||||
pix.x = fminf(1.0f, pix.x);
|
||||
pix.y = fminf(1.0f, pix.y);
|
||||
pix.z = fminf(1.0f, pix.z);
|
||||
pix.w = alpha;
|
||||
|
||||
launch(bilateral, np.intp(bdst), i32(pattern), i32(r),
|
||||
f32(sstd), f32(cstd), f32(dstd), f32(dpow), f32(gspeed),
|
||||
texrefs=[tref, grad_tref])
|
||||
|
||||
def do_bilateral_range(bsrc, bdst, npats, *args, **kwargs):
|
||||
for i in range(npats):
|
||||
do_bilateral(bsrc, bdst, i, *args, **kwargs)
|
||||
bdst, bsrc = bsrc, bdst
|
||||
if npats % 2:
|
||||
cuda.memcpy_dtod_async(bdst, bsrc, bs, stream)
|
||||
|
||||
# Filter the first xform, using `ddst` as an intermediate buffer.
|
||||
# End result is the filtered copy in `dsrc`.
|
||||
do_bilateral_range(dsrc, ddst, DIRECTIONS)
|
||||
|
||||
# Filter the remaining xforms, using `ddst` as an intermediate
|
||||
# buffer, then add the result to `dsrc` (at the zero'th xform).
|
||||
for x in range(1, nxf):
|
||||
src = int(dsrc) + x * bs
|
||||
do_bilateral_range(src, ddst, DIRECTIONS)
|
||||
launch(fma_buf, dsrc, np.intp(src), i32(dim.astride), f32(1))
|
||||
|
||||
# Log-scale the accumulated buffer in `dsrc`.
|
||||
k1 = f32(gnm.color.brightness(tc) * 268 / 256)
|
||||
# Old definition of area is (w*h/(s*s)). Since new scale 'ns' is now
|
||||
# s/w, new definition is (w*h/(s*s*w*w)) = (h/(s*s*w))
|
||||
area = dim.h / (gnm.camera.scale(tc) ** 2 * dim.w)
|
||||
k2 = f32(1.0 / (area * gnm.spp(tc)))
|
||||
|
||||
nbins = dim.ah * dim.astride
|
||||
logscale = self.mod.get_function("logscale")
|
||||
t = logscale(ddst, dsrc, k1, k2,
|
||||
block=(512, 1, 1), grid=(nbins/512, 1), stream=stream)
|
||||
|
||||
def colorclip(self, dbuf, gnm, dim, tc, blend, stream=None):
|
||||
nbins = dim.ah * dim.astride
|
||||
|
||||
# TODO: implement integration over cubic splines?
|
||||
gam = f32(1 / gnm.color.gamma(tc))
|
||||
vib = f32(gnm.color.vibrance(tc))
|
||||
hipow = f32(gnm.color.highlight_power(tc))
|
||||
lin = f32(gnm.color.gamma_threshold(tc))
|
||||
lingam = f32(lin ** (gam-1.0) if lin > 0 else 0)
|
||||
bkgd = vec.make_float3(
|
||||
gnm.color.background.r(tc),
|
||||
gnm.color.background.g(tc),
|
||||
gnm.color.background.b(tc))
|
||||
|
||||
color_fun = self.mod.get_function("colorclip")
|
||||
blocks = int(np.ceil(np.sqrt(nbins / 256)))
|
||||
color_fun(dbuf, gam, vib, hipow, lin, lingam, bkgd, i32(nbins),
|
||||
i32(blend), block=(256, 1, 1), grid=(blocks, blocks),
|
||||
stream=stream)
|
||||
pixbuf[i] = pix;
|
||||
}
|
||||
''')
|
@ -2,8 +2,10 @@ from collections import OrderedDict
|
||||
from itertools import cycle
|
||||
import numpy as np
|
||||
|
||||
import tempita
|
||||
from util import HunkOCode, Template, BaseCode, assemble_code
|
||||
import util
|
||||
from util import Template, assemble_code, devlib, binsearchlib, ringbuflib
|
||||
from color import yuvlib
|
||||
from mwc import mwclib
|
||||
|
||||
class GenomePackerName(str):
|
||||
"""Class to indicate that a property is precalculated on the device"""
|
||||
@ -115,17 +117,10 @@ class GenomePackerPrecalc(GenomePackerView):
|
||||
def _code(self, code):
|
||||
self.packer.precalc_code.append(code)
|
||||
|
||||
class GenomePacker(HunkOCode):
|
||||
class GenomePacker(object):
|
||||
"""
|
||||
Packs a genome for use in iteration.
|
||||
"""
|
||||
|
||||
# 2^search_rounds is the maximum number of control points, including
|
||||
# endpoints, that can be used in a single genome. It should be okay to
|
||||
# change this number without touching other code, but 32 samples fits
|
||||
# nicely on a single cache line.
|
||||
search_rounds = 5
|
||||
|
||||
def __init__(self, tname):
|
||||
"""
|
||||
Create a new DataPacker.
|
||||
@ -152,6 +147,7 @@ class GenomePacker(HunkOCode):
|
||||
|
||||
self.packed = None
|
||||
self.genome = None
|
||||
self.search_rounds = util.DEFAULT_SEARCH_ROUNDS
|
||||
|
||||
def __len__(self):
|
||||
assert self._len is not None, 'len() called before finalize()'
|
||||
@ -189,42 +185,41 @@ class GenomePacker(HunkOCode):
|
||||
|
||||
self._len = len(self.packed)
|
||||
|
||||
self.decls = self._decls.substitute(
|
||||
packed=self.packed, tname=self.tname,
|
||||
search_rounds=self.search_rounds)
|
||||
self.defs = self._defs.substitute(
|
||||
decls = self._decls.substitute(packed=self.packed, tname=self.tname)
|
||||
defs = self._defs.substitute(
|
||||
packed_direct=self.packed_direct, tname=self.tname,
|
||||
precalc_code=self.precalc_code,
|
||||
search_rounds=self.search_rounds)
|
||||
|
||||
return devlib(deps=[catmullromlib], decls=decls, defs=defs)
|
||||
|
||||
def pack(self):
|
||||
def pack(self, pool=None):
|
||||
"""
|
||||
Return a packed copy of the genome ready for uploading to the GPU as a
|
||||
3D NDArray, with the first element being the times and the second
|
||||
being the values.
|
||||
Return a packed copy of the genome ready for uploading to the GPU,
|
||||
as two float32 NDArrays for the knot times and values.
|
||||
"""
|
||||
width = 1 << self.search_rounds
|
||||
out = np.empty((2, len(self.genome), width), dtype=np.float32)
|
||||
# Ensure that unused values at the top are always big (must be >2.0)
|
||||
out[0].fill(1e9)
|
||||
if pool:
|
||||
times = pool.allocate((len(self.genome), width), 'f4')
|
||||
knots = pool.allocate((len(self.genome), width), 'f4')
|
||||
else:
|
||||
times, knots = np.empty((2, len(self.genome), width), 'f4')
|
||||
times.fill(1e9)
|
||||
|
||||
for idx, gname in enumerate(self.genome):
|
||||
attr = self.ns[gname[0]]
|
||||
for g in gname[1:]:
|
||||
attr = getattr(attr, g)
|
||||
out[0][idx][:len(attr.knots[0])] = attr.knots[0]
|
||||
out[1][idx][:len(attr.knots[1])] = attr.knots[1]
|
||||
return out
|
||||
times[idx,:len(attr.knots[0])] = attr.knots[0]
|
||||
knots[idx,:len(attr.knots[1])] = attr.knots[1]
|
||||
return times, knots
|
||||
|
||||
_defs = Template(r"""
|
||||
|
||||
__global__
|
||||
void interp_{{tname}}(
|
||||
__global__ void interp_{{tname}}(
|
||||
{{tname}}* out,
|
||||
const float *times, const float *knots,
|
||||
float tstart, float tstep, int maxid
|
||||
) {
|
||||
float tstart, float tstep, int maxid)
|
||||
{
|
||||
int id = gtid();
|
||||
if (id >= maxid) return;
|
||||
out = &out[id];
|
||||
@ -249,7 +244,6 @@ void interp_{{tname}}(
|
||||
}
|
||||
{{endfor}}
|
||||
}
|
||||
|
||||
""")
|
||||
|
||||
_decls = Template(r"""
|
||||
@ -259,22 +253,13 @@ typedef struct {
|
||||
{{endfor}}
|
||||
} {{tname}};
|
||||
|
||||
/* Search through the fixed-size list 'hay' to find the rightmost index which
|
||||
* contains a value strictly smaller than the input 'needle'. The crazy
|
||||
* bitwise search is just for my own amusement.
|
||||
*/
|
||||
__device__
|
||||
int bitwise_binsearch(const float *hay, float needle) {
|
||||
int lo = 0;
|
||||
|
||||
// TODO: improve efficiency on 64-bit arches
|
||||
{{for i in range(search_rounds-1, -1, -1)}}
|
||||
if (needle > hay[lo + {{1 << i}}])
|
||||
lo += {{1 << i}};
|
||||
{{endfor}}
|
||||
return lo;
|
||||
}
|
||||
""")
|
||||
|
||||
catmullromlib = devlib(deps=[binsearchlib], decls=r'''
|
||||
__device__ __noinline__
|
||||
float catmull_rom(const float *times, const float *knots, float t);
|
||||
''', defs=r'''
|
||||
__device__ __noinline__
|
||||
float catmull_rom(const float *times, const float *knots, float t) {
|
||||
int idx = bitwise_binsearch(times, t);
|
||||
@ -303,67 +288,14 @@ float catmull_rom(const float *times, const float *knots, float t) {
|
||||
+ m2 * ( ttt - tt)
|
||||
+ k2 * (-2.0f*ttt + 3.0f*tt);
|
||||
}
|
||||
''')
|
||||
|
||||
__global__
|
||||
void test_cr(const float *times, const float *knots, const float *t, float *r) {
|
||||
int i = threadIdx.x + blockDim.x * blockIdx.x;
|
||||
r[i] = catmull_rom(times, knots, t[i]);
|
||||
}
|
||||
""")
|
||||
|
||||
class Palette(HunkOCode):
|
||||
# The JPEG YUV full-range matrix, without bias into the positve regime.
|
||||
# This assumes input color space is CIERGB D65, encoded with gamma 2.2.
|
||||
# Note that some interpolated colors may exceed the sRGB and YUV gamuts.
|
||||
YUV = np.matrix([[ 0.299, 0.587, 0.114],
|
||||
[-0.168736, -0.331264, 0.5],
|
||||
[ 0.5, -0.418688, -0.081312]])
|
||||
|
||||
def __init__(self, interp_mode):
|
||||
assert interp_mode in self.modes
|
||||
self.mode = interp_mode
|
||||
self.defs = self._defs.substitute(mode=interp_mode)
|
||||
|
||||
def prepare(self, palettes):
|
||||
"""
|
||||
Produce palettes suitable for uploading to the device. Returns an
|
||||
array of palettes in the same size and shape as the input.
|
||||
|
||||
This function will never modify its argument, but may return it
|
||||
unmodified for certain interpolation modes.
|
||||
"""
|
||||
if self.mode == 'yuvpolar':
|
||||
ys, uvrs, uvts, alphas = zip(*map(self.rgbtoyuvpolar, palettes))
|
||||
# Center all medians as closely to 0 as possible
|
||||
means = np.mean(uvts, axis=1)
|
||||
newmeans = (means + np.pi) % (2 * np.pi) - np.pi
|
||||
uvts = (newmeans - means).reshape((-1, 1)) + uvts
|
||||
zipped = zip(ys, uvrs, uvts, alphas)
|
||||
return np.array(zipped, dtype='f4').transpose((0, 2, 1))
|
||||
return palettes
|
||||
|
||||
@classmethod
|
||||
def rgbtoyuvpolar(cls, pal):
|
||||
# TODO: premultiply alpha or some nonsense like that?
|
||||
y, u, v = np.array(cls.YUV * pal.T[:3])
|
||||
uvr = np.hypot(u, v)
|
||||
uvt = np.unwrap(np.arctan2(v, u))
|
||||
return y, uvr, uvt, pal.T[3]
|
||||
|
||||
@classmethod
|
||||
def yuvpolartorgb(cls, y, uvr, uvt, a):
|
||||
u = uvr * np.cos(uvt)
|
||||
v = uvr * np.sin(uvt)
|
||||
r, g, b = np.array(cls.YUV.I * np.array([y, u, v]))
|
||||
# Ensure Fortran order so that the memory gets laid out correctly
|
||||
return np.array([r, g, b, a], order='F').T
|
||||
|
||||
modes = ['hsv', 'yuv', 'yuvpolar']
|
||||
decls = "surface<void, cudaSurfaceType2D> flatpal;\n"
|
||||
|
||||
_defs = Template(r"""
|
||||
__device__
|
||||
float4 interp_color(const float *times, const float4 *sources, float time) {
|
||||
palintlib = devlib(deps=[binsearchlib, ringbuflib, yuvlib, mwclib], decls='''
|
||||
surface<void, cudaSurfaceType2D> flatpal;
|
||||
''', defs=r'''
|
||||
__device__ float4
|
||||
interp_color(const float *times, const float4 *sources, float time)
|
||||
{
|
||||
int idx = fmaxf(bitwise_binsearch(times, time) + 1, 1);
|
||||
float lf = (times[idx] - time) / (times[idx] - times[idx-1]);
|
||||
float rf = 1.0f - lf;
|
||||
@ -375,42 +307,11 @@ float4 interp_color(const float *times, const float4 *sources, float time) {
|
||||
float3 l3 = make_float3(left.x, left.y, left.z);
|
||||
float3 r3 = make_float3(right.x, right.y, right.z);
|
||||
|
||||
{{if mode == 'hsv'}}
|
||||
float3 lhsv = rgb2hsv(l3);
|
||||
float3 rhsv = rgb2hsv(r3);
|
||||
|
||||
if (fabs(lhsv.x - rhsv.x) > 3.0f)
|
||||
if (lhsv.x < rhsv.x)
|
||||
lhsv.x += 6.0f;
|
||||
else
|
||||
rhsv.x += 6.0f;
|
||||
|
||||
float3 hsv;
|
||||
hsv.x = lhsv.x * lf + rhsv.x * rf;
|
||||
hsv.y = lhsv.y * lf + rhsv.y * rf;
|
||||
hsv.z = lhsv.z * lf + rhsv.z * rf;
|
||||
|
||||
if (hsv.x > 6.0f)
|
||||
hsv.x -= 6.0f;
|
||||
if (hsv.x < 0.0f)
|
||||
hsv.x += 6.0f;
|
||||
|
||||
yuv = rgb2yuv(hsv2rgb(hsv));
|
||||
{{elif mode.startswith('yuv')}}
|
||||
{{if mode == 'yuv'}}
|
||||
float3 lyuv = rgb2yuv(l3);
|
||||
float3 ryuv = rgb2yuv(r3);
|
||||
yuv.x = lyuv.x * lf + ryuv.x * rf;
|
||||
yuv.y = lyuv.y * lf + ryuv.y * rf;
|
||||
yuv.z = lyuv.z * lf + ryuv.z * rf;
|
||||
{{elif mode == 'yuvpolar'}}
|
||||
yuv.x = l3.x * lf + r3.x * rf;
|
||||
float radius = l3.y * lf + r3.y * rf;
|
||||
float angle = l3.z * lf + r3.z * rf;
|
||||
yuv.y = radius * cosf(angle);
|
||||
yuv.z = radius * sinf(angle);
|
||||
{{endif}}
|
||||
{{endif}}
|
||||
|
||||
yuv.y += 0.5f;
|
||||
yuv.z += 0.5f;
|
||||
@ -418,28 +319,13 @@ float4 interp_color(const float *times, const float4 *sources, float time) {
|
||||
return make_float4(yuv.x, yuv.y, yuv.z, left.w * lf + right.w * rf);
|
||||
}
|
||||
|
||||
__global__
|
||||
void interp_palette(uchar4 *outs,
|
||||
__global__ void interp_palette_flat(
|
||||
ringbuf *rb, mwc_st *rctxs,
|
||||
const float *times, const float4 *sources,
|
||||
float tstart, float tstep) {
|
||||
float time = tstart + blockIdx.x * tstep;
|
||||
float4 yuva = interp_color(times, sources, time);
|
||||
|
||||
uchar4 out;
|
||||
out.x = yuva.x * 255.0f;
|
||||
out.y = yuva.y * 255.0f;
|
||||
out.z = yuva.z * 255.0f;
|
||||
out.w = yuva.w * 255.0f;
|
||||
outs[blockDim.x * blockIdx.x + threadIdx.x] = out;
|
||||
}
|
||||
|
||||
__global__
|
||||
void interp_palette_flat(mwc_st *rctxs,
|
||||
const float *times, const float4 *sources,
|
||||
float tstart, float tstep) {
|
||||
|
||||
float tstart, float tstep)
|
||||
{
|
||||
mwc_st rctx = rctxs[rb_incr(rb->head, threadIdx.x)];
|
||||
int gid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
mwc_st rctx = rctxs[gid];
|
||||
|
||||
float time = tstart + blockIdx.x * tstep;
|
||||
float4 yuva = interp_color(times, sources, time);
|
||||
@ -447,17 +333,30 @@ void interp_palette_flat(mwc_st *rctxs,
|
||||
// TODO: pack Y at full precision, UV at quarter
|
||||
uint2 out;
|
||||
|
||||
uint32_t y = min(255, (uint32_t) (yuva.x * 255.0f + 0.49f * mwc_next_11(rctx)));
|
||||
uint32_t u = min(255, (uint32_t) (yuva.y * 255.0f + 0.49f * mwc_next_11(rctx)));
|
||||
uint32_t v = min(255, (uint32_t) (yuva.z * 255.0f + 0.49f * mwc_next_11(rctx)));
|
||||
uint32_t y = yuva.x * 255.0f + 0.49f * mwc_next_11(rctx);
|
||||
uint32_t u = yuva.y * 255.0f + 0.49f * mwc_next_11(rctx);
|
||||
uint32_t v = yuva.z * 255.0f + 0.49f * mwc_next_11(rctx);
|
||||
y = min(255, y);
|
||||
u = min(255, u);
|
||||
v = min(255, v);
|
||||
out.y = (1 << 22) | (y << 4);
|
||||
out.x = (u << 18) | v;
|
||||
|
||||
surf2Dwrite(out, flatpal, 8 * threadIdx.x, blockIdx.x);
|
||||
rctxs[gid] = rctx;
|
||||
rctxs[rb_incr(rb->tail, threadIdx.x)] = rctx;
|
||||
}
|
||||
""")
|
||||
''')
|
||||
|
||||
testcrlib = devlib(defs=r'''
|
||||
__global__ void
|
||||
test_cr(const float *times, const float *knots, const float *t, float *r) {
|
||||
int i = threadIdx.x + blockDim.x * blockIdx.x;
|
||||
r[i] = catmull_rom(times, knots, t[i]);
|
||||
}
|
||||
''')
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Test spline evaluation. This code will probably drift pretty often.
|
||||
import pycuda.driver as cuda
|
||||
from pycuda.compiler import SourceModule
|
||||
import pycuda.autoinit
|
||||
|
@ -2,15 +2,16 @@
|
||||
The main iteration loop.
|
||||
"""
|
||||
|
||||
from cuburn.code import mwc, variations, interp
|
||||
from cuburn.code.util import *
|
||||
import variations
|
||||
import interp
|
||||
from util import Template, devlib, ringbuflib
|
||||
from mwc import mwclib
|
||||
|
||||
def precalc_densities(pcp, std_xforms):
|
||||
# This pattern recurs a few times for precalc segments. Unfortunately,
|
||||
# namespace stuff means it's not easy to functionalize this boilerplate
|
||||
pre_cp = pcp._precalc()
|
||||
pre_cp._code(Template(r"""
|
||||
|
||||
float sum = 0.0f;
|
||||
|
||||
{{for n in std_xforms}}
|
||||
@ -30,7 +31,6 @@ def precalc_densities(pcp, std_xforms):
|
||||
def precalc_chaos(pcp, std_xforms):
|
||||
pre_cp = pcp._precalc()
|
||||
pre_cp._code(Template("""
|
||||
|
||||
float sum, rsum;
|
||||
|
||||
{{for p in std_xforms}}
|
||||
@ -50,7 +50,6 @@ def precalc_chaos(pcp, std_xforms):
|
||||
{{endfor}}
|
||||
|
||||
{{endfor}}
|
||||
|
||||
""", name='precalc_chaos').substitute(locals()))
|
||||
|
||||
def precalc_camera(pcam):
|
||||
@ -64,7 +63,6 @@ def precalc_camera(pcam):
|
||||
# . matrix([X],[Y],[1]);
|
||||
|
||||
pre_cam._code(Template(r"""
|
||||
|
||||
float rot = {{pre_cam.rotation}} * M_PI / 180.0f;
|
||||
float rotsin = sin(rot), rotcos = cos(rot);
|
||||
float cenx = {{pre_cam.center.x}}, ceny = {{pre_cam.center.y}};
|
||||
@ -79,13 +77,11 @@ def precalc_camera(pcam):
|
||||
{{pre_cam._set('yy')}} = scale * rotcos;
|
||||
{{pre_cam._set('yo')}} = scale * -(rotsin * cenx + rotcos * ceny)
|
||||
+ 0.5f * acc_size.aheight;
|
||||
|
||||
""").substitute(locals()))
|
||||
""", 'precalc_camera').substitute(locals()))
|
||||
|
||||
def precalc_xf_affine(px):
|
||||
pre = px._precalc()
|
||||
pre._code(Template(r"""
|
||||
|
||||
float pri = {{pre.angle}} * M_PI / 180.0f;
|
||||
float spr = {{pre.spread}} * M_PI / 180.0f;
|
||||
|
||||
@ -98,27 +94,22 @@ def precalc_xf_affine(px):
|
||||
{{pre._set('yy')}} = magy * sin(pri+spr);
|
||||
{{pre._set('xo')}} = {{pre.offset.x}};
|
||||
{{pre._set('yo')}} = -{{pre.offset.y}};
|
||||
""", 'precalc_xf_affine').substitute(locals()))
|
||||
|
||||
""").substitute(locals()))
|
||||
def apply_affine(x, y, xo, yo, packer):
|
||||
return Template("""
|
||||
{{xo}} = {{packer.xx}} * {{x}} + {{packer.xy}} * {{y}} + {{packer.xo}};
|
||||
{{yo}} = {{packer.yx}} * {{x}} + {{packer.yy}} * {{y}} + {{packer.yo}};
|
||||
""", 'apply_affine').substitute(locals())
|
||||
|
||||
class IterCode(HunkOCode):
|
||||
# The number of threads per block
|
||||
NTHREADS = 256
|
||||
# The number of threads per block used in the iteration function. Don't change
|
||||
# it lightly; the code may depend on it in unusual ways.
|
||||
NTHREADS = 256
|
||||
|
||||
def __init__(self, info, genome):
|
||||
self.packer = interp.GenomePacker('iter_params')
|
||||
self.pcp = self.packer.view('params', genome, 'cp')
|
||||
|
||||
iterbody = self._iterbody(info, genome)
|
||||
bodies = [self._xfbody(i,x) for i,x in sorted(genome.xforms.items())]
|
||||
bodies.append(iterbody)
|
||||
self.defs = '\n'.join(bodies)
|
||||
|
||||
decls = """
|
||||
iter_decls = """
|
||||
// Note: for normalized lookups, uchar4 actually returns floats
|
||||
texture<uchar4, cudaTextureType2D, cudaReadModeNormalizedFloat> palTex;
|
||||
__shared__ iter_params params;
|
||||
__device__ int rb_head, rb_tail, rb_size;
|
||||
|
||||
typedef struct {
|
||||
uint32_t width;
|
||||
@ -128,12 +119,9 @@ typedef struct {
|
||||
uint32_t astride;
|
||||
} acc_size_t;
|
||||
__constant__ acc_size_t acc_size;
|
||||
|
||||
"""
|
||||
|
||||
def _xfbody(self, xfid, xform):
|
||||
px = self.pcp.xforms[xfid]
|
||||
tmpl = Template(r"""
|
||||
iter_xf_body_code = r"""
|
||||
__device__
|
||||
void apply_xf_{{xfid}}(float &ox, float &oy, float &color, mwc_st &rctx) {
|
||||
float tx, ty;
|
||||
@ -162,44 +150,30 @@ void apply_xf_{{xfid}}(float &ox, float &oy, float &color, mwc_st &rctx) {
|
||||
float csp = {{px.color_speed}};
|
||||
color = color * (1.0f - csp) + {{px.color}} * csp;
|
||||
};
|
||||
""", 'apply_xf_'+xfid)
|
||||
"""
|
||||
|
||||
def iter_xf_body(pcp, xfid, xform):
|
||||
px = pcp.xforms[xfid]
|
||||
tmpl = Template(iter_xf_body_code, 'apply_xf_'+xfid)
|
||||
|
||||
g = dict(globals())
|
||||
g.update(locals())
|
||||
return tmpl.substitute(g)
|
||||
|
||||
def _iterbody(self, info, genome):
|
||||
tmpl = Template(r'''
|
||||
|
||||
__global__ void reset_rb(int size) {
|
||||
rb_head = rb_tail = 0;
|
||||
rb_size = size;
|
||||
}
|
||||
|
||||
__global__
|
||||
void iter(
|
||||
uint64_t out_ptr,
|
||||
mwc_st *msts,
|
||||
float4 *points,
|
||||
const iter_params *all_params
|
||||
{{if info.acc_mode == 'atomic'}}
|
||||
, uint64_t atom_ptr
|
||||
{{endif}}
|
||||
) {
|
||||
const iter_params *global_params = &(all_params[blockIdx.x]);
|
||||
|
||||
|
||||
iter_body_code = r'''
|
||||
__global__ void
|
||||
iter(uint64_t out_ptr, uint64_t atom_ptr,
|
||||
ringbuf *rb, mwc_st *msts, float4 *points,
|
||||
const iter_params *all_params)
|
||||
{
|
||||
// load params to shared memory cooperatively
|
||||
const iter_params *global_params = &(all_params[blockIdx.x]);
|
||||
for (int i = threadIdx.y * blockDim.x + threadIdx.x;
|
||||
i < (sizeof(iter_params) / 4); i += blockDim.x * blockDim.y)
|
||||
reinterpret_cast<float*>(¶ms)[i] =
|
||||
reinterpret_cast<const float*>(global_params)[i];
|
||||
|
||||
__shared__ int rb_idx;
|
||||
if (threadIdx.y == 1 && threadIdx.x == 1)
|
||||
rb_idx = 32 * blockDim.y * (atomicAdd(&rb_head, 1) % rb_size);
|
||||
|
||||
__syncthreads();
|
||||
int this_rb_idx = rb_idx + threadIdx.x + 32 * threadIdx.y;
|
||||
int this_rb_idx = rb_incr(rb->head, blockDim.x * threadIdx.y + threadIdx.x);
|
||||
mwc_st rctx = msts[this_rb_idx];
|
||||
|
||||
{{precalc_camera(pcp.camera)}}
|
||||
@ -209,22 +183,15 @@ void iter(
|
||||
{{pcp.camera.yo}} += ditherwidth * mwc_next_11(rctx);
|
||||
}
|
||||
|
||||
{{if info.acc_mode == 'global'}}
|
||||
__shared__ float time_frac;
|
||||
time_frac = blockIdx.x / (float) gridDim.x;
|
||||
{{else}}
|
||||
{{if info.acc_mode == 'atomic'}}
|
||||
// TODO: spare the register, reuse at call site?
|
||||
int time = blockIdx.x >> 4;
|
||||
{{endif}}
|
||||
float color_dither = 0.49f * mwc_next_11(rctx);
|
||||
{{endif}}
|
||||
|
||||
// TODO: 4th channel unused. Kill or use for something helpful
|
||||
float4 old_point = points[this_rb_idx];
|
||||
float x = old_point.x, y = old_point.y, color = old_point.z;
|
||||
|
||||
{{if not info.chaos_used}}
|
||||
{{if not chaos_used}}
|
||||
// Shared memory size can be reduced by a factor of four using a slower
|
||||
// 4-stage reduce, but on Fermi hardware shmem use isn't a bottleneck
|
||||
__shared__ float swap[{{4*NTHREADS}}];
|
||||
@ -250,7 +217,7 @@ void iter(
|
||||
// TODO: link up with FUSE, etc
|
||||
for (int round = 0; round < 256; round++) {
|
||||
|
||||
{{if info.chaos_used}}
|
||||
{{if chaos_used}}
|
||||
|
||||
{{precalc_chaos(pcp, std_xforms)}}
|
||||
|
||||
@ -315,16 +282,7 @@ void iter(
|
||||
|
||||
{{endif}}
|
||||
|
||||
{{if info.acc_mode == 'deferred'}}
|
||||
int tid = threadIdx.y * 32 + threadIdx.x;
|
||||
int offset = 4 * (256 * (256 * blockIdx.x + round) + tid);
|
||||
int *log = reinterpret_cast<int*>(out_ptr + offset);
|
||||
{{endif}}
|
||||
|
||||
if (fuse) {
|
||||
{{if info.acc_mode == 'deferred'}}
|
||||
*log = 0xffffffff;
|
||||
{{endif}}
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -346,15 +304,11 @@ void iter(
|
||||
uint32_t ix = trunca(cx), iy = trunca(cy);
|
||||
|
||||
if (ix >= acc_size.astride || iy >= acc_size.aheight) {
|
||||
{{if info.acc_mode == 'deferred'}}
|
||||
*log = 0xffffffff;
|
||||
{{endif}}
|
||||
continue;
|
||||
}
|
||||
|
||||
uint32_t ibase = (last_xf_used % {{info.max_nxf}}) * acc_size.aheight;
|
||||
uint32_t i = (ibase + iy) * acc_size.astride + ix;
|
||||
{{if info.acc_mode == 'atomic'}}
|
||||
uint32_t i = iy * acc_size.astride + ix;
|
||||
|
||||
asm volatile ({{crep("""
|
||||
{
|
||||
// To prevent overflow, we need to flush each pixel before the density
|
||||
@ -432,38 +386,45 @@ oflow_end:
|
||||
""")}} :: "f"(cc), "f"(color_dither), "r"(time), "r"(i),
|
||||
"l"(atom_ptr), "f"(cosel[threadIdx.y + {{NWARPS}}]),
|
||||
"l"(out_ptr));
|
||||
{{endif}}
|
||||
|
||||
{{if info.acc_mode == 'global'}}
|
||||
float4 outcol = tex2D(palTex, cc, time_frac);
|
||||
float4 *accbuf = reinterpret_cast<float4*>(out_ptr + (16*i));
|
||||
float4 pix = *accbuf;
|
||||
pix.x += outcol.x;
|
||||
pix.y += outcol.y;
|
||||
pix.z += outcol.z;
|
||||
pix.w += 1.0f;
|
||||
*accbuf = pix;
|
||||
{{elif info.acc_mode == 'deferred'}}
|
||||
// 'color' gets the top 8 bits. TODO: add dithering via precalc.
|
||||
uint32_t icolor = fminf(1.0f, cc) * 255.0f + color_dither;
|
||||
asm("bfi.b32 %0, %1, %0, 24, 8;" : "+r"(i) : "r"(icolor));
|
||||
*log = i;
|
||||
{{endif}}
|
||||
}
|
||||
|
||||
if (threadIdx.x == 0 && threadIdx.y == 0)
|
||||
rb_idx = 32 * blockDim.y * (atomicAdd(&rb_tail, 1) % rb_size);
|
||||
__syncthreads();
|
||||
this_rb_idx = rb_idx + threadIdx.x + 32 * threadIdx.y;
|
||||
|
||||
this_rb_idx = rb_incr(rb->tail, blockDim.x * threadIdx.y + threadIdx.x);
|
||||
points[this_rb_idx] = make_float4(x, y, color, 0.0f);
|
||||
msts[this_rb_idx] = rctx;
|
||||
return;
|
||||
}
|
||||
'''
|
||||
|
||||
{{if info.acc_mode == 'atomic'}}
|
||||
__global__
|
||||
void flush_atom(uint64_t out_ptr, uint64_t atom_ptr, int nbins) {
|
||||
def iter_body(cp, pcp):
|
||||
# For legacy reasons, 'cp' is used here instead of 'genome'.
|
||||
tmpl = Template(iter_body_code, 'iter_body')
|
||||
NWARPS = NTHREADS / 32
|
||||
std_xforms = [n for n in sorted(cp.xforms) if n != 'final']
|
||||
|
||||
# TODO: detect this properly and use it
|
||||
chaos_used = False
|
||||
|
||||
vars = globals()
|
||||
vars.update(locals())
|
||||
return tmpl.substitute(vars)
|
||||
|
||||
def mkiterlib(genome):
|
||||
packer = interp.GenomePacker('iter_params')
|
||||
pcp = packer.view('params', genome, 'cp')
|
||||
|
||||
iterbody = iter_body(genome, pcp)
|
||||
bodies = [iter_xf_body(pcp, i, x) for i, x in sorted(genome.xforms.items())]
|
||||
bodies.append(iterbody)
|
||||
packer_lib = packer.finalize()
|
||||
|
||||
lib = devlib(deps=[packer_lib, mwclib, ringbuflib],
|
||||
# We grab the surf decl from palintlib as well
|
||||
decls=iter_decls + interp.palintlib.decls,
|
||||
defs='\n'.join(bodies))
|
||||
return packer, lib
|
||||
|
||||
flushatomlib = devlib(defs=Template(r'''
|
||||
__global__ void flush_atom(uint64_t out_ptr, uint64_t atom_ptr, int nbins) {
|
||||
int i = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;
|
||||
if (i >= nbins) return;
|
||||
asm volatile ({{crep("""
|
||||
@ -499,168 +460,4 @@ void flush_atom(uint64_t out_ptr, uint64_t atom_ptr, int nbins) {
|
||||
}
|
||||
""")}} :: "r"(i), "l"(atom_ptr), "l"(out_ptr));
|
||||
}
|
||||
|
||||
{{endif}}
|
||||
|
||||
{{if info.acc_mode == 'deferred'}}
|
||||
|
||||
// Block size, shared accumulation bits, shared accumulation width.
|
||||
#define BS 1024
|
||||
#define SHAB 12
|
||||
#define SHAW (1<<SHAB)
|
||||
|
||||
// Read the point log, accumulate in shared memory, and write the results.
|
||||
// This kernel is to be launched with one block for every 4,096 addresses to
|
||||
// be processed, and will handle those addresses.
|
||||
//
|
||||
// log_bounds is an array mapping radix values to the first index in the log
|
||||
// with that radix position. For performance reasons in other parts of the
|
||||
// code, the radix may actually include bits within the lower SHAB part of the
|
||||
// address, or it might not cover the first few bits after the SHAB part;
|
||||
// log_bounds_shift covers that. glob_addr_bits specifies the number of bits
|
||||
// above SHAB which are address bits.
|
||||
|
||||
__global__ void
|
||||
__launch_bounds__(BS, 1)
|
||||
write_shmem(
|
||||
float4 *acc,
|
||||
const uint32_t *log,
|
||||
const uint32_t *log_bounds,
|
||||
uint32_t nbins
|
||||
) {
|
||||
const int tid = threadIdx.x;
|
||||
const int bid = blockIdx.x;
|
||||
|
||||
// These two accumulators, used in write_shmem, hold {density, red} and
|
||||
// {green, blue} values as packed u16 pairs. The fixed size represents
|
||||
// 4,096 pixels in the accumulator.
|
||||
__shared__ uint32_t s_acc[SHAW*2];
|
||||
|
||||
int idx = tid;
|
||||
for (int i = 0; i < (SHAW * 2 / BS); i++) {
|
||||
s_acc[idx] = 0;
|
||||
idx += BS;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// Shut the compiler up
|
||||
idx = s_acc[0];
|
||||
|
||||
// log_bounds[] holds inclusive prefix sums, so that log_bounds[0] is the
|
||||
// largest index with radix 0, and so on.
|
||||
int lb_idx_hi = bid & 0xff;
|
||||
int lb_idx_lo = lb_idx_hi - 1;
|
||||
|
||||
int idx_lo;
|
||||
if (lb_idx_lo > 0) idx_lo = log_bounds[lb_idx_lo] & ~(BS - 1);
|
||||
else idx_lo = 0;
|
||||
int idx_hi = (log_bounds[lb_idx_hi] & ~(BS - 1)) + BS;
|
||||
|
||||
float rnrounds = {{'%d.0f' % info.palette_height}} / (idx_hi - idx_lo);
|
||||
float time = tid * rnrounds;
|
||||
float time_step = BS * rnrounds;
|
||||
|
||||
int magic = ((blockIdx.x & 0xff) << 3) + ((blockIdx.x & 0xf00) << 12);
|
||||
int magic_mask = 0xf007f8;
|
||||
|
||||
for (int i = idx_lo + tid; i < idx_hi; i += BS) {
|
||||
int entry = log[i];
|
||||
|
||||
asm volatile ({{crep("""
|
||||
{
|
||||
.reg .pred q;
|
||||
.reg .u32 shoff, color, time, d, r, g, b, hi, lo, hiw, low, tmp;
|
||||
.reg .u64 ptr;
|
||||
.reg .f32 rf, gf, bf, df, rg, gg, dg, bg;
|
||||
|
||||
// TODO: opacity
|
||||
and.b32 tmp, %0, %4;
|
||||
setp.eq.u32 q, tmp, %3;
|
||||
@!q bra before_sync;
|
||||
|
||||
and.b32 shoff, %0, 0xff800;
|
||||
shr.b32 shoff, shoff, 5;
|
||||
bfi.b32 shoff, %0, shoff, 3, 3;
|
||||
|
||||
bfe.u32 color, %0, 24, 8;
|
||||
shl.b32 color, color, 3;
|
||||
cvt.rni.u32.f32 time, %1;
|
||||
|
||||
suld.b.2d.v2.b32.clamp {lo, hi}, [flatpal, {color, time}];
|
||||
ld.shared.v2.u32 {low, hiw}, [shoff];
|
||||
add.cc.u32 lo, lo, low;
|
||||
addc.u32 hi, hi, hiw;
|
||||
setp.hs.u32 q, hi, (1023 << 22);
|
||||
@q bra oflow_sync;
|
||||
st.shared.v2.u32 [shoff], {hi, lo};
|
||||
before_sync:
|
||||
bar.sync 0;
|
||||
bra oflow_write_end;
|
||||
oflow_sync:
|
||||
st.shared.v2.u32 [shoff], {0, 0};
|
||||
|
||||
// TODO: opacity
|
||||
bfi.b32 shoff, %0, 0, 4, 24;
|
||||
cvt.u64.u32 ptr, shoff;
|
||||
add.u64 ptr, ptr, %2;
|
||||
ld.global.v4.f32 {dg,bg,gg,rg}, [ptr];
|
||||
bar.sync 0;
|
||||
|
||||
bfe.u32 r, hi, 4, 18;
|
||||
bfe.u32 g, lo, 18, 14;
|
||||
bfi.b32 g, hi, g, 14, 4;
|
||||
and.b32 b, lo, ((1<<18)-1);
|
||||
cvt.rn.f32.u32 rf, r;
|
||||
cvt.rn.f32.u32 gf, g;
|
||||
cvt.rn.f32.u32 bf, b;
|
||||
fma.rn.ftz.f32 rf, rf, (1.0/255.0), rg;
|
||||
fma.rn.ftz.f32 gf, gf, (1.0/255.0), gg;
|
||||
fma.rn.ftz.f32 bf, bf, (1.0/255.0), bg;
|
||||
add.f32 df, df, dg;
|
||||
st.global.v4.f32 [ptr], {df,bf,gf,rf};
|
||||
|
||||
oflow_write_end:
|
||||
}
|
||||
""")}} :: "r"(entry), "f"(time), "l"(acc), "r"(magic), "r"(magic_mask));
|
||||
time += time_step;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
|
||||
idx = tid;
|
||||
int glo_idx = magic | (((idx << 8) | idx) & 0xff807);
|
||||
|
||||
for (int i = 0; i < (SHAW / BS) && glo_idx < nbins; i++) {
|
||||
int d, r, g, b;
|
||||
float4 pix = acc[glo_idx];
|
||||
asm({{crep("""
|
||||
{
|
||||
.reg .u32 hi, lo;
|
||||
ld.shared.v2.u32 {lo, hi}, [%4];
|
||||
shr.u32 %0, hi, 22;
|
||||
bfe.u32 %1, hi, 4, 18;
|
||||
bfe.u32 %2, lo, 18, 14;
|
||||
bfi.b32 %2, hi, %2, 14, 4;
|
||||
and.b32 %3, lo, ((1<<18)-1);
|
||||
}
|
||||
""")}} : "=r"(d), "=r"(r), "=r"(g), "=r"(b) : "r"(idx*8));
|
||||
pix.x += r / 255.0f;
|
||||
pix.y += g / 255.0f;
|
||||
pix.z += b / 255.0f;
|
||||
pix.w += d;
|
||||
acc[glo_idx] = pix;
|
||||
idx += BS;
|
||||
glo_idx += (BS << 8);
|
||||
}
|
||||
}
|
||||
{{endif}}
|
||||
''', 'iter_kern')
|
||||
return tmpl.substitute(
|
||||
info = info,
|
||||
cp = genome,
|
||||
pcp = self.pcp,
|
||||
NTHREADS = self.NTHREADS,
|
||||
NWARPS = self.NTHREADS / 32,
|
||||
std_xforms = [n for n in sorted(genome.xforms) if n != 'final'],
|
||||
**globals())
|
||||
''', 'flush_atom').substitute())
|
||||
|
@ -6,18 +6,53 @@ import os
|
||||
import warnings
|
||||
import numpy as np
|
||||
|
||||
from util import *
|
||||
from util import devlib, assemble_code
|
||||
|
||||
class MWC(HunkOCode):
|
||||
decls = """
|
||||
# Keeping this live in the module isn't necessary, but loading the mults
|
||||
# can be surprisingly slow.
|
||||
mults = None
|
||||
|
||||
def load_mults():
|
||||
pfpath = os.path.join(os.path.dirname(__file__), 'primes.bin')
|
||||
if os.path.isfile(pfpath):
|
||||
with open(pfpath) as fp:
|
||||
return np.frombuffer(fp.read(), dtype='<u4')
|
||||
|
||||
warnings.warn('primes.bin not found, trying to download it')
|
||||
import bz2, urllib2
|
||||
ufp = urllib2.urlopen('http://aduro.strobe.cc/primes.diff.bin.bz2')
|
||||
diffs = np.frombuffer(bz2.decompress(ufp.read()), dtype='<u2')
|
||||
mults = np.cumsum(-np.array(diffs, dtype='<u4'), dtype='<u4')
|
||||
with open(pfpath, 'wb') as fp:
|
||||
fp.write(mults)
|
||||
return mults
|
||||
|
||||
def make_seeds(nthreads, host_seed=None):
|
||||
global mults
|
||||
if mults is None:
|
||||
mults = load_mults()
|
||||
if host_seed:
|
||||
rand = np.random.RandomState(host_seed)
|
||||
else:
|
||||
rand = np.random
|
||||
|
||||
# Create the seed structures. TODO: check that struct is 4-byte aligned
|
||||
seeds = np.empty((3, nthreads), dtype=np.uint32, order='F')
|
||||
seeds[0][:] = mults[:nthreads]
|
||||
|
||||
# Excludes 0xffffffff for 32-bit compatibility with laziness
|
||||
seeds[1] = rand.randint(1, 0x7fffffff, size=nthreads)
|
||||
seeds[2] = rand.randint(1, 0x7fffffff, size=nthreads)
|
||||
|
||||
return seeds
|
||||
|
||||
mwclib = devlib(decls=r'''
|
||||
typedef struct {
|
||||
uint32_t mul;
|
||||
uint32_t state;
|
||||
uint32_t carry;
|
||||
} mwc_st;
|
||||
"""
|
||||
|
||||
defs = r"""
|
||||
''', defs=r'''
|
||||
__device__ uint32_t mwc_next(mwc_st &st) {
|
||||
asm("{\n\t"
|
||||
".reg .u32 tmp;\n\t"
|
||||
@ -40,46 +75,9 @@ __device__ float mwc_next_11(mwc_st &st) {
|
||||
: "=f"(ret) : "r"(val));
|
||||
return ret;
|
||||
}
|
||||
"""
|
||||
''')
|
||||
|
||||
@staticmethod
|
||||
def load_mults():
|
||||
pfpath = os.path.join(os.path.dirname(__file__), 'primes.bin')
|
||||
if os.path.isfile(pfpath):
|
||||
with open(pfpath) as fp:
|
||||
return np.frombuffer(fp.read(), dtype='<u4')
|
||||
|
||||
warnings.warn('primes.bin not found, trying to download it')
|
||||
import bz2, urllib2
|
||||
ufp = urllib2.urlopen('http://aduro.strobe.cc/primes.diff.bin.bz2')
|
||||
diffs = np.frombuffer(bz2.decompress(ufp.read()), dtype='<u2')
|
||||
mults = np.cumsum(-np.array(diffs, dtype='<u4'), dtype='<u4')
|
||||
with open(pfpath, 'wb') as fp:
|
||||
fp.write(mults)
|
||||
return mults
|
||||
|
||||
mults = None
|
||||
@classmethod
|
||||
def make_seeds(cls, nthreads, host_seed=None):
|
||||
if cls.mults is None:
|
||||
cls.mults = cls.load_mults()
|
||||
if host_seed:
|
||||
rand = np.random.RandomState(host_seed)
|
||||
else:
|
||||
rand = np.random
|
||||
|
||||
# Create the seed structures. TODO: check that struct is 4-byte aligned
|
||||
seeds = np.empty((3, nthreads), dtype=np.uint32, order='F')
|
||||
seeds[0][:] = cls.mults[:nthreads]
|
||||
|
||||
# Excludes 0xffffffff for 32-bit compatibility with laziness
|
||||
seeds[1] = rand.randint(1, 0x7fffffff, size=nthreads)
|
||||
seeds[2] = rand.randint(1, 0x7fffffff, size=nthreads)
|
||||
|
||||
return seeds
|
||||
|
||||
class MWCTest(HunkOCode):
|
||||
defs = """
|
||||
mwctestlib = devlib(deps=[mwclib], defs="""
|
||||
__global__ void test_mwc(mwc_st *msts, uint64_t *sums, float nrounds) {
|
||||
mwc_st rctx = msts[gtid()];
|
||||
uint64_t sum = 0;
|
||||
@ -87,19 +85,18 @@ __global__ void test_mwc(mwc_st *msts, uint64_t *sums, float nrounds) {
|
||||
sums[gtid()] = sum;
|
||||
msts[gtid()] = rctx;
|
||||
}
|
||||
"""
|
||||
""")
|
||||
|
||||
@classmethod
|
||||
def test_mwc(cls, rounds=5000, nblocks=64, blockwidth=512):
|
||||
def test_mwc(rounds=5000, nblocks=64, blockwidth=512):
|
||||
import pycuda.driver as cuda
|
||||
from pycuda.compiler import SourceModule
|
||||
import time
|
||||
|
||||
nthreads = blockwidth * nblocks
|
||||
seeds = MWC.make_seeds(nthreads, host_seed = 5)
|
||||
seeds = make_seeds(nthreads, host_seed=42)
|
||||
dseeds = cuda.to_device(seeds)
|
||||
|
||||
mod = SourceModule(assemble_code(BaseCode, MWC, cls))
|
||||
mod = SourceModule(assemble_code(mwctestlib))
|
||||
|
||||
for trial in range(2):
|
||||
print "Trial %d, on CPU: " % trial,
|
||||
@ -133,5 +130,4 @@ __global__ void test_mwc(mwc_st *msts, uint64_t *sums, float nrounds) {
|
||||
|
||||
if __name__ == "__main__":
|
||||
import pycuda.autoinit
|
||||
MWCTest.test_mwc()
|
||||
|
||||
test_mwc()
|
||||
|
34
cuburn/code/output.py
Normal file
34
cuburn/code/output.py
Normal file
@ -0,0 +1,34 @@
|
||||
from util import devlib, ringbuflib
|
||||
from mwc import mwclib
|
||||
|
||||
f32tou8lib = devlib(deps=[ringbuflib, mwclib], defs=r'''
|
||||
// Perform a conversion from float32 values to uint8 ones, applying
|
||||
// pixel- and channel-independent dithering to reduce suprathreshold banding
|
||||
// artifacts. Clamps values larger than 1.0f.
|
||||
// TODO: move to a separate module?
|
||||
// TODO: less ineffecient mwc_st handling?
|
||||
__global__ void f32_to_u8(
|
||||
ringbuf *rb, mwc_st *rctxs, uchar4 *dst, const float4 *src,
|
||||
int gutter, int dstride, int sstride, int height)
|
||||
{
|
||||
int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
if (x > dstride || y > height) return;
|
||||
int isrc = sstride * (y + gutter) + x + gutter;
|
||||
|
||||
int tid = blockDim.x * threadIdx.y + threadIdx.x;
|
||||
mwc_st rctx = rctxs[rb_incr(rb->head, tid)];
|
||||
|
||||
float4 in = src[isrc];
|
||||
uchar4 out = make_uchar4(
|
||||
fminf(1.0f, in.x) * 255.0f + 0.49f * mwc_next_11(rctx),
|
||||
fminf(1.0f, in.y) * 255.0f + 0.49f * mwc_next_11(rctx),
|
||||
fminf(1.0f, in.z) * 255.0f + 0.49f * mwc_next_11(rctx),
|
||||
fminf(1.0f, in.w) * 255.0f + 0.49f * mwc_next_11(rctx)
|
||||
);
|
||||
|
||||
int idst = dstride * y + x;
|
||||
dst[idst] = out;
|
||||
rctxs[rb_incr(rb->head, tid)] = rctx;
|
||||
}
|
||||
''')
|
@ -1,55 +1,119 @@
|
||||
"""
|
||||
Provides tools and miscellaneous functions for building device code.
|
||||
"""
|
||||
import os
|
||||
import tempfile
|
||||
from collections import namedtuple
|
||||
|
||||
import pycuda.driver as cuda
|
||||
import pycuda.compiler
|
||||
import numpy as np
|
||||
import tempita
|
||||
|
||||
def crep(s):
|
||||
"""Escape for PTX assembly"""
|
||||
if isinstance(s, unicode):
|
||||
s = s.encode('utf-8')
|
||||
return '"%s"' % s.encode("string_escape")
|
||||
|
||||
def argset(obj, **kwargs):
|
||||
"""
|
||||
Allow an object with many properties to be set using one call.
|
||||
|
||||
>>> x = argset(X(), a=1, b=2)
|
||||
>>> x.a
|
||||
1
|
||||
"""
|
||||
for k, v in kwargs.items():
|
||||
setattr(obj, k, v)
|
||||
return obj
|
||||
|
||||
def launch(name, mod, stream, block, grid, *args, **kwargs):
|
||||
"""
|
||||
Oft-used kernel launch helper. Provides a nice boost in readability of
|
||||
densely-packed asynchronous code launches.
|
||||
"""
|
||||
fun = mod.get_function(name)
|
||||
if isinstance(block, (int, np.number)):
|
||||
block = (int(block), 1, 1)
|
||||
if isinstance(grid, (int, np.number)):
|
||||
grid = (int(grid), 1)
|
||||
fun(*args, block=block, grid=grid, stream=stream, **kwargs)
|
||||
|
||||
def crep(s):
|
||||
"""Multiline literal escape for inline PTX assembly."""
|
||||
if isinstance(s, unicode):
|
||||
s = s.encode('utf-8')
|
||||
return '"%s"' % s.encode("string_escape")
|
||||
|
||||
class Template(tempita.Template):
|
||||
"""
|
||||
A Tempita template with extra stuff in the default namespace.
|
||||
"""
|
||||
default_namespace = tempita.Template.default_namespace.copy()
|
||||
Template.default_namespace.update({'np': np, 'crep': crep})
|
||||
|
||||
class HunkOCode(object):
|
||||
"""An apparently passive container for device code."""
|
||||
# Use property objects to make these dynamic
|
||||
headers = ''
|
||||
decls = ''
|
||||
defs = ''
|
||||
# Passive container for device code.
|
||||
DevLib = namedtuple('DevLib', 'deps headers decls defs')
|
||||
|
||||
def assemble_code(*sections):
|
||||
return ''.join([''.join([getattr(sect, kind) for sect in sections])
|
||||
for kind in ['headers', 'decls', 'defs']])
|
||||
def devlib(deps=(), headers='', decls='', defs=''):
|
||||
"""Create a library of device code."""
|
||||
# This exists because namedtuple doesn't support default args
|
||||
return DevLib(deps, headers, decls, defs)
|
||||
|
||||
def apply_affine(x, y, xo, yo, packer):
|
||||
return Template("""
|
||||
{{xo}} = {{packer.xx}} * {{x}} + {{packer.xy}} * {{y}} + {{packer.xo}};
|
||||
{{yo}} = {{packer.yx}} * {{x}} + {{packer.yy}} * {{y}} + {{packer.yo}};
|
||||
""").substitute(locals())
|
||||
def assemble_code(*libs):
|
||||
seen = set()
|
||||
out = []
|
||||
def go(lib):
|
||||
map(go, lib.deps)
|
||||
code = lib[1:]
|
||||
if code not in seen:
|
||||
seen.add(code)
|
||||
out.append(code)
|
||||
go(stdlib)
|
||||
map(go, libs)
|
||||
return ''.join(sum(zip(*out), ()))
|
||||
|
||||
class BaseCode(HunkOCode):
|
||||
headers = """
|
||||
DEFAULT_CMP_OPTIONS = ('-use_fast_math', '-maxrregcount', '42')
|
||||
DEFAULT_SAVE_KERNEL = True
|
||||
def compile(name, src, opts=DEFAULT_CMP_OPTIONS, save=DEFAULT_SAVE_KERNEL):
|
||||
"""
|
||||
Compile a module. Returns a copy of the source (for inspection or
|
||||
display) and the compiled cubin.
|
||||
"""
|
||||
dir = tempfile.gettempdir()
|
||||
if save:
|
||||
with open(os.path.join(dir, name + '_kern.cu'), 'w') as fp:
|
||||
fp.write(src)
|
||||
cubin = pycuda.compiler.compile(src, options=list(opts))
|
||||
if save:
|
||||
with open(os.path.join(dir, name + '_kern.cubin'), 'w') as fp:
|
||||
fp.write(cubin)
|
||||
return cubin
|
||||
|
||||
class ClsMod(object):
|
||||
"""
|
||||
Convenience class or mixin that automatically compiles and loads a module
|
||||
once per class, saving potentially expensive code regeneration. Only use
|
||||
if your class does not employ run-time code generation.
|
||||
"""
|
||||
mod = None
|
||||
# Supply your own DevLib on this property
|
||||
lib = None
|
||||
|
||||
def __init__(self):
|
||||
super(ClsMod, self).__init__()
|
||||
self.load()
|
||||
|
||||
@classmethod
|
||||
def load(cls, name=None):
|
||||
if cls.mod is None:
|
||||
if name is None:
|
||||
name = cls.__name__.lower()
|
||||
cubin = compile(name, assemble_code(cls.lib))
|
||||
cls.mod = cuda.module_from_buffer(cubin)
|
||||
|
||||
# This lib is included with *every* assembled module. It contains no global
|
||||
# functions, so it shouldn't slow down compilation time too much.
|
||||
stdlib = devlib(headers="""
|
||||
#include<cuda.h>
|
||||
#include<stdint.h>
|
||||
#include<stdio.h>
|
||||
"""
|
||||
|
||||
decls = """
|
||||
float3 rgb2hsv(float3 rgb);
|
||||
float3 hsv2rgb(float3 hsv);
|
||||
"""
|
||||
|
||||
defs = Template(r"""
|
||||
""", decls=r"""
|
||||
#undef M_E
|
||||
#undef M_LOG2E
|
||||
#undef M_LOG10E
|
||||
@ -84,33 +148,79 @@ float3 hsv2rgb(float3 hsv);
|
||||
#define bfe_decl(d, s, o, w) \
|
||||
int d; \
|
||||
bfe(d, s, o, w)
|
||||
|
||||
// TODO: use launch parameter preconfig to eliminate unnecessary parts
|
||||
__device__
|
||||
uint32_t gtid() {
|
||||
""", defs=r'''
|
||||
__device__ uint32_t gtid() {
|
||||
return threadIdx.x + blockDim.x *
|
||||
(threadIdx.y + blockDim.y *
|
||||
(threadIdx.z + blockDim.z *
|
||||
(blockIdx.x + (gridDim.x * blockIdx.y))));
|
||||
}
|
||||
|
||||
__device__
|
||||
uint32_t trunca(float f) {
|
||||
__device__ uint32_t trunca(float f) {
|
||||
// truncate as used in address calculations. note the use of a signed
|
||||
// conversion is intentional here (simplifies image bounds checking).
|
||||
uint32_t ret;
|
||||
asm("cvt.rni.s32.f32 %0, %1;" : "=r"(ret) : "f"(f));
|
||||
return ret;
|
||||
}
|
||||
''')
|
||||
|
||||
__global__
|
||||
void fill_dptr(uint32_t* dptr, int size, uint32_t value) {
|
||||
def mkbinsearchlib(rounds):
|
||||
"""
|
||||
Search through the fixed-size list 'hay' to find the rightmost index which
|
||||
contains a value strictly smaller than the input 'needle'. The list must
|
||||
be exactly '2^rounds' long, although padding at the top with very large
|
||||
numbers or even +inf effectively shortens it.
|
||||
"""
|
||||
# TODO: this doesn't optimize well on a 64-bit arch, not that it's a
|
||||
# performance-critical chunk of code or anything
|
||||
src = Template(r'''
|
||||
__device__ int bitwise_binsearch(const float *hay, float needle) {
|
||||
int lo = 0;
|
||||
|
||||
// TODO: improve efficiency on 64-bit arches
|
||||
{{for i in range(search_rounds-1, -1, -1)}}
|
||||
if (needle > hay[lo + {{1 << i}}])
|
||||
lo += {{1 << i}};
|
||||
{{endfor}}
|
||||
return lo;
|
||||
}
|
||||
''', 'bitwise_binsearch')
|
||||
return devlib(defs=src.substitute(search_rounds=rounds))
|
||||
|
||||
# 2^search_rounds is the maximum number of knots allowed in a single spline.
|
||||
# This includes the four required knots, so a 5 round search supports 28
|
||||
# interior knots in the domain (0, 1). 2^5 fits nicely on a single cache line.
|
||||
DEFAULT_SEARCH_ROUNDS = 5
|
||||
binsearchlib = mkbinsearchlib(DEFAULT_SEARCH_ROUNDS)
|
||||
|
||||
|
||||
filldptrlib = devlib(defs=r'''
|
||||
__global__ void
|
||||
fill_dptr(uint32_t* dptr, int size, uint32_t value) {
|
||||
int i = (gridDim.x * blockIdx.y + blockIdx.x) * blockDim.x + threadIdx.x;
|
||||
if (i < size) {
|
||||
dptr[i] = value;
|
||||
}
|
||||
}
|
||||
''')
|
||||
def fill_dptr(mod, dptr, size, stream=None, value=np.uint32(0)):
|
||||
"""
|
||||
A memory zeroer which can be embedded in a stream, unlike the various
|
||||
memset routines. Size is the number of 4-byte words in the pointer;
|
||||
value is the word to fill it with. If value is not an np.uint32, it
|
||||
will be coerced to a buffer and the first four bytes taken.
|
||||
"""
|
||||
if not isinstance(value, np.uint32):
|
||||
if isinstance(value, int):
|
||||
value = np.uint32(value)
|
||||
else:
|
||||
value = np.frombuffer(buffer(value), np.uint32)[0]
|
||||
blocks = int(np.ceil(np.sqrt(size / 1024.)))
|
||||
launch('fill_dptr', mod, stream, (1024, 1, 1), (blocks, blocks),
|
||||
dptr, np.int32(size), value)
|
||||
|
||||
writehalflib = devlib(defs=r'''
|
||||
/* read_half and write_half decode and encode, respectively, two
|
||||
* floating-point values from a 32-bit value (typed as a 'float' for
|
||||
* convenience but not really). The values are packed into u16s as a
|
||||
@ -122,8 +232,8 @@ void fill_dptr(uint32_t* dptr, int size, uint32_t value) {
|
||||
* approach when the alpha channel is present.
|
||||
*/
|
||||
|
||||
__device__
|
||||
void read_half(float &x, float &y, float xy, float den) {
|
||||
__device__ void
|
||||
read_half(float &x, float &y, float xy, float den) {
|
||||
asm("\n\t{"
|
||||
"\n\t .reg .u16 x, y;"
|
||||
"\n\t .reg .f32 rc;"
|
||||
@ -137,8 +247,8 @@ void read_half(float &x, float &y, float xy, float den) {
|
||||
: "=f"(x), "=f"(y) : "f"(xy), "f"(den));
|
||||
}
|
||||
|
||||
__device__
|
||||
void write_half(float &xy, float x, float y, float den) {
|
||||
__device__ void
|
||||
write_half(float &xy, float x, float y, float den) {
|
||||
asm("\n\t{"
|
||||
"\n\t .reg .u16 x, y;"
|
||||
"\n\t .reg .f32 rc, xf, yf;"
|
||||
@ -152,87 +262,41 @@ void write_half(float &xy, float x, float y, float den) {
|
||||
"\n\t}"
|
||||
: "=f"(xy) : "f"(x), "f"(y), "f"(den));
|
||||
}
|
||||
''')
|
||||
|
||||
|
||||
/* This conversion uses the JPEG full-range standard. Note that UV have range
|
||||
* [-0.5, 0.5], so consider biasing the results. */
|
||||
__device__
|
||||
float3 rgb2yuv(float3 rgb) {
|
||||
return make_float3(
|
||||
0.299f * rgb.x + 0.587f * rgb.y + 0.114f * rgb.z,
|
||||
-0.168736f * rgb.x - 0.331264f * rgb.y + 0.5f * rgb.z,
|
||||
0.5f * rgb.x - 0.418688f * rgb.y - 0.081312f * rgb.z);
|
||||
}
|
||||
|
||||
__device__
|
||||
float3 yuv2rgb(float3 yuv) {
|
||||
return make_float3(
|
||||
yuv.x + 1.402f * yuv.z,
|
||||
yuv.x - 0.34414f * yuv.y - 0.71414f * yuv.z,
|
||||
yuv.x + 1.772f * yuv.y);
|
||||
}
|
||||
|
||||
__device__
|
||||
float3 rgb2hsv(float3 rgb) {
|
||||
float M = fmaxf(fmaxf(rgb.x, rgb.y), rgb.z);
|
||||
float m = fminf(fminf(rgb.x, rgb.y), rgb.z);
|
||||
float C = M - m;
|
||||
|
||||
float s = M > 0.0f ? C / M : 0.0f;
|
||||
|
||||
float h = 0.0f;
|
||||
if (s != 0.0f) {
|
||||
C = 1.0f / C;
|
||||
float rc = (M - rgb.x) * C;
|
||||
float gc = (M - rgb.y) * C;
|
||||
float bc = (M - rgb.z) * C;
|
||||
|
||||
if (rgb.x == M) h = bc - gc;
|
||||
else if (rgb.y == M) h = 2 + rc - bc;
|
||||
else h = 4 + gc - rc;
|
||||
|
||||
if (h < 0) h += 6.0f;
|
||||
}
|
||||
return make_float3(h, s, M);
|
||||
}
|
||||
|
||||
__device__
|
||||
float3 hsv2rgb(float3 hsv) {
|
||||
|
||||
float whole = floorf(hsv.x);
|
||||
float frac = hsv.x - whole;
|
||||
float val = hsv.z;
|
||||
float min = val * (1 - hsv.y);
|
||||
float mid = val * (1 - (hsv.y * frac));
|
||||
float alt = val * (1 - (hsv.y * (1 - frac)));
|
||||
|
||||
float3 out;
|
||||
if (whole == 0.0f) { out.x = val; out.y = alt; out.z = min; }
|
||||
else if (whole == 1.0f) { out.x = mid; out.y = val; out.z = min; }
|
||||
else if (whole == 2.0f) { out.x = min; out.y = val; out.z = alt; }
|
||||
else if (whole == 3.0f) { out.x = min; out.y = mid; out.z = val; }
|
||||
else if (whole == 4.0f) { out.x = alt; out.y = min; out.z = val; }
|
||||
else { out.x = val; out.y = min; out.z = mid; }
|
||||
return out;
|
||||
}
|
||||
""").substitute()
|
||||
|
||||
@staticmethod
|
||||
def fill_dptr(mod, dptr, size, stream=None, value=np.uint32(0)):
|
||||
def mkringbuflib(rb_size):
|
||||
"""
|
||||
A memory zeroer which can be embedded in a stream, unlike the various
|
||||
memset routines. Size is the number of 4-byte words in the pointer;
|
||||
value is the word to fill it with. If value is not an np.uint32, it
|
||||
will be coerced to a buffer and the first four bytes taken.
|
||||
A ringbuffer for access to shared resources.
|
||||
|
||||
Some components, such as the MWC contexts, are expensive to generate, and
|
||||
have no affinity to a particular block. Rather than maintain a separate
|
||||
copy of each of these objects for every thread block in a launch, we want
|
||||
to only keep enough copies of these resources around to service every
|
||||
thread block that could possibly be active simultaneously on one card,
|
||||
which is often considerably smaller. This class provides a simple
|
||||
ringbuffer type and an increment function, used in a couple places to
|
||||
implement this kind of sharing.
|
||||
"""
|
||||
fill = mod.get_function("fill_dptr")
|
||||
if not isinstance(value, np.uint32):
|
||||
if isinstance(value, int):
|
||||
value = np.uint32(value)
|
||||
else:
|
||||
value = np.frombuffer(buffer(value), np.uint32)[0]
|
||||
blocks = int(np.ceil(np.sqrt(size / 1024 + 1)))
|
||||
fill(dptr, np.int32(size), value, stream=stream,
|
||||
block=(1024, 1, 1), grid=(blocks, blocks, 1))
|
||||
|
||||
return devlib(headers="#define RB_SIZE_MASK %d" % (rb_size - 1), decls='''
|
||||
typedef struct {
|
||||
int head;
|
||||
int tail;
|
||||
} ringbuf;
|
||||
''', defs=r'''
|
||||
__shared__ int rb_idx;
|
||||
__device__ int rb_incr(int &rb_base, int tidx) {
|
||||
if (threadIdx.y == 1 && threadIdx.x == 1)
|
||||
rb_idx = 256 * (atomicAdd(&rb_base, 1) & RB_SIZE_MASK);
|
||||
__syncthreads();
|
||||
return rb_idx + tidx;
|
||||
}
|
||||
''')
|
||||
|
||||
# For now, the number of entries is fixed to a value known to work on all
|
||||
# Fermi cards. Autodetection, or perhaps just a global limit increase, will be
|
||||
# done when I get my hands on a Kepler device. The fixed size assumes blocks
|
||||
# of 256 threads, although even at that size there are pathological cases that
|
||||
# could break the assumption.
|
||||
DEFAULT_RB_SIZE = 64
|
||||
ringbuflib = mkringbuflib(DEFAULT_RB_SIZE)
|
||||
|
105
cuburn/filters.py
Normal file
105
cuburn/filters.py
Normal file
@ -0,0 +1,105 @@
|
||||
import numpy as np
|
||||
from numpy import float32 as f32, int32 as i32
|
||||
|
||||
import pycuda.driver as cuda
|
||||
import pycuda.compiler
|
||||
from pycuda.gpuarray import vec
|
||||
|
||||
import code.filters
|
||||
from code.util import ClsMod, argset, launch
|
||||
|
||||
class Filter(object):
|
||||
def apply(self, fb, gnm, dim, tc, stream=None):
|
||||
"""
|
||||
Queue the application of this filter. When the live stream finishes
|
||||
executing the last item enqueued by this method, the result must be
|
||||
live in the buffer pointed to by ``fb.d_front`` at the return of this
|
||||
function.
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
class Bilateral(Filter, ClsMod):
|
||||
lib = code.filters.bilaterallib
|
||||
def __init__(self, directions=8, r=15, sstd=6, cstd=0.05,
|
||||
dstd=1.5, dpow=0.8, gspeed=4.0):
|
||||
# TODO: expose these parameters on the genome, or at least on the
|
||||
# profile, and set them by a less ugly mechanism
|
||||
for n in 'directions r sstd cstd dstd dpow gspeed'.split():
|
||||
setattr(self, n, locals()[n])
|
||||
super(Bilateral, self).__init__()
|
||||
|
||||
def apply(self, fb, gnm, dim, tc, stream=None):
|
||||
# Helper variables and functions to keep it clean
|
||||
sb = 16 * dim.astride
|
||||
bs = sb * dim.ah
|
||||
bl, gr = (32, 8, 1), (dim.astride / 32, dim.ah / 8)
|
||||
|
||||
mkdsc = lambda c: argset(cuda.ArrayDescriptor(), height=dim.ah,
|
||||
width=dim.astride, num_channels=c,
|
||||
format=cuda.array_format.FLOAT)
|
||||
def mktref(n):
|
||||
tref = self.mod.get_texref(n)
|
||||
tref.set_filter_mode(cuda.filter_mode.POINT)
|
||||
tref.set_address_mode(0, cuda.address_mode.WRAP)
|
||||
tref.set_address_mode(1, cuda.address_mode.WRAP)
|
||||
return tref
|
||||
|
||||
dsc = mkdsc(4)
|
||||
tref = mktref('bilateral_src')
|
||||
grad_dsc = mkdsc(1)
|
||||
grad_tref = mktref('blur_src')
|
||||
|
||||
for pattern in range(self.directions):
|
||||
# Scale spatial parameter so that a "pixel" is equivalent to an
|
||||
# actual pixel at 1080p
|
||||
sstd = self.sstd * dim.w / 1920.
|
||||
|
||||
tref.set_address_2d(fb.d_front, dsc, sb)
|
||||
|
||||
# Blur density two octaves along sampling vector, ultimately
|
||||
# storing in the side buffer
|
||||
launch('den_blur', self.mod, stream, bl, gr,
|
||||
fb.d_back, i32(pattern), i32(0), texrefs=[tref])
|
||||
grad_tref.set_address_2d(fb.d_back, grad_dsc, sb / 4)
|
||||
launch('den_blur_1c', self.mod, stream, bl, gr,
|
||||
fb.d_side, i32(pattern), i32(1), texrefs=[grad_tref])
|
||||
grad_tref.set_address_2d(fb.d_side, grad_dsc, sb / 4)
|
||||
|
||||
launch('bilateral', self.mod, stream, bl, gr,
|
||||
fb.d_back, i32(pattern), i32(self.r),
|
||||
f32(sstd), f32(self.cstd), f32(self.dstd),
|
||||
f32(self.dpow), f32(self.gspeed),
|
||||
texrefs=[tref, grad_tref])
|
||||
fb.flip()
|
||||
|
||||
class Logscale(Filter, ClsMod):
|
||||
lib = code.filters.logscalelib
|
||||
def apply(self, fb, gnm, dim, tc, stream=None):
|
||||
"""Log-scale in place."""
|
||||
k1 = f32(gnm.color.brightness(tc) * 268 / 256)
|
||||
# Old definition of area is (w*h/(s*s)). Since new scale 'ns' is now
|
||||
# s/w, new definition is (w*h/(s*s*w*w)) = (h/(s*s*w))
|
||||
area = dim.h / (gnm.camera.scale(tc) ** 2 * dim.w)
|
||||
k2 = f32(1.0 / (area * gnm.spp(tc)))
|
||||
nbins = dim.ah * dim.astride
|
||||
launch('logscale', self.mod, stream, 256, nbins/256,
|
||||
fb.d_front, fb.d_front, k1, k2)
|
||||
|
||||
class ColorClip(Filter, ClsMod):
|
||||
lib = code.filters.colorcliplib
|
||||
def apply(self, fb, gnm, dim, tc, stream=None):
|
||||
# TODO: implement integration over cubic splines?
|
||||
gam = f32(1 / gnm.color.gamma(tc))
|
||||
vib = f32(gnm.color.vibrance(tc))
|
||||
hipow = f32(gnm.color.highlight_power(tc))
|
||||
lin = f32(gnm.color.gamma_threshold(tc))
|
||||
lingam = f32(lin ** (gam-1.0) if lin > 0 else 0)
|
||||
bkgd = vec.make_float3(
|
||||
gnm.color.background.r(tc),
|
||||
gnm.color.background.g(tc),
|
||||
gnm.color.background.b(tc))
|
||||
|
||||
nbins = dim.ah * dim.astride
|
||||
blocks = int(np.ceil(np.sqrt(nbins / 256.)))
|
||||
launch('colorclip', self.mod, stream, 256, (blocks, blocks),
|
||||
fb.d_front, gam, vib, hipow, lin, lingam, bkgd, i32(nbins))
|
49
cuburn/output.py
Normal file
49
cuburn/output.py
Normal file
@ -0,0 +1,49 @@
|
||||
import numpy as np
|
||||
from numpy import float32 as f32, int32 as i32
|
||||
|
||||
import pycuda.driver as cuda
|
||||
|
||||
from code.util import ClsMod, launch
|
||||
from code.output import f32tou8lib
|
||||
|
||||
import scipy.misc
|
||||
|
||||
if not hasattr(scipy.misc, 'toimage'):
|
||||
raise ImportError("Could not find scipy.misc.toimage. "
|
||||
"Are scipy and PIL installed?")
|
||||
|
||||
class Output(object):
|
||||
def convert(self, fb, gnm, dim, stream=None):
|
||||
"""
|
||||
Convert a filtered buffer to whatever output format is needed by the
|
||||
writer.
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
def copy(self, fb, dim, pool, stream=None):
|
||||
"""
|
||||
Schedule a copy from the device buffer to host memory, returning the
|
||||
target buffer.
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
class PILOutput(Output, ClsMod):
|
||||
lib = f32tou8lib
|
||||
|
||||
def convert(self, fb, gnm, dim, stream=None):
|
||||
launch('f32_to_u8', self.mod, stream,
|
||||
(32, 8, 1), (int(np.ceil(dim.w/32.)), int(np.ceil(dim.h/8.))),
|
||||
fb.d_rb, fb.d_seeds, fb.d_back, fb.d_front,
|
||||
i32(fb.gutter), i32(dim.w), i32(dim.astride), i32(dim.h))
|
||||
|
||||
def copy(self, fb, dim, pool, stream=None):
|
||||
h_out = pool.allocate((dim.h, dim.w, 4), 'u1')
|
||||
cuda.memcpy_dtoh_async(h_out, fb.d_back, stream)
|
||||
return h_out
|
||||
|
||||
@staticmethod
|
||||
def save(buf, name, type=None, quality=98):
|
||||
if type == 'jpeg' or (type is None and name.endswith('.jpg')):
|
||||
buf = buf[:,:,:3]
|
||||
img = scipy.misc.toimage(buf, cmin=0, cmax=1)
|
||||
img.save(name, type, quality=quality)
|
683
cuburn/render.py
683
cuburn/render.py
@ -1,383 +1,372 @@
|
||||
"""
|
||||
Resources and tools to perform rendering.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import re
|
||||
import time as timemod
|
||||
import time
|
||||
import tempfile
|
||||
from collections import namedtuple
|
||||
from itertools import cycle, repeat, chain, izip, imap, ifilter
|
||||
from ctypes import *
|
||||
from cStringIO import StringIO
|
||||
import numpy as np
|
||||
from numpy import float32 as f32, int32 as i32, uint32 as u32, uint64 as u64
|
||||
from scipy import ndimage
|
||||
|
||||
import pycuda.compiler
|
||||
import pycuda.driver as cuda
|
||||
import pycuda.tools
|
||||
|
||||
import cuburn.genome
|
||||
from cuburn import affine
|
||||
from cuburn.code import util, mwc, iter, interp, filtering, sort
|
||||
import filters
|
||||
import output
|
||||
from code import util, mwc, iter, interp, sort
|
||||
from code.util import ClsMod, devlib, filldptrlib, assemble_code, launch
|
||||
|
||||
RenderedImage = namedtuple('RenderedImage', 'buf idx gpu_time')
|
||||
Dimensions = namedtuple('Dimensions', 'w h aw ah astride')
|
||||
|
||||
def _sync_stream(dst, src):
|
||||
dst.wait_for_event(cuda.Event(cuda.event_flags.DISABLE_TIMING).record(src))
|
||||
class Framebuffers(object):
|
||||
"""
|
||||
The largest memory allocations, and a stream to serialize their use.
|
||||
|
||||
class Renderer(object):
|
||||
``d_front`` and ``d_back`` are separate buffers, each large enough to hold
|
||||
four float32 components per pixel (including any gutter pixels added for
|
||||
alignment or padding). ``d_side`` is another buffer large enough to hold
|
||||
two float32 components per pixel.
|
||||
|
||||
Every user of this set of buffers may use and overwrite the buffers in
|
||||
any way, as long as the output for the next stage winds up in the front
|
||||
buffer. The front and back buffers can be exchanged by the ``flip()``
|
||||
method (which simply exchanges the pointers); while no similar method
|
||||
exists for the side buffer, you're free to do the same by taking local
|
||||
copies of the references and exchanging them yourself.
|
||||
|
||||
There's one spot in the stream interleaving where the behavior is
|
||||
different: the ``Output.convert`` call must store its output to the back
|
||||
buffer, which will remain untouched until the dtoh copy of the converted
|
||||
buffer is finished. This happens while the ``iter`` kernel of the next
|
||||
frame writes to the front and side buffers, which means in practice that
|
||||
there's essentially no waiting on any buffers.
|
||||
|
||||
If an output module decides to get all krazy and actually replaces the
|
||||
references to the buffers on this object - to, say, implement a temporally
|
||||
aware tone-mapping or denoising pass - that's probably okay, but just make
|
||||
sure it appears like it's expected to.
|
||||
"""
|
||||
Control structure for rendering a series of frames.
|
||||
|
||||
# Minimum extension of accumulation buffer beyond output region. Used to
|
||||
# alleviate edge effects during filtering. Actual gutter may be larger to
|
||||
# accomodate alignment requirements; when it is, that extension will be
|
||||
# applied to the lower-right corner of the buffer. This is asymmetrical,
|
||||
# but simplifies trimming logic when it's time for that.
|
||||
gutter = 10
|
||||
|
||||
@classmethod
|
||||
def calc_dim(cls, width, height):
|
||||
"""
|
||||
Given a width and height, return a valid set of dimensions which
|
||||
include at least enough gutter to exceed the minimum, and where
|
||||
(acc_width % 32) == 0 and (acc_height % 8) == 0.
|
||||
"""
|
||||
awidth = width + 2 * cls.gutter
|
||||
aheight = 8 * int(np.ceil((height + 2 * cls.gutter) / 8.))
|
||||
astride = 32 * int(np.ceil(awidth / 32.))
|
||||
return Dimensions(width, height, awidth, aheight, astride)
|
||||
|
||||
def __init__(self):
|
||||
self.stream = cuda.Stream()
|
||||
self._clear()
|
||||
|
||||
# These resources rely on the slots/ringbuffer mechanism for sharing,
|
||||
# and so can be shared across any number of launches, genomes, and
|
||||
# render kernels. Notably, seeds are self-synchronizing, so they're not
|
||||
# attached to either stream object.
|
||||
self.d_rb = cuda.to_device(np.array([0, 0], dtype=u32))
|
||||
seeds = mwc.make_seeds(util.DEFAULT_RB_SIZE * 256)
|
||||
self.d_seeds = cuda.to_device(seeds)
|
||||
self._len_d_points = util.DEFAULT_RB_SIZE * 256 * 16
|
||||
self.d_points = cuda.mem_alloc(self._len_d_points)
|
||||
|
||||
def _clear(self):
|
||||
self.nbins = self.d_front = self.d_back = self.d_side = None
|
||||
|
||||
def free(self, stream=None):
|
||||
if stream is not None:
|
||||
stream.synchronize()
|
||||
else:
|
||||
cu
|
||||
for p in (self.d_front, self.d_back, self.d_side):
|
||||
if p is not None:
|
||||
p.free()
|
||||
self._clear()
|
||||
|
||||
def alloc(self, dim, stream=None):
|
||||
"""
|
||||
Ensure that this object's framebuffers are large enough to handle the
|
||||
given dimensions, allocating new ones if not.
|
||||
|
||||
If ``stream`` is not None and a reallocation is necessary, the stream
|
||||
will be synchronized before the old buffers are deallocated.
|
||||
"""
|
||||
nbins = dim.ah * dim.astride
|
||||
if self.nbins >= nbins: return
|
||||
if self.nbins is not None: self.free()
|
||||
try:
|
||||
self.d_front = cuda.mem_alloc(16 * nbins)
|
||||
self.d_back = cuda.mem_alloc(16 * nbins)
|
||||
self.d_side = cuda.mem_alloc(8 * nbins)
|
||||
self.nbins = nbins
|
||||
except cuda.MemoryError, e:
|
||||
# If a frame that's too large sneaks by the task distributor, we
|
||||
# don't want to kill the server, but we also don't want to leave
|
||||
# it stuck without any free memory to complete the next alloc.
|
||||
# TODO: measure free mem and only take tasks that fit (but that
|
||||
# should be done elsewhere)
|
||||
self.free(stream)
|
||||
raise e
|
||||
|
||||
def set_dim(self, width, height, stream=None):
|
||||
"""
|
||||
Compute padded dimensions for given width and height, ensure that the
|
||||
buffers are large enough (and reallocate if not), and return the
|
||||
calculated dimensions.
|
||||
|
||||
Note that the returned dimensions are always the same for a given
|
||||
width, height, and minimum gutter, even if the underlying buffers are
|
||||
larger due to a previous allocation.
|
||||
"""
|
||||
dim = self.calc_dim(width, height)
|
||||
self.alloc(dim, stream)
|
||||
return dim
|
||||
|
||||
def flip(self):
|
||||
"""Flip the front and back buffers."""
|
||||
self.d_front, self.d_back = self.d_back, self.d_front
|
||||
|
||||
class DevSrc(object):
|
||||
"""
|
||||
The buffers which represent a genome on-device, in the formats needed to
|
||||
serve as a source for interpolating temporal samples.
|
||||
"""
|
||||
|
||||
# Maximum number of knots per parameter. This also covers the maximum
|
||||
# number of palettes allowed.
|
||||
max_knots = 1 << util.DEFAULT_SEARCH_ROUNDS
|
||||
|
||||
# Maximum number of parameters per genome. This number is exceedingly
|
||||
# high, and should never even come close to being hit.
|
||||
max_params = 1024
|
||||
|
||||
def __init__(self):
|
||||
self.d_times = cuda.mem_alloc(4 * self.max_knots * self.max_params)
|
||||
self.d_knots = cuda.mem_alloc(4 * self.max_knots * self.max_params)
|
||||
self.d_ptimes = cuda.mem_alloc(4 * self.max_knots)
|
||||
self.d_pals = cuda.mem_alloc(4 * 4 * 256 * self.max_knots)
|
||||
|
||||
class DevInfo(object):
|
||||
"""
|
||||
The buffers which hold temporal samples on-device, as used by iter.
|
||||
"""
|
||||
|
||||
# The palette texture/surface covers the color coordinate from [0,1] with
|
||||
# equidistant horizontal samples, and spans the temporal range of the
|
||||
# frame linearly with this many rows. Increasing these values increases the
|
||||
# number of uniquely-dithered samples when using pre-dithered surfaces, as
|
||||
# is done in 'atomic' accumulation.
|
||||
palette_width = 256 # TODO: make this setting be respected
|
||||
palette_height = 64
|
||||
|
||||
# This used to be determined automagically, but simultaneous occupancy
|
||||
# and a much smaller block size simplifies this.
|
||||
ntemporal_samples = 1024
|
||||
|
||||
# Number of iterations to iterate without write after generating a new
|
||||
# point. This number is currently fixed pretty deeply in the set of magic
|
||||
# constants which govern buffer sizes; changing the value here won't
|
||||
# actually change the code on the device to do something different.
|
||||
# It's here just for documentation purposes.
|
||||
fuse = 256
|
||||
|
||||
# The palette texture/surface covers the color coordinate from [0,1] with
|
||||
# (for now, a fixed 256) equidistant horizontal samples, and spans the
|
||||
# temporal range of the frame linearly with this many rows. Increasing
|
||||
# this value increases the number of uniquely-dithered samples when using
|
||||
# pre-dithered surfaces.
|
||||
palette_height = 64
|
||||
|
||||
# Palette color interpolation mode (see code.interp.Palette)
|
||||
palette_interp_mode = 'yuv'
|
||||
|
||||
# Maximum width of DE and other spatial filters, and thus in turn the
|
||||
# amount of padding applied. Note that, for now, this must not be changed!
|
||||
# The filtering code makes deep assumptions about this value.
|
||||
gutter = 10
|
||||
|
||||
# Accumulation mode. Leave it at 'atomic' for now.
|
||||
acc_mode = 'atomic'
|
||||
|
||||
# At most this many separate buffers for xforms will be allocated, after
|
||||
# which further xforms will wrap to the first when writing. Currently it
|
||||
# is compiled in, so power-of-two and no runtime maximization. Current
|
||||
# value of 16 fits into a 1GB card at 1080p.
|
||||
max_nxf = 1
|
||||
|
||||
# TODO
|
||||
chaos_used = False
|
||||
|
||||
cmp_options = ('-use_fast_math', '-maxrregcount', '42')
|
||||
keep = False
|
||||
|
||||
def __init__(self):
|
||||
self._iter = self.pal = self.src = self.cubin = self.mod = None
|
||||
|
||||
# Ensure class options don't get contaminated on an instance
|
||||
self.cmp_options = list(self.cmp_options)
|
||||
|
||||
def compile(self, genome, keep=None, cmp_options=None):
|
||||
"""
|
||||
Compile a kernel capable of rendering every frame in this animation.
|
||||
The resulting compiled kernel is stored in the ``cubin`` property;
|
||||
the source is available as ``src``, and is also returned for
|
||||
inspection and display.
|
||||
|
||||
This operation is idempotent, and has no side effects outside of
|
||||
setting properties on this instance (unless there's a compiler error,
|
||||
which is a bug); it should therefore be threadsafe as well.
|
||||
It is, however, rather slow.
|
||||
"""
|
||||
keep = self.keep if keep is None else keep
|
||||
cmp_options = self.cmp_options if cmp_options is None else cmp_options
|
||||
|
||||
self._iter = iter.IterCode(self, genome)
|
||||
self._iter.packer.finalize()
|
||||
self.pal = interp.Palette(self.palette_interp_mode)
|
||||
self.src = util.assemble_code(util.BaseCode, mwc.MWC, self._iter.packer,
|
||||
self.pal, self._iter)
|
||||
with open(os.path.join(tempfile.gettempdir(), 'kernel.cu'), 'w') as fp:
|
||||
fp.write(self.src)
|
||||
self.cubin = pycuda.compiler.compile(
|
||||
self.src, keep=keep, options=cmp_options,
|
||||
cache_dir=False if keep else None)
|
||||
|
||||
def load(self, genome, jit_options=[]):
|
||||
if not self.cubin:
|
||||
self.compile(genome)
|
||||
self.mod = cuda.module_from_buffer(self.cubin, jit_options)
|
||||
with open('/tmp/iter_kern.cubin', 'wb') as fp:
|
||||
fp.write(self.cubin)
|
||||
return self.src
|
||||
|
||||
def render(self, genome, times, width, height, blend=True):
|
||||
"""
|
||||
Render a frame for each timestamp in the iterable value ``times``. This
|
||||
function returns a generator that will yield a RenderedImage object
|
||||
containing a shared reference to the output buffer for each specified
|
||||
frame.
|
||||
|
||||
The returned buffer is page-locked host memory. Between the time a
|
||||
buffer is yielded and the time the next frame's results are requested,
|
||||
the buffer will not be modified. Thereafter, however, it will be
|
||||
overwritten by an asynchronous DMA operation coming from the CUDA
|
||||
device. If you hang on to it for longer than one frame, copy it.
|
||||
|
||||
``genome`` is the genome to be rendered. Successive calls to the
|
||||
`render()` method on one ``Renderer`` object must use genomes which
|
||||
produce identical compiled code, and this will not be verified by the
|
||||
renderer. In practice, this means you can alter genome parameter
|
||||
values, but the full set of keys must remain identical between runs on
|
||||
the same renderer.
|
||||
|
||||
``times`` is a list of (idx, cen_time) tuples, where ``idx`` is passed
|
||||
unmodified in the RenderedImage return value and ``cen_time`` is the
|
||||
central time of the current frame in spline-time units. (Any
|
||||
clock-time or frame-time units in the genome should be preconverted.)
|
||||
|
||||
If ``blend`` is False, the output buffer will contain unclipped,
|
||||
premultiplied RGBA data, without vibrancy, highlight power, or the
|
||||
alpha elbow applied.
|
||||
"""
|
||||
r = self.render_gen(genome, width, height, blend=blend)
|
||||
next(r)
|
||||
return ifilter(None, imap(r.send, chain(times, [None])))
|
||||
|
||||
def render_gen(self, genome, width, height, blend=True):
|
||||
"""
|
||||
Render frames. This method is wrapped by the ``render()`` method; see
|
||||
its docstring for warnings and details.
|
||||
|
||||
Instead of passing frame times as an iterable, they are passed
|
||||
individually via the ``generator.send()`` method. There is an
|
||||
internal pipeline latency of one frame, so the first call to the
|
||||
``send()`` method will return None, the second call will return the
|
||||
first frame's result, and so on. To retrieve the last frame in a
|
||||
sequence, send ``None``.
|
||||
|
||||
Direct use of this method is useful for implementing render servers.
|
||||
"""
|
||||
|
||||
last_idx = None
|
||||
next_frame = yield
|
||||
if next_frame is None:
|
||||
return
|
||||
|
||||
if not self.mod:
|
||||
self.load(genome)
|
||||
|
||||
filt = filtering.Filtering()
|
||||
|
||||
reset_rb_fun = self.mod.get_function("reset_rb")
|
||||
packer_fun = self.mod.get_function("interp_iter_params")
|
||||
iter_fun = self.mod.get_function("iter")
|
||||
|
||||
# The synchronization model is messy. See helpers/task_model.svg.
|
||||
iter_stream = cuda.Stream()
|
||||
filt_stream = cuda.Stream()
|
||||
if self.acc_mode == 'deferred':
|
||||
write_stream = cuda.Stream()
|
||||
write_fun = self.mod.get_function("write_shmem")
|
||||
else:
|
||||
write_stream = iter_stream
|
||||
|
||||
# These events fire when the corresponding buffer is available for
|
||||
# reading on the host (i.e. the copy is done). On the first pass, 'a'
|
||||
# will be ignored, and subsequently moved to 'b'.
|
||||
event_a = cuda.Event().record(filt_stream)
|
||||
event_b = None
|
||||
|
||||
awidth = width + 2 * self.gutter
|
||||
aheight = 32 * int(np.ceil((height + 2 * self.gutter) / 32.))
|
||||
astride = 32 * int(np.ceil(awidth / 32.))
|
||||
dim = Dimensions(width, height, awidth, aheight, astride)
|
||||
d_acc_size = self.mod.get_global('acc_size')[0]
|
||||
cuda.memcpy_htod_async(d_acc_size, u32(list(dim)), write_stream)
|
||||
|
||||
nbins = astride * aheight
|
||||
nxf = len(filter(lambda g: g != 'final', genome.xforms))
|
||||
nxf = min(nxf, self.max_nxf)
|
||||
d_accum = cuda.mem_alloc(16 * nbins * nxf)
|
||||
d_out = cuda.mem_alloc(16 * nbins)
|
||||
if self.acc_mode == 'atomic':
|
||||
d_atom = cuda.mem_alloc(8 * nbins * nxf)
|
||||
flush_fun = self.mod.get_function("flush_atom")
|
||||
else:
|
||||
# d_atom is also used as a scratch buffer during filtering, so we
|
||||
# need it at least this size
|
||||
d_atom = cuda.mem_alloc(4 * nbins)
|
||||
|
||||
obuf_copy = util.argset(cuda.Memcpy2D(),
|
||||
src_y=self.gutter, src_x_in_bytes=16*self.gutter,
|
||||
src_pitch=16*astride, dst_pitch=16*width,
|
||||
width_in_bytes=16*width, height=height)
|
||||
obuf_copy.set_src_device(d_out)
|
||||
h_out_a = cuda.pagelocked_empty((height, width, 4), f32)
|
||||
h_out_b = cuda.pagelocked_empty((height, width, 4), f32)
|
||||
|
||||
if self.acc_mode == 'deferred':
|
||||
# Having a fixed, power-of-two log size makes things much easier
|
||||
log_size = 64 << 20
|
||||
d_log = cuda.mem_alloc(log_size * 4)
|
||||
d_log_sorted = cuda.mem_alloc(log_size * 4)
|
||||
sorter = sort.Sorter(log_size)
|
||||
# We need to cover each unique tag - address bits 20-23 - with one
|
||||
# write block per sort bin. Or somethinig like that.
|
||||
nwriteblocks = int(np.ceil(nbins / float(1<<20))) * 256
|
||||
|
||||
# Calculate 'nslots', the number of simultaneous running threads that
|
||||
# can be active on the GPU during iteration (and thus the number of
|
||||
# slots for loading and storing RNG and point context that will be
|
||||
# prepared on the device), and derive 'rb_size', the number of blocks in
|
||||
# 'nslots'.
|
||||
iter_threads_per_block = 256
|
||||
dev_data = pycuda.tools.DeviceData()
|
||||
occupancy = pycuda.tools.OccupancyRecord(
|
||||
dev_data, iter_threads_per_block,
|
||||
iter_fun.shared_size_bytes, iter_fun.num_regs)
|
||||
nsms = cuda.Context.get_device().multiprocessor_count
|
||||
rb_size = occupancy.warps_per_mp * nsms / (iter_threads_per_block / 32)
|
||||
nslots = iter_threads_per_block * rb_size
|
||||
|
||||
# Reset the ringbuffer info for the slots
|
||||
reset_rb_fun(np.int32(rb_size), block=(1,1,1))
|
||||
|
||||
d_points = cuda.mem_alloc(nslots * 16)
|
||||
# This statement may add extra seeds to simplify palette dithering.
|
||||
seeds = mwc.MWC.make_seeds(max(nslots, 256 * self.palette_height))
|
||||
d_seeds = cuda.to_device(seeds)
|
||||
|
||||
# We used to auto-calculate this to a multiple of the number of SMs on
|
||||
# the device, but since we now use shorter launches and, to a certain
|
||||
# extent, allow simultaneous occupancy, that's not as important. The
|
||||
# 1024 is a magic constant to ensure reasonable and power-of-two log
|
||||
# size for deferred: 256MB / (4B * FUSE * NTHREADS). Enhancements to
|
||||
# the sort engine are needed to make this more flexible.
|
||||
ntemporal_samples = 1024
|
||||
genome_times, genome_knots = self._iter.packer.pack()
|
||||
d_genome_times = cuda.to_device(genome_times)
|
||||
d_genome_knots = cuda.to_device(genome_knots)
|
||||
info_size = 4 * len(self._iter.packer) * ntemporal_samples
|
||||
d_infos = cuda.mem_alloc(info_size)
|
||||
|
||||
ptimes, pidxs = zip(*genome.palette_times)
|
||||
palint_times = np.empty(len(genome_times[0]), f32)
|
||||
palint_times.fill(1e10)
|
||||
palint_times[:len(ptimes)] = ptimes
|
||||
d_palint_times = cuda.to_device(palint_times)
|
||||
pvals = self.pal.prepare([genome.decoded_palettes[i] for i in pidxs])
|
||||
d_palint_vals = cuda.to_device(np.concatenate(pvals))
|
||||
|
||||
if self.acc_mode in ('deferred', 'atomic'):
|
||||
palette_fun = self.mod.get_function("interp_palette_flat")
|
||||
dsc = util.argset(cuda.ArrayDescriptor3D(),
|
||||
height=self.palette_height, width=256, depth=0,
|
||||
self.d_params = cuda.mem_alloc(
|
||||
self.ntemporal_samples * DevSrc.max_params * 4)
|
||||
self.palette_surf_dsc = util.argset(cuda.ArrayDescriptor3D(),
|
||||
height=self.palette_height, width=self.palette_width, depth=0,
|
||||
format=cuda.array_format.SIGNED_INT32,
|
||||
num_channels=2, flags=cuda.array3d_flags.SURFACE_LDST)
|
||||
palarray = cuda.Array(dsc)
|
||||
self.d_pal_array = cuda.Array(self.palette_surf_dsc)
|
||||
|
||||
class Renderer(object):
|
||||
def __init__(self, gnm):
|
||||
self.packer, self.lib = iter.mkiterlib(gnm)
|
||||
cubin = util.compile('iter', assemble_code(self.lib))
|
||||
self.mod = cuda.module_from_buffer(cubin)
|
||||
|
||||
# TODO: make these customizable
|
||||
self.filts = [ filters.Bilateral()
|
||||
, filters.Logscale()
|
||||
, filters.ColorClip() ]
|
||||
self.out = output.PILOutput()
|
||||
|
||||
class RenderManager(ClsMod):
|
||||
lib = devlib(deps=[interp.palintlib, filldptrlib, iter.flushatomlib])
|
||||
|
||||
def __init__(self):
|
||||
super(RenderManager, self).__init__()
|
||||
self.pool = pycuda.tools.PageLockedMemoryPool()
|
||||
|
||||
self.fb = Framebuffers()
|
||||
self.src_a, self.src_b = DevSrc(), DevSrc()
|
||||
self.info_a, self.info_b = DevInfo(), DevInfo()
|
||||
self.stream_a, self.stream_b = cuda.Stream(), cuda.Stream()
|
||||
self.filt_evt = self.copy_evt = None
|
||||
|
||||
def _copy(self, rdr, gnm):
|
||||
"""
|
||||
Queue a copy of a host genome into a set of device interpolation source
|
||||
buffers.
|
||||
|
||||
Note that for now, this is broken! It ignores ``gnm``, and only packs
|
||||
the genome that was used when creating the renderer.
|
||||
"""
|
||||
times, knots = rdr.packer.pack(self.pool)
|
||||
cuda.memcpy_htod_async(self.src_a.d_times, times, self.stream_a)
|
||||
cuda.memcpy_htod_async(self.src_a.d_knots, knots, self.stream_a)
|
||||
|
||||
ptimes, pidxs = zip(*gnm.palette_times)
|
||||
palettes = self.pool.allocate((len(ptimes), 256, 4), f32)
|
||||
palettes[:] = [gnm.decoded_palettes[i] for i in pidxs]
|
||||
palette_times = self.pool.allocate((self.src_a.max_knots,), f32)
|
||||
palette_times.fill(1e9)
|
||||
palette_times[:len(ptimes)] = ptimes
|
||||
cuda.memcpy_htod_async(self.src_a.d_pals, palettes, self.stream_a)
|
||||
cuda.memcpy_htod_async(self.src_a.d_ptimes, palette_times,
|
||||
self.stream_a)
|
||||
|
||||
# TODO: use bilerp tex as src for palette interp
|
||||
|
||||
def _interp(self, rdr, gnm, dim, ts, td):
|
||||
d_acc_size = rdr.mod.get_global('acc_size')[0]
|
||||
p_dim = self.pool.allocate((len(dim),), u32)
|
||||
p_dim[:] = dim
|
||||
cuda.memcpy_htod_async(d_acc_size, p_dim, self.stream_a)
|
||||
|
||||
tref = self.mod.get_surfref('flatpal')
|
||||
tref.set_array(palarray, 0)
|
||||
else:
|
||||
palette_fun = self.mod.get_function("interp_palette")
|
||||
dsc = util.argset(cuda.ArrayDescriptor(),
|
||||
height=self.palette_height, width=256,
|
||||
format=cuda.array_format.UNSIGNED_INT8,
|
||||
num_channels=4)
|
||||
d_palmem = cuda.mem_alloc(256 * self.palette_height * 4)
|
||||
tref.set_array(self.info_a.d_pal_array, 0)
|
||||
launch('interp_palette_flat', self.mod, self.stream_a,
|
||||
256, self.info_a.palette_height,
|
||||
self.fb.d_rb, self.fb.d_seeds,
|
||||
self.src_a.d_ptimes, self.src_a.d_pals,
|
||||
f32(ts), f32(td / self.info_a.palette_height))
|
||||
|
||||
tref = self.mod.get_texref('palTex')
|
||||
tref.set_address_2d(d_palmem, dsc, 1024)
|
||||
tref.set_flags(cuda.TRSF_NORMALIZED_COORDINATES)
|
||||
tref.set_filter_mode(cuda.filter_mode.LINEAR)
|
||||
nts = self.info_a.ntemporal_samples
|
||||
launch('interp_iter_params', rdr.mod, self.stream_a,
|
||||
256, np.ceil(nts / 256.),
|
||||
self.info_a.d_params, self.src_a.d_times, self.src_a.d_knots,
|
||||
f32(ts), f32(td / nts), i32(nts))
|
||||
|
||||
while next_frame is not None:
|
||||
# tc, td, ts, te: central, delta, start, end times
|
||||
idx, tc = next_frame
|
||||
td = genome.adj_frame_width(tc)
|
||||
def _print_interp_knots(self, rdr, tsidx=5):
|
||||
infos = cuda.from_device_like(self.info_a.d_params,
|
||||
(tsidx + 1, self.info_a.max_params), f32)
|
||||
for i, n in zip(infos[-1], rdr.packer.packed):
|
||||
print '%60s %g' % ('_'.join(n), i)
|
||||
|
||||
def _iter(self, rdr, gnm, dim, tc):
|
||||
tref = rdr.mod.get_surfref('flatpal')
|
||||
tref.set_array(self.info_a.d_pal_array, 0)
|
||||
|
||||
nbins = dim.ah * dim.astride
|
||||
fill = lambda b, s, v=i32(0): util.fill_dptr(
|
||||
self.mod, b, s, stream=self.stream_a, value=v)
|
||||
fill(self.fb.d_front, 4 * nbins)
|
||||
fill(self.fb.d_side, 2 * nbins)
|
||||
fill(self.fb.d_points, self.fb._len_d_points / 4, f32(np.nan))
|
||||
|
||||
nts = self.info_a.ntemporal_samples
|
||||
nsamps = (gnm.spp(tc) * dim.w * dim.h)
|
||||
nrounds = int(nsamps / (nts * 256. * 256)) + 1
|
||||
launch('iter', rdr.mod, self.stream_a, (32, 8, 1), (nts, nrounds),
|
||||
self.fb.d_front, self.fb.d_side,
|
||||
self.fb.d_rb, self.fb.d_seeds, self.fb.d_points,
|
||||
self.info_a.d_params)
|
||||
|
||||
nblocks = int(np.ceil(np.sqrt(dim.ah*dim.astride/256.)))
|
||||
launch('flush_atom', self.mod, self.stream_a,
|
||||
256, (nblocks, nblocks),
|
||||
u64(self.fb.d_front), u64(self.fb.d_side), i32(nbins))
|
||||
|
||||
def queue_frame(self, rdr, gnm, tc, w, h, copy=True):
|
||||
"""
|
||||
Queue one frame for rendering.
|
||||
|
||||
``rdr`` is a compiled Renderer module. Caller must ensure that the
|
||||
module is compatible with the genome data provided.
|
||||
|
||||
``gnm`` is the genome to be rendered.
|
||||
|
||||
``tc`` is the center time at which to render.
|
||||
|
||||
``w``, ``h`` are the width and height of the desired output in px.
|
||||
|
||||
If ``copy`` is False, the genome data will not be recopied for each
|
||||
new genome. This function must be called with ``copy=True`` the first
|
||||
time a new genome is used, and may be called in that manner
|
||||
subsequently without harm. I suspect the performance impact is low, so
|
||||
leave ``copy`` to True every time for now.
|
||||
|
||||
The return value is a 2-tuple ``(evt, h_out)``, where ``evt`` is a
|
||||
CUDA event and ``h_out`` is the return value of the output module's
|
||||
``copy`` function. In the typical case, ``h_out`` will be a host
|
||||
allocation containing data in an appropriate format for the output
|
||||
module's file writer, and ``evt`` indicates when the asynchronous
|
||||
DMA copy which will populate ``h_out`` is complete. This can vary
|
||||
depending on the output module in use, though.
|
||||
"""
|
||||
# Note: we synchronize on the previous stream if buffers need to be
|
||||
# reallocated, which implicitly also syncs the current stream.
|
||||
dim = self.fb.set_dim(w, h, self.stream_b)
|
||||
|
||||
td = gnm.adj_frame_width(tc)
|
||||
ts, te = tc - 0.5 * td, tc + 0.5 * td
|
||||
|
||||
if self.acc_mode in ('deferred', 'atomic'):
|
||||
# In this mode, the palette writes to a surface reference, but
|
||||
# requires dithering, so we pass it the seeds instead
|
||||
arg0 = d_seeds
|
||||
else:
|
||||
arg0 = d_palmem
|
||||
palette_fun(arg0, d_palint_times, d_palint_vals,
|
||||
f32(ts), f32(td / self.palette_height),
|
||||
block=(256,1,1), grid=(self.palette_height,1),
|
||||
stream=write_stream)
|
||||
# The stream interleaving here is nontrivial.
|
||||
# TODO: update diagram and link to it here
|
||||
if copy:
|
||||
self.src_a, self.src_b = self.src_b, self.src_a
|
||||
self._copy(rdr, gnm)
|
||||
self._interp(rdr, gnm, dim, ts, td)
|
||||
if self.filt_evt:
|
||||
self.stream_a.wait_for_event(self.filt_evt)
|
||||
self._iter(rdr, gnm, dim, tc)
|
||||
if self.copy_evt:
|
||||
self.stream_a.wait_for_event(self.copy_evt)
|
||||
for filt in rdr.filts:
|
||||
filt.apply(self.fb, gnm, dim, tc, self.stream_a)
|
||||
rdr.out.convert(self.fb, gnm, dim, self.stream_a)
|
||||
self.filt_evt = cuda.Event().record(self.stream_a)
|
||||
h_out = rdr.out.copy(self.fb, dim, self.pool, self.stream_a)
|
||||
self.copy_evt = cuda.Event().record(self.stream_a)
|
||||
|
||||
packer_fun(d_infos, d_genome_times, d_genome_knots,
|
||||
f32(ts), f32(td / ntemporal_samples),
|
||||
i32(ntemporal_samples), block=(256,1,1),
|
||||
grid=(int(np.ceil(ntemporal_samples/256.)),1),
|
||||
stream=iter_stream)
|
||||
|
||||
# Reset points so that they will be FUSEd
|
||||
util.BaseCode.fill_dptr(self.mod, d_points, 4 * nslots,
|
||||
iter_stream, f32(np.nan))
|
||||
|
||||
# Get interpolated control points for debugging
|
||||
#iter_stream.synchronize()
|
||||
#d_temp = cuda.from_device(d_infos,
|
||||
#(ntemporal_samples, len(self._iter.packer)), f32)
|
||||
#for i, n in zip(d_temp[5], self._iter.packer.packed):
|
||||
#print '%60s %g' % ('_'.join(n), i)
|
||||
|
||||
util.BaseCode.fill_dptr(self.mod, d_accum, 4 * nbins * nxf,
|
||||
write_stream)
|
||||
if self.acc_mode == 'atomic':
|
||||
util.BaseCode.fill_dptr(self.mod, d_atom, 2 * nbins * nxf,
|
||||
write_stream)
|
||||
nrounds = int( (genome.spp(tc) * width * height)
|
||||
/ (ntemporal_samples * 256 * 256) ) + 1
|
||||
if self.acc_mode == 'deferred':
|
||||
for i in range(nrounds):
|
||||
iter_fun(np.uint64(d_log), d_seeds, d_points, d_infos,
|
||||
block=(32, self._iter.NTHREADS/32, 1),
|
||||
grid=(ntemporal_samples, 1), stream=iter_stream)
|
||||
_sync_stream(write_stream, iter_stream)
|
||||
sorter.sort(d_log_sorted, d_log, log_size, 3, True,
|
||||
stream=write_stream)
|
||||
_sync_stream(iter_stream, write_stream)
|
||||
write_fun(d_accum, d_log_sorted, sorter.dglobal, i32(nbins),
|
||||
block=(1024, 1, 1), grid=(nwriteblocks, 1),
|
||||
stream=write_stream)
|
||||
else:
|
||||
args = [u64(d_accum), d_seeds, d_points, d_infos]
|
||||
if self.acc_mode == 'atomic':
|
||||
args.append(u64(d_atom))
|
||||
iter_fun(*args, block=(32, self._iter.NTHREADS/32, 1),
|
||||
grid=(ntemporal_samples, nrounds), stream=iter_stream)
|
||||
if self.acc_mode == 'atomic':
|
||||
nblocks = int(np.ceil(np.sqrt(nbins*nxf/float(512))))
|
||||
flush_fun(u64(d_accum), u64(d_atom), i32(nbins*nxf),
|
||||
block=(512, 1, 1), grid=(nblocks, nblocks),
|
||||
stream=iter_stream)
|
||||
|
||||
util.BaseCode.fill_dptr(self.mod, d_out, 4 * nbins, filt_stream)
|
||||
_sync_stream(filt_stream, write_stream)
|
||||
filt.de(d_out, d_accum, d_atom, genome, dim, tc, nxf,
|
||||
stream=filt_stream)
|
||||
_sync_stream(write_stream, filt_stream)
|
||||
filt.colorclip(d_out, genome, dim, tc, blend, stream=filt_stream)
|
||||
obuf_copy.set_dst_host(h_out_a)
|
||||
obuf_copy(filt_stream)
|
||||
|
||||
if event_b:
|
||||
while not event_a.query():
|
||||
timemod.sleep(0.01)
|
||||
gpu_time = event_a.time_since(event_b)
|
||||
result = RenderedImage(h_out_b, last_idx, gpu_time)
|
||||
else:
|
||||
result = None
|
||||
last_idx = idx
|
||||
|
||||
event_a, event_b = cuda.Event().record(filt_stream), event_a
|
||||
h_out_a, h_out_b = h_out_b, h_out_a
|
||||
|
||||
# TODO: add ability to flush a frame without breaking the pipe
|
||||
next_frame = yield result
|
||||
|
||||
while not event_a.query():
|
||||
timemod.sleep(0.001)
|
||||
gpu_time = event_a.time_since(event_b)
|
||||
yield RenderedImage(h_out_b, last_idx, gpu_time)
|
||||
self.info_a, self.info_b = self.info_b, self.info_a
|
||||
self.stream_a, self.stream_b = self.stream_b, self.stream_a
|
||||
return self.copy_evt, h_out
|
||||
|
||||
def render(self, gnm, times, w, h):
|
||||
"""
|
||||
A port of the old rendering function, retained for backwards
|
||||
compatibility. Some of this will be pulled into as-yet-undecided
|
||||
methods for more DRY.
|
||||
"""
|
||||
rdr = Renderer(gnm)
|
||||
last_evt = cuda.Event().record(self.stream_a)
|
||||
last_idx = None
|
||||
def wait(): # Times like these where you wish for a macro
|
||||
while not last_evt.query():
|
||||
time.sleep(0.01)
|
||||
gpu_time = last_evt.time_since(two_evts_ago)
|
||||
return RenderedImage(h_buf, idx, gpu_time)
|
||||
for idx, tc in times:
|
||||
evt, h_buf = self.queue_frame(rdr, gnm, tc, w, h, last_idx is None)
|
||||
if last_idx:
|
||||
yield wait()
|
||||
two_evts_ago, last_evt = last_evt, evt
|
||||
last_buf, last_idx = h_buf, idx
|
||||
if last_idx:
|
||||
yield wait()
|
||||
|
37
main.py
37
main.py
@ -3,8 +3,8 @@
|
||||
# cuburn, one of a surprisingly large number of ports of the fractal flame
|
||||
# algorithm to NVIDIA GPUs.
|
||||
#
|
||||
# This one is copyright 2010-2011, Steven Robertson <steven@strobe.cc>
|
||||
# and Eric Reckase <e.reckase@gmail.com>.
|
||||
# This one is copyright 2010-2012, Steven Robertson <steven@strobe.cc>
|
||||
# and Erik Reckase <e.reckase@gmail.com>.
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License version 2 or later
|
||||
@ -20,11 +20,9 @@ from subprocess import Popen
|
||||
from itertools import ifilter
|
||||
|
||||
import numpy as np
|
||||
import Image
|
||||
import scipy
|
||||
import pycuda.driver as cuda
|
||||
|
||||
from cuburn import genome, render
|
||||
from cuburn import genome, render, filters, output
|
||||
|
||||
profiles = {
|
||||
'1080p': dict(fps=24, width=1920, height=1080, quality=3000, skip=0),
|
||||
@ -33,11 +31,10 @@ profiles = {
|
||||
'preview': dict(fps=24, width=640, height=360, quality=800, skip=1)
|
||||
}
|
||||
|
||||
def save(rframe):
|
||||
noalpha = rframe.buf[:,:,:3]
|
||||
img = scipy.misc.toimage(noalpha, cmin=0, cmax=1)
|
||||
img.save(rframe.idx, quality=98)
|
||||
print rframe.idx, rframe.gpu_time
|
||||
def save(out):
|
||||
# Temporary! TODO: fix this
|
||||
output.PILOutput.save(out.buf, out.idx)
|
||||
print out.idx, out.gpu_time
|
||||
|
||||
def main(args, prof):
|
||||
import pycuda.autoinit
|
||||
@ -53,9 +50,7 @@ def main(args, prof):
|
||||
gnm = genome.Genome(gnm)
|
||||
err, times = gnm.set_profile(prof)
|
||||
|
||||
anim = render.Renderer()
|
||||
anim.compile(gnm, keep=args.keep)
|
||||
anim.load(gnm)
|
||||
rmgr = render.RenderManager()
|
||||
|
||||
basename = os.path.basename(args.flame.name).rsplit('.', 1)[0] + '_'
|
||||
if args.flame.name == '-':
|
||||
@ -76,9 +71,13 @@ def main(args, prof):
|
||||
if not os.path.isfile(f[0]) or m > os.path.getmtime(f[0]))
|
||||
|
||||
w, h = prof['width'], prof['height']
|
||||
gen = anim.render(gnm, frames, w, h)
|
||||
gen = rmgr.render(gnm, frames, w, h)
|
||||
|
||||
if not args.gfx:
|
||||
for out in gen:
|
||||
save(out)
|
||||
return
|
||||
|
||||
if args.gfx:
|
||||
import pyglet
|
||||
window = pyglet.window.Window(w, h, vsync=False)
|
||||
image = pyglet.image.CheckerImagePattern().create_image(w, h)
|
||||
@ -126,11 +125,7 @@ def main(args, prof):
|
||||
pyglet.clock.set_fps_limit(30)
|
||||
pyglet.clock.schedule_interval(poll, 1/30.)
|
||||
pyglet.app.run()
|
||||
else:
|
||||
for out in gen:
|
||||
save(out)
|
||||
if args.sync:
|
||||
cuda.Context.synchronize()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description='Render fractal flames.')
|
||||
@ -150,8 +145,6 @@ if __name__ == "__main__":
|
||||
parser.add_argument('--pause', action='store_true',
|
||||
help="Don't close the preview window after rendering is finished")
|
||||
|
||||
parser.add_argument('--keep', action='store_true', dest='keep',
|
||||
help='Keep compilation directory (disables kernel caching)')
|
||||
parser.add_argument('--sync', action='store_true', dest='sync',
|
||||
help='Use synchronous launches whenever possible')
|
||||
|
||||
|
66
worker.py
66
worker.py
@ -17,6 +17,8 @@ import redis
|
||||
|
||||
from cuburn import render, genome
|
||||
|
||||
import pycuda.driver as cuda
|
||||
|
||||
pycuda = None
|
||||
|
||||
# The default maximum number of waiting jobs. Also used to determine when a
|
||||
@ -50,20 +52,6 @@ def get_temperature():
|
||||
return out[idx+1:idx+3]
|
||||
return ''
|
||||
|
||||
def push_frame(r, out):
|
||||
if out is None:
|
||||
return
|
||||
sid, sidx, ftag = out.idx
|
||||
# TODO: gotta put this in a module somewhere and make it adjustable
|
||||
noalpha = out.buf[:,:,:3]
|
||||
img = scipy.misc.toimage(noalpha, cmin=0, cmax=1)
|
||||
buf = StringIO()
|
||||
img.save(buf, 'jpeg', quality=98)
|
||||
buf.seek(0)
|
||||
head = ' '.join([sidx, str(out.gpu_time), ftag])
|
||||
r.rpush(sid + ':queue', head + '\0' + buf.read())
|
||||
print 'Pushed frame: %s' % head
|
||||
|
||||
def work(server):
|
||||
global pycuda
|
||||
import pycuda.autoinit
|
||||
@ -80,7 +68,11 @@ def work(server):
|
||||
r.expire(wid, 180)
|
||||
last_ping = time.time()
|
||||
|
||||
last_pid, last_gid, riter = None, None, None
|
||||
idx = evt = buf = None
|
||||
last_idx = last_buf = last_evt = two_evts_ago = None
|
||||
last_pid = last_gid = rdr = None
|
||||
|
||||
mgr = render.RenderManager()
|
||||
|
||||
while True:
|
||||
task = r.blpop('renderpool:' + rev + ':queue', 10)
|
||||
@ -90,28 +82,46 @@ def work(server):
|
||||
r.expire(wid, 180)
|
||||
last_ping = now
|
||||
|
||||
# last_evt will be populated during normal queue operation (when evt
|
||||
# contains the most recent event), as well as when the render queue is
|
||||
# flushing due to not receiving a new task before the timeout.
|
||||
if last_idx is not None:
|
||||
while not last_evt.query():
|
||||
# This delay could probably be a lot higher with zero impact
|
||||
# on throughput for Fermi cards
|
||||
time.sleep(0.05)
|
||||
|
||||
sid, sidx, ftag = last_idx
|
||||
obuf = StringIO()
|
||||
rdr.out.save(buf, obuf, 'jpeg')
|
||||
obuf.seek(0)
|
||||
gpu_time = last_evt.time_since(two_evts_ago)
|
||||
head = ' '.join([sidx, str(gpu_time), ftag])
|
||||
r.rpush(sid + ':queue', head + '\0' + obuf.read())
|
||||
print 'Pushed frame: %s' % head
|
||||
|
||||
two_evts_ago, last_evt = last_evt, evt
|
||||
last_idx, last_buf = idx, buf
|
||||
|
||||
if not task:
|
||||
if riter:
|
||||
push_frame(r, riter.send(None))
|
||||
riter = None
|
||||
idx = evt = buf = None
|
||||
continue
|
||||
|
||||
sid, sidx, pid, gid, ftime, ftag = task[1].split(' ', 5)
|
||||
if pid != last_pid or gid != last_gid or not riter:
|
||||
if pid != last_pid or gid != last_gid or not rdr:
|
||||
gnm = genome.Genome(json.loads(r.get(gid)))
|
||||
prof = json.loads(r.get(pid))
|
||||
gnm.set_profile(prof)
|
||||
renderer = render.Renderer()
|
||||
renderer.load(gnm)
|
||||
rdr = render.Renderer(gnm)
|
||||
|
||||
if riter:
|
||||
push_frame(r, riter.send(None))
|
||||
if last_evt is None:
|
||||
# Create a dummy event for timing
|
||||
last_evt = cuda.Event().record(mgr.stream_a)
|
||||
|
||||
riter = renderer.render_gen(gnm, prof['width'], prof['height'])
|
||||
next(riter)
|
||||
last_pid, last_gid = pid, gid
|
||||
|
||||
push_frame(r, riter.send(((sid, sidx, ftag), float(ftime))))
|
||||
copy = last_idx is None
|
||||
w, h = prof['width'], prof['height']
|
||||
evt, buf = mgr.queue_frame(rdr, gnm, float(ftime), w, h, copy)
|
||||
idx = sid, sidx, ftag
|
||||
|
||||
def iter_genomes(prof, gpaths, pname='540p'):
|
||||
"""
|
||||
|
Loading…
Reference in New Issue
Block a user