mirror of
https://github.com/stevenrobertson/cuburn.git
synced 2025-02-05 11:40:04 -05:00
Sweeping refactor. More bugs undoubtedly remain.
This commit is contained in:
parent
4cfd328f85
commit
60a45c9a20
@ -1,9 +0,0 @@
|
|||||||
"""
|
|
||||||
Contains the PTX fragments which will drive the device, and helper functions
|
|
||||||
to combine those fragments.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import util
|
|
||||||
import mwc
|
|
||||||
import iter
|
|
||||||
|
|
77
cuburn/code/color.py
Normal file
77
cuburn/code/color.py
Normal file
@ -0,0 +1,77 @@
|
|||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from util import devlib
|
||||||
|
|
||||||
|
# The JPEG YUV full-range matrix, without bias into the positve regime.
|
||||||
|
# This assumes input color space is CIERGB D65, encoded with gamma 2.2.
|
||||||
|
# Note that some interpolated colors may exceed the sRGB and YUV gamuts.
|
||||||
|
YUV_MATRIX = np.matrix([[ 0.299, 0.587, 0.114],
|
||||||
|
[-0.168736, -0.331264, 0.5],
|
||||||
|
[ 0.5, -0.418688, -0.081312]])
|
||||||
|
|
||||||
|
yuvlib = devlib(defs='''
|
||||||
|
__device__ float3 rgb2yuv(float3 rgb);
|
||||||
|
__device__ float3 yuv2rgb(float3 yuv);
|
||||||
|
''', decls=r'''
|
||||||
|
/* This conversion uses the JPEG full-range standard. Note that UV have range
|
||||||
|
* [-0.5, 0.5], so consider biasing the results. */
|
||||||
|
__device__ float3 rgb2yuv(float3 rgb) {
|
||||||
|
return make_float3(
|
||||||
|
0.299f * rgb.x + 0.587f * rgb.y + 0.114f * rgb.z,
|
||||||
|
-0.168736f * rgb.x - 0.331264f * rgb.y + 0.5f * rgb.z,
|
||||||
|
0.5f * rgb.x - 0.418688f * rgb.y - 0.081312f * rgb.z);
|
||||||
|
}
|
||||||
|
|
||||||
|
__device__ float3 yuv2rgb(float3 yuv) {
|
||||||
|
return make_float3(
|
||||||
|
yuv.x + 1.402f * yuv.z,
|
||||||
|
yuv.x - 0.34414f * yuv.y - 0.71414f * yuv.z,
|
||||||
|
yuv.x + 1.772f * yuv.y);
|
||||||
|
}
|
||||||
|
''')
|
||||||
|
|
||||||
|
hsvlib = devlib(decls='''
|
||||||
|
__device__ float3 rgb2hsv(float3 rgb);
|
||||||
|
__device__ float3 hsv2rgb(float3 hsv);
|
||||||
|
''', defs=r'''
|
||||||
|
__device__ float3 rgb2hsv(float3 rgb) {
|
||||||
|
float M = fmaxf(fmaxf(rgb.x, rgb.y), rgb.z);
|
||||||
|
float m = fminf(fminf(rgb.x, rgb.y), rgb.z);
|
||||||
|
float C = M - m;
|
||||||
|
|
||||||
|
float s = M > 0.0f ? C / M : 0.0f;
|
||||||
|
|
||||||
|
float h = 0.0f;
|
||||||
|
if (s != 0.0f) {
|
||||||
|
C = 1.0f / C;
|
||||||
|
float rc = (M - rgb.x) * C;
|
||||||
|
float gc = (M - rgb.y) * C;
|
||||||
|
float bc = (M - rgb.z) * C;
|
||||||
|
|
||||||
|
if (rgb.x == M) h = bc - gc;
|
||||||
|
else if (rgb.y == M) h = 2 + rc - bc;
|
||||||
|
else h = 4 + gc - rc;
|
||||||
|
|
||||||
|
if (h < 0) h += 6.0f;
|
||||||
|
}
|
||||||
|
return make_float3(h, s, M);
|
||||||
|
}
|
||||||
|
|
||||||
|
__device__ float3 hsv2rgb(float3 hsv) {
|
||||||
|
float whole = floorf(hsv.x);
|
||||||
|
float frac = hsv.x - whole;
|
||||||
|
float val = hsv.z;
|
||||||
|
float min = val * (1 - hsv.y);
|
||||||
|
float mid = val * (1 - (hsv.y * frac));
|
||||||
|
float alt = val * (1 - (hsv.y * (1 - frac)));
|
||||||
|
|
||||||
|
float3 out;
|
||||||
|
if (whole == 0.0f) { out.x = val; out.y = alt; out.z = min; }
|
||||||
|
else if (whole == 1.0f) { out.x = mid; out.y = val; out.z = min; }
|
||||||
|
else if (whole == 2.0f) { out.x = min; out.y = val; out.z = alt; }
|
||||||
|
else if (whole == 3.0f) { out.x = min; out.y = mid; out.z = val; }
|
||||||
|
else if (whole == 4.0f) { out.x = alt; out.y = min; out.z = val; }
|
||||||
|
else { out.x = val; out.y = min; out.z = mid; }
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
''')
|
@ -1,13 +1,7 @@
|
|||||||
import numpy as np
|
from util import devlib
|
||||||
from numpy import float32 as f32, int32 as i32
|
from color import yuvlib
|
||||||
|
|
||||||
import pycuda.driver as cuda
|
texshearlib = devlib(defs=r'''
|
||||||
import pycuda.compiler
|
|
||||||
from pycuda.gpuarray import vec
|
|
||||||
|
|
||||||
from cuburn.code.util import *
|
|
||||||
|
|
||||||
_CODE = r'''
|
|
||||||
// Filter directions specified in degrees, using image/texture addressing
|
// Filter directions specified in degrees, using image/texture addressing
|
||||||
// [(0,0) is upper left corner, 90 degrees is down].
|
// [(0,0) is upper left corner, 90 degrees is down].
|
||||||
|
|
||||||
@ -22,7 +16,9 @@ __constant__ float2 addressing_patterns[16] = {
|
|||||||
{ 1.0f, -0.333333f}, { 0.333333f, 1.0f}, // 14, 15: -15, 75
|
{ 1.0f, -0.333333f}, { 0.333333f, 1.0f}, // 14, 15: -15, 75
|
||||||
};
|
};
|
||||||
|
|
||||||
// Mon dieu! A C++ feature?
|
// Mon dieu! A C++ feature? Gotta to close the "extern C" added by the compiler.
|
||||||
|
}
|
||||||
|
|
||||||
template <typename T> __device__ T
|
template <typename T> __device__ T
|
||||||
tex_shear(texture<T, cudaTextureType2D> ref, int pattern,
|
tex_shear(texture<T, cudaTextureType2D> ref, int pattern,
|
||||||
float x, float y, float radius) {
|
float x, float y, float radius) {
|
||||||
@ -39,100 +35,14 @@ tex_shear(texture<T, cudaTextureType2D> ref, int pattern,
|
|||||||
}
|
}
|
||||||
|
|
||||||
extern "C" {
|
extern "C" {
|
||||||
|
''')
|
||||||
|
|
||||||
__global__
|
logscalelib = devlib(defs=r'''
|
||||||
void colorclip(float4 *pixbuf, float gamma, float vibrance, float highpow,
|
__global__ void
|
||||||
float linrange, float lingam, float3 bkgd,
|
logscale(float4 *outbuf, const float4 *pixbuf, float k1, float k2) {
|
||||||
int fbsize, int blend_background_color) {
|
|
||||||
int i = threadIdx.x + blockDim.x * (blockIdx.x + gridDim.x * blockIdx.y);
|
|
||||||
if (i >= fbsize) return;
|
|
||||||
|
|
||||||
float4 pix = pixbuf[i];
|
|
||||||
|
|
||||||
if (pix.w <= 0) {
|
|
||||||
pixbuf[i] = make_float4(bkgd.x, bkgd.y, bkgd.z, 0.0f);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
pix.y -= 0.5f * pix.w;
|
|
||||||
pix.z -= 0.5f * pix.w;
|
|
||||||
float3 tmp = yuv2rgb(make_float3(pix.x, pix.y, pix.z));
|
|
||||||
pix.x = tmp.x;
|
|
||||||
pix.y = tmp.y;
|
|
||||||
pix.z = tmp.z;
|
|
||||||
|
|
||||||
pix.x = fmaxf(0.0f, pix.x);
|
|
||||||
pix.y = fmaxf(0.0f, pix.y);
|
|
||||||
pix.z = fmaxf(0.0f, pix.z);
|
|
||||||
|
|
||||||
float4 opix = pix;
|
|
||||||
|
|
||||||
float alpha = powf(pix.w, gamma);
|
|
||||||
if (pix.w < linrange) {
|
|
||||||
float frac = pix.w / linrange;
|
|
||||||
alpha = (1.0f - frac) * pix.w * lingam + frac * alpha;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!blend_background_color) {
|
|
||||||
float ls = alpha / pix.w;
|
|
||||||
pix.x *= ls;
|
|
||||||
pix.y *= ls;
|
|
||||||
pix.z *= ls;
|
|
||||||
pix.w = alpha;
|
|
||||||
pixbuf[i] = pix;
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
float ls = vibrance * alpha / pix.w;
|
|
||||||
alpha = fminf(1.0f, fmaxf(0.0f, alpha));
|
|
||||||
|
|
||||||
float maxc = fmaxf(pix.x, fmaxf(pix.y, pix.z));
|
|
||||||
float maxa = maxc * ls;
|
|
||||||
float newls = 1.0f / maxc;
|
|
||||||
|
|
||||||
if (maxa > 1.0f && highpow >= 0.0f) {
|
|
||||||
float lsratio = powf(newls / ls, highpow);
|
|
||||||
pix.x *= newls;
|
|
||||||
pix.y *= newls;
|
|
||||||
pix.z *= newls;
|
|
||||||
|
|
||||||
// Reduce saturation (according to the HSV model) by proportionally
|
|
||||||
// increasing the values of the other colors.
|
|
||||||
pix.x = maxc - (maxc - pix.x) * lsratio;
|
|
||||||
pix.y = maxc - (maxc - pix.y) * lsratio;
|
|
||||||
pix.z = maxc - (maxc - pix.z) * lsratio;
|
|
||||||
} else {
|
|
||||||
float adjhlp = -highpow;
|
|
||||||
if (adjhlp > 1.0f || maxa <= 1.0f) adjhlp = 1.0f;
|
|
||||||
if (maxc > 0.0f) {
|
|
||||||
float adj = ((1.0f - adjhlp) * newls + adjhlp * ls);
|
|
||||||
pix.x *= adj;
|
|
||||||
pix.y *= adj;
|
|
||||||
pix.z *= adj;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pix.x += (1.0f - vibrance) * powf(opix.x, gamma);
|
|
||||||
pix.y += (1.0f - vibrance) * powf(opix.y, gamma);
|
|
||||||
pix.z += (1.0f - vibrance) * powf(opix.z, gamma);
|
|
||||||
|
|
||||||
pix.x += (1.0f - alpha) * bkgd.x;
|
|
||||||
pix.y += (1.0f - alpha) * bkgd.y;
|
|
||||||
pix.z += (1.0f - alpha) * bkgd.z;
|
|
||||||
|
|
||||||
pix.x = fminf(1.0f, pix.x);
|
|
||||||
pix.y = fminf(1.0f, pix.y);
|
|
||||||
pix.z = fminf(1.0f, pix.z);
|
|
||||||
pix.w = alpha;
|
|
||||||
|
|
||||||
pixbuf[i] = pix;
|
|
||||||
}
|
|
||||||
|
|
||||||
__global__
|
|
||||||
void logscale(float4 *outbuf, const float4 *pixbuf, float k1, float k2) {
|
|
||||||
int i = blockDim.x * blockIdx.x + threadIdx.x;
|
int i = blockDim.x * blockIdx.x + threadIdx.x;
|
||||||
float4 pix = pixbuf[i];
|
float4 pix = pixbuf[i];
|
||||||
|
|
||||||
// float ls = fmaxf(0, k1 * logf(1.0f + pix.w * pix.w * k2 / (1 + pix.w)) / pix.w);
|
|
||||||
float ls = fmaxf(0, k1 * logf(1.0f + pix.w * k2) / pix.w);
|
float ls = fmaxf(0, k1 * logf(1.0f + pix.w * k2) / pix.w);
|
||||||
pix.x *= ls;
|
pix.x *= ls;
|
||||||
pix.y *= ls;
|
pix.y *= ls;
|
||||||
@ -141,10 +51,12 @@ void logscale(float4 *outbuf, const float4 *pixbuf, float k1, float k2) {
|
|||||||
|
|
||||||
outbuf[i] = pix;
|
outbuf[i] = pix;
|
||||||
}
|
}
|
||||||
|
''')
|
||||||
|
|
||||||
|
fmabuflib = devlib(defs=r'''
|
||||||
// Element-wise computation of ``dst[i]=dst[i]+src[i]*scale``.
|
// Element-wise computation of ``dst[i]=dst[i]+src[i]*scale``.
|
||||||
__global__
|
__global__ void
|
||||||
void fma_buf(float4 *dst, const float4 *src, int astride, float scale) {
|
fma_buf(float4 *dst, const float4 *src, int astride, float scale) {
|
||||||
int x = blockIdx.x * blockDim.x + threadIdx.x;
|
int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
int y = blockIdx.y * blockDim.y + threadIdx.y;
|
int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||||
int i = y * astride + x;
|
int i = y * astride + x;
|
||||||
@ -155,23 +67,16 @@ void fma_buf(float4 *dst, const float4 *src, int astride, float scale) {
|
|||||||
d.w += s.w * scale;
|
d.w += s.w * scale;
|
||||||
dst[i] = d;
|
dst[i] = d;
|
||||||
}
|
}
|
||||||
|
''')
|
||||||
|
|
||||||
texture<float4, cudaTextureType2D> bilateral_src;
|
denblurlib = devlib(decls='''
|
||||||
texture<float, cudaTextureType2D> blur_src;
|
texture<float, cudaTextureType2D> blur_src;
|
||||||
|
|
||||||
__global__ void logscale_den(float *dst, float k2) {
|
|
||||||
int xi = blockIdx.x * blockDim.x + threadIdx.x;
|
|
||||||
int yi = blockIdx.y * blockDim.y + threadIdx.y;
|
|
||||||
float4 pix = tex2D(bilateral_src, xi, yi);
|
|
||||||
float out = logf(1.0f + pix.w * k2);
|
|
||||||
dst[yi * (blockDim.x * gridDim.x) + xi] = out;
|
|
||||||
}
|
|
||||||
|
|
||||||
__constant__ float gauss_coefs[9] = {
|
__constant__ float gauss_coefs[9] = {
|
||||||
0.00443305f, 0.05400558f, 0.24203623f, 0.39905028f,
|
0.00443305f, 0.05400558f, 0.24203623f, 0.39905028f,
|
||||||
0.24203623f, 0.05400558f, 0.00443305f
|
0.24203623f, 0.05400558f, 0.00443305f
|
||||||
};
|
};
|
||||||
|
''', defs=r'''
|
||||||
// Apply a Gaussian-esque blur to the density channel of the texture in
|
// Apply a Gaussian-esque blur to the density channel of the texture in
|
||||||
// ``bilateral_src`` in the horizontal direction, and write it to ``dst``, a
|
// ``bilateral_src`` in the horizontal direction, and write it to ``dst``, a
|
||||||
// one-channel buffer.
|
// one-channel buffer.
|
||||||
@ -203,7 +108,11 @@ __global__ void den_blur_1c(float *dst, int pattern, int upsample) {
|
|||||||
* gauss_coefs[i];
|
* gauss_coefs[i];
|
||||||
dst[yi * (blockDim.x * gridDim.x) + xi] = den;
|
dst[yi * (blockDim.x * gridDim.x) + xi] = den;
|
||||||
}
|
}
|
||||||
|
''')
|
||||||
|
|
||||||
|
bilaterallib = devlib(deps=[logscalelib, texshearlib, denblurlib], decls='''
|
||||||
|
texture<float4, cudaTextureType2D> bilateral_src;
|
||||||
|
''', defs=r'''
|
||||||
/* sstd: spatial standard deviation (Gaussian filter)
|
/* sstd: spatial standard deviation (Gaussian filter)
|
||||||
* cstd: color standard deviation (Gaussian on the range [0, 1], where 1
|
* cstd: color standard deviation (Gaussian on the range [0, 1], where 1
|
||||||
* represents an "opposite" color).
|
* represents an "opposite" color).
|
||||||
@ -215,9 +124,8 @@ __global__ void den_blur_1c(float *dst, int pattern, int upsample) {
|
|||||||
* gspeed: Speed of (exp2f) Gompertz distribution governing how much to
|
* gspeed: Speed of (exp2f) Gompertz distribution governing how much to
|
||||||
* tighten gradients. Zero and negative values OK.
|
* tighten gradients. Zero and negative values OK.
|
||||||
*/
|
*/
|
||||||
|
__global__ void
|
||||||
__global__
|
bilateral(float4 *dst, int pattern, int radius,
|
||||||
void bilateral(float4 *dst, int pattern, int radius,
|
|
||||||
float sstd, float cstd, float dstd, float dpow, float gspeed)
|
float sstd, float cstd, float dstd, float dpow, float gspeed)
|
||||||
{
|
{
|
||||||
int xi = blockIdx.x * blockDim.x + threadIdx.x;
|
int xi = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
@ -317,124 +225,83 @@ void bilateral(float4 *dst, int pattern, int radius,
|
|||||||
const int astride = blockDim.x * gridDim.x;
|
const int astride = blockDim.x * gridDim.x;
|
||||||
dst[yi * astride + xi] = out;
|
dst[yi * astride + xi] = out;
|
||||||
}
|
}
|
||||||
|
''')
|
||||||
|
|
||||||
} // end extern "C"
|
colorcliplib = devlib(deps=[yuvlib], defs=r'''
|
||||||
'''
|
__global__ void
|
||||||
|
colorclip(float4 *pixbuf, float gamma, float vibrance, float highpow,
|
||||||
|
float linrange, float lingam, float3 bkgd, int fbsize)
|
||||||
|
{
|
||||||
|
int i = threadIdx.x + blockDim.x * (blockIdx.x + gridDim.x * blockIdx.y);
|
||||||
|
if (i >= fbsize) return;
|
||||||
|
|
||||||
class Filtering(HunkOCode):
|
float4 pix = pixbuf[i];
|
||||||
mod = None
|
|
||||||
defs = _CODE
|
|
||||||
|
|
||||||
@classmethod
|
if (pix.w <= 0) {
|
||||||
def init_mod(cls):
|
pixbuf[i] = make_float4(bkgd.x, bkgd.y, bkgd.z, 0.0f);
|
||||||
if cls.mod is None:
|
return;
|
||||||
cls.mod = pycuda.compiler.SourceModule(assemble_code(BaseCode, cls),
|
}
|
||||||
options=['-use_fast_math', '-maxrregcount', '32'],
|
pix.y -= 0.5f * pix.w;
|
||||||
no_extern_c=True)
|
pix.z -= 0.5f * pix.w;
|
||||||
|
float3 tmp = yuv2rgb(make_float3(pix.x, pix.y, pix.z));
|
||||||
|
pix.x = tmp.x;
|
||||||
|
pix.y = tmp.y;
|
||||||
|
pix.z = tmp.z;
|
||||||
|
|
||||||
def __init__(self):
|
pix.x = fmaxf(0.0f, pix.x);
|
||||||
self.init_mod()
|
pix.y = fmaxf(0.0f, pix.y);
|
||||||
|
pix.z = fmaxf(0.0f, pix.z);
|
||||||
|
|
||||||
def de(self, ddst, dsrc, dscratch, gnm, dim, tc, nxf, stream=None):
|
float4 opix = pix;
|
||||||
|
|
||||||
# Helper variables and functions to keep it clean
|
float alpha = powf(pix.w, gamma);
|
||||||
sb = 16 * dim.astride
|
if (pix.w < linrange) {
|
||||||
bs = sb * dim.ah
|
float frac = pix.w / linrange;
|
||||||
bl, gr = (32, 8, 1), (dim.astride / 32, dim.ah / 8)
|
alpha = (1.0f - frac) * pix.w * lingam + frac * alpha;
|
||||||
|
}
|
||||||
|
|
||||||
def launch(f, *args, **kwargs):
|
float ls = vibrance * alpha / pix.w;
|
||||||
f(*args, block=bl, grid=gr, stream=stream, **kwargs)
|
alpha = fminf(1.0f, fmaxf(0.0f, alpha));
|
||||||
mkdsc = lambda c: argset(cuda.ArrayDescriptor(), height=dim.ah,
|
|
||||||
width=dim.astride, num_channels=c,
|
|
||||||
format=cuda.array_format.FLOAT)
|
|
||||||
def mktref(n):
|
|
||||||
tref = self.mod.get_texref(n)
|
|
||||||
tref.set_filter_mode(cuda.filter_mode.POINT)
|
|
||||||
tref.set_address_mode(0, cuda.address_mode.WRAP)
|
|
||||||
tref.set_address_mode(1, cuda.address_mode.WRAP)
|
|
||||||
return tref
|
|
||||||
|
|
||||||
dsc = mkdsc(4)
|
float maxc = fmaxf(pix.x, fmaxf(pix.y, pix.z));
|
||||||
tref = mktref('bilateral_src')
|
float maxa = maxc * ls;
|
||||||
grad_dsc = mkdsc(1)
|
float newls = 1.0f / maxc;
|
||||||
grad_tref = mktref('blur_src')
|
|
||||||
|
|
||||||
bilateral, logscale_den, den_blur, den_blur_1c, fma_buf = map(
|
if (maxa > 1.0f && highpow >= 0.0f) {
|
||||||
self.mod.get_function,
|
float lsratio = powf(newls / ls, highpow);
|
||||||
'bilateral logscale_den den_blur den_blur_1c fma_buf'.split())
|
pix.x *= newls;
|
||||||
|
pix.y *= newls;
|
||||||
|
pix.z *= newls;
|
||||||
|
|
||||||
# Number of different shear patterns to use when filtering. Must be
|
// Reduce saturation (according to the HSV model) by proportionally
|
||||||
# even, since we depend on buffer bouncing (but I don't think that it's
|
// increasing the values of the other colors.
|
||||||
# a requirement for the filter itself to get decent results).
|
pix.x = maxc - (maxc - pix.x) * lsratio;
|
||||||
DIRECTIONS = 8
|
pix.y = maxc - (maxc - pix.y) * lsratio;
|
||||||
|
pix.z = maxc - (maxc - pix.z) * lsratio;
|
||||||
|
} else {
|
||||||
|
float adjhlp = -highpow;
|
||||||
|
if (adjhlp > 1.0f || maxa <= 1.0f) adjhlp = 1.0f;
|
||||||
|
if (maxc > 0.0f) {
|
||||||
|
float adj = ((1.0f - adjhlp) * newls + adjhlp * ls);
|
||||||
|
pix.x *= adj;
|
||||||
|
pix.y *= adj;
|
||||||
|
pix.z *= adj;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
def do_bilateral(bsrc, bdst, pattern, r=15, sstd=6, cstd=0.05,
|
pix.x += (1.0f - vibrance) * powf(opix.x, gamma);
|
||||||
dstd=1.5, dpow=0.8, gspeed=4.0):
|
pix.y += (1.0f - vibrance) * powf(opix.y, gamma);
|
||||||
# Scale spatial parameter so that a "pixel" is equivalent to an
|
pix.z += (1.0f - vibrance) * powf(opix.z, gamma);
|
||||||
# actual pixel at 1080p
|
|
||||||
sstd *= dim.w / 1920.
|
|
||||||
|
|
||||||
tref.set_address_2d(bsrc, dsc, sb)
|
pix.x += (1.0f - alpha) * bkgd.x;
|
||||||
|
pix.y += (1.0f - alpha) * bkgd.y;
|
||||||
|
pix.z += (1.0f - alpha) * bkgd.z;
|
||||||
|
|
||||||
# Blur density two octaves along sampling vector, ultimately
|
pix.x = fminf(1.0f, pix.x);
|
||||||
# storing in `dscratch`
|
pix.y = fminf(1.0f, pix.y);
|
||||||
launch(den_blur, np.intp(bdst), i32(pattern), i32(0),
|
pix.z = fminf(1.0f, pix.z);
|
||||||
texrefs=[tref])
|
pix.w = alpha;
|
||||||
grad_tref.set_address_2d(bdst, grad_dsc, sb / 4)
|
|
||||||
launch(den_blur_1c, dscratch, i32(pattern), i32(1),
|
|
||||||
texrefs=[grad_tref])
|
|
||||||
grad_tref.set_address_2d(dscratch, grad_dsc, sb / 4)
|
|
||||||
|
|
||||||
launch(bilateral, np.intp(bdst), i32(pattern), i32(r),
|
pixbuf[i] = pix;
|
||||||
f32(sstd), f32(cstd), f32(dstd), f32(dpow), f32(gspeed),
|
}
|
||||||
texrefs=[tref, grad_tref])
|
''')
|
||||||
|
|
||||||
def do_bilateral_range(bsrc, bdst, npats, *args, **kwargs):
|
|
||||||
for i in range(npats):
|
|
||||||
do_bilateral(bsrc, bdst, i, *args, **kwargs)
|
|
||||||
bdst, bsrc = bsrc, bdst
|
|
||||||
if npats % 2:
|
|
||||||
cuda.memcpy_dtod_async(bdst, bsrc, bs, stream)
|
|
||||||
|
|
||||||
# Filter the first xform, using `ddst` as an intermediate buffer.
|
|
||||||
# End result is the filtered copy in `dsrc`.
|
|
||||||
do_bilateral_range(dsrc, ddst, DIRECTIONS)
|
|
||||||
|
|
||||||
# Filter the remaining xforms, using `ddst` as an intermediate
|
|
||||||
# buffer, then add the result to `dsrc` (at the zero'th xform).
|
|
||||||
for x in range(1, nxf):
|
|
||||||
src = int(dsrc) + x * bs
|
|
||||||
do_bilateral_range(src, ddst, DIRECTIONS)
|
|
||||||
launch(fma_buf, dsrc, np.intp(src), i32(dim.astride), f32(1))
|
|
||||||
|
|
||||||
# Log-scale the accumulated buffer in `dsrc`.
|
|
||||||
k1 = f32(gnm.color.brightness(tc) * 268 / 256)
|
|
||||||
# Old definition of area is (w*h/(s*s)). Since new scale 'ns' is now
|
|
||||||
# s/w, new definition is (w*h/(s*s*w*w)) = (h/(s*s*w))
|
|
||||||
area = dim.h / (gnm.camera.scale(tc) ** 2 * dim.w)
|
|
||||||
k2 = f32(1.0 / (area * gnm.spp(tc)))
|
|
||||||
|
|
||||||
nbins = dim.ah * dim.astride
|
|
||||||
logscale = self.mod.get_function("logscale")
|
|
||||||
t = logscale(ddst, dsrc, k1, k2,
|
|
||||||
block=(512, 1, 1), grid=(nbins/512, 1), stream=stream)
|
|
||||||
|
|
||||||
def colorclip(self, dbuf, gnm, dim, tc, blend, stream=None):
|
|
||||||
nbins = dim.ah * dim.astride
|
|
||||||
|
|
||||||
# TODO: implement integration over cubic splines?
|
|
||||||
gam = f32(1 / gnm.color.gamma(tc))
|
|
||||||
vib = f32(gnm.color.vibrance(tc))
|
|
||||||
hipow = f32(gnm.color.highlight_power(tc))
|
|
||||||
lin = f32(gnm.color.gamma_threshold(tc))
|
|
||||||
lingam = f32(lin ** (gam-1.0) if lin > 0 else 0)
|
|
||||||
bkgd = vec.make_float3(
|
|
||||||
gnm.color.background.r(tc),
|
|
||||||
gnm.color.background.g(tc),
|
|
||||||
gnm.color.background.b(tc))
|
|
||||||
|
|
||||||
color_fun = self.mod.get_function("colorclip")
|
|
||||||
blocks = int(np.ceil(np.sqrt(nbins / 256)))
|
|
||||||
color_fun(dbuf, gam, vib, hipow, lin, lingam, bkgd, i32(nbins),
|
|
||||||
i32(blend), block=(256, 1, 1), grid=(blocks, blocks),
|
|
||||||
stream=stream)
|
|
@ -2,8 +2,10 @@ from collections import OrderedDict
|
|||||||
from itertools import cycle
|
from itertools import cycle
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
import tempita
|
import util
|
||||||
from util import HunkOCode, Template, BaseCode, assemble_code
|
from util import Template, assemble_code, devlib, binsearchlib, ringbuflib
|
||||||
|
from color import yuvlib
|
||||||
|
from mwc import mwclib
|
||||||
|
|
||||||
class GenomePackerName(str):
|
class GenomePackerName(str):
|
||||||
"""Class to indicate that a property is precalculated on the device"""
|
"""Class to indicate that a property is precalculated on the device"""
|
||||||
@ -115,17 +117,10 @@ class GenomePackerPrecalc(GenomePackerView):
|
|||||||
def _code(self, code):
|
def _code(self, code):
|
||||||
self.packer.precalc_code.append(code)
|
self.packer.precalc_code.append(code)
|
||||||
|
|
||||||
class GenomePacker(HunkOCode):
|
class GenomePacker(object):
|
||||||
"""
|
"""
|
||||||
Packs a genome for use in iteration.
|
Packs a genome for use in iteration.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# 2^search_rounds is the maximum number of control points, including
|
|
||||||
# endpoints, that can be used in a single genome. It should be okay to
|
|
||||||
# change this number without touching other code, but 32 samples fits
|
|
||||||
# nicely on a single cache line.
|
|
||||||
search_rounds = 5
|
|
||||||
|
|
||||||
def __init__(self, tname):
|
def __init__(self, tname):
|
||||||
"""
|
"""
|
||||||
Create a new DataPacker.
|
Create a new DataPacker.
|
||||||
@ -152,6 +147,7 @@ class GenomePacker(HunkOCode):
|
|||||||
|
|
||||||
self.packed = None
|
self.packed = None
|
||||||
self.genome = None
|
self.genome = None
|
||||||
|
self.search_rounds = util.DEFAULT_SEARCH_ROUNDS
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
assert self._len is not None, 'len() called before finalize()'
|
assert self._len is not None, 'len() called before finalize()'
|
||||||
@ -189,42 +185,41 @@ class GenomePacker(HunkOCode):
|
|||||||
|
|
||||||
self._len = len(self.packed)
|
self._len = len(self.packed)
|
||||||
|
|
||||||
self.decls = self._decls.substitute(
|
decls = self._decls.substitute(packed=self.packed, tname=self.tname)
|
||||||
packed=self.packed, tname=self.tname,
|
defs = self._defs.substitute(
|
||||||
search_rounds=self.search_rounds)
|
|
||||||
self.defs = self._defs.substitute(
|
|
||||||
packed_direct=self.packed_direct, tname=self.tname,
|
packed_direct=self.packed_direct, tname=self.tname,
|
||||||
precalc_code=self.precalc_code,
|
precalc_code=self.precalc_code,
|
||||||
search_rounds=self.search_rounds)
|
search_rounds=self.search_rounds)
|
||||||
|
|
||||||
|
return devlib(deps=[catmullromlib], decls=decls, defs=defs)
|
||||||
|
|
||||||
def pack(self):
|
def pack(self, pool=None):
|
||||||
"""
|
"""
|
||||||
Return a packed copy of the genome ready for uploading to the GPU as a
|
Return a packed copy of the genome ready for uploading to the GPU,
|
||||||
3D NDArray, with the first element being the times and the second
|
as two float32 NDArrays for the knot times and values.
|
||||||
being the values.
|
|
||||||
"""
|
"""
|
||||||
width = 1 << self.search_rounds
|
width = 1 << self.search_rounds
|
||||||
out = np.empty((2, len(self.genome), width), dtype=np.float32)
|
if pool:
|
||||||
# Ensure that unused values at the top are always big (must be >2.0)
|
times = pool.allocate((len(self.genome), width), 'f4')
|
||||||
out[0].fill(1e9)
|
knots = pool.allocate((len(self.genome), width), 'f4')
|
||||||
|
else:
|
||||||
|
times, knots = np.empty((2, len(self.genome), width), 'f4')
|
||||||
|
times.fill(1e9)
|
||||||
|
|
||||||
for idx, gname in enumerate(self.genome):
|
for idx, gname in enumerate(self.genome):
|
||||||
attr = self.ns[gname[0]]
|
attr = self.ns[gname[0]]
|
||||||
for g in gname[1:]:
|
for g in gname[1:]:
|
||||||
attr = getattr(attr, g)
|
attr = getattr(attr, g)
|
||||||
out[0][idx][:len(attr.knots[0])] = attr.knots[0]
|
times[idx,:len(attr.knots[0])] = attr.knots[0]
|
||||||
out[1][idx][:len(attr.knots[1])] = attr.knots[1]
|
knots[idx,:len(attr.knots[1])] = attr.knots[1]
|
||||||
return out
|
return times, knots
|
||||||
|
|
||||||
_defs = Template(r"""
|
_defs = Template(r"""
|
||||||
|
__global__ void interp_{{tname}}(
|
||||||
__global__
|
|
||||||
void interp_{{tname}}(
|
|
||||||
{{tname}}* out,
|
{{tname}}* out,
|
||||||
const float *times, const float *knots,
|
const float *times, const float *knots,
|
||||||
float tstart, float tstep, int maxid
|
float tstart, float tstep, int maxid)
|
||||||
) {
|
{
|
||||||
int id = gtid();
|
int id = gtid();
|
||||||
if (id >= maxid) return;
|
if (id >= maxid) return;
|
||||||
out = &out[id];
|
out = &out[id];
|
||||||
@ -249,7 +244,6 @@ void interp_{{tname}}(
|
|||||||
}
|
}
|
||||||
{{endfor}}
|
{{endfor}}
|
||||||
}
|
}
|
||||||
|
|
||||||
""")
|
""")
|
||||||
|
|
||||||
_decls = Template(r"""
|
_decls = Template(r"""
|
||||||
@ -259,22 +253,13 @@ typedef struct {
|
|||||||
{{endfor}}
|
{{endfor}}
|
||||||
} {{tname}};
|
} {{tname}};
|
||||||
|
|
||||||
/* Search through the fixed-size list 'hay' to find the rightmost index which
|
|
||||||
* contains a value strictly smaller than the input 'needle'. The crazy
|
|
||||||
* bitwise search is just for my own amusement.
|
|
||||||
*/
|
|
||||||
__device__
|
|
||||||
int bitwise_binsearch(const float *hay, float needle) {
|
|
||||||
int lo = 0;
|
|
||||||
|
|
||||||
// TODO: improve efficiency on 64-bit arches
|
""")
|
||||||
{{for i in range(search_rounds-1, -1, -1)}}
|
|
||||||
if (needle > hay[lo + {{1 << i}}])
|
|
||||||
lo += {{1 << i}};
|
|
||||||
{{endfor}}
|
|
||||||
return lo;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
catmullromlib = devlib(deps=[binsearchlib], decls=r'''
|
||||||
|
__device__ __noinline__
|
||||||
|
float catmull_rom(const float *times, const float *knots, float t);
|
||||||
|
''', defs=r'''
|
||||||
__device__ __noinline__
|
__device__ __noinline__
|
||||||
float catmull_rom(const float *times, const float *knots, float t) {
|
float catmull_rom(const float *times, const float *knots, float t) {
|
||||||
int idx = bitwise_binsearch(times, t);
|
int idx = bitwise_binsearch(times, t);
|
||||||
@ -303,67 +288,14 @@ float catmull_rom(const float *times, const float *knots, float t) {
|
|||||||
+ m2 * ( ttt - tt)
|
+ m2 * ( ttt - tt)
|
||||||
+ k2 * (-2.0f*ttt + 3.0f*tt);
|
+ k2 * (-2.0f*ttt + 3.0f*tt);
|
||||||
}
|
}
|
||||||
|
''')
|
||||||
|
|
||||||
__global__
|
palintlib = devlib(deps=[binsearchlib, ringbuflib, yuvlib, mwclib], decls='''
|
||||||
void test_cr(const float *times, const float *knots, const float *t, float *r) {
|
surface<void, cudaSurfaceType2D> flatpal;
|
||||||
int i = threadIdx.x + blockDim.x * blockIdx.x;
|
''', defs=r'''
|
||||||
r[i] = catmull_rom(times, knots, t[i]);
|
__device__ float4
|
||||||
}
|
interp_color(const float *times, const float4 *sources, float time)
|
||||||
""")
|
{
|
||||||
|
|
||||||
class Palette(HunkOCode):
|
|
||||||
# The JPEG YUV full-range matrix, without bias into the positve regime.
|
|
||||||
# This assumes input color space is CIERGB D65, encoded with gamma 2.2.
|
|
||||||
# Note that some interpolated colors may exceed the sRGB and YUV gamuts.
|
|
||||||
YUV = np.matrix([[ 0.299, 0.587, 0.114],
|
|
||||||
[-0.168736, -0.331264, 0.5],
|
|
||||||
[ 0.5, -0.418688, -0.081312]])
|
|
||||||
|
|
||||||
def __init__(self, interp_mode):
|
|
||||||
assert interp_mode in self.modes
|
|
||||||
self.mode = interp_mode
|
|
||||||
self.defs = self._defs.substitute(mode=interp_mode)
|
|
||||||
|
|
||||||
def prepare(self, palettes):
|
|
||||||
"""
|
|
||||||
Produce palettes suitable for uploading to the device. Returns an
|
|
||||||
array of palettes in the same size and shape as the input.
|
|
||||||
|
|
||||||
This function will never modify its argument, but may return it
|
|
||||||
unmodified for certain interpolation modes.
|
|
||||||
"""
|
|
||||||
if self.mode == 'yuvpolar':
|
|
||||||
ys, uvrs, uvts, alphas = zip(*map(self.rgbtoyuvpolar, palettes))
|
|
||||||
# Center all medians as closely to 0 as possible
|
|
||||||
means = np.mean(uvts, axis=1)
|
|
||||||
newmeans = (means + np.pi) % (2 * np.pi) - np.pi
|
|
||||||
uvts = (newmeans - means).reshape((-1, 1)) + uvts
|
|
||||||
zipped = zip(ys, uvrs, uvts, alphas)
|
|
||||||
return np.array(zipped, dtype='f4').transpose((0, 2, 1))
|
|
||||||
return palettes
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def rgbtoyuvpolar(cls, pal):
|
|
||||||
# TODO: premultiply alpha or some nonsense like that?
|
|
||||||
y, u, v = np.array(cls.YUV * pal.T[:3])
|
|
||||||
uvr = np.hypot(u, v)
|
|
||||||
uvt = np.unwrap(np.arctan2(v, u))
|
|
||||||
return y, uvr, uvt, pal.T[3]
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def yuvpolartorgb(cls, y, uvr, uvt, a):
|
|
||||||
u = uvr * np.cos(uvt)
|
|
||||||
v = uvr * np.sin(uvt)
|
|
||||||
r, g, b = np.array(cls.YUV.I * np.array([y, u, v]))
|
|
||||||
# Ensure Fortran order so that the memory gets laid out correctly
|
|
||||||
return np.array([r, g, b, a], order='F').T
|
|
||||||
|
|
||||||
modes = ['hsv', 'yuv', 'yuvpolar']
|
|
||||||
decls = "surface<void, cudaSurfaceType2D> flatpal;\n"
|
|
||||||
|
|
||||||
_defs = Template(r"""
|
|
||||||
__device__
|
|
||||||
float4 interp_color(const float *times, const float4 *sources, float time) {
|
|
||||||
int idx = fmaxf(bitwise_binsearch(times, time) + 1, 1);
|
int idx = fmaxf(bitwise_binsearch(times, time) + 1, 1);
|
||||||
float lf = (times[idx] - time) / (times[idx] - times[idx-1]);
|
float lf = (times[idx] - time) / (times[idx] - times[idx-1]);
|
||||||
float rf = 1.0f - lf;
|
float rf = 1.0f - lf;
|
||||||
@ -375,42 +307,11 @@ float4 interp_color(const float *times, const float4 *sources, float time) {
|
|||||||
float3 l3 = make_float3(left.x, left.y, left.z);
|
float3 l3 = make_float3(left.x, left.y, left.z);
|
||||||
float3 r3 = make_float3(right.x, right.y, right.z);
|
float3 r3 = make_float3(right.x, right.y, right.z);
|
||||||
|
|
||||||
{{if mode == 'hsv'}}
|
|
||||||
float3 lhsv = rgb2hsv(l3);
|
|
||||||
float3 rhsv = rgb2hsv(r3);
|
|
||||||
|
|
||||||
if (fabs(lhsv.x - rhsv.x) > 3.0f)
|
|
||||||
if (lhsv.x < rhsv.x)
|
|
||||||
lhsv.x += 6.0f;
|
|
||||||
else
|
|
||||||
rhsv.x += 6.0f;
|
|
||||||
|
|
||||||
float3 hsv;
|
|
||||||
hsv.x = lhsv.x * lf + rhsv.x * rf;
|
|
||||||
hsv.y = lhsv.y * lf + rhsv.y * rf;
|
|
||||||
hsv.z = lhsv.z * lf + rhsv.z * rf;
|
|
||||||
|
|
||||||
if (hsv.x > 6.0f)
|
|
||||||
hsv.x -= 6.0f;
|
|
||||||
if (hsv.x < 0.0f)
|
|
||||||
hsv.x += 6.0f;
|
|
||||||
|
|
||||||
yuv = rgb2yuv(hsv2rgb(hsv));
|
|
||||||
{{elif mode.startswith('yuv')}}
|
|
||||||
{{if mode == 'yuv'}}
|
|
||||||
float3 lyuv = rgb2yuv(l3);
|
float3 lyuv = rgb2yuv(l3);
|
||||||
float3 ryuv = rgb2yuv(r3);
|
float3 ryuv = rgb2yuv(r3);
|
||||||
yuv.x = lyuv.x * lf + ryuv.x * rf;
|
yuv.x = lyuv.x * lf + ryuv.x * rf;
|
||||||
yuv.y = lyuv.y * lf + ryuv.y * rf;
|
yuv.y = lyuv.y * lf + ryuv.y * rf;
|
||||||
yuv.z = lyuv.z * lf + ryuv.z * rf;
|
yuv.z = lyuv.z * lf + ryuv.z * rf;
|
||||||
{{elif mode == 'yuvpolar'}}
|
|
||||||
yuv.x = l3.x * lf + r3.x * rf;
|
|
||||||
float radius = l3.y * lf + r3.y * rf;
|
|
||||||
float angle = l3.z * lf + r3.z * rf;
|
|
||||||
yuv.y = radius * cosf(angle);
|
|
||||||
yuv.z = radius * sinf(angle);
|
|
||||||
{{endif}}
|
|
||||||
{{endif}}
|
|
||||||
|
|
||||||
yuv.y += 0.5f;
|
yuv.y += 0.5f;
|
||||||
yuv.z += 0.5f;
|
yuv.z += 0.5f;
|
||||||
@ -418,28 +319,13 @@ float4 interp_color(const float *times, const float4 *sources, float time) {
|
|||||||
return make_float4(yuv.x, yuv.y, yuv.z, left.w * lf + right.w * rf);
|
return make_float4(yuv.x, yuv.y, yuv.z, left.w * lf + right.w * rf);
|
||||||
}
|
}
|
||||||
|
|
||||||
__global__
|
__global__ void interp_palette_flat(
|
||||||
void interp_palette(uchar4 *outs,
|
ringbuf *rb, mwc_st *rctxs,
|
||||||
const float *times, const float4 *sources,
|
const float *times, const float4 *sources,
|
||||||
float tstart, float tstep) {
|
float tstart, float tstep)
|
||||||
float time = tstart + blockIdx.x * tstep;
|
{
|
||||||
float4 yuva = interp_color(times, sources, time);
|
mwc_st rctx = rctxs[rb_incr(rb->head, threadIdx.x)];
|
||||||
|
|
||||||
uchar4 out;
|
|
||||||
out.x = yuva.x * 255.0f;
|
|
||||||
out.y = yuva.y * 255.0f;
|
|
||||||
out.z = yuva.z * 255.0f;
|
|
||||||
out.w = yuva.w * 255.0f;
|
|
||||||
outs[blockDim.x * blockIdx.x + threadIdx.x] = out;
|
|
||||||
}
|
|
||||||
|
|
||||||
__global__
|
|
||||||
void interp_palette_flat(mwc_st *rctxs,
|
|
||||||
const float *times, const float4 *sources,
|
|
||||||
float tstart, float tstep) {
|
|
||||||
|
|
||||||
int gid = blockIdx.x * blockDim.x + threadIdx.x;
|
int gid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
mwc_st rctx = rctxs[gid];
|
|
||||||
|
|
||||||
float time = tstart + blockIdx.x * tstep;
|
float time = tstart + blockIdx.x * tstep;
|
||||||
float4 yuva = interp_color(times, sources, time);
|
float4 yuva = interp_color(times, sources, time);
|
||||||
@ -447,17 +333,30 @@ void interp_palette_flat(mwc_st *rctxs,
|
|||||||
// TODO: pack Y at full precision, UV at quarter
|
// TODO: pack Y at full precision, UV at quarter
|
||||||
uint2 out;
|
uint2 out;
|
||||||
|
|
||||||
uint32_t y = min(255, (uint32_t) (yuva.x * 255.0f + 0.49f * mwc_next_11(rctx)));
|
uint32_t y = yuva.x * 255.0f + 0.49f * mwc_next_11(rctx);
|
||||||
uint32_t u = min(255, (uint32_t) (yuva.y * 255.0f + 0.49f * mwc_next_11(rctx)));
|
uint32_t u = yuva.y * 255.0f + 0.49f * mwc_next_11(rctx);
|
||||||
uint32_t v = min(255, (uint32_t) (yuva.z * 255.0f + 0.49f * mwc_next_11(rctx)));
|
uint32_t v = yuva.z * 255.0f + 0.49f * mwc_next_11(rctx);
|
||||||
|
y = min(255, y);
|
||||||
|
u = min(255, u);
|
||||||
|
v = min(255, v);
|
||||||
out.y = (1 << 22) | (y << 4);
|
out.y = (1 << 22) | (y << 4);
|
||||||
out.x = (u << 18) | v;
|
out.x = (u << 18) | v;
|
||||||
|
|
||||||
surf2Dwrite(out, flatpal, 8 * threadIdx.x, blockIdx.x);
|
surf2Dwrite(out, flatpal, 8 * threadIdx.x, blockIdx.x);
|
||||||
rctxs[gid] = rctx;
|
rctxs[rb_incr(rb->tail, threadIdx.x)] = rctx;
|
||||||
}
|
}
|
||||||
""")
|
''')
|
||||||
|
|
||||||
|
testcrlib = devlib(defs=r'''
|
||||||
|
__global__ void
|
||||||
|
test_cr(const float *times, const float *knots, const float *t, float *r) {
|
||||||
|
int i = threadIdx.x + blockDim.x * blockIdx.x;
|
||||||
|
r[i] = catmull_rom(times, knots, t[i]);
|
||||||
|
}
|
||||||
|
''')
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
# Test spline evaluation. This code will probably drift pretty often.
|
||||||
import pycuda.driver as cuda
|
import pycuda.driver as cuda
|
||||||
from pycuda.compiler import SourceModule
|
from pycuda.compiler import SourceModule
|
||||||
import pycuda.autoinit
|
import pycuda.autoinit
|
||||||
|
@ -2,15 +2,16 @@
|
|||||||
The main iteration loop.
|
The main iteration loop.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from cuburn.code import mwc, variations, interp
|
import variations
|
||||||
from cuburn.code.util import *
|
import interp
|
||||||
|
from util import Template, devlib, ringbuflib
|
||||||
|
from mwc import mwclib
|
||||||
|
|
||||||
def precalc_densities(pcp, std_xforms):
|
def precalc_densities(pcp, std_xforms):
|
||||||
# This pattern recurs a few times for precalc segments. Unfortunately,
|
# This pattern recurs a few times for precalc segments. Unfortunately,
|
||||||
# namespace stuff means it's not easy to functionalize this boilerplate
|
# namespace stuff means it's not easy to functionalize this boilerplate
|
||||||
pre_cp = pcp._precalc()
|
pre_cp = pcp._precalc()
|
||||||
pre_cp._code(Template(r"""
|
pre_cp._code(Template(r"""
|
||||||
|
|
||||||
float sum = 0.0f;
|
float sum = 0.0f;
|
||||||
|
|
||||||
{{for n in std_xforms}}
|
{{for n in std_xforms}}
|
||||||
@ -30,7 +31,6 @@ def precalc_densities(pcp, std_xforms):
|
|||||||
def precalc_chaos(pcp, std_xforms):
|
def precalc_chaos(pcp, std_xforms):
|
||||||
pre_cp = pcp._precalc()
|
pre_cp = pcp._precalc()
|
||||||
pre_cp._code(Template("""
|
pre_cp._code(Template("""
|
||||||
|
|
||||||
float sum, rsum;
|
float sum, rsum;
|
||||||
|
|
||||||
{{for p in std_xforms}}
|
{{for p in std_xforms}}
|
||||||
@ -50,7 +50,6 @@ def precalc_chaos(pcp, std_xforms):
|
|||||||
{{endfor}}
|
{{endfor}}
|
||||||
|
|
||||||
{{endfor}}
|
{{endfor}}
|
||||||
|
|
||||||
""", name='precalc_chaos').substitute(locals()))
|
""", name='precalc_chaos').substitute(locals()))
|
||||||
|
|
||||||
def precalc_camera(pcam):
|
def precalc_camera(pcam):
|
||||||
@ -64,7 +63,6 @@ def precalc_camera(pcam):
|
|||||||
# . matrix([X],[Y],[1]);
|
# . matrix([X],[Y],[1]);
|
||||||
|
|
||||||
pre_cam._code(Template(r"""
|
pre_cam._code(Template(r"""
|
||||||
|
|
||||||
float rot = {{pre_cam.rotation}} * M_PI / 180.0f;
|
float rot = {{pre_cam.rotation}} * M_PI / 180.0f;
|
||||||
float rotsin = sin(rot), rotcos = cos(rot);
|
float rotsin = sin(rot), rotcos = cos(rot);
|
||||||
float cenx = {{pre_cam.center.x}}, ceny = {{pre_cam.center.y}};
|
float cenx = {{pre_cam.center.x}}, ceny = {{pre_cam.center.y}};
|
||||||
@ -79,13 +77,11 @@ def precalc_camera(pcam):
|
|||||||
{{pre_cam._set('yy')}} = scale * rotcos;
|
{{pre_cam._set('yy')}} = scale * rotcos;
|
||||||
{{pre_cam._set('yo')}} = scale * -(rotsin * cenx + rotcos * ceny)
|
{{pre_cam._set('yo')}} = scale * -(rotsin * cenx + rotcos * ceny)
|
||||||
+ 0.5f * acc_size.aheight;
|
+ 0.5f * acc_size.aheight;
|
||||||
|
""", 'precalc_camera').substitute(locals()))
|
||||||
""").substitute(locals()))
|
|
||||||
|
|
||||||
def precalc_xf_affine(px):
|
def precalc_xf_affine(px):
|
||||||
pre = px._precalc()
|
pre = px._precalc()
|
||||||
pre._code(Template(r"""
|
pre._code(Template(r"""
|
||||||
|
|
||||||
float pri = {{pre.angle}} * M_PI / 180.0f;
|
float pri = {{pre.angle}} * M_PI / 180.0f;
|
||||||
float spr = {{pre.spread}} * M_PI / 180.0f;
|
float spr = {{pre.spread}} * M_PI / 180.0f;
|
||||||
|
|
||||||
@ -98,27 +94,22 @@ def precalc_xf_affine(px):
|
|||||||
{{pre._set('yy')}} = magy * sin(pri+spr);
|
{{pre._set('yy')}} = magy * sin(pri+spr);
|
||||||
{{pre._set('xo')}} = {{pre.offset.x}};
|
{{pre._set('xo')}} = {{pre.offset.x}};
|
||||||
{{pre._set('yo')}} = -{{pre.offset.y}};
|
{{pre._set('yo')}} = -{{pre.offset.y}};
|
||||||
|
""", 'precalc_xf_affine').substitute(locals()))
|
||||||
|
|
||||||
""").substitute(locals()))
|
def apply_affine(x, y, xo, yo, packer):
|
||||||
|
return Template("""
|
||||||
|
{{xo}} = {{packer.xx}} * {{x}} + {{packer.xy}} * {{y}} + {{packer.xo}};
|
||||||
|
{{yo}} = {{packer.yx}} * {{x}} + {{packer.yy}} * {{y}} + {{packer.yo}};
|
||||||
|
""", 'apply_affine').substitute(locals())
|
||||||
|
|
||||||
class IterCode(HunkOCode):
|
# The number of threads per block used in the iteration function. Don't change
|
||||||
# The number of threads per block
|
# it lightly; the code may depend on it in unusual ways.
|
||||||
NTHREADS = 256
|
NTHREADS = 256
|
||||||
|
|
||||||
def __init__(self, info, genome):
|
iter_decls = """
|
||||||
self.packer = interp.GenomePacker('iter_params')
|
|
||||||
self.pcp = self.packer.view('params', genome, 'cp')
|
|
||||||
|
|
||||||
iterbody = self._iterbody(info, genome)
|
|
||||||
bodies = [self._xfbody(i,x) for i,x in sorted(genome.xforms.items())]
|
|
||||||
bodies.append(iterbody)
|
|
||||||
self.defs = '\n'.join(bodies)
|
|
||||||
|
|
||||||
decls = """
|
|
||||||
// Note: for normalized lookups, uchar4 actually returns floats
|
// Note: for normalized lookups, uchar4 actually returns floats
|
||||||
texture<uchar4, cudaTextureType2D, cudaReadModeNormalizedFloat> palTex;
|
texture<uchar4, cudaTextureType2D, cudaReadModeNormalizedFloat> palTex;
|
||||||
__shared__ iter_params params;
|
__shared__ iter_params params;
|
||||||
__device__ int rb_head, rb_tail, rb_size;
|
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
uint32_t width;
|
uint32_t width;
|
||||||
@ -128,12 +119,9 @@ typedef struct {
|
|||||||
uint32_t astride;
|
uint32_t astride;
|
||||||
} acc_size_t;
|
} acc_size_t;
|
||||||
__constant__ acc_size_t acc_size;
|
__constant__ acc_size_t acc_size;
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def _xfbody(self, xfid, xform):
|
iter_xf_body_code = r"""
|
||||||
px = self.pcp.xforms[xfid]
|
|
||||||
tmpl = Template(r"""
|
|
||||||
__device__
|
__device__
|
||||||
void apply_xf_{{xfid}}(float &ox, float &oy, float &color, mwc_st &rctx) {
|
void apply_xf_{{xfid}}(float &ox, float &oy, float &color, mwc_st &rctx) {
|
||||||
float tx, ty;
|
float tx, ty;
|
||||||
@ -162,44 +150,30 @@ void apply_xf_{{xfid}}(float &ox, float &oy, float &color, mwc_st &rctx) {
|
|||||||
float csp = {{px.color_speed}};
|
float csp = {{px.color_speed}};
|
||||||
color = color * (1.0f - csp) + {{px.color}} * csp;
|
color = color * (1.0f - csp) + {{px.color}} * csp;
|
||||||
};
|
};
|
||||||
""", 'apply_xf_'+xfid)
|
"""
|
||||||
|
|
||||||
|
def iter_xf_body(pcp, xfid, xform):
|
||||||
|
px = pcp.xforms[xfid]
|
||||||
|
tmpl = Template(iter_xf_body_code, 'apply_xf_'+xfid)
|
||||||
|
|
||||||
g = dict(globals())
|
g = dict(globals())
|
||||||
g.update(locals())
|
g.update(locals())
|
||||||
return tmpl.substitute(g)
|
return tmpl.substitute(g)
|
||||||
|
|
||||||
def _iterbody(self, info, genome):
|
iter_body_code = r'''
|
||||||
tmpl = Template(r'''
|
__global__ void
|
||||||
|
iter(uint64_t out_ptr, uint64_t atom_ptr,
|
||||||
__global__ void reset_rb(int size) {
|
ringbuf *rb, mwc_st *msts, float4 *points,
|
||||||
rb_head = rb_tail = 0;
|
const iter_params *all_params)
|
||||||
rb_size = size;
|
{
|
||||||
}
|
|
||||||
|
|
||||||
__global__
|
|
||||||
void iter(
|
|
||||||
uint64_t out_ptr,
|
|
||||||
mwc_st *msts,
|
|
||||||
float4 *points,
|
|
||||||
const iter_params *all_params
|
|
||||||
{{if info.acc_mode == 'atomic'}}
|
|
||||||
, uint64_t atom_ptr
|
|
||||||
{{endif}}
|
|
||||||
) {
|
|
||||||
const iter_params *global_params = &(all_params[blockIdx.x]);
|
|
||||||
|
|
||||||
|
|
||||||
// load params to shared memory cooperatively
|
// load params to shared memory cooperatively
|
||||||
|
const iter_params *global_params = &(all_params[blockIdx.x]);
|
||||||
for (int i = threadIdx.y * blockDim.x + threadIdx.x;
|
for (int i = threadIdx.y * blockDim.x + threadIdx.x;
|
||||||
i < (sizeof(iter_params) / 4); i += blockDim.x * blockDim.y)
|
i < (sizeof(iter_params) / 4); i += blockDim.x * blockDim.y)
|
||||||
reinterpret_cast<float*>(¶ms)[i] =
|
reinterpret_cast<float*>(¶ms)[i] =
|
||||||
reinterpret_cast<const float*>(global_params)[i];
|
reinterpret_cast<const float*>(global_params)[i];
|
||||||
|
|
||||||
__shared__ int rb_idx;
|
int this_rb_idx = rb_incr(rb->head, blockDim.x * threadIdx.y + threadIdx.x);
|
||||||
if (threadIdx.y == 1 && threadIdx.x == 1)
|
|
||||||
rb_idx = 32 * blockDim.y * (atomicAdd(&rb_head, 1) % rb_size);
|
|
||||||
|
|
||||||
__syncthreads();
|
|
||||||
int this_rb_idx = rb_idx + threadIdx.x + 32 * threadIdx.y;
|
|
||||||
mwc_st rctx = msts[this_rb_idx];
|
mwc_st rctx = msts[this_rb_idx];
|
||||||
|
|
||||||
{{precalc_camera(pcp.camera)}}
|
{{precalc_camera(pcp.camera)}}
|
||||||
@ -209,22 +183,15 @@ void iter(
|
|||||||
{{pcp.camera.yo}} += ditherwidth * mwc_next_11(rctx);
|
{{pcp.camera.yo}} += ditherwidth * mwc_next_11(rctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
{{if info.acc_mode == 'global'}}
|
|
||||||
__shared__ float time_frac;
|
|
||||||
time_frac = blockIdx.x / (float) gridDim.x;
|
|
||||||
{{else}}
|
|
||||||
{{if info.acc_mode == 'atomic'}}
|
|
||||||
// TODO: spare the register, reuse at call site?
|
// TODO: spare the register, reuse at call site?
|
||||||
int time = blockIdx.x >> 4;
|
int time = blockIdx.x >> 4;
|
||||||
{{endif}}
|
|
||||||
float color_dither = 0.49f * mwc_next_11(rctx);
|
float color_dither = 0.49f * mwc_next_11(rctx);
|
||||||
{{endif}}
|
|
||||||
|
|
||||||
// TODO: 4th channel unused. Kill or use for something helpful
|
// TODO: 4th channel unused. Kill or use for something helpful
|
||||||
float4 old_point = points[this_rb_idx];
|
float4 old_point = points[this_rb_idx];
|
||||||
float x = old_point.x, y = old_point.y, color = old_point.z;
|
float x = old_point.x, y = old_point.y, color = old_point.z;
|
||||||
|
|
||||||
{{if not info.chaos_used}}
|
{{if not chaos_used}}
|
||||||
// Shared memory size can be reduced by a factor of four using a slower
|
// Shared memory size can be reduced by a factor of four using a slower
|
||||||
// 4-stage reduce, but on Fermi hardware shmem use isn't a bottleneck
|
// 4-stage reduce, but on Fermi hardware shmem use isn't a bottleneck
|
||||||
__shared__ float swap[{{4*NTHREADS}}];
|
__shared__ float swap[{{4*NTHREADS}}];
|
||||||
@ -250,7 +217,7 @@ void iter(
|
|||||||
// TODO: link up with FUSE, etc
|
// TODO: link up with FUSE, etc
|
||||||
for (int round = 0; round < 256; round++) {
|
for (int round = 0; round < 256; round++) {
|
||||||
|
|
||||||
{{if info.chaos_used}}
|
{{if chaos_used}}
|
||||||
|
|
||||||
{{precalc_chaos(pcp, std_xforms)}}
|
{{precalc_chaos(pcp, std_xforms)}}
|
||||||
|
|
||||||
@ -315,16 +282,7 @@ void iter(
|
|||||||
|
|
||||||
{{endif}}
|
{{endif}}
|
||||||
|
|
||||||
{{if info.acc_mode == 'deferred'}}
|
|
||||||
int tid = threadIdx.y * 32 + threadIdx.x;
|
|
||||||
int offset = 4 * (256 * (256 * blockIdx.x + round) + tid);
|
|
||||||
int *log = reinterpret_cast<int*>(out_ptr + offset);
|
|
||||||
{{endif}}
|
|
||||||
|
|
||||||
if (fuse) {
|
if (fuse) {
|
||||||
{{if info.acc_mode == 'deferred'}}
|
|
||||||
*log = 0xffffffff;
|
|
||||||
{{endif}}
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -346,15 +304,11 @@ void iter(
|
|||||||
uint32_t ix = trunca(cx), iy = trunca(cy);
|
uint32_t ix = trunca(cx), iy = trunca(cy);
|
||||||
|
|
||||||
if (ix >= acc_size.astride || iy >= acc_size.aheight) {
|
if (ix >= acc_size.astride || iy >= acc_size.aheight) {
|
||||||
{{if info.acc_mode == 'deferred'}}
|
|
||||||
*log = 0xffffffff;
|
|
||||||
{{endif}}
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
uint32_t ibase = (last_xf_used % {{info.max_nxf}}) * acc_size.aheight;
|
uint32_t i = iy * acc_size.astride + ix;
|
||||||
uint32_t i = (ibase + iy) * acc_size.astride + ix;
|
|
||||||
{{if info.acc_mode == 'atomic'}}
|
|
||||||
asm volatile ({{crep("""
|
asm volatile ({{crep("""
|
||||||
{
|
{
|
||||||
// To prevent overflow, we need to flush each pixel before the density
|
// To prevent overflow, we need to flush each pixel before the density
|
||||||
@ -432,38 +386,45 @@ oflow_end:
|
|||||||
""")}} :: "f"(cc), "f"(color_dither), "r"(time), "r"(i),
|
""")}} :: "f"(cc), "f"(color_dither), "r"(time), "r"(i),
|
||||||
"l"(atom_ptr), "f"(cosel[threadIdx.y + {{NWARPS}}]),
|
"l"(atom_ptr), "f"(cosel[threadIdx.y + {{NWARPS}}]),
|
||||||
"l"(out_ptr));
|
"l"(out_ptr));
|
||||||
{{endif}}
|
|
||||||
|
|
||||||
{{if info.acc_mode == 'global'}}
|
|
||||||
float4 outcol = tex2D(palTex, cc, time_frac);
|
|
||||||
float4 *accbuf = reinterpret_cast<float4*>(out_ptr + (16*i));
|
|
||||||
float4 pix = *accbuf;
|
|
||||||
pix.x += outcol.x;
|
|
||||||
pix.y += outcol.y;
|
|
||||||
pix.z += outcol.z;
|
|
||||||
pix.w += 1.0f;
|
|
||||||
*accbuf = pix;
|
|
||||||
{{elif info.acc_mode == 'deferred'}}
|
|
||||||
// 'color' gets the top 8 bits. TODO: add dithering via precalc.
|
|
||||||
uint32_t icolor = fminf(1.0f, cc) * 255.0f + color_dither;
|
|
||||||
asm("bfi.b32 %0, %1, %0, 24, 8;" : "+r"(i) : "r"(icolor));
|
|
||||||
*log = i;
|
|
||||||
{{endif}}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (threadIdx.x == 0 && threadIdx.y == 0)
|
this_rb_idx = rb_incr(rb->tail, blockDim.x * threadIdx.y + threadIdx.x);
|
||||||
rb_idx = 32 * blockDim.y * (atomicAdd(&rb_tail, 1) % rb_size);
|
|
||||||
__syncthreads();
|
|
||||||
this_rb_idx = rb_idx + threadIdx.x + 32 * threadIdx.y;
|
|
||||||
|
|
||||||
points[this_rb_idx] = make_float4(x, y, color, 0.0f);
|
points[this_rb_idx] = make_float4(x, y, color, 0.0f);
|
||||||
msts[this_rb_idx] = rctx;
|
msts[this_rb_idx] = rctx;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
'''
|
||||||
|
|
||||||
{{if info.acc_mode == 'atomic'}}
|
def iter_body(cp, pcp):
|
||||||
__global__
|
# For legacy reasons, 'cp' is used here instead of 'genome'.
|
||||||
void flush_atom(uint64_t out_ptr, uint64_t atom_ptr, int nbins) {
|
tmpl = Template(iter_body_code, 'iter_body')
|
||||||
|
NWARPS = NTHREADS / 32
|
||||||
|
std_xforms = [n for n in sorted(cp.xforms) if n != 'final']
|
||||||
|
|
||||||
|
# TODO: detect this properly and use it
|
||||||
|
chaos_used = False
|
||||||
|
|
||||||
|
vars = globals()
|
||||||
|
vars.update(locals())
|
||||||
|
return tmpl.substitute(vars)
|
||||||
|
|
||||||
|
def mkiterlib(genome):
|
||||||
|
packer = interp.GenomePacker('iter_params')
|
||||||
|
pcp = packer.view('params', genome, 'cp')
|
||||||
|
|
||||||
|
iterbody = iter_body(genome, pcp)
|
||||||
|
bodies = [iter_xf_body(pcp, i, x) for i, x in sorted(genome.xforms.items())]
|
||||||
|
bodies.append(iterbody)
|
||||||
|
packer_lib = packer.finalize()
|
||||||
|
|
||||||
|
lib = devlib(deps=[packer_lib, mwclib, ringbuflib],
|
||||||
|
# We grab the surf decl from palintlib as well
|
||||||
|
decls=iter_decls + interp.palintlib.decls,
|
||||||
|
defs='\n'.join(bodies))
|
||||||
|
return packer, lib
|
||||||
|
|
||||||
|
flushatomlib = devlib(defs=Template(r'''
|
||||||
|
__global__ void flush_atom(uint64_t out_ptr, uint64_t atom_ptr, int nbins) {
|
||||||
int i = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;
|
int i = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;
|
||||||
if (i >= nbins) return;
|
if (i >= nbins) return;
|
||||||
asm volatile ({{crep("""
|
asm volatile ({{crep("""
|
||||||
@ -499,168 +460,4 @@ void flush_atom(uint64_t out_ptr, uint64_t atom_ptr, int nbins) {
|
|||||||
}
|
}
|
||||||
""")}} :: "r"(i), "l"(atom_ptr), "l"(out_ptr));
|
""")}} :: "r"(i), "l"(atom_ptr), "l"(out_ptr));
|
||||||
}
|
}
|
||||||
|
''', 'flush_atom').substitute())
|
||||||
{{endif}}
|
|
||||||
|
|
||||||
{{if info.acc_mode == 'deferred'}}
|
|
||||||
|
|
||||||
// Block size, shared accumulation bits, shared accumulation width.
|
|
||||||
#define BS 1024
|
|
||||||
#define SHAB 12
|
|
||||||
#define SHAW (1<<SHAB)
|
|
||||||
|
|
||||||
// Read the point log, accumulate in shared memory, and write the results.
|
|
||||||
// This kernel is to be launched with one block for every 4,096 addresses to
|
|
||||||
// be processed, and will handle those addresses.
|
|
||||||
//
|
|
||||||
// log_bounds is an array mapping radix values to the first index in the log
|
|
||||||
// with that radix position. For performance reasons in other parts of the
|
|
||||||
// code, the radix may actually include bits within the lower SHAB part of the
|
|
||||||
// address, or it might not cover the first few bits after the SHAB part;
|
|
||||||
// log_bounds_shift covers that. glob_addr_bits specifies the number of bits
|
|
||||||
// above SHAB which are address bits.
|
|
||||||
|
|
||||||
__global__ void
|
|
||||||
__launch_bounds__(BS, 1)
|
|
||||||
write_shmem(
|
|
||||||
float4 *acc,
|
|
||||||
const uint32_t *log,
|
|
||||||
const uint32_t *log_bounds,
|
|
||||||
uint32_t nbins
|
|
||||||
) {
|
|
||||||
const int tid = threadIdx.x;
|
|
||||||
const int bid = blockIdx.x;
|
|
||||||
|
|
||||||
// These two accumulators, used in write_shmem, hold {density, red} and
|
|
||||||
// {green, blue} values as packed u16 pairs. The fixed size represents
|
|
||||||
// 4,096 pixels in the accumulator.
|
|
||||||
__shared__ uint32_t s_acc[SHAW*2];
|
|
||||||
|
|
||||||
int idx = tid;
|
|
||||||
for (int i = 0; i < (SHAW * 2 / BS); i++) {
|
|
||||||
s_acc[idx] = 0;
|
|
||||||
idx += BS;
|
|
||||||
}
|
|
||||||
__syncthreads();
|
|
||||||
|
|
||||||
// Shut the compiler up
|
|
||||||
idx = s_acc[0];
|
|
||||||
|
|
||||||
// log_bounds[] holds inclusive prefix sums, so that log_bounds[0] is the
|
|
||||||
// largest index with radix 0, and so on.
|
|
||||||
int lb_idx_hi = bid & 0xff;
|
|
||||||
int lb_idx_lo = lb_idx_hi - 1;
|
|
||||||
|
|
||||||
int idx_lo;
|
|
||||||
if (lb_idx_lo > 0) idx_lo = log_bounds[lb_idx_lo] & ~(BS - 1);
|
|
||||||
else idx_lo = 0;
|
|
||||||
int idx_hi = (log_bounds[lb_idx_hi] & ~(BS - 1)) + BS;
|
|
||||||
|
|
||||||
float rnrounds = {{'%d.0f' % info.palette_height}} / (idx_hi - idx_lo);
|
|
||||||
float time = tid * rnrounds;
|
|
||||||
float time_step = BS * rnrounds;
|
|
||||||
|
|
||||||
int magic = ((blockIdx.x & 0xff) << 3) + ((blockIdx.x & 0xf00) << 12);
|
|
||||||
int magic_mask = 0xf007f8;
|
|
||||||
|
|
||||||
for (int i = idx_lo + tid; i < idx_hi; i += BS) {
|
|
||||||
int entry = log[i];
|
|
||||||
|
|
||||||
asm volatile ({{crep("""
|
|
||||||
{
|
|
||||||
.reg .pred q;
|
|
||||||
.reg .u32 shoff, color, time, d, r, g, b, hi, lo, hiw, low, tmp;
|
|
||||||
.reg .u64 ptr;
|
|
||||||
.reg .f32 rf, gf, bf, df, rg, gg, dg, bg;
|
|
||||||
|
|
||||||
// TODO: opacity
|
|
||||||
and.b32 tmp, %0, %4;
|
|
||||||
setp.eq.u32 q, tmp, %3;
|
|
||||||
@!q bra before_sync;
|
|
||||||
|
|
||||||
and.b32 shoff, %0, 0xff800;
|
|
||||||
shr.b32 shoff, shoff, 5;
|
|
||||||
bfi.b32 shoff, %0, shoff, 3, 3;
|
|
||||||
|
|
||||||
bfe.u32 color, %0, 24, 8;
|
|
||||||
shl.b32 color, color, 3;
|
|
||||||
cvt.rni.u32.f32 time, %1;
|
|
||||||
|
|
||||||
suld.b.2d.v2.b32.clamp {lo, hi}, [flatpal, {color, time}];
|
|
||||||
ld.shared.v2.u32 {low, hiw}, [shoff];
|
|
||||||
add.cc.u32 lo, lo, low;
|
|
||||||
addc.u32 hi, hi, hiw;
|
|
||||||
setp.hs.u32 q, hi, (1023 << 22);
|
|
||||||
@q bra oflow_sync;
|
|
||||||
st.shared.v2.u32 [shoff], {hi, lo};
|
|
||||||
before_sync:
|
|
||||||
bar.sync 0;
|
|
||||||
bra oflow_write_end;
|
|
||||||
oflow_sync:
|
|
||||||
st.shared.v2.u32 [shoff], {0, 0};
|
|
||||||
|
|
||||||
// TODO: opacity
|
|
||||||
bfi.b32 shoff, %0, 0, 4, 24;
|
|
||||||
cvt.u64.u32 ptr, shoff;
|
|
||||||
add.u64 ptr, ptr, %2;
|
|
||||||
ld.global.v4.f32 {dg,bg,gg,rg}, [ptr];
|
|
||||||
bar.sync 0;
|
|
||||||
|
|
||||||
bfe.u32 r, hi, 4, 18;
|
|
||||||
bfe.u32 g, lo, 18, 14;
|
|
||||||
bfi.b32 g, hi, g, 14, 4;
|
|
||||||
and.b32 b, lo, ((1<<18)-1);
|
|
||||||
cvt.rn.f32.u32 rf, r;
|
|
||||||
cvt.rn.f32.u32 gf, g;
|
|
||||||
cvt.rn.f32.u32 bf, b;
|
|
||||||
fma.rn.ftz.f32 rf, rf, (1.0/255.0), rg;
|
|
||||||
fma.rn.ftz.f32 gf, gf, (1.0/255.0), gg;
|
|
||||||
fma.rn.ftz.f32 bf, bf, (1.0/255.0), bg;
|
|
||||||
add.f32 df, df, dg;
|
|
||||||
st.global.v4.f32 [ptr], {df,bf,gf,rf};
|
|
||||||
|
|
||||||
oflow_write_end:
|
|
||||||
}
|
|
||||||
""")}} :: "r"(entry), "f"(time), "l"(acc), "r"(magic), "r"(magic_mask));
|
|
||||||
time += time_step;
|
|
||||||
}
|
|
||||||
|
|
||||||
__syncthreads();
|
|
||||||
|
|
||||||
|
|
||||||
idx = tid;
|
|
||||||
int glo_idx = magic | (((idx << 8) | idx) & 0xff807);
|
|
||||||
|
|
||||||
for (int i = 0; i < (SHAW / BS) && glo_idx < nbins; i++) {
|
|
||||||
int d, r, g, b;
|
|
||||||
float4 pix = acc[glo_idx];
|
|
||||||
asm({{crep("""
|
|
||||||
{
|
|
||||||
.reg .u32 hi, lo;
|
|
||||||
ld.shared.v2.u32 {lo, hi}, [%4];
|
|
||||||
shr.u32 %0, hi, 22;
|
|
||||||
bfe.u32 %1, hi, 4, 18;
|
|
||||||
bfe.u32 %2, lo, 18, 14;
|
|
||||||
bfi.b32 %2, hi, %2, 14, 4;
|
|
||||||
and.b32 %3, lo, ((1<<18)-1);
|
|
||||||
}
|
|
||||||
""")}} : "=r"(d), "=r"(r), "=r"(g), "=r"(b) : "r"(idx*8));
|
|
||||||
pix.x += r / 255.0f;
|
|
||||||
pix.y += g / 255.0f;
|
|
||||||
pix.z += b / 255.0f;
|
|
||||||
pix.w += d;
|
|
||||||
acc[glo_idx] = pix;
|
|
||||||
idx += BS;
|
|
||||||
glo_idx += (BS << 8);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
{{endif}}
|
|
||||||
''', 'iter_kern')
|
|
||||||
return tmpl.substitute(
|
|
||||||
info = info,
|
|
||||||
cp = genome,
|
|
||||||
pcp = self.pcp,
|
|
||||||
NTHREADS = self.NTHREADS,
|
|
||||||
NWARPS = self.NTHREADS / 32,
|
|
||||||
std_xforms = [n for n in sorted(genome.xforms) if n != 'final'],
|
|
||||||
**globals())
|
|
||||||
|
@ -6,18 +6,53 @@ import os
|
|||||||
import warnings
|
import warnings
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from util import *
|
from util import devlib, assemble_code
|
||||||
|
|
||||||
class MWC(HunkOCode):
|
# Keeping this live in the module isn't necessary, but loading the mults
|
||||||
decls = """
|
# can be surprisingly slow.
|
||||||
|
mults = None
|
||||||
|
|
||||||
|
def load_mults():
|
||||||
|
pfpath = os.path.join(os.path.dirname(__file__), 'primes.bin')
|
||||||
|
if os.path.isfile(pfpath):
|
||||||
|
with open(pfpath) as fp:
|
||||||
|
return np.frombuffer(fp.read(), dtype='<u4')
|
||||||
|
|
||||||
|
warnings.warn('primes.bin not found, trying to download it')
|
||||||
|
import bz2, urllib2
|
||||||
|
ufp = urllib2.urlopen('http://aduro.strobe.cc/primes.diff.bin.bz2')
|
||||||
|
diffs = np.frombuffer(bz2.decompress(ufp.read()), dtype='<u2')
|
||||||
|
mults = np.cumsum(-np.array(diffs, dtype='<u4'), dtype='<u4')
|
||||||
|
with open(pfpath, 'wb') as fp:
|
||||||
|
fp.write(mults)
|
||||||
|
return mults
|
||||||
|
|
||||||
|
def make_seeds(nthreads, host_seed=None):
|
||||||
|
global mults
|
||||||
|
if mults is None:
|
||||||
|
mults = load_mults()
|
||||||
|
if host_seed:
|
||||||
|
rand = np.random.RandomState(host_seed)
|
||||||
|
else:
|
||||||
|
rand = np.random
|
||||||
|
|
||||||
|
# Create the seed structures. TODO: check that struct is 4-byte aligned
|
||||||
|
seeds = np.empty((3, nthreads), dtype=np.uint32, order='F')
|
||||||
|
seeds[0][:] = mults[:nthreads]
|
||||||
|
|
||||||
|
# Excludes 0xffffffff for 32-bit compatibility with laziness
|
||||||
|
seeds[1] = rand.randint(1, 0x7fffffff, size=nthreads)
|
||||||
|
seeds[2] = rand.randint(1, 0x7fffffff, size=nthreads)
|
||||||
|
|
||||||
|
return seeds
|
||||||
|
|
||||||
|
mwclib = devlib(decls=r'''
|
||||||
typedef struct {
|
typedef struct {
|
||||||
uint32_t mul;
|
uint32_t mul;
|
||||||
uint32_t state;
|
uint32_t state;
|
||||||
uint32_t carry;
|
uint32_t carry;
|
||||||
} mwc_st;
|
} mwc_st;
|
||||||
"""
|
''', defs=r'''
|
||||||
|
|
||||||
defs = r"""
|
|
||||||
__device__ uint32_t mwc_next(mwc_st &st) {
|
__device__ uint32_t mwc_next(mwc_st &st) {
|
||||||
asm("{\n\t"
|
asm("{\n\t"
|
||||||
".reg .u32 tmp;\n\t"
|
".reg .u32 tmp;\n\t"
|
||||||
@ -40,46 +75,9 @@ __device__ float mwc_next_11(mwc_st &st) {
|
|||||||
: "=f"(ret) : "r"(val));
|
: "=f"(ret) : "r"(val));
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
"""
|
''')
|
||||||
|
|
||||||
@staticmethod
|
mwctestlib = devlib(deps=[mwclib], defs="""
|
||||||
def load_mults():
|
|
||||||
pfpath = os.path.join(os.path.dirname(__file__), 'primes.bin')
|
|
||||||
if os.path.isfile(pfpath):
|
|
||||||
with open(pfpath) as fp:
|
|
||||||
return np.frombuffer(fp.read(), dtype='<u4')
|
|
||||||
|
|
||||||
warnings.warn('primes.bin not found, trying to download it')
|
|
||||||
import bz2, urllib2
|
|
||||||
ufp = urllib2.urlopen('http://aduro.strobe.cc/primes.diff.bin.bz2')
|
|
||||||
diffs = np.frombuffer(bz2.decompress(ufp.read()), dtype='<u2')
|
|
||||||
mults = np.cumsum(-np.array(diffs, dtype='<u4'), dtype='<u4')
|
|
||||||
with open(pfpath, 'wb') as fp:
|
|
||||||
fp.write(mults)
|
|
||||||
return mults
|
|
||||||
|
|
||||||
mults = None
|
|
||||||
@classmethod
|
|
||||||
def make_seeds(cls, nthreads, host_seed=None):
|
|
||||||
if cls.mults is None:
|
|
||||||
cls.mults = cls.load_mults()
|
|
||||||
if host_seed:
|
|
||||||
rand = np.random.RandomState(host_seed)
|
|
||||||
else:
|
|
||||||
rand = np.random
|
|
||||||
|
|
||||||
# Create the seed structures. TODO: check that struct is 4-byte aligned
|
|
||||||
seeds = np.empty((3, nthreads), dtype=np.uint32, order='F')
|
|
||||||
seeds[0][:] = cls.mults[:nthreads]
|
|
||||||
|
|
||||||
# Excludes 0xffffffff for 32-bit compatibility with laziness
|
|
||||||
seeds[1] = rand.randint(1, 0x7fffffff, size=nthreads)
|
|
||||||
seeds[2] = rand.randint(1, 0x7fffffff, size=nthreads)
|
|
||||||
|
|
||||||
return seeds
|
|
||||||
|
|
||||||
class MWCTest(HunkOCode):
|
|
||||||
defs = """
|
|
||||||
__global__ void test_mwc(mwc_st *msts, uint64_t *sums, float nrounds) {
|
__global__ void test_mwc(mwc_st *msts, uint64_t *sums, float nrounds) {
|
||||||
mwc_st rctx = msts[gtid()];
|
mwc_st rctx = msts[gtid()];
|
||||||
uint64_t sum = 0;
|
uint64_t sum = 0;
|
||||||
@ -87,19 +85,18 @@ __global__ void test_mwc(mwc_st *msts, uint64_t *sums, float nrounds) {
|
|||||||
sums[gtid()] = sum;
|
sums[gtid()] = sum;
|
||||||
msts[gtid()] = rctx;
|
msts[gtid()] = rctx;
|
||||||
}
|
}
|
||||||
"""
|
""")
|
||||||
|
|
||||||
@classmethod
|
def test_mwc(rounds=5000, nblocks=64, blockwidth=512):
|
||||||
def test_mwc(cls, rounds=5000, nblocks=64, blockwidth=512):
|
|
||||||
import pycuda.driver as cuda
|
import pycuda.driver as cuda
|
||||||
from pycuda.compiler import SourceModule
|
from pycuda.compiler import SourceModule
|
||||||
import time
|
import time
|
||||||
|
|
||||||
nthreads = blockwidth * nblocks
|
nthreads = blockwidth * nblocks
|
||||||
seeds = MWC.make_seeds(nthreads, host_seed = 5)
|
seeds = make_seeds(nthreads, host_seed=42)
|
||||||
dseeds = cuda.to_device(seeds)
|
dseeds = cuda.to_device(seeds)
|
||||||
|
|
||||||
mod = SourceModule(assemble_code(BaseCode, MWC, cls))
|
mod = SourceModule(assemble_code(mwctestlib))
|
||||||
|
|
||||||
for trial in range(2):
|
for trial in range(2):
|
||||||
print "Trial %d, on CPU: " % trial,
|
print "Trial %d, on CPU: " % trial,
|
||||||
@ -133,5 +130,4 @@ __global__ void test_mwc(mwc_st *msts, uint64_t *sums, float nrounds) {
|
|||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
import pycuda.autoinit
|
import pycuda.autoinit
|
||||||
MWCTest.test_mwc()
|
test_mwc()
|
||||||
|
|
||||||
|
34
cuburn/code/output.py
Normal file
34
cuburn/code/output.py
Normal file
@ -0,0 +1,34 @@
|
|||||||
|
from util import devlib, ringbuflib
|
||||||
|
from mwc import mwclib
|
||||||
|
|
||||||
|
f32tou8lib = devlib(deps=[ringbuflib, mwclib], defs=r'''
|
||||||
|
// Perform a conversion from float32 values to uint8 ones, applying
|
||||||
|
// pixel- and channel-independent dithering to reduce suprathreshold banding
|
||||||
|
// artifacts. Clamps values larger than 1.0f.
|
||||||
|
// TODO: move to a separate module?
|
||||||
|
// TODO: less ineffecient mwc_st handling?
|
||||||
|
__global__ void f32_to_u8(
|
||||||
|
ringbuf *rb, mwc_st *rctxs, uchar4 *dst, const float4 *src,
|
||||||
|
int gutter, int dstride, int sstride, int height)
|
||||||
|
{
|
||||||
|
int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||||
|
if (x > dstride || y > height) return;
|
||||||
|
int isrc = sstride * (y + gutter) + x + gutter;
|
||||||
|
|
||||||
|
int tid = blockDim.x * threadIdx.y + threadIdx.x;
|
||||||
|
mwc_st rctx = rctxs[rb_incr(rb->head, tid)];
|
||||||
|
|
||||||
|
float4 in = src[isrc];
|
||||||
|
uchar4 out = make_uchar4(
|
||||||
|
fminf(1.0f, in.x) * 255.0f + 0.49f * mwc_next_11(rctx),
|
||||||
|
fminf(1.0f, in.y) * 255.0f + 0.49f * mwc_next_11(rctx),
|
||||||
|
fminf(1.0f, in.z) * 255.0f + 0.49f * mwc_next_11(rctx),
|
||||||
|
fminf(1.0f, in.w) * 255.0f + 0.49f * mwc_next_11(rctx)
|
||||||
|
);
|
||||||
|
|
||||||
|
int idst = dstride * y + x;
|
||||||
|
dst[idst] = out;
|
||||||
|
rctxs[rb_incr(rb->head, tid)] = rctx;
|
||||||
|
}
|
||||||
|
''')
|
@ -1,55 +1,119 @@
|
|||||||
"""
|
"""
|
||||||
Provides tools and miscellaneous functions for building device code.
|
Provides tools and miscellaneous functions for building device code.
|
||||||
"""
|
"""
|
||||||
|
import os
|
||||||
|
import tempfile
|
||||||
|
from collections import namedtuple
|
||||||
|
|
||||||
|
import pycuda.driver as cuda
|
||||||
|
import pycuda.compiler
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import tempita
|
import tempita
|
||||||
|
|
||||||
def crep(s):
|
|
||||||
"""Escape for PTX assembly"""
|
|
||||||
if isinstance(s, unicode):
|
|
||||||
s = s.encode('utf-8')
|
|
||||||
return '"%s"' % s.encode("string_escape")
|
|
||||||
|
|
||||||
def argset(obj, **kwargs):
|
def argset(obj, **kwargs):
|
||||||
|
"""
|
||||||
|
Allow an object with many properties to be set using one call.
|
||||||
|
|
||||||
|
>>> x = argset(X(), a=1, b=2)
|
||||||
|
>>> x.a
|
||||||
|
1
|
||||||
|
"""
|
||||||
for k, v in kwargs.items():
|
for k, v in kwargs.items():
|
||||||
setattr(obj, k, v)
|
setattr(obj, k, v)
|
||||||
return obj
|
return obj
|
||||||
|
|
||||||
|
def launch(name, mod, stream, block, grid, *args, **kwargs):
|
||||||
|
"""
|
||||||
|
Oft-used kernel launch helper. Provides a nice boost in readability of
|
||||||
|
densely-packed asynchronous code launches.
|
||||||
|
"""
|
||||||
|
fun = mod.get_function(name)
|
||||||
|
if isinstance(block, (int, np.number)):
|
||||||
|
block = (int(block), 1, 1)
|
||||||
|
if isinstance(grid, (int, np.number)):
|
||||||
|
grid = (int(grid), 1)
|
||||||
|
fun(*args, block=block, grid=grid, stream=stream, **kwargs)
|
||||||
|
|
||||||
|
def crep(s):
|
||||||
|
"""Multiline literal escape for inline PTX assembly."""
|
||||||
|
if isinstance(s, unicode):
|
||||||
|
s = s.encode('utf-8')
|
||||||
|
return '"%s"' % s.encode("string_escape")
|
||||||
|
|
||||||
class Template(tempita.Template):
|
class Template(tempita.Template):
|
||||||
|
"""
|
||||||
|
A Tempita template with extra stuff in the default namespace.
|
||||||
|
"""
|
||||||
default_namespace = tempita.Template.default_namespace.copy()
|
default_namespace = tempita.Template.default_namespace.copy()
|
||||||
Template.default_namespace.update({'np': np, 'crep': crep})
|
Template.default_namespace.update({'np': np, 'crep': crep})
|
||||||
|
|
||||||
class HunkOCode(object):
|
# Passive container for device code.
|
||||||
"""An apparently passive container for device code."""
|
DevLib = namedtuple('DevLib', 'deps headers decls defs')
|
||||||
# Use property objects to make these dynamic
|
|
||||||
headers = ''
|
|
||||||
decls = ''
|
|
||||||
defs = ''
|
|
||||||
|
|
||||||
def assemble_code(*sections):
|
def devlib(deps=(), headers='', decls='', defs=''):
|
||||||
return ''.join([''.join([getattr(sect, kind) for sect in sections])
|
"""Create a library of device code."""
|
||||||
for kind in ['headers', 'decls', 'defs']])
|
# This exists because namedtuple doesn't support default args
|
||||||
|
return DevLib(deps, headers, decls, defs)
|
||||||
|
|
||||||
def apply_affine(x, y, xo, yo, packer):
|
def assemble_code(*libs):
|
||||||
return Template("""
|
seen = set()
|
||||||
{{xo}} = {{packer.xx}} * {{x}} + {{packer.xy}} * {{y}} + {{packer.xo}};
|
out = []
|
||||||
{{yo}} = {{packer.yx}} * {{x}} + {{packer.yy}} * {{y}} + {{packer.yo}};
|
def go(lib):
|
||||||
""").substitute(locals())
|
map(go, lib.deps)
|
||||||
|
code = lib[1:]
|
||||||
|
if code not in seen:
|
||||||
|
seen.add(code)
|
||||||
|
out.append(code)
|
||||||
|
go(stdlib)
|
||||||
|
map(go, libs)
|
||||||
|
return ''.join(sum(zip(*out), ()))
|
||||||
|
|
||||||
class BaseCode(HunkOCode):
|
DEFAULT_CMP_OPTIONS = ('-use_fast_math', '-maxrregcount', '42')
|
||||||
headers = """
|
DEFAULT_SAVE_KERNEL = True
|
||||||
|
def compile(name, src, opts=DEFAULT_CMP_OPTIONS, save=DEFAULT_SAVE_KERNEL):
|
||||||
|
"""
|
||||||
|
Compile a module. Returns a copy of the source (for inspection or
|
||||||
|
display) and the compiled cubin.
|
||||||
|
"""
|
||||||
|
dir = tempfile.gettempdir()
|
||||||
|
if save:
|
||||||
|
with open(os.path.join(dir, name + '_kern.cu'), 'w') as fp:
|
||||||
|
fp.write(src)
|
||||||
|
cubin = pycuda.compiler.compile(src, options=list(opts))
|
||||||
|
if save:
|
||||||
|
with open(os.path.join(dir, name + '_kern.cubin'), 'w') as fp:
|
||||||
|
fp.write(cubin)
|
||||||
|
return cubin
|
||||||
|
|
||||||
|
class ClsMod(object):
|
||||||
|
"""
|
||||||
|
Convenience class or mixin that automatically compiles and loads a module
|
||||||
|
once per class, saving potentially expensive code regeneration. Only use
|
||||||
|
if your class does not employ run-time code generation.
|
||||||
|
"""
|
||||||
|
mod = None
|
||||||
|
# Supply your own DevLib on this property
|
||||||
|
lib = None
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
super(ClsMod, self).__init__()
|
||||||
|
self.load()
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def load(cls, name=None):
|
||||||
|
if cls.mod is None:
|
||||||
|
if name is None:
|
||||||
|
name = cls.__name__.lower()
|
||||||
|
cubin = compile(name, assemble_code(cls.lib))
|
||||||
|
cls.mod = cuda.module_from_buffer(cubin)
|
||||||
|
|
||||||
|
# This lib is included with *every* assembled module. It contains no global
|
||||||
|
# functions, so it shouldn't slow down compilation time too much.
|
||||||
|
stdlib = devlib(headers="""
|
||||||
#include<cuda.h>
|
#include<cuda.h>
|
||||||
#include<stdint.h>
|
#include<stdint.h>
|
||||||
#include<stdio.h>
|
#include<stdio.h>
|
||||||
"""
|
""", decls=r"""
|
||||||
|
|
||||||
decls = """
|
|
||||||
float3 rgb2hsv(float3 rgb);
|
|
||||||
float3 hsv2rgb(float3 hsv);
|
|
||||||
"""
|
|
||||||
|
|
||||||
defs = Template(r"""
|
|
||||||
#undef M_E
|
#undef M_E
|
||||||
#undef M_LOG2E
|
#undef M_LOG2E
|
||||||
#undef M_LOG10E
|
#undef M_LOG10E
|
||||||
@ -84,33 +148,79 @@ float3 hsv2rgb(float3 hsv);
|
|||||||
#define bfe_decl(d, s, o, w) \
|
#define bfe_decl(d, s, o, w) \
|
||||||
int d; \
|
int d; \
|
||||||
bfe(d, s, o, w)
|
bfe(d, s, o, w)
|
||||||
|
""", defs=r'''
|
||||||
// TODO: use launch parameter preconfig to eliminate unnecessary parts
|
__device__ uint32_t gtid() {
|
||||||
__device__
|
|
||||||
uint32_t gtid() {
|
|
||||||
return threadIdx.x + blockDim.x *
|
return threadIdx.x + blockDim.x *
|
||||||
(threadIdx.y + blockDim.y *
|
(threadIdx.y + blockDim.y *
|
||||||
(threadIdx.z + blockDim.z *
|
(threadIdx.z + blockDim.z *
|
||||||
(blockIdx.x + (gridDim.x * blockIdx.y))));
|
(blockIdx.x + (gridDim.x * blockIdx.y))));
|
||||||
}
|
}
|
||||||
|
|
||||||
__device__
|
__device__ uint32_t trunca(float f) {
|
||||||
uint32_t trunca(float f) {
|
|
||||||
// truncate as used in address calculations. note the use of a signed
|
// truncate as used in address calculations. note the use of a signed
|
||||||
// conversion is intentional here (simplifies image bounds checking).
|
// conversion is intentional here (simplifies image bounds checking).
|
||||||
uint32_t ret;
|
uint32_t ret;
|
||||||
asm("cvt.rni.s32.f32 %0, %1;" : "=r"(ret) : "f"(f));
|
asm("cvt.rni.s32.f32 %0, %1;" : "=r"(ret) : "f"(f));
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
''')
|
||||||
|
|
||||||
__global__
|
def mkbinsearchlib(rounds):
|
||||||
void fill_dptr(uint32_t* dptr, int size, uint32_t value) {
|
"""
|
||||||
|
Search through the fixed-size list 'hay' to find the rightmost index which
|
||||||
|
contains a value strictly smaller than the input 'needle'. The list must
|
||||||
|
be exactly '2^rounds' long, although padding at the top with very large
|
||||||
|
numbers or even +inf effectively shortens it.
|
||||||
|
"""
|
||||||
|
# TODO: this doesn't optimize well on a 64-bit arch, not that it's a
|
||||||
|
# performance-critical chunk of code or anything
|
||||||
|
src = Template(r'''
|
||||||
|
__device__ int bitwise_binsearch(const float *hay, float needle) {
|
||||||
|
int lo = 0;
|
||||||
|
|
||||||
|
// TODO: improve efficiency on 64-bit arches
|
||||||
|
{{for i in range(search_rounds-1, -1, -1)}}
|
||||||
|
if (needle > hay[lo + {{1 << i}}])
|
||||||
|
lo += {{1 << i}};
|
||||||
|
{{endfor}}
|
||||||
|
return lo;
|
||||||
|
}
|
||||||
|
''', 'bitwise_binsearch')
|
||||||
|
return devlib(defs=src.substitute(search_rounds=rounds))
|
||||||
|
|
||||||
|
# 2^search_rounds is the maximum number of knots allowed in a single spline.
|
||||||
|
# This includes the four required knots, so a 5 round search supports 28
|
||||||
|
# interior knots in the domain (0, 1). 2^5 fits nicely on a single cache line.
|
||||||
|
DEFAULT_SEARCH_ROUNDS = 5
|
||||||
|
binsearchlib = mkbinsearchlib(DEFAULT_SEARCH_ROUNDS)
|
||||||
|
|
||||||
|
|
||||||
|
filldptrlib = devlib(defs=r'''
|
||||||
|
__global__ void
|
||||||
|
fill_dptr(uint32_t* dptr, int size, uint32_t value) {
|
||||||
int i = (gridDim.x * blockIdx.y + blockIdx.x) * blockDim.x + threadIdx.x;
|
int i = (gridDim.x * blockIdx.y + blockIdx.x) * blockDim.x + threadIdx.x;
|
||||||
if (i < size) {
|
if (i < size) {
|
||||||
dptr[i] = value;
|
dptr[i] = value;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
''')
|
||||||
|
def fill_dptr(mod, dptr, size, stream=None, value=np.uint32(0)):
|
||||||
|
"""
|
||||||
|
A memory zeroer which can be embedded in a stream, unlike the various
|
||||||
|
memset routines. Size is the number of 4-byte words in the pointer;
|
||||||
|
value is the word to fill it with. If value is not an np.uint32, it
|
||||||
|
will be coerced to a buffer and the first four bytes taken.
|
||||||
|
"""
|
||||||
|
if not isinstance(value, np.uint32):
|
||||||
|
if isinstance(value, int):
|
||||||
|
value = np.uint32(value)
|
||||||
|
else:
|
||||||
|
value = np.frombuffer(buffer(value), np.uint32)[0]
|
||||||
|
blocks = int(np.ceil(np.sqrt(size / 1024.)))
|
||||||
|
launch('fill_dptr', mod, stream, (1024, 1, 1), (blocks, blocks),
|
||||||
|
dptr, np.int32(size), value)
|
||||||
|
|
||||||
|
writehalflib = devlib(defs=r'''
|
||||||
/* read_half and write_half decode and encode, respectively, two
|
/* read_half and write_half decode and encode, respectively, two
|
||||||
* floating-point values from a 32-bit value (typed as a 'float' for
|
* floating-point values from a 32-bit value (typed as a 'float' for
|
||||||
* convenience but not really). The values are packed into u16s as a
|
* convenience but not really). The values are packed into u16s as a
|
||||||
@ -122,8 +232,8 @@ void fill_dptr(uint32_t* dptr, int size, uint32_t value) {
|
|||||||
* approach when the alpha channel is present.
|
* approach when the alpha channel is present.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
__device__
|
__device__ void
|
||||||
void read_half(float &x, float &y, float xy, float den) {
|
read_half(float &x, float &y, float xy, float den) {
|
||||||
asm("\n\t{"
|
asm("\n\t{"
|
||||||
"\n\t .reg .u16 x, y;"
|
"\n\t .reg .u16 x, y;"
|
||||||
"\n\t .reg .f32 rc;"
|
"\n\t .reg .f32 rc;"
|
||||||
@ -137,8 +247,8 @@ void read_half(float &x, float &y, float xy, float den) {
|
|||||||
: "=f"(x), "=f"(y) : "f"(xy), "f"(den));
|
: "=f"(x), "=f"(y) : "f"(xy), "f"(den));
|
||||||
}
|
}
|
||||||
|
|
||||||
__device__
|
__device__ void
|
||||||
void write_half(float &xy, float x, float y, float den) {
|
write_half(float &xy, float x, float y, float den) {
|
||||||
asm("\n\t{"
|
asm("\n\t{"
|
||||||
"\n\t .reg .u16 x, y;"
|
"\n\t .reg .u16 x, y;"
|
||||||
"\n\t .reg .f32 rc, xf, yf;"
|
"\n\t .reg .f32 rc, xf, yf;"
|
||||||
@ -152,87 +262,41 @@ void write_half(float &xy, float x, float y, float den) {
|
|||||||
"\n\t}"
|
"\n\t}"
|
||||||
: "=f"(xy) : "f"(x), "f"(y), "f"(den));
|
: "=f"(xy) : "f"(x), "f"(y), "f"(den));
|
||||||
}
|
}
|
||||||
|
''')
|
||||||
|
|
||||||
|
|
||||||
/* This conversion uses the JPEG full-range standard. Note that UV have range
|
def mkringbuflib(rb_size):
|
||||||
* [-0.5, 0.5], so consider biasing the results. */
|
|
||||||
__device__
|
|
||||||
float3 rgb2yuv(float3 rgb) {
|
|
||||||
return make_float3(
|
|
||||||
0.299f * rgb.x + 0.587f * rgb.y + 0.114f * rgb.z,
|
|
||||||
-0.168736f * rgb.x - 0.331264f * rgb.y + 0.5f * rgb.z,
|
|
||||||
0.5f * rgb.x - 0.418688f * rgb.y - 0.081312f * rgb.z);
|
|
||||||
}
|
|
||||||
|
|
||||||
__device__
|
|
||||||
float3 yuv2rgb(float3 yuv) {
|
|
||||||
return make_float3(
|
|
||||||
yuv.x + 1.402f * yuv.z,
|
|
||||||
yuv.x - 0.34414f * yuv.y - 0.71414f * yuv.z,
|
|
||||||
yuv.x + 1.772f * yuv.y);
|
|
||||||
}
|
|
||||||
|
|
||||||
__device__
|
|
||||||
float3 rgb2hsv(float3 rgb) {
|
|
||||||
float M = fmaxf(fmaxf(rgb.x, rgb.y), rgb.z);
|
|
||||||
float m = fminf(fminf(rgb.x, rgb.y), rgb.z);
|
|
||||||
float C = M - m;
|
|
||||||
|
|
||||||
float s = M > 0.0f ? C / M : 0.0f;
|
|
||||||
|
|
||||||
float h = 0.0f;
|
|
||||||
if (s != 0.0f) {
|
|
||||||
C = 1.0f / C;
|
|
||||||
float rc = (M - rgb.x) * C;
|
|
||||||
float gc = (M - rgb.y) * C;
|
|
||||||
float bc = (M - rgb.z) * C;
|
|
||||||
|
|
||||||
if (rgb.x == M) h = bc - gc;
|
|
||||||
else if (rgb.y == M) h = 2 + rc - bc;
|
|
||||||
else h = 4 + gc - rc;
|
|
||||||
|
|
||||||
if (h < 0) h += 6.0f;
|
|
||||||
}
|
|
||||||
return make_float3(h, s, M);
|
|
||||||
}
|
|
||||||
|
|
||||||
__device__
|
|
||||||
float3 hsv2rgb(float3 hsv) {
|
|
||||||
|
|
||||||
float whole = floorf(hsv.x);
|
|
||||||
float frac = hsv.x - whole;
|
|
||||||
float val = hsv.z;
|
|
||||||
float min = val * (1 - hsv.y);
|
|
||||||
float mid = val * (1 - (hsv.y * frac));
|
|
||||||
float alt = val * (1 - (hsv.y * (1 - frac)));
|
|
||||||
|
|
||||||
float3 out;
|
|
||||||
if (whole == 0.0f) { out.x = val; out.y = alt; out.z = min; }
|
|
||||||
else if (whole == 1.0f) { out.x = mid; out.y = val; out.z = min; }
|
|
||||||
else if (whole == 2.0f) { out.x = min; out.y = val; out.z = alt; }
|
|
||||||
else if (whole == 3.0f) { out.x = min; out.y = mid; out.z = val; }
|
|
||||||
else if (whole == 4.0f) { out.x = alt; out.y = min; out.z = val; }
|
|
||||||
else { out.x = val; out.y = min; out.z = mid; }
|
|
||||||
return out;
|
|
||||||
}
|
|
||||||
""").substitute()
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def fill_dptr(mod, dptr, size, stream=None, value=np.uint32(0)):
|
|
||||||
"""
|
"""
|
||||||
A memory zeroer which can be embedded in a stream, unlike the various
|
A ringbuffer for access to shared resources.
|
||||||
memset routines. Size is the number of 4-byte words in the pointer;
|
|
||||||
value is the word to fill it with. If value is not an np.uint32, it
|
Some components, such as the MWC contexts, are expensive to generate, and
|
||||||
will be coerced to a buffer and the first four bytes taken.
|
have no affinity to a particular block. Rather than maintain a separate
|
||||||
|
copy of each of these objects for every thread block in a launch, we want
|
||||||
|
to only keep enough copies of these resources around to service every
|
||||||
|
thread block that could possibly be active simultaneously on one card,
|
||||||
|
which is often considerably smaller. This class provides a simple
|
||||||
|
ringbuffer type and an increment function, used in a couple places to
|
||||||
|
implement this kind of sharing.
|
||||||
"""
|
"""
|
||||||
fill = mod.get_function("fill_dptr")
|
return devlib(headers="#define RB_SIZE_MASK %d" % (rb_size - 1), decls='''
|
||||||
if not isinstance(value, np.uint32):
|
typedef struct {
|
||||||
if isinstance(value, int):
|
int head;
|
||||||
value = np.uint32(value)
|
int tail;
|
||||||
else:
|
} ringbuf;
|
||||||
value = np.frombuffer(buffer(value), np.uint32)[0]
|
''', defs=r'''
|
||||||
blocks = int(np.ceil(np.sqrt(size / 1024 + 1)))
|
__shared__ int rb_idx;
|
||||||
fill(dptr, np.int32(size), value, stream=stream,
|
__device__ int rb_incr(int &rb_base, int tidx) {
|
||||||
block=(1024, 1, 1), grid=(blocks, blocks, 1))
|
if (threadIdx.y == 1 && threadIdx.x == 1)
|
||||||
|
rb_idx = 256 * (atomicAdd(&rb_base, 1) & RB_SIZE_MASK);
|
||||||
|
__syncthreads();
|
||||||
|
return rb_idx + tidx;
|
||||||
|
}
|
||||||
|
''')
|
||||||
|
|
||||||
|
# For now, the number of entries is fixed to a value known to work on all
|
||||||
|
# Fermi cards. Autodetection, or perhaps just a global limit increase, will be
|
||||||
|
# done when I get my hands on a Kepler device. The fixed size assumes blocks
|
||||||
|
# of 256 threads, although even at that size there are pathological cases that
|
||||||
|
# could break the assumption.
|
||||||
|
DEFAULT_RB_SIZE = 64
|
||||||
|
ringbuflib = mkringbuflib(DEFAULT_RB_SIZE)
|
||||||
|
105
cuburn/filters.py
Normal file
105
cuburn/filters.py
Normal file
@ -0,0 +1,105 @@
|
|||||||
|
import numpy as np
|
||||||
|
from numpy import float32 as f32, int32 as i32
|
||||||
|
|
||||||
|
import pycuda.driver as cuda
|
||||||
|
import pycuda.compiler
|
||||||
|
from pycuda.gpuarray import vec
|
||||||
|
|
||||||
|
import code.filters
|
||||||
|
from code.util import ClsMod, argset, launch
|
||||||
|
|
||||||
|
class Filter(object):
|
||||||
|
def apply(self, fb, gnm, dim, tc, stream=None):
|
||||||
|
"""
|
||||||
|
Queue the application of this filter. When the live stream finishes
|
||||||
|
executing the last item enqueued by this method, the result must be
|
||||||
|
live in the buffer pointed to by ``fb.d_front`` at the return of this
|
||||||
|
function.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
class Bilateral(Filter, ClsMod):
|
||||||
|
lib = code.filters.bilaterallib
|
||||||
|
def __init__(self, directions=8, r=15, sstd=6, cstd=0.05,
|
||||||
|
dstd=1.5, dpow=0.8, gspeed=4.0):
|
||||||
|
# TODO: expose these parameters on the genome, or at least on the
|
||||||
|
# profile, and set them by a less ugly mechanism
|
||||||
|
for n in 'directions r sstd cstd dstd dpow gspeed'.split():
|
||||||
|
setattr(self, n, locals()[n])
|
||||||
|
super(Bilateral, self).__init__()
|
||||||
|
|
||||||
|
def apply(self, fb, gnm, dim, tc, stream=None):
|
||||||
|
# Helper variables and functions to keep it clean
|
||||||
|
sb = 16 * dim.astride
|
||||||
|
bs = sb * dim.ah
|
||||||
|
bl, gr = (32, 8, 1), (dim.astride / 32, dim.ah / 8)
|
||||||
|
|
||||||
|
mkdsc = lambda c: argset(cuda.ArrayDescriptor(), height=dim.ah,
|
||||||
|
width=dim.astride, num_channels=c,
|
||||||
|
format=cuda.array_format.FLOAT)
|
||||||
|
def mktref(n):
|
||||||
|
tref = self.mod.get_texref(n)
|
||||||
|
tref.set_filter_mode(cuda.filter_mode.POINT)
|
||||||
|
tref.set_address_mode(0, cuda.address_mode.WRAP)
|
||||||
|
tref.set_address_mode(1, cuda.address_mode.WRAP)
|
||||||
|
return tref
|
||||||
|
|
||||||
|
dsc = mkdsc(4)
|
||||||
|
tref = mktref('bilateral_src')
|
||||||
|
grad_dsc = mkdsc(1)
|
||||||
|
grad_tref = mktref('blur_src')
|
||||||
|
|
||||||
|
for pattern in range(self.directions):
|
||||||
|
# Scale spatial parameter so that a "pixel" is equivalent to an
|
||||||
|
# actual pixel at 1080p
|
||||||
|
sstd = self.sstd * dim.w / 1920.
|
||||||
|
|
||||||
|
tref.set_address_2d(fb.d_front, dsc, sb)
|
||||||
|
|
||||||
|
# Blur density two octaves along sampling vector, ultimately
|
||||||
|
# storing in the side buffer
|
||||||
|
launch('den_blur', self.mod, stream, bl, gr,
|
||||||
|
fb.d_back, i32(pattern), i32(0), texrefs=[tref])
|
||||||
|
grad_tref.set_address_2d(fb.d_back, grad_dsc, sb / 4)
|
||||||
|
launch('den_blur_1c', self.mod, stream, bl, gr,
|
||||||
|
fb.d_side, i32(pattern), i32(1), texrefs=[grad_tref])
|
||||||
|
grad_tref.set_address_2d(fb.d_side, grad_dsc, sb / 4)
|
||||||
|
|
||||||
|
launch('bilateral', self.mod, stream, bl, gr,
|
||||||
|
fb.d_back, i32(pattern), i32(self.r),
|
||||||
|
f32(sstd), f32(self.cstd), f32(self.dstd),
|
||||||
|
f32(self.dpow), f32(self.gspeed),
|
||||||
|
texrefs=[tref, grad_tref])
|
||||||
|
fb.flip()
|
||||||
|
|
||||||
|
class Logscale(Filter, ClsMod):
|
||||||
|
lib = code.filters.logscalelib
|
||||||
|
def apply(self, fb, gnm, dim, tc, stream=None):
|
||||||
|
"""Log-scale in place."""
|
||||||
|
k1 = f32(gnm.color.brightness(tc) * 268 / 256)
|
||||||
|
# Old definition of area is (w*h/(s*s)). Since new scale 'ns' is now
|
||||||
|
# s/w, new definition is (w*h/(s*s*w*w)) = (h/(s*s*w))
|
||||||
|
area = dim.h / (gnm.camera.scale(tc) ** 2 * dim.w)
|
||||||
|
k2 = f32(1.0 / (area * gnm.spp(tc)))
|
||||||
|
nbins = dim.ah * dim.astride
|
||||||
|
launch('logscale', self.mod, stream, 256, nbins/256,
|
||||||
|
fb.d_front, fb.d_front, k1, k2)
|
||||||
|
|
||||||
|
class ColorClip(Filter, ClsMod):
|
||||||
|
lib = code.filters.colorcliplib
|
||||||
|
def apply(self, fb, gnm, dim, tc, stream=None):
|
||||||
|
# TODO: implement integration over cubic splines?
|
||||||
|
gam = f32(1 / gnm.color.gamma(tc))
|
||||||
|
vib = f32(gnm.color.vibrance(tc))
|
||||||
|
hipow = f32(gnm.color.highlight_power(tc))
|
||||||
|
lin = f32(gnm.color.gamma_threshold(tc))
|
||||||
|
lingam = f32(lin ** (gam-1.0) if lin > 0 else 0)
|
||||||
|
bkgd = vec.make_float3(
|
||||||
|
gnm.color.background.r(tc),
|
||||||
|
gnm.color.background.g(tc),
|
||||||
|
gnm.color.background.b(tc))
|
||||||
|
|
||||||
|
nbins = dim.ah * dim.astride
|
||||||
|
blocks = int(np.ceil(np.sqrt(nbins / 256.)))
|
||||||
|
launch('colorclip', self.mod, stream, 256, (blocks, blocks),
|
||||||
|
fb.d_front, gam, vib, hipow, lin, lingam, bkgd, i32(nbins))
|
49
cuburn/output.py
Normal file
49
cuburn/output.py
Normal file
@ -0,0 +1,49 @@
|
|||||||
|
import numpy as np
|
||||||
|
from numpy import float32 as f32, int32 as i32
|
||||||
|
|
||||||
|
import pycuda.driver as cuda
|
||||||
|
|
||||||
|
from code.util import ClsMod, launch
|
||||||
|
from code.output import f32tou8lib
|
||||||
|
|
||||||
|
import scipy.misc
|
||||||
|
|
||||||
|
if not hasattr(scipy.misc, 'toimage'):
|
||||||
|
raise ImportError("Could not find scipy.misc.toimage. "
|
||||||
|
"Are scipy and PIL installed?")
|
||||||
|
|
||||||
|
class Output(object):
|
||||||
|
def convert(self, fb, gnm, dim, stream=None):
|
||||||
|
"""
|
||||||
|
Convert a filtered buffer to whatever output format is needed by the
|
||||||
|
writer.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
def copy(self, fb, dim, pool, stream=None):
|
||||||
|
"""
|
||||||
|
Schedule a copy from the device buffer to host memory, returning the
|
||||||
|
target buffer.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
class PILOutput(Output, ClsMod):
|
||||||
|
lib = f32tou8lib
|
||||||
|
|
||||||
|
def convert(self, fb, gnm, dim, stream=None):
|
||||||
|
launch('f32_to_u8', self.mod, stream,
|
||||||
|
(32, 8, 1), (int(np.ceil(dim.w/32.)), int(np.ceil(dim.h/8.))),
|
||||||
|
fb.d_rb, fb.d_seeds, fb.d_back, fb.d_front,
|
||||||
|
i32(fb.gutter), i32(dim.w), i32(dim.astride), i32(dim.h))
|
||||||
|
|
||||||
|
def copy(self, fb, dim, pool, stream=None):
|
||||||
|
h_out = pool.allocate((dim.h, dim.w, 4), 'u1')
|
||||||
|
cuda.memcpy_dtoh_async(h_out, fb.d_back, stream)
|
||||||
|
return h_out
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def save(buf, name, type=None, quality=98):
|
||||||
|
if type == 'jpeg' or (type is None and name.endswith('.jpg')):
|
||||||
|
buf = buf[:,:,:3]
|
||||||
|
img = scipy.misc.toimage(buf, cmin=0, cmax=1)
|
||||||
|
img.save(name, type, quality=quality)
|
683
cuburn/render.py
683
cuburn/render.py
@ -1,383 +1,372 @@
|
|||||||
|
"""
|
||||||
|
Resources and tools to perform rendering.
|
||||||
|
"""
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import re
|
import re
|
||||||
import time as timemod
|
import time
|
||||||
import tempfile
|
import tempfile
|
||||||
from collections import namedtuple
|
from collections import namedtuple
|
||||||
from itertools import cycle, repeat, chain, izip, imap, ifilter
|
|
||||||
from ctypes import *
|
|
||||||
from cStringIO import StringIO
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from numpy import float32 as f32, int32 as i32, uint32 as u32, uint64 as u64
|
from numpy import float32 as f32, int32 as i32, uint32 as u32, uint64 as u64
|
||||||
from scipy import ndimage
|
|
||||||
|
|
||||||
import pycuda.compiler
|
|
||||||
import pycuda.driver as cuda
|
import pycuda.driver as cuda
|
||||||
import pycuda.tools
|
import pycuda.tools
|
||||||
|
|
||||||
import cuburn.genome
|
import filters
|
||||||
from cuburn import affine
|
import output
|
||||||
from cuburn.code import util, mwc, iter, interp, filtering, sort
|
from code import util, mwc, iter, interp, sort
|
||||||
|
from code.util import ClsMod, devlib, filldptrlib, assemble_code, launch
|
||||||
|
|
||||||
RenderedImage = namedtuple('RenderedImage', 'buf idx gpu_time')
|
RenderedImage = namedtuple('RenderedImage', 'buf idx gpu_time')
|
||||||
Dimensions = namedtuple('Dimensions', 'w h aw ah astride')
|
Dimensions = namedtuple('Dimensions', 'w h aw ah astride')
|
||||||
|
|
||||||
def _sync_stream(dst, src):
|
class Framebuffers(object):
|
||||||
dst.wait_for_event(cuda.Event(cuda.event_flags.DISABLE_TIMING).record(src))
|
"""
|
||||||
|
The largest memory allocations, and a stream to serialize their use.
|
||||||
|
|
||||||
class Renderer(object):
|
``d_front`` and ``d_back`` are separate buffers, each large enough to hold
|
||||||
|
four float32 components per pixel (including any gutter pixels added for
|
||||||
|
alignment or padding). ``d_side`` is another buffer large enough to hold
|
||||||
|
two float32 components per pixel.
|
||||||
|
|
||||||
|
Every user of this set of buffers may use and overwrite the buffers in
|
||||||
|
any way, as long as the output for the next stage winds up in the front
|
||||||
|
buffer. The front and back buffers can be exchanged by the ``flip()``
|
||||||
|
method (which simply exchanges the pointers); while no similar method
|
||||||
|
exists for the side buffer, you're free to do the same by taking local
|
||||||
|
copies of the references and exchanging them yourself.
|
||||||
|
|
||||||
|
There's one spot in the stream interleaving where the behavior is
|
||||||
|
different: the ``Output.convert`` call must store its output to the back
|
||||||
|
buffer, which will remain untouched until the dtoh copy of the converted
|
||||||
|
buffer is finished. This happens while the ``iter`` kernel of the next
|
||||||
|
frame writes to the front and side buffers, which means in practice that
|
||||||
|
there's essentially no waiting on any buffers.
|
||||||
|
|
||||||
|
If an output module decides to get all krazy and actually replaces the
|
||||||
|
references to the buffers on this object - to, say, implement a temporally
|
||||||
|
aware tone-mapping or denoising pass - that's probably okay, but just make
|
||||||
|
sure it appears like it's expected to.
|
||||||
"""
|
"""
|
||||||
Control structure for rendering a series of frames.
|
|
||||||
|
# Minimum extension of accumulation buffer beyond output region. Used to
|
||||||
|
# alleviate edge effects during filtering. Actual gutter may be larger to
|
||||||
|
# accomodate alignment requirements; when it is, that extension will be
|
||||||
|
# applied to the lower-right corner of the buffer. This is asymmetrical,
|
||||||
|
# but simplifies trimming logic when it's time for that.
|
||||||
|
gutter = 10
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def calc_dim(cls, width, height):
|
||||||
"""
|
"""
|
||||||
|
Given a width and height, return a valid set of dimensions which
|
||||||
|
include at least enough gutter to exceed the minimum, and where
|
||||||
|
(acc_width % 32) == 0 and (acc_height % 8) == 0.
|
||||||
|
"""
|
||||||
|
awidth = width + 2 * cls.gutter
|
||||||
|
aheight = 8 * int(np.ceil((height + 2 * cls.gutter) / 8.))
|
||||||
|
astride = 32 * int(np.ceil(awidth / 32.))
|
||||||
|
return Dimensions(width, height, awidth, aheight, astride)
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.stream = cuda.Stream()
|
||||||
|
self._clear()
|
||||||
|
|
||||||
|
# These resources rely on the slots/ringbuffer mechanism for sharing,
|
||||||
|
# and so can be shared across any number of launches, genomes, and
|
||||||
|
# render kernels. Notably, seeds are self-synchronizing, so they're not
|
||||||
|
# attached to either stream object.
|
||||||
|
self.d_rb = cuda.to_device(np.array([0, 0], dtype=u32))
|
||||||
|
seeds = mwc.make_seeds(util.DEFAULT_RB_SIZE * 256)
|
||||||
|
self.d_seeds = cuda.to_device(seeds)
|
||||||
|
self._len_d_points = util.DEFAULT_RB_SIZE * 256 * 16
|
||||||
|
self.d_points = cuda.mem_alloc(self._len_d_points)
|
||||||
|
|
||||||
|
def _clear(self):
|
||||||
|
self.nbins = self.d_front = self.d_back = self.d_side = None
|
||||||
|
|
||||||
|
def free(self, stream=None):
|
||||||
|
if stream is not None:
|
||||||
|
stream.synchronize()
|
||||||
|
else:
|
||||||
|
cu
|
||||||
|
for p in (self.d_front, self.d_back, self.d_side):
|
||||||
|
if p is not None:
|
||||||
|
p.free()
|
||||||
|
self._clear()
|
||||||
|
|
||||||
|
def alloc(self, dim, stream=None):
|
||||||
|
"""
|
||||||
|
Ensure that this object's framebuffers are large enough to handle the
|
||||||
|
given dimensions, allocating new ones if not.
|
||||||
|
|
||||||
|
If ``stream`` is not None and a reallocation is necessary, the stream
|
||||||
|
will be synchronized before the old buffers are deallocated.
|
||||||
|
"""
|
||||||
|
nbins = dim.ah * dim.astride
|
||||||
|
if self.nbins >= nbins: return
|
||||||
|
if self.nbins is not None: self.free()
|
||||||
|
try:
|
||||||
|
self.d_front = cuda.mem_alloc(16 * nbins)
|
||||||
|
self.d_back = cuda.mem_alloc(16 * nbins)
|
||||||
|
self.d_side = cuda.mem_alloc(8 * nbins)
|
||||||
|
self.nbins = nbins
|
||||||
|
except cuda.MemoryError, e:
|
||||||
|
# If a frame that's too large sneaks by the task distributor, we
|
||||||
|
# don't want to kill the server, but we also don't want to leave
|
||||||
|
# it stuck without any free memory to complete the next alloc.
|
||||||
|
# TODO: measure free mem and only take tasks that fit (but that
|
||||||
|
# should be done elsewhere)
|
||||||
|
self.free(stream)
|
||||||
|
raise e
|
||||||
|
|
||||||
|
def set_dim(self, width, height, stream=None):
|
||||||
|
"""
|
||||||
|
Compute padded dimensions for given width and height, ensure that the
|
||||||
|
buffers are large enough (and reallocate if not), and return the
|
||||||
|
calculated dimensions.
|
||||||
|
|
||||||
|
Note that the returned dimensions are always the same for a given
|
||||||
|
width, height, and minimum gutter, even if the underlying buffers are
|
||||||
|
larger due to a previous allocation.
|
||||||
|
"""
|
||||||
|
dim = self.calc_dim(width, height)
|
||||||
|
self.alloc(dim, stream)
|
||||||
|
return dim
|
||||||
|
|
||||||
|
def flip(self):
|
||||||
|
"""Flip the front and back buffers."""
|
||||||
|
self.d_front, self.d_back = self.d_back, self.d_front
|
||||||
|
|
||||||
|
class DevSrc(object):
|
||||||
|
"""
|
||||||
|
The buffers which represent a genome on-device, in the formats needed to
|
||||||
|
serve as a source for interpolating temporal samples.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Maximum number of knots per parameter. This also covers the maximum
|
||||||
|
# number of palettes allowed.
|
||||||
|
max_knots = 1 << util.DEFAULT_SEARCH_ROUNDS
|
||||||
|
|
||||||
|
# Maximum number of parameters per genome. This number is exceedingly
|
||||||
|
# high, and should never even come close to being hit.
|
||||||
|
max_params = 1024
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.d_times = cuda.mem_alloc(4 * self.max_knots * self.max_params)
|
||||||
|
self.d_knots = cuda.mem_alloc(4 * self.max_knots * self.max_params)
|
||||||
|
self.d_ptimes = cuda.mem_alloc(4 * self.max_knots)
|
||||||
|
self.d_pals = cuda.mem_alloc(4 * 4 * 256 * self.max_knots)
|
||||||
|
|
||||||
|
class DevInfo(object):
|
||||||
|
"""
|
||||||
|
The buffers which hold temporal samples on-device, as used by iter.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# The palette texture/surface covers the color coordinate from [0,1] with
|
||||||
|
# equidistant horizontal samples, and spans the temporal range of the
|
||||||
|
# frame linearly with this many rows. Increasing these values increases the
|
||||||
|
# number of uniquely-dithered samples when using pre-dithered surfaces, as
|
||||||
|
# is done in 'atomic' accumulation.
|
||||||
|
palette_width = 256 # TODO: make this setting be respected
|
||||||
|
palette_height = 64
|
||||||
|
|
||||||
|
# This used to be determined automagically, but simultaneous occupancy
|
||||||
|
# and a much smaller block size simplifies this.
|
||||||
|
ntemporal_samples = 1024
|
||||||
|
|
||||||
# Number of iterations to iterate without write after generating a new
|
# Number of iterations to iterate without write after generating a new
|
||||||
# point. This number is currently fixed pretty deeply in the set of magic
|
# point. This number is currently fixed pretty deeply in the set of magic
|
||||||
# constants which govern buffer sizes; changing the value here won't
|
# constants which govern buffer sizes; changing the value here won't
|
||||||
# actually change the code on the device to do something different.
|
# actually change the code on the device to do something different.
|
||||||
|
# It's here just for documentation purposes.
|
||||||
fuse = 256
|
fuse = 256
|
||||||
|
|
||||||
# The palette texture/surface covers the color coordinate from [0,1] with
|
|
||||||
# (for now, a fixed 256) equidistant horizontal samples, and spans the
|
|
||||||
# temporal range of the frame linearly with this many rows. Increasing
|
|
||||||
# this value increases the number of uniquely-dithered samples when using
|
|
||||||
# pre-dithered surfaces.
|
|
||||||
palette_height = 64
|
|
||||||
|
|
||||||
# Palette color interpolation mode (see code.interp.Palette)
|
|
||||||
palette_interp_mode = 'yuv'
|
|
||||||
|
|
||||||
# Maximum width of DE and other spatial filters, and thus in turn the
|
|
||||||
# amount of padding applied. Note that, for now, this must not be changed!
|
|
||||||
# The filtering code makes deep assumptions about this value.
|
|
||||||
gutter = 10
|
|
||||||
|
|
||||||
# Accumulation mode. Leave it at 'atomic' for now.
|
|
||||||
acc_mode = 'atomic'
|
|
||||||
|
|
||||||
# At most this many separate buffers for xforms will be allocated, after
|
|
||||||
# which further xforms will wrap to the first when writing. Currently it
|
|
||||||
# is compiled in, so power-of-two and no runtime maximization. Current
|
|
||||||
# value of 16 fits into a 1GB card at 1080p.
|
|
||||||
max_nxf = 1
|
|
||||||
|
|
||||||
# TODO
|
|
||||||
chaos_used = False
|
|
||||||
|
|
||||||
cmp_options = ('-use_fast_math', '-maxrregcount', '42')
|
|
||||||
keep = False
|
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self._iter = self.pal = self.src = self.cubin = self.mod = None
|
self.d_params = cuda.mem_alloc(
|
||||||
|
self.ntemporal_samples * DevSrc.max_params * 4)
|
||||||
# Ensure class options don't get contaminated on an instance
|
self.palette_surf_dsc = util.argset(cuda.ArrayDescriptor3D(),
|
||||||
self.cmp_options = list(self.cmp_options)
|
height=self.palette_height, width=self.palette_width, depth=0,
|
||||||
|
|
||||||
def compile(self, genome, keep=None, cmp_options=None):
|
|
||||||
"""
|
|
||||||
Compile a kernel capable of rendering every frame in this animation.
|
|
||||||
The resulting compiled kernel is stored in the ``cubin`` property;
|
|
||||||
the source is available as ``src``, and is also returned for
|
|
||||||
inspection and display.
|
|
||||||
|
|
||||||
This operation is idempotent, and has no side effects outside of
|
|
||||||
setting properties on this instance (unless there's a compiler error,
|
|
||||||
which is a bug); it should therefore be threadsafe as well.
|
|
||||||
It is, however, rather slow.
|
|
||||||
"""
|
|
||||||
keep = self.keep if keep is None else keep
|
|
||||||
cmp_options = self.cmp_options if cmp_options is None else cmp_options
|
|
||||||
|
|
||||||
self._iter = iter.IterCode(self, genome)
|
|
||||||
self._iter.packer.finalize()
|
|
||||||
self.pal = interp.Palette(self.palette_interp_mode)
|
|
||||||
self.src = util.assemble_code(util.BaseCode, mwc.MWC, self._iter.packer,
|
|
||||||
self.pal, self._iter)
|
|
||||||
with open(os.path.join(tempfile.gettempdir(), 'kernel.cu'), 'w') as fp:
|
|
||||||
fp.write(self.src)
|
|
||||||
self.cubin = pycuda.compiler.compile(
|
|
||||||
self.src, keep=keep, options=cmp_options,
|
|
||||||
cache_dir=False if keep else None)
|
|
||||||
|
|
||||||
def load(self, genome, jit_options=[]):
|
|
||||||
if not self.cubin:
|
|
||||||
self.compile(genome)
|
|
||||||
self.mod = cuda.module_from_buffer(self.cubin, jit_options)
|
|
||||||
with open('/tmp/iter_kern.cubin', 'wb') as fp:
|
|
||||||
fp.write(self.cubin)
|
|
||||||
return self.src
|
|
||||||
|
|
||||||
def render(self, genome, times, width, height, blend=True):
|
|
||||||
"""
|
|
||||||
Render a frame for each timestamp in the iterable value ``times``. This
|
|
||||||
function returns a generator that will yield a RenderedImage object
|
|
||||||
containing a shared reference to the output buffer for each specified
|
|
||||||
frame.
|
|
||||||
|
|
||||||
The returned buffer is page-locked host memory. Between the time a
|
|
||||||
buffer is yielded and the time the next frame's results are requested,
|
|
||||||
the buffer will not be modified. Thereafter, however, it will be
|
|
||||||
overwritten by an asynchronous DMA operation coming from the CUDA
|
|
||||||
device. If you hang on to it for longer than one frame, copy it.
|
|
||||||
|
|
||||||
``genome`` is the genome to be rendered. Successive calls to the
|
|
||||||
`render()` method on one ``Renderer`` object must use genomes which
|
|
||||||
produce identical compiled code, and this will not be verified by the
|
|
||||||
renderer. In practice, this means you can alter genome parameter
|
|
||||||
values, but the full set of keys must remain identical between runs on
|
|
||||||
the same renderer.
|
|
||||||
|
|
||||||
``times`` is a list of (idx, cen_time) tuples, where ``idx`` is passed
|
|
||||||
unmodified in the RenderedImage return value and ``cen_time`` is the
|
|
||||||
central time of the current frame in spline-time units. (Any
|
|
||||||
clock-time or frame-time units in the genome should be preconverted.)
|
|
||||||
|
|
||||||
If ``blend`` is False, the output buffer will contain unclipped,
|
|
||||||
premultiplied RGBA data, without vibrancy, highlight power, or the
|
|
||||||
alpha elbow applied.
|
|
||||||
"""
|
|
||||||
r = self.render_gen(genome, width, height, blend=blend)
|
|
||||||
next(r)
|
|
||||||
return ifilter(None, imap(r.send, chain(times, [None])))
|
|
||||||
|
|
||||||
def render_gen(self, genome, width, height, blend=True):
|
|
||||||
"""
|
|
||||||
Render frames. This method is wrapped by the ``render()`` method; see
|
|
||||||
its docstring for warnings and details.
|
|
||||||
|
|
||||||
Instead of passing frame times as an iterable, they are passed
|
|
||||||
individually via the ``generator.send()`` method. There is an
|
|
||||||
internal pipeline latency of one frame, so the first call to the
|
|
||||||
``send()`` method will return None, the second call will return the
|
|
||||||
first frame's result, and so on. To retrieve the last frame in a
|
|
||||||
sequence, send ``None``.
|
|
||||||
|
|
||||||
Direct use of this method is useful for implementing render servers.
|
|
||||||
"""
|
|
||||||
|
|
||||||
last_idx = None
|
|
||||||
next_frame = yield
|
|
||||||
if next_frame is None:
|
|
||||||
return
|
|
||||||
|
|
||||||
if not self.mod:
|
|
||||||
self.load(genome)
|
|
||||||
|
|
||||||
filt = filtering.Filtering()
|
|
||||||
|
|
||||||
reset_rb_fun = self.mod.get_function("reset_rb")
|
|
||||||
packer_fun = self.mod.get_function("interp_iter_params")
|
|
||||||
iter_fun = self.mod.get_function("iter")
|
|
||||||
|
|
||||||
# The synchronization model is messy. See helpers/task_model.svg.
|
|
||||||
iter_stream = cuda.Stream()
|
|
||||||
filt_stream = cuda.Stream()
|
|
||||||
if self.acc_mode == 'deferred':
|
|
||||||
write_stream = cuda.Stream()
|
|
||||||
write_fun = self.mod.get_function("write_shmem")
|
|
||||||
else:
|
|
||||||
write_stream = iter_stream
|
|
||||||
|
|
||||||
# These events fire when the corresponding buffer is available for
|
|
||||||
# reading on the host (i.e. the copy is done). On the first pass, 'a'
|
|
||||||
# will be ignored, and subsequently moved to 'b'.
|
|
||||||
event_a = cuda.Event().record(filt_stream)
|
|
||||||
event_b = None
|
|
||||||
|
|
||||||
awidth = width + 2 * self.gutter
|
|
||||||
aheight = 32 * int(np.ceil((height + 2 * self.gutter) / 32.))
|
|
||||||
astride = 32 * int(np.ceil(awidth / 32.))
|
|
||||||
dim = Dimensions(width, height, awidth, aheight, astride)
|
|
||||||
d_acc_size = self.mod.get_global('acc_size')[0]
|
|
||||||
cuda.memcpy_htod_async(d_acc_size, u32(list(dim)), write_stream)
|
|
||||||
|
|
||||||
nbins = astride * aheight
|
|
||||||
nxf = len(filter(lambda g: g != 'final', genome.xforms))
|
|
||||||
nxf = min(nxf, self.max_nxf)
|
|
||||||
d_accum = cuda.mem_alloc(16 * nbins * nxf)
|
|
||||||
d_out = cuda.mem_alloc(16 * nbins)
|
|
||||||
if self.acc_mode == 'atomic':
|
|
||||||
d_atom = cuda.mem_alloc(8 * nbins * nxf)
|
|
||||||
flush_fun = self.mod.get_function("flush_atom")
|
|
||||||
else:
|
|
||||||
# d_atom is also used as a scratch buffer during filtering, so we
|
|
||||||
# need it at least this size
|
|
||||||
d_atom = cuda.mem_alloc(4 * nbins)
|
|
||||||
|
|
||||||
obuf_copy = util.argset(cuda.Memcpy2D(),
|
|
||||||
src_y=self.gutter, src_x_in_bytes=16*self.gutter,
|
|
||||||
src_pitch=16*astride, dst_pitch=16*width,
|
|
||||||
width_in_bytes=16*width, height=height)
|
|
||||||
obuf_copy.set_src_device(d_out)
|
|
||||||
h_out_a = cuda.pagelocked_empty((height, width, 4), f32)
|
|
||||||
h_out_b = cuda.pagelocked_empty((height, width, 4), f32)
|
|
||||||
|
|
||||||
if self.acc_mode == 'deferred':
|
|
||||||
# Having a fixed, power-of-two log size makes things much easier
|
|
||||||
log_size = 64 << 20
|
|
||||||
d_log = cuda.mem_alloc(log_size * 4)
|
|
||||||
d_log_sorted = cuda.mem_alloc(log_size * 4)
|
|
||||||
sorter = sort.Sorter(log_size)
|
|
||||||
# We need to cover each unique tag - address bits 20-23 - with one
|
|
||||||
# write block per sort bin. Or somethinig like that.
|
|
||||||
nwriteblocks = int(np.ceil(nbins / float(1<<20))) * 256
|
|
||||||
|
|
||||||
# Calculate 'nslots', the number of simultaneous running threads that
|
|
||||||
# can be active on the GPU during iteration (and thus the number of
|
|
||||||
# slots for loading and storing RNG and point context that will be
|
|
||||||
# prepared on the device), and derive 'rb_size', the number of blocks in
|
|
||||||
# 'nslots'.
|
|
||||||
iter_threads_per_block = 256
|
|
||||||
dev_data = pycuda.tools.DeviceData()
|
|
||||||
occupancy = pycuda.tools.OccupancyRecord(
|
|
||||||
dev_data, iter_threads_per_block,
|
|
||||||
iter_fun.shared_size_bytes, iter_fun.num_regs)
|
|
||||||
nsms = cuda.Context.get_device().multiprocessor_count
|
|
||||||
rb_size = occupancy.warps_per_mp * nsms / (iter_threads_per_block / 32)
|
|
||||||
nslots = iter_threads_per_block * rb_size
|
|
||||||
|
|
||||||
# Reset the ringbuffer info for the slots
|
|
||||||
reset_rb_fun(np.int32(rb_size), block=(1,1,1))
|
|
||||||
|
|
||||||
d_points = cuda.mem_alloc(nslots * 16)
|
|
||||||
# This statement may add extra seeds to simplify palette dithering.
|
|
||||||
seeds = mwc.MWC.make_seeds(max(nslots, 256 * self.palette_height))
|
|
||||||
d_seeds = cuda.to_device(seeds)
|
|
||||||
|
|
||||||
# We used to auto-calculate this to a multiple of the number of SMs on
|
|
||||||
# the device, but since we now use shorter launches and, to a certain
|
|
||||||
# extent, allow simultaneous occupancy, that's not as important. The
|
|
||||||
# 1024 is a magic constant to ensure reasonable and power-of-two log
|
|
||||||
# size for deferred: 256MB / (4B * FUSE * NTHREADS). Enhancements to
|
|
||||||
# the sort engine are needed to make this more flexible.
|
|
||||||
ntemporal_samples = 1024
|
|
||||||
genome_times, genome_knots = self._iter.packer.pack()
|
|
||||||
d_genome_times = cuda.to_device(genome_times)
|
|
||||||
d_genome_knots = cuda.to_device(genome_knots)
|
|
||||||
info_size = 4 * len(self._iter.packer) * ntemporal_samples
|
|
||||||
d_infos = cuda.mem_alloc(info_size)
|
|
||||||
|
|
||||||
ptimes, pidxs = zip(*genome.palette_times)
|
|
||||||
palint_times = np.empty(len(genome_times[0]), f32)
|
|
||||||
palint_times.fill(1e10)
|
|
||||||
palint_times[:len(ptimes)] = ptimes
|
|
||||||
d_palint_times = cuda.to_device(palint_times)
|
|
||||||
pvals = self.pal.prepare([genome.decoded_palettes[i] for i in pidxs])
|
|
||||||
d_palint_vals = cuda.to_device(np.concatenate(pvals))
|
|
||||||
|
|
||||||
if self.acc_mode in ('deferred', 'atomic'):
|
|
||||||
palette_fun = self.mod.get_function("interp_palette_flat")
|
|
||||||
dsc = util.argset(cuda.ArrayDescriptor3D(),
|
|
||||||
height=self.palette_height, width=256, depth=0,
|
|
||||||
format=cuda.array_format.SIGNED_INT32,
|
format=cuda.array_format.SIGNED_INT32,
|
||||||
num_channels=2, flags=cuda.array3d_flags.SURFACE_LDST)
|
num_channels=2, flags=cuda.array3d_flags.SURFACE_LDST)
|
||||||
palarray = cuda.Array(dsc)
|
self.d_pal_array = cuda.Array(self.palette_surf_dsc)
|
||||||
|
|
||||||
|
class Renderer(object):
|
||||||
|
def __init__(self, gnm):
|
||||||
|
self.packer, self.lib = iter.mkiterlib(gnm)
|
||||||
|
cubin = util.compile('iter', assemble_code(self.lib))
|
||||||
|
self.mod = cuda.module_from_buffer(cubin)
|
||||||
|
|
||||||
|
# TODO: make these customizable
|
||||||
|
self.filts = [ filters.Bilateral()
|
||||||
|
, filters.Logscale()
|
||||||
|
, filters.ColorClip() ]
|
||||||
|
self.out = output.PILOutput()
|
||||||
|
|
||||||
|
class RenderManager(ClsMod):
|
||||||
|
lib = devlib(deps=[interp.palintlib, filldptrlib, iter.flushatomlib])
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
super(RenderManager, self).__init__()
|
||||||
|
self.pool = pycuda.tools.PageLockedMemoryPool()
|
||||||
|
|
||||||
|
self.fb = Framebuffers()
|
||||||
|
self.src_a, self.src_b = DevSrc(), DevSrc()
|
||||||
|
self.info_a, self.info_b = DevInfo(), DevInfo()
|
||||||
|
self.stream_a, self.stream_b = cuda.Stream(), cuda.Stream()
|
||||||
|
self.filt_evt = self.copy_evt = None
|
||||||
|
|
||||||
|
def _copy(self, rdr, gnm):
|
||||||
|
"""
|
||||||
|
Queue a copy of a host genome into a set of device interpolation source
|
||||||
|
buffers.
|
||||||
|
|
||||||
|
Note that for now, this is broken! It ignores ``gnm``, and only packs
|
||||||
|
the genome that was used when creating the renderer.
|
||||||
|
"""
|
||||||
|
times, knots = rdr.packer.pack(self.pool)
|
||||||
|
cuda.memcpy_htod_async(self.src_a.d_times, times, self.stream_a)
|
||||||
|
cuda.memcpy_htod_async(self.src_a.d_knots, knots, self.stream_a)
|
||||||
|
|
||||||
|
ptimes, pidxs = zip(*gnm.palette_times)
|
||||||
|
palettes = self.pool.allocate((len(ptimes), 256, 4), f32)
|
||||||
|
palettes[:] = [gnm.decoded_palettes[i] for i in pidxs]
|
||||||
|
palette_times = self.pool.allocate((self.src_a.max_knots,), f32)
|
||||||
|
palette_times.fill(1e9)
|
||||||
|
palette_times[:len(ptimes)] = ptimes
|
||||||
|
cuda.memcpy_htod_async(self.src_a.d_pals, palettes, self.stream_a)
|
||||||
|
cuda.memcpy_htod_async(self.src_a.d_ptimes, palette_times,
|
||||||
|
self.stream_a)
|
||||||
|
|
||||||
|
# TODO: use bilerp tex as src for palette interp
|
||||||
|
|
||||||
|
def _interp(self, rdr, gnm, dim, ts, td):
|
||||||
|
d_acc_size = rdr.mod.get_global('acc_size')[0]
|
||||||
|
p_dim = self.pool.allocate((len(dim),), u32)
|
||||||
|
p_dim[:] = dim
|
||||||
|
cuda.memcpy_htod_async(d_acc_size, p_dim, self.stream_a)
|
||||||
|
|
||||||
tref = self.mod.get_surfref('flatpal')
|
tref = self.mod.get_surfref('flatpal')
|
||||||
tref.set_array(palarray, 0)
|
tref.set_array(self.info_a.d_pal_array, 0)
|
||||||
else:
|
launch('interp_palette_flat', self.mod, self.stream_a,
|
||||||
palette_fun = self.mod.get_function("interp_palette")
|
256, self.info_a.palette_height,
|
||||||
dsc = util.argset(cuda.ArrayDescriptor(),
|
self.fb.d_rb, self.fb.d_seeds,
|
||||||
height=self.palette_height, width=256,
|
self.src_a.d_ptimes, self.src_a.d_pals,
|
||||||
format=cuda.array_format.UNSIGNED_INT8,
|
f32(ts), f32(td / self.info_a.palette_height))
|
||||||
num_channels=4)
|
|
||||||
d_palmem = cuda.mem_alloc(256 * self.palette_height * 4)
|
|
||||||
|
|
||||||
tref = self.mod.get_texref('palTex')
|
nts = self.info_a.ntemporal_samples
|
||||||
tref.set_address_2d(d_palmem, dsc, 1024)
|
launch('interp_iter_params', rdr.mod, self.stream_a,
|
||||||
tref.set_flags(cuda.TRSF_NORMALIZED_COORDINATES)
|
256, np.ceil(nts / 256.),
|
||||||
tref.set_filter_mode(cuda.filter_mode.LINEAR)
|
self.info_a.d_params, self.src_a.d_times, self.src_a.d_knots,
|
||||||
|
f32(ts), f32(td / nts), i32(nts))
|
||||||
|
|
||||||
while next_frame is not None:
|
def _print_interp_knots(self, rdr, tsidx=5):
|
||||||
# tc, td, ts, te: central, delta, start, end times
|
infos = cuda.from_device_like(self.info_a.d_params,
|
||||||
idx, tc = next_frame
|
(tsidx + 1, self.info_a.max_params), f32)
|
||||||
td = genome.adj_frame_width(tc)
|
for i, n in zip(infos[-1], rdr.packer.packed):
|
||||||
|
print '%60s %g' % ('_'.join(n), i)
|
||||||
|
|
||||||
|
def _iter(self, rdr, gnm, dim, tc):
|
||||||
|
tref = rdr.mod.get_surfref('flatpal')
|
||||||
|
tref.set_array(self.info_a.d_pal_array, 0)
|
||||||
|
|
||||||
|
nbins = dim.ah * dim.astride
|
||||||
|
fill = lambda b, s, v=i32(0): util.fill_dptr(
|
||||||
|
self.mod, b, s, stream=self.stream_a, value=v)
|
||||||
|
fill(self.fb.d_front, 4 * nbins)
|
||||||
|
fill(self.fb.d_side, 2 * nbins)
|
||||||
|
fill(self.fb.d_points, self.fb._len_d_points / 4, f32(np.nan))
|
||||||
|
|
||||||
|
nts = self.info_a.ntemporal_samples
|
||||||
|
nsamps = (gnm.spp(tc) * dim.w * dim.h)
|
||||||
|
nrounds = int(nsamps / (nts * 256. * 256)) + 1
|
||||||
|
launch('iter', rdr.mod, self.stream_a, (32, 8, 1), (nts, nrounds),
|
||||||
|
self.fb.d_front, self.fb.d_side,
|
||||||
|
self.fb.d_rb, self.fb.d_seeds, self.fb.d_points,
|
||||||
|
self.info_a.d_params)
|
||||||
|
|
||||||
|
nblocks = int(np.ceil(np.sqrt(dim.ah*dim.astride/256.)))
|
||||||
|
launch('flush_atom', self.mod, self.stream_a,
|
||||||
|
256, (nblocks, nblocks),
|
||||||
|
u64(self.fb.d_front), u64(self.fb.d_side), i32(nbins))
|
||||||
|
|
||||||
|
def queue_frame(self, rdr, gnm, tc, w, h, copy=True):
|
||||||
|
"""
|
||||||
|
Queue one frame for rendering.
|
||||||
|
|
||||||
|
``rdr`` is a compiled Renderer module. Caller must ensure that the
|
||||||
|
module is compatible with the genome data provided.
|
||||||
|
|
||||||
|
``gnm`` is the genome to be rendered.
|
||||||
|
|
||||||
|
``tc`` is the center time at which to render.
|
||||||
|
|
||||||
|
``w``, ``h`` are the width and height of the desired output in px.
|
||||||
|
|
||||||
|
If ``copy`` is False, the genome data will not be recopied for each
|
||||||
|
new genome. This function must be called with ``copy=True`` the first
|
||||||
|
time a new genome is used, and may be called in that manner
|
||||||
|
subsequently without harm. I suspect the performance impact is low, so
|
||||||
|
leave ``copy`` to True every time for now.
|
||||||
|
|
||||||
|
The return value is a 2-tuple ``(evt, h_out)``, where ``evt`` is a
|
||||||
|
CUDA event and ``h_out`` is the return value of the output module's
|
||||||
|
``copy`` function. In the typical case, ``h_out`` will be a host
|
||||||
|
allocation containing data in an appropriate format for the output
|
||||||
|
module's file writer, and ``evt`` indicates when the asynchronous
|
||||||
|
DMA copy which will populate ``h_out`` is complete. This can vary
|
||||||
|
depending on the output module in use, though.
|
||||||
|
"""
|
||||||
|
# Note: we synchronize on the previous stream if buffers need to be
|
||||||
|
# reallocated, which implicitly also syncs the current stream.
|
||||||
|
dim = self.fb.set_dim(w, h, self.stream_b)
|
||||||
|
|
||||||
|
td = gnm.adj_frame_width(tc)
|
||||||
ts, te = tc - 0.5 * td, tc + 0.5 * td
|
ts, te = tc - 0.5 * td, tc + 0.5 * td
|
||||||
|
|
||||||
if self.acc_mode in ('deferred', 'atomic'):
|
# The stream interleaving here is nontrivial.
|
||||||
# In this mode, the palette writes to a surface reference, but
|
# TODO: update diagram and link to it here
|
||||||
# requires dithering, so we pass it the seeds instead
|
if copy:
|
||||||
arg0 = d_seeds
|
self.src_a, self.src_b = self.src_b, self.src_a
|
||||||
else:
|
self._copy(rdr, gnm)
|
||||||
arg0 = d_palmem
|
self._interp(rdr, gnm, dim, ts, td)
|
||||||
palette_fun(arg0, d_palint_times, d_palint_vals,
|
if self.filt_evt:
|
||||||
f32(ts), f32(td / self.palette_height),
|
self.stream_a.wait_for_event(self.filt_evt)
|
||||||
block=(256,1,1), grid=(self.palette_height,1),
|
self._iter(rdr, gnm, dim, tc)
|
||||||
stream=write_stream)
|
if self.copy_evt:
|
||||||
|
self.stream_a.wait_for_event(self.copy_evt)
|
||||||
|
for filt in rdr.filts:
|
||||||
|
filt.apply(self.fb, gnm, dim, tc, self.stream_a)
|
||||||
|
rdr.out.convert(self.fb, gnm, dim, self.stream_a)
|
||||||
|
self.filt_evt = cuda.Event().record(self.stream_a)
|
||||||
|
h_out = rdr.out.copy(self.fb, dim, self.pool, self.stream_a)
|
||||||
|
self.copy_evt = cuda.Event().record(self.stream_a)
|
||||||
|
|
||||||
packer_fun(d_infos, d_genome_times, d_genome_knots,
|
self.info_a, self.info_b = self.info_b, self.info_a
|
||||||
f32(ts), f32(td / ntemporal_samples),
|
self.stream_a, self.stream_b = self.stream_b, self.stream_a
|
||||||
i32(ntemporal_samples), block=(256,1,1),
|
return self.copy_evt, h_out
|
||||||
grid=(int(np.ceil(ntemporal_samples/256.)),1),
|
|
||||||
stream=iter_stream)
|
|
||||||
|
|
||||||
# Reset points so that they will be FUSEd
|
|
||||||
util.BaseCode.fill_dptr(self.mod, d_points, 4 * nslots,
|
|
||||||
iter_stream, f32(np.nan))
|
|
||||||
|
|
||||||
# Get interpolated control points for debugging
|
|
||||||
#iter_stream.synchronize()
|
|
||||||
#d_temp = cuda.from_device(d_infos,
|
|
||||||
#(ntemporal_samples, len(self._iter.packer)), f32)
|
|
||||||
#for i, n in zip(d_temp[5], self._iter.packer.packed):
|
|
||||||
#print '%60s %g' % ('_'.join(n), i)
|
|
||||||
|
|
||||||
util.BaseCode.fill_dptr(self.mod, d_accum, 4 * nbins * nxf,
|
|
||||||
write_stream)
|
|
||||||
if self.acc_mode == 'atomic':
|
|
||||||
util.BaseCode.fill_dptr(self.mod, d_atom, 2 * nbins * nxf,
|
|
||||||
write_stream)
|
|
||||||
nrounds = int( (genome.spp(tc) * width * height)
|
|
||||||
/ (ntemporal_samples * 256 * 256) ) + 1
|
|
||||||
if self.acc_mode == 'deferred':
|
|
||||||
for i in range(nrounds):
|
|
||||||
iter_fun(np.uint64(d_log), d_seeds, d_points, d_infos,
|
|
||||||
block=(32, self._iter.NTHREADS/32, 1),
|
|
||||||
grid=(ntemporal_samples, 1), stream=iter_stream)
|
|
||||||
_sync_stream(write_stream, iter_stream)
|
|
||||||
sorter.sort(d_log_sorted, d_log, log_size, 3, True,
|
|
||||||
stream=write_stream)
|
|
||||||
_sync_stream(iter_stream, write_stream)
|
|
||||||
write_fun(d_accum, d_log_sorted, sorter.dglobal, i32(nbins),
|
|
||||||
block=(1024, 1, 1), grid=(nwriteblocks, 1),
|
|
||||||
stream=write_stream)
|
|
||||||
else:
|
|
||||||
args = [u64(d_accum), d_seeds, d_points, d_infos]
|
|
||||||
if self.acc_mode == 'atomic':
|
|
||||||
args.append(u64(d_atom))
|
|
||||||
iter_fun(*args, block=(32, self._iter.NTHREADS/32, 1),
|
|
||||||
grid=(ntemporal_samples, nrounds), stream=iter_stream)
|
|
||||||
if self.acc_mode == 'atomic':
|
|
||||||
nblocks = int(np.ceil(np.sqrt(nbins*nxf/float(512))))
|
|
||||||
flush_fun(u64(d_accum), u64(d_atom), i32(nbins*nxf),
|
|
||||||
block=(512, 1, 1), grid=(nblocks, nblocks),
|
|
||||||
stream=iter_stream)
|
|
||||||
|
|
||||||
util.BaseCode.fill_dptr(self.mod, d_out, 4 * nbins, filt_stream)
|
|
||||||
_sync_stream(filt_stream, write_stream)
|
|
||||||
filt.de(d_out, d_accum, d_atom, genome, dim, tc, nxf,
|
|
||||||
stream=filt_stream)
|
|
||||||
_sync_stream(write_stream, filt_stream)
|
|
||||||
filt.colorclip(d_out, genome, dim, tc, blend, stream=filt_stream)
|
|
||||||
obuf_copy.set_dst_host(h_out_a)
|
|
||||||
obuf_copy(filt_stream)
|
|
||||||
|
|
||||||
if event_b:
|
|
||||||
while not event_a.query():
|
|
||||||
timemod.sleep(0.01)
|
|
||||||
gpu_time = event_a.time_since(event_b)
|
|
||||||
result = RenderedImage(h_out_b, last_idx, gpu_time)
|
|
||||||
else:
|
|
||||||
result = None
|
|
||||||
last_idx = idx
|
|
||||||
|
|
||||||
event_a, event_b = cuda.Event().record(filt_stream), event_a
|
|
||||||
h_out_a, h_out_b = h_out_b, h_out_a
|
|
||||||
|
|
||||||
# TODO: add ability to flush a frame without breaking the pipe
|
|
||||||
next_frame = yield result
|
|
||||||
|
|
||||||
while not event_a.query():
|
|
||||||
timemod.sleep(0.001)
|
|
||||||
gpu_time = event_a.time_since(event_b)
|
|
||||||
yield RenderedImage(h_out_b, last_idx, gpu_time)
|
|
||||||
|
|
||||||
|
def render(self, gnm, times, w, h):
|
||||||
|
"""
|
||||||
|
A port of the old rendering function, retained for backwards
|
||||||
|
compatibility. Some of this will be pulled into as-yet-undecided
|
||||||
|
methods for more DRY.
|
||||||
|
"""
|
||||||
|
rdr = Renderer(gnm)
|
||||||
|
last_evt = cuda.Event().record(self.stream_a)
|
||||||
|
last_idx = None
|
||||||
|
def wait(): # Times like these where you wish for a macro
|
||||||
|
while not last_evt.query():
|
||||||
|
time.sleep(0.01)
|
||||||
|
gpu_time = last_evt.time_since(two_evts_ago)
|
||||||
|
return RenderedImage(h_buf, idx, gpu_time)
|
||||||
|
for idx, tc in times:
|
||||||
|
evt, h_buf = self.queue_frame(rdr, gnm, tc, w, h, last_idx is None)
|
||||||
|
if last_idx:
|
||||||
|
yield wait()
|
||||||
|
two_evts_ago, last_evt = last_evt, evt
|
||||||
|
last_buf, last_idx = h_buf, idx
|
||||||
|
if last_idx:
|
||||||
|
yield wait()
|
||||||
|
37
main.py
37
main.py
@ -3,8 +3,8 @@
|
|||||||
# cuburn, one of a surprisingly large number of ports of the fractal flame
|
# cuburn, one of a surprisingly large number of ports of the fractal flame
|
||||||
# algorithm to NVIDIA GPUs.
|
# algorithm to NVIDIA GPUs.
|
||||||
#
|
#
|
||||||
# This one is copyright 2010-2011, Steven Robertson <steven@strobe.cc>
|
# This one is copyright 2010-2012, Steven Robertson <steven@strobe.cc>
|
||||||
# and Eric Reckase <e.reckase@gmail.com>.
|
# and Erik Reckase <e.reckase@gmail.com>.
|
||||||
#
|
#
|
||||||
# This program is free software; you can redistribute it and/or modify
|
# This program is free software; you can redistribute it and/or modify
|
||||||
# it under the terms of the GNU General Public License version 2 or later
|
# it under the terms of the GNU General Public License version 2 or later
|
||||||
@ -20,11 +20,9 @@ from subprocess import Popen
|
|||||||
from itertools import ifilter
|
from itertools import ifilter
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import Image
|
|
||||||
import scipy
|
|
||||||
import pycuda.driver as cuda
|
import pycuda.driver as cuda
|
||||||
|
|
||||||
from cuburn import genome, render
|
from cuburn import genome, render, filters, output
|
||||||
|
|
||||||
profiles = {
|
profiles = {
|
||||||
'1080p': dict(fps=24, width=1920, height=1080, quality=3000, skip=0),
|
'1080p': dict(fps=24, width=1920, height=1080, quality=3000, skip=0),
|
||||||
@ -33,11 +31,10 @@ profiles = {
|
|||||||
'preview': dict(fps=24, width=640, height=360, quality=800, skip=1)
|
'preview': dict(fps=24, width=640, height=360, quality=800, skip=1)
|
||||||
}
|
}
|
||||||
|
|
||||||
def save(rframe):
|
def save(out):
|
||||||
noalpha = rframe.buf[:,:,:3]
|
# Temporary! TODO: fix this
|
||||||
img = scipy.misc.toimage(noalpha, cmin=0, cmax=1)
|
output.PILOutput.save(out.buf, out.idx)
|
||||||
img.save(rframe.idx, quality=98)
|
print out.idx, out.gpu_time
|
||||||
print rframe.idx, rframe.gpu_time
|
|
||||||
|
|
||||||
def main(args, prof):
|
def main(args, prof):
|
||||||
import pycuda.autoinit
|
import pycuda.autoinit
|
||||||
@ -53,9 +50,7 @@ def main(args, prof):
|
|||||||
gnm = genome.Genome(gnm)
|
gnm = genome.Genome(gnm)
|
||||||
err, times = gnm.set_profile(prof)
|
err, times = gnm.set_profile(prof)
|
||||||
|
|
||||||
anim = render.Renderer()
|
rmgr = render.RenderManager()
|
||||||
anim.compile(gnm, keep=args.keep)
|
|
||||||
anim.load(gnm)
|
|
||||||
|
|
||||||
basename = os.path.basename(args.flame.name).rsplit('.', 1)[0] + '_'
|
basename = os.path.basename(args.flame.name).rsplit('.', 1)[0] + '_'
|
||||||
if args.flame.name == '-':
|
if args.flame.name == '-':
|
||||||
@ -76,9 +71,13 @@ def main(args, prof):
|
|||||||
if not os.path.isfile(f[0]) or m > os.path.getmtime(f[0]))
|
if not os.path.isfile(f[0]) or m > os.path.getmtime(f[0]))
|
||||||
|
|
||||||
w, h = prof['width'], prof['height']
|
w, h = prof['width'], prof['height']
|
||||||
gen = anim.render(gnm, frames, w, h)
|
gen = rmgr.render(gnm, frames, w, h)
|
||||||
|
|
||||||
|
if not args.gfx:
|
||||||
|
for out in gen:
|
||||||
|
save(out)
|
||||||
|
return
|
||||||
|
|
||||||
if args.gfx:
|
|
||||||
import pyglet
|
import pyglet
|
||||||
window = pyglet.window.Window(w, h, vsync=False)
|
window = pyglet.window.Window(w, h, vsync=False)
|
||||||
image = pyglet.image.CheckerImagePattern().create_image(w, h)
|
image = pyglet.image.CheckerImagePattern().create_image(w, h)
|
||||||
@ -126,11 +125,7 @@ def main(args, prof):
|
|||||||
pyglet.clock.set_fps_limit(30)
|
pyglet.clock.set_fps_limit(30)
|
||||||
pyglet.clock.schedule_interval(poll, 1/30.)
|
pyglet.clock.schedule_interval(poll, 1/30.)
|
||||||
pyglet.app.run()
|
pyglet.app.run()
|
||||||
else:
|
|
||||||
for out in gen:
|
|
||||||
save(out)
|
|
||||||
if args.sync:
|
|
||||||
cuda.Context.synchronize()
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
parser = argparse.ArgumentParser(description='Render fractal flames.')
|
parser = argparse.ArgumentParser(description='Render fractal flames.')
|
||||||
@ -150,8 +145,6 @@ if __name__ == "__main__":
|
|||||||
parser.add_argument('--pause', action='store_true',
|
parser.add_argument('--pause', action='store_true',
|
||||||
help="Don't close the preview window after rendering is finished")
|
help="Don't close the preview window after rendering is finished")
|
||||||
|
|
||||||
parser.add_argument('--keep', action='store_true', dest='keep',
|
|
||||||
help='Keep compilation directory (disables kernel caching)')
|
|
||||||
parser.add_argument('--sync', action='store_true', dest='sync',
|
parser.add_argument('--sync', action='store_true', dest='sync',
|
||||||
help='Use synchronous launches whenever possible')
|
help='Use synchronous launches whenever possible')
|
||||||
|
|
||||||
|
66
worker.py
66
worker.py
@ -17,6 +17,8 @@ import redis
|
|||||||
|
|
||||||
from cuburn import render, genome
|
from cuburn import render, genome
|
||||||
|
|
||||||
|
import pycuda.driver as cuda
|
||||||
|
|
||||||
pycuda = None
|
pycuda = None
|
||||||
|
|
||||||
# The default maximum number of waiting jobs. Also used to determine when a
|
# The default maximum number of waiting jobs. Also used to determine when a
|
||||||
@ -50,20 +52,6 @@ def get_temperature():
|
|||||||
return out[idx+1:idx+3]
|
return out[idx+1:idx+3]
|
||||||
return ''
|
return ''
|
||||||
|
|
||||||
def push_frame(r, out):
|
|
||||||
if out is None:
|
|
||||||
return
|
|
||||||
sid, sidx, ftag = out.idx
|
|
||||||
# TODO: gotta put this in a module somewhere and make it adjustable
|
|
||||||
noalpha = out.buf[:,:,:3]
|
|
||||||
img = scipy.misc.toimage(noalpha, cmin=0, cmax=1)
|
|
||||||
buf = StringIO()
|
|
||||||
img.save(buf, 'jpeg', quality=98)
|
|
||||||
buf.seek(0)
|
|
||||||
head = ' '.join([sidx, str(out.gpu_time), ftag])
|
|
||||||
r.rpush(sid + ':queue', head + '\0' + buf.read())
|
|
||||||
print 'Pushed frame: %s' % head
|
|
||||||
|
|
||||||
def work(server):
|
def work(server):
|
||||||
global pycuda
|
global pycuda
|
||||||
import pycuda.autoinit
|
import pycuda.autoinit
|
||||||
@ -80,7 +68,11 @@ def work(server):
|
|||||||
r.expire(wid, 180)
|
r.expire(wid, 180)
|
||||||
last_ping = time.time()
|
last_ping = time.time()
|
||||||
|
|
||||||
last_pid, last_gid, riter = None, None, None
|
idx = evt = buf = None
|
||||||
|
last_idx = last_buf = last_evt = two_evts_ago = None
|
||||||
|
last_pid = last_gid = rdr = None
|
||||||
|
|
||||||
|
mgr = render.RenderManager()
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
task = r.blpop('renderpool:' + rev + ':queue', 10)
|
task = r.blpop('renderpool:' + rev + ':queue', 10)
|
||||||
@ -90,28 +82,46 @@ def work(server):
|
|||||||
r.expire(wid, 180)
|
r.expire(wid, 180)
|
||||||
last_ping = now
|
last_ping = now
|
||||||
|
|
||||||
|
# last_evt will be populated during normal queue operation (when evt
|
||||||
|
# contains the most recent event), as well as when the render queue is
|
||||||
|
# flushing due to not receiving a new task before the timeout.
|
||||||
|
if last_idx is not None:
|
||||||
|
while not last_evt.query():
|
||||||
|
# This delay could probably be a lot higher with zero impact
|
||||||
|
# on throughput for Fermi cards
|
||||||
|
time.sleep(0.05)
|
||||||
|
|
||||||
|
sid, sidx, ftag = last_idx
|
||||||
|
obuf = StringIO()
|
||||||
|
rdr.out.save(buf, obuf, 'jpeg')
|
||||||
|
obuf.seek(0)
|
||||||
|
gpu_time = last_evt.time_since(two_evts_ago)
|
||||||
|
head = ' '.join([sidx, str(gpu_time), ftag])
|
||||||
|
r.rpush(sid + ':queue', head + '\0' + obuf.read())
|
||||||
|
print 'Pushed frame: %s' % head
|
||||||
|
|
||||||
|
two_evts_ago, last_evt = last_evt, evt
|
||||||
|
last_idx, last_buf = idx, buf
|
||||||
|
|
||||||
if not task:
|
if not task:
|
||||||
if riter:
|
idx = evt = buf = None
|
||||||
push_frame(r, riter.send(None))
|
|
||||||
riter = None
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
sid, sidx, pid, gid, ftime, ftag = task[1].split(' ', 5)
|
sid, sidx, pid, gid, ftime, ftag = task[1].split(' ', 5)
|
||||||
if pid != last_pid or gid != last_gid or not riter:
|
if pid != last_pid or gid != last_gid or not rdr:
|
||||||
gnm = genome.Genome(json.loads(r.get(gid)))
|
gnm = genome.Genome(json.loads(r.get(gid)))
|
||||||
prof = json.loads(r.get(pid))
|
prof = json.loads(r.get(pid))
|
||||||
gnm.set_profile(prof)
|
gnm.set_profile(prof)
|
||||||
renderer = render.Renderer()
|
rdr = render.Renderer(gnm)
|
||||||
renderer.load(gnm)
|
|
||||||
|
|
||||||
if riter:
|
if last_evt is None:
|
||||||
push_frame(r, riter.send(None))
|
# Create a dummy event for timing
|
||||||
|
last_evt = cuda.Event().record(mgr.stream_a)
|
||||||
|
|
||||||
riter = renderer.render_gen(gnm, prof['width'], prof['height'])
|
copy = last_idx is None
|
||||||
next(riter)
|
w, h = prof['width'], prof['height']
|
||||||
last_pid, last_gid = pid, gid
|
evt, buf = mgr.queue_frame(rdr, gnm, float(ftime), w, h, copy)
|
||||||
|
idx = sid, sidx, ftag
|
||||||
push_frame(r, riter.send(((sid, sidx, ftag), float(ftime))))
|
|
||||||
|
|
||||||
def iter_genomes(prof, gpaths, pname='540p'):
|
def iter_genomes(prof, gpaths, pname='540p'):
|
||||||
"""
|
"""
|
||||||
|
Loading…
Reference in New Issue
Block a user