Sweeping refactor. More bugs undoubtedly remain.

This commit is contained in:
Steven Robertson 2012-02-14 07:40:58 -08:00
parent 4cfd328f85
commit 60a45c9a20
13 changed files with 1170 additions and 1299 deletions

View File

@ -1,9 +0,0 @@
"""
Contains the PTX fragments which will drive the device, and helper functions
to combine those fragments.
"""
import util
import mwc
import iter

77
cuburn/code/color.py Normal file
View File

@ -0,0 +1,77 @@
import numpy as np
from util import devlib
# The JPEG YUV full-range matrix, without bias into the positve regime.
# This assumes input color space is CIERGB D65, encoded with gamma 2.2.
# Note that some interpolated colors may exceed the sRGB and YUV gamuts.
YUV_MATRIX = np.matrix([[ 0.299, 0.587, 0.114],
[-0.168736, -0.331264, 0.5],
[ 0.5, -0.418688, -0.081312]])
yuvlib = devlib(defs='''
__device__ float3 rgb2yuv(float3 rgb);
__device__ float3 yuv2rgb(float3 yuv);
''', decls=r'''
/* This conversion uses the JPEG full-range standard. Note that UV have range
* [-0.5, 0.5], so consider biasing the results. */
__device__ float3 rgb2yuv(float3 rgb) {
return make_float3(
0.299f * rgb.x + 0.587f * rgb.y + 0.114f * rgb.z,
-0.168736f * rgb.x - 0.331264f * rgb.y + 0.5f * rgb.z,
0.5f * rgb.x - 0.418688f * rgb.y - 0.081312f * rgb.z);
}
__device__ float3 yuv2rgb(float3 yuv) {
return make_float3(
yuv.x + 1.402f * yuv.z,
yuv.x - 0.34414f * yuv.y - 0.71414f * yuv.z,
yuv.x + 1.772f * yuv.y);
}
''')
hsvlib = devlib(decls='''
__device__ float3 rgb2hsv(float3 rgb);
__device__ float3 hsv2rgb(float3 hsv);
''', defs=r'''
__device__ float3 rgb2hsv(float3 rgb) {
float M = fmaxf(fmaxf(rgb.x, rgb.y), rgb.z);
float m = fminf(fminf(rgb.x, rgb.y), rgb.z);
float C = M - m;
float s = M > 0.0f ? C / M : 0.0f;
float h = 0.0f;
if (s != 0.0f) {
C = 1.0f / C;
float rc = (M - rgb.x) * C;
float gc = (M - rgb.y) * C;
float bc = (M - rgb.z) * C;
if (rgb.x == M) h = bc - gc;
else if (rgb.y == M) h = 2 + rc - bc;
else h = 4 + gc - rc;
if (h < 0) h += 6.0f;
}
return make_float3(h, s, M);
}
__device__ float3 hsv2rgb(float3 hsv) {
float whole = floorf(hsv.x);
float frac = hsv.x - whole;
float val = hsv.z;
float min = val * (1 - hsv.y);
float mid = val * (1 - (hsv.y * frac));
float alt = val * (1 - (hsv.y * (1 - frac)));
float3 out;
if (whole == 0.0f) { out.x = val; out.y = alt; out.z = min; }
else if (whole == 1.0f) { out.x = mid; out.y = val; out.z = min; }
else if (whole == 2.0f) { out.x = min; out.y = val; out.z = alt; }
else if (whole == 3.0f) { out.x = min; out.y = mid; out.z = val; }
else if (whole == 4.0f) { out.x = alt; out.y = min; out.z = val; }
else { out.x = val; out.y = min; out.z = mid; }
return out;
}
''')

View File

@ -1,13 +1,7 @@
import numpy as np from util import devlib
from numpy import float32 as f32, int32 as i32 from color import yuvlib
import pycuda.driver as cuda texshearlib = devlib(defs=r'''
import pycuda.compiler
from pycuda.gpuarray import vec
from cuburn.code.util import *
_CODE = r'''
// Filter directions specified in degrees, using image/texture addressing // Filter directions specified in degrees, using image/texture addressing
// [(0,0) is upper left corner, 90 degrees is down]. // [(0,0) is upper left corner, 90 degrees is down].
@ -22,7 +16,9 @@ __constant__ float2 addressing_patterns[16] = {
{ 1.0f, -0.333333f}, { 0.333333f, 1.0f}, // 14, 15: -15, 75 { 1.0f, -0.333333f}, { 0.333333f, 1.0f}, // 14, 15: -15, 75
}; };
// Mon dieu! A C++ feature? // Mon dieu! A C++ feature? Gotta to close the "extern C" added by the compiler.
}
template <typename T> __device__ T template <typename T> __device__ T
tex_shear(texture<T, cudaTextureType2D> ref, int pattern, tex_shear(texture<T, cudaTextureType2D> ref, int pattern,
float x, float y, float radius) { float x, float y, float radius) {
@ -39,100 +35,14 @@ tex_shear(texture<T, cudaTextureType2D> ref, int pattern,
} }
extern "C" { extern "C" {
''')
__global__ logscalelib = devlib(defs=r'''
void colorclip(float4 *pixbuf, float gamma, float vibrance, float highpow, __global__ void
float linrange, float lingam, float3 bkgd, logscale(float4 *outbuf, const float4 *pixbuf, float k1, float k2) {
int fbsize, int blend_background_color) {
int i = threadIdx.x + blockDim.x * (blockIdx.x + gridDim.x * blockIdx.y);
if (i >= fbsize) return;
float4 pix = pixbuf[i];
if (pix.w <= 0) {
pixbuf[i] = make_float4(bkgd.x, bkgd.y, bkgd.z, 0.0f);
return;
}
pix.y -= 0.5f * pix.w;
pix.z -= 0.5f * pix.w;
float3 tmp = yuv2rgb(make_float3(pix.x, pix.y, pix.z));
pix.x = tmp.x;
pix.y = tmp.y;
pix.z = tmp.z;
pix.x = fmaxf(0.0f, pix.x);
pix.y = fmaxf(0.0f, pix.y);
pix.z = fmaxf(0.0f, pix.z);
float4 opix = pix;
float alpha = powf(pix.w, gamma);
if (pix.w < linrange) {
float frac = pix.w / linrange;
alpha = (1.0f - frac) * pix.w * lingam + frac * alpha;
}
if (!blend_background_color) {
float ls = alpha / pix.w;
pix.x *= ls;
pix.y *= ls;
pix.z *= ls;
pix.w = alpha;
pixbuf[i] = pix;
return;
}
float ls = vibrance * alpha / pix.w;
alpha = fminf(1.0f, fmaxf(0.0f, alpha));
float maxc = fmaxf(pix.x, fmaxf(pix.y, pix.z));
float maxa = maxc * ls;
float newls = 1.0f / maxc;
if (maxa > 1.0f && highpow >= 0.0f) {
float lsratio = powf(newls / ls, highpow);
pix.x *= newls;
pix.y *= newls;
pix.z *= newls;
// Reduce saturation (according to the HSV model) by proportionally
// increasing the values of the other colors.
pix.x = maxc - (maxc - pix.x) * lsratio;
pix.y = maxc - (maxc - pix.y) * lsratio;
pix.z = maxc - (maxc - pix.z) * lsratio;
} else {
float adjhlp = -highpow;
if (adjhlp > 1.0f || maxa <= 1.0f) adjhlp = 1.0f;
if (maxc > 0.0f) {
float adj = ((1.0f - adjhlp) * newls + adjhlp * ls);
pix.x *= adj;
pix.y *= adj;
pix.z *= adj;
}
}
pix.x += (1.0f - vibrance) * powf(opix.x, gamma);
pix.y += (1.0f - vibrance) * powf(opix.y, gamma);
pix.z += (1.0f - vibrance) * powf(opix.z, gamma);
pix.x += (1.0f - alpha) * bkgd.x;
pix.y += (1.0f - alpha) * bkgd.y;
pix.z += (1.0f - alpha) * bkgd.z;
pix.x = fminf(1.0f, pix.x);
pix.y = fminf(1.0f, pix.y);
pix.z = fminf(1.0f, pix.z);
pix.w = alpha;
pixbuf[i] = pix;
}
__global__
void logscale(float4 *outbuf, const float4 *pixbuf, float k1, float k2) {
int i = blockDim.x * blockIdx.x + threadIdx.x; int i = blockDim.x * blockIdx.x + threadIdx.x;
float4 pix = pixbuf[i]; float4 pix = pixbuf[i];
// float ls = fmaxf(0, k1 * logf(1.0f + pix.w * pix.w * k2 / (1 + pix.w)) / pix.w);
float ls = fmaxf(0, k1 * logf(1.0f + pix.w * k2) / pix.w); float ls = fmaxf(0, k1 * logf(1.0f + pix.w * k2) / pix.w);
pix.x *= ls; pix.x *= ls;
pix.y *= ls; pix.y *= ls;
@ -141,10 +51,12 @@ void logscale(float4 *outbuf, const float4 *pixbuf, float k1, float k2) {
outbuf[i] = pix; outbuf[i] = pix;
} }
''')
fmabuflib = devlib(defs=r'''
// Element-wise computation of ``dst[i]=dst[i]+src[i]*scale``. // Element-wise computation of ``dst[i]=dst[i]+src[i]*scale``.
__global__ __global__ void
void fma_buf(float4 *dst, const float4 *src, int astride, float scale) { fma_buf(float4 *dst, const float4 *src, int astride, float scale) {
int x = blockIdx.x * blockDim.x + threadIdx.x; int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y; int y = blockIdx.y * blockDim.y + threadIdx.y;
int i = y * astride + x; int i = y * astride + x;
@ -155,23 +67,16 @@ void fma_buf(float4 *dst, const float4 *src, int astride, float scale) {
d.w += s.w * scale; d.w += s.w * scale;
dst[i] = d; dst[i] = d;
} }
''')
texture<float4, cudaTextureType2D> bilateral_src; denblurlib = devlib(decls='''
texture<float, cudaTextureType2D> blur_src; texture<float, cudaTextureType2D> blur_src;
__global__ void logscale_den(float *dst, float k2) {
int xi = blockIdx.x * blockDim.x + threadIdx.x;
int yi = blockIdx.y * blockDim.y + threadIdx.y;
float4 pix = tex2D(bilateral_src, xi, yi);
float out = logf(1.0f + pix.w * k2);
dst[yi * (blockDim.x * gridDim.x) + xi] = out;
}
__constant__ float gauss_coefs[9] = { __constant__ float gauss_coefs[9] = {
0.00443305f, 0.05400558f, 0.24203623f, 0.39905028f, 0.00443305f, 0.05400558f, 0.24203623f, 0.39905028f,
0.24203623f, 0.05400558f, 0.00443305f 0.24203623f, 0.05400558f, 0.00443305f
}; };
''', defs=r'''
// Apply a Gaussian-esque blur to the density channel of the texture in // Apply a Gaussian-esque blur to the density channel of the texture in
// ``bilateral_src`` in the horizontal direction, and write it to ``dst``, a // ``bilateral_src`` in the horizontal direction, and write it to ``dst``, a
// one-channel buffer. // one-channel buffer.
@ -203,7 +108,11 @@ __global__ void den_blur_1c(float *dst, int pattern, int upsample) {
* gauss_coefs[i]; * gauss_coefs[i];
dst[yi * (blockDim.x * gridDim.x) + xi] = den; dst[yi * (blockDim.x * gridDim.x) + xi] = den;
} }
''')
bilaterallib = devlib(deps=[logscalelib, texshearlib, denblurlib], decls='''
texture<float4, cudaTextureType2D> bilateral_src;
''', defs=r'''
/* sstd: spatial standard deviation (Gaussian filter) /* sstd: spatial standard deviation (Gaussian filter)
* cstd: color standard deviation (Gaussian on the range [0, 1], where 1 * cstd: color standard deviation (Gaussian on the range [0, 1], where 1
* represents an "opposite" color). * represents an "opposite" color).
@ -215,10 +124,9 @@ __global__ void den_blur_1c(float *dst, int pattern, int upsample) {
* gspeed: Speed of (exp2f) Gompertz distribution governing how much to * gspeed: Speed of (exp2f) Gompertz distribution governing how much to
* tighten gradients. Zero and negative values OK. * tighten gradients. Zero and negative values OK.
*/ */
__global__ void
__global__ bilateral(float4 *dst, int pattern, int radius,
void bilateral(float4 *dst, int pattern, int radius, float sstd, float cstd, float dstd, float dpow, float gspeed)
float sstd, float cstd, float dstd, float dpow, float gspeed)
{ {
int xi = blockIdx.x * blockDim.x + threadIdx.x; int xi = blockIdx.x * blockDim.x + threadIdx.x;
int yi = blockIdx.y * blockDim.y + threadIdx.y; int yi = blockIdx.y * blockDim.y + threadIdx.y;
@ -317,124 +225,83 @@ void bilateral(float4 *dst, int pattern, int radius,
const int astride = blockDim.x * gridDim.x; const int astride = blockDim.x * gridDim.x;
dst[yi * astride + xi] = out; dst[yi * astride + xi] = out;
} }
''')
} // end extern "C" colorcliplib = devlib(deps=[yuvlib], defs=r'''
''' __global__ void
colorclip(float4 *pixbuf, float gamma, float vibrance, float highpow,
float linrange, float lingam, float3 bkgd, int fbsize)
{
int i = threadIdx.x + blockDim.x * (blockIdx.x + gridDim.x * blockIdx.y);
if (i >= fbsize) return;
class Filtering(HunkOCode): float4 pix = pixbuf[i];
mod = None
defs = _CODE
@classmethod if (pix.w <= 0) {
def init_mod(cls): pixbuf[i] = make_float4(bkgd.x, bkgd.y, bkgd.z, 0.0f);
if cls.mod is None: return;
cls.mod = pycuda.compiler.SourceModule(assemble_code(BaseCode, cls), }
options=['-use_fast_math', '-maxrregcount', '32'], pix.y -= 0.5f * pix.w;
no_extern_c=True) pix.z -= 0.5f * pix.w;
float3 tmp = yuv2rgb(make_float3(pix.x, pix.y, pix.z));
pix.x = tmp.x;
pix.y = tmp.y;
pix.z = tmp.z;
def __init__(self): pix.x = fmaxf(0.0f, pix.x);
self.init_mod() pix.y = fmaxf(0.0f, pix.y);
pix.z = fmaxf(0.0f, pix.z);
def de(self, ddst, dsrc, dscratch, gnm, dim, tc, nxf, stream=None): float4 opix = pix;
# Helper variables and functions to keep it clean float alpha = powf(pix.w, gamma);
sb = 16 * dim.astride if (pix.w < linrange) {
bs = sb * dim.ah float frac = pix.w / linrange;
bl, gr = (32, 8, 1), (dim.astride / 32, dim.ah / 8) alpha = (1.0f - frac) * pix.w * lingam + frac * alpha;
}
def launch(f, *args, **kwargs): float ls = vibrance * alpha / pix.w;
f(*args, block=bl, grid=gr, stream=stream, **kwargs) alpha = fminf(1.0f, fmaxf(0.0f, alpha));
mkdsc = lambda c: argset(cuda.ArrayDescriptor(), height=dim.ah,
width=dim.astride, num_channels=c,
format=cuda.array_format.FLOAT)
def mktref(n):
tref = self.mod.get_texref(n)
tref.set_filter_mode(cuda.filter_mode.POINT)
tref.set_address_mode(0, cuda.address_mode.WRAP)
tref.set_address_mode(1, cuda.address_mode.WRAP)
return tref
dsc = mkdsc(4) float maxc = fmaxf(pix.x, fmaxf(pix.y, pix.z));
tref = mktref('bilateral_src') float maxa = maxc * ls;
grad_dsc = mkdsc(1) float newls = 1.0f / maxc;
grad_tref = mktref('blur_src')
bilateral, logscale_den, den_blur, den_blur_1c, fma_buf = map( if (maxa > 1.0f && highpow >= 0.0f) {
self.mod.get_function, float lsratio = powf(newls / ls, highpow);
'bilateral logscale_den den_blur den_blur_1c fma_buf'.split()) pix.x *= newls;
pix.y *= newls;
pix.z *= newls;
# Number of different shear patterns to use when filtering. Must be // Reduce saturation (according to the HSV model) by proportionally
# even, since we depend on buffer bouncing (but I don't think that it's // increasing the values of the other colors.
# a requirement for the filter itself to get decent results). pix.x = maxc - (maxc - pix.x) * lsratio;
DIRECTIONS = 8 pix.y = maxc - (maxc - pix.y) * lsratio;
pix.z = maxc - (maxc - pix.z) * lsratio;
} else {
float adjhlp = -highpow;
if (adjhlp > 1.0f || maxa <= 1.0f) adjhlp = 1.0f;
if (maxc > 0.0f) {
float adj = ((1.0f - adjhlp) * newls + adjhlp * ls);
pix.x *= adj;
pix.y *= adj;
pix.z *= adj;
}
}
def do_bilateral(bsrc, bdst, pattern, r=15, sstd=6, cstd=0.05, pix.x += (1.0f - vibrance) * powf(opix.x, gamma);
dstd=1.5, dpow=0.8, gspeed=4.0): pix.y += (1.0f - vibrance) * powf(opix.y, gamma);
# Scale spatial parameter so that a "pixel" is equivalent to an pix.z += (1.0f - vibrance) * powf(opix.z, gamma);
# actual pixel at 1080p
sstd *= dim.w / 1920.
tref.set_address_2d(bsrc, dsc, sb) pix.x += (1.0f - alpha) * bkgd.x;
pix.y += (1.0f - alpha) * bkgd.y;
pix.z += (1.0f - alpha) * bkgd.z;
# Blur density two octaves along sampling vector, ultimately pix.x = fminf(1.0f, pix.x);
# storing in `dscratch` pix.y = fminf(1.0f, pix.y);
launch(den_blur, np.intp(bdst), i32(pattern), i32(0), pix.z = fminf(1.0f, pix.z);
texrefs=[tref]) pix.w = alpha;
grad_tref.set_address_2d(bdst, grad_dsc, sb / 4)
launch(den_blur_1c, dscratch, i32(pattern), i32(1),
texrefs=[grad_tref])
grad_tref.set_address_2d(dscratch, grad_dsc, sb / 4)
launch(bilateral, np.intp(bdst), i32(pattern), i32(r), pixbuf[i] = pix;
f32(sstd), f32(cstd), f32(dstd), f32(dpow), f32(gspeed), }
texrefs=[tref, grad_tref]) ''')
def do_bilateral_range(bsrc, bdst, npats, *args, **kwargs):
for i in range(npats):
do_bilateral(bsrc, bdst, i, *args, **kwargs)
bdst, bsrc = bsrc, bdst
if npats % 2:
cuda.memcpy_dtod_async(bdst, bsrc, bs, stream)
# Filter the first xform, using `ddst` as an intermediate buffer.
# End result is the filtered copy in `dsrc`.
do_bilateral_range(dsrc, ddst, DIRECTIONS)
# Filter the remaining xforms, using `ddst` as an intermediate
# buffer, then add the result to `dsrc` (at the zero'th xform).
for x in range(1, nxf):
src = int(dsrc) + x * bs
do_bilateral_range(src, ddst, DIRECTIONS)
launch(fma_buf, dsrc, np.intp(src), i32(dim.astride), f32(1))
# Log-scale the accumulated buffer in `dsrc`.
k1 = f32(gnm.color.brightness(tc) * 268 / 256)
# Old definition of area is (w*h/(s*s)). Since new scale 'ns' is now
# s/w, new definition is (w*h/(s*s*w*w)) = (h/(s*s*w))
area = dim.h / (gnm.camera.scale(tc) ** 2 * dim.w)
k2 = f32(1.0 / (area * gnm.spp(tc)))
nbins = dim.ah * dim.astride
logscale = self.mod.get_function("logscale")
t = logscale(ddst, dsrc, k1, k2,
block=(512, 1, 1), grid=(nbins/512, 1), stream=stream)
def colorclip(self, dbuf, gnm, dim, tc, blend, stream=None):
nbins = dim.ah * dim.astride
# TODO: implement integration over cubic splines?
gam = f32(1 / gnm.color.gamma(tc))
vib = f32(gnm.color.vibrance(tc))
hipow = f32(gnm.color.highlight_power(tc))
lin = f32(gnm.color.gamma_threshold(tc))
lingam = f32(lin ** (gam-1.0) if lin > 0 else 0)
bkgd = vec.make_float3(
gnm.color.background.r(tc),
gnm.color.background.g(tc),
gnm.color.background.b(tc))
color_fun = self.mod.get_function("colorclip")
blocks = int(np.ceil(np.sqrt(nbins / 256)))
color_fun(dbuf, gam, vib, hipow, lin, lingam, bkgd, i32(nbins),
i32(blend), block=(256, 1, 1), grid=(blocks, blocks),
stream=stream)

View File

@ -2,8 +2,10 @@ from collections import OrderedDict
from itertools import cycle from itertools import cycle
import numpy as np import numpy as np
import tempita import util
from util import HunkOCode, Template, BaseCode, assemble_code from util import Template, assemble_code, devlib, binsearchlib, ringbuflib
from color import yuvlib
from mwc import mwclib
class GenomePackerName(str): class GenomePackerName(str):
"""Class to indicate that a property is precalculated on the device""" """Class to indicate that a property is precalculated on the device"""
@ -115,17 +117,10 @@ class GenomePackerPrecalc(GenomePackerView):
def _code(self, code): def _code(self, code):
self.packer.precalc_code.append(code) self.packer.precalc_code.append(code)
class GenomePacker(HunkOCode): class GenomePacker(object):
""" """
Packs a genome for use in iteration. Packs a genome for use in iteration.
""" """
# 2^search_rounds is the maximum number of control points, including
# endpoints, that can be used in a single genome. It should be okay to
# change this number without touching other code, but 32 samples fits
# nicely on a single cache line.
search_rounds = 5
def __init__(self, tname): def __init__(self, tname):
""" """
Create a new DataPacker. Create a new DataPacker.
@ -152,6 +147,7 @@ class GenomePacker(HunkOCode):
self.packed = None self.packed = None
self.genome = None self.genome = None
self.search_rounds = util.DEFAULT_SEARCH_ROUNDS
def __len__(self): def __len__(self):
assert self._len is not None, 'len() called before finalize()' assert self._len is not None, 'len() called before finalize()'
@ -189,42 +185,41 @@ class GenomePacker(HunkOCode):
self._len = len(self.packed) self._len = len(self.packed)
self.decls = self._decls.substitute( decls = self._decls.substitute(packed=self.packed, tname=self.tname)
packed=self.packed, tname=self.tname, defs = self._defs.substitute(
search_rounds=self.search_rounds)
self.defs = self._defs.substitute(
packed_direct=self.packed_direct, tname=self.tname, packed_direct=self.packed_direct, tname=self.tname,
precalc_code=self.precalc_code, precalc_code=self.precalc_code,
search_rounds=self.search_rounds) search_rounds=self.search_rounds)
return devlib(deps=[catmullromlib], decls=decls, defs=defs)
def pack(self): def pack(self, pool=None):
""" """
Return a packed copy of the genome ready for uploading to the GPU as a Return a packed copy of the genome ready for uploading to the GPU,
3D NDArray, with the first element being the times and the second as two float32 NDArrays for the knot times and values.
being the values.
""" """
width = 1 << self.search_rounds width = 1 << self.search_rounds
out = np.empty((2, len(self.genome), width), dtype=np.float32) if pool:
# Ensure that unused values at the top are always big (must be >2.0) times = pool.allocate((len(self.genome), width), 'f4')
out[0].fill(1e9) knots = pool.allocate((len(self.genome), width), 'f4')
else:
times, knots = np.empty((2, len(self.genome), width), 'f4')
times.fill(1e9)
for idx, gname in enumerate(self.genome): for idx, gname in enumerate(self.genome):
attr = self.ns[gname[0]] attr = self.ns[gname[0]]
for g in gname[1:]: for g in gname[1:]:
attr = getattr(attr, g) attr = getattr(attr, g)
out[0][idx][:len(attr.knots[0])] = attr.knots[0] times[idx,:len(attr.knots[0])] = attr.knots[0]
out[1][idx][:len(attr.knots[1])] = attr.knots[1] knots[idx,:len(attr.knots[1])] = attr.knots[1]
return out return times, knots
_defs = Template(r""" _defs = Template(r"""
__global__ void interp_{{tname}}(
__global__
void interp_{{tname}}(
{{tname}}* out, {{tname}}* out,
const float *times, const float *knots, const float *times, const float *knots,
float tstart, float tstep, int maxid float tstart, float tstep, int maxid)
) { {
int id = gtid(); int id = gtid();
if (id >= maxid) return; if (id >= maxid) return;
out = &out[id]; out = &out[id];
@ -249,7 +244,6 @@ void interp_{{tname}}(
} }
{{endfor}} {{endfor}}
} }
""") """)
_decls = Template(r""" _decls = Template(r"""
@ -259,22 +253,13 @@ typedef struct {
{{endfor}} {{endfor}}
} {{tname}}; } {{tname}};
/* Search through the fixed-size list 'hay' to find the rightmost index which
* contains a value strictly smaller than the input 'needle'. The crazy
* bitwise search is just for my own amusement.
*/
__device__
int bitwise_binsearch(const float *hay, float needle) {
int lo = 0;
// TODO: improve efficiency on 64-bit arches """)
{{for i in range(search_rounds-1, -1, -1)}}
if (needle > hay[lo + {{1 << i}}])
lo += {{1 << i}};
{{endfor}}
return lo;
}
catmullromlib = devlib(deps=[binsearchlib], decls=r'''
__device__ __noinline__
float catmull_rom(const float *times, const float *knots, float t);
''', defs=r'''
__device__ __noinline__ __device__ __noinline__
float catmull_rom(const float *times, const float *knots, float t) { float catmull_rom(const float *times, const float *knots, float t) {
int idx = bitwise_binsearch(times, t); int idx = bitwise_binsearch(times, t);
@ -303,67 +288,14 @@ float catmull_rom(const float *times, const float *knots, float t) {
+ m2 * ( ttt - tt) + m2 * ( ttt - tt)
+ k2 * (-2.0f*ttt + 3.0f*tt); + k2 * (-2.0f*ttt + 3.0f*tt);
} }
''')
__global__ palintlib = devlib(deps=[binsearchlib, ringbuflib, yuvlib, mwclib], decls='''
void test_cr(const float *times, const float *knots, const float *t, float *r) { surface<void, cudaSurfaceType2D> flatpal;
int i = threadIdx.x + blockDim.x * blockIdx.x; ''', defs=r'''
r[i] = catmull_rom(times, knots, t[i]); __device__ float4
} interp_color(const float *times, const float4 *sources, float time)
""") {
class Palette(HunkOCode):
# The JPEG YUV full-range matrix, without bias into the positve regime.
# This assumes input color space is CIERGB D65, encoded with gamma 2.2.
# Note that some interpolated colors may exceed the sRGB and YUV gamuts.
YUV = np.matrix([[ 0.299, 0.587, 0.114],
[-0.168736, -0.331264, 0.5],
[ 0.5, -0.418688, -0.081312]])
def __init__(self, interp_mode):
assert interp_mode in self.modes
self.mode = interp_mode
self.defs = self._defs.substitute(mode=interp_mode)
def prepare(self, palettes):
"""
Produce palettes suitable for uploading to the device. Returns an
array of palettes in the same size and shape as the input.
This function will never modify its argument, but may return it
unmodified for certain interpolation modes.
"""
if self.mode == 'yuvpolar':
ys, uvrs, uvts, alphas = zip(*map(self.rgbtoyuvpolar, palettes))
# Center all medians as closely to 0 as possible
means = np.mean(uvts, axis=1)
newmeans = (means + np.pi) % (2 * np.pi) - np.pi
uvts = (newmeans - means).reshape((-1, 1)) + uvts
zipped = zip(ys, uvrs, uvts, alphas)
return np.array(zipped, dtype='f4').transpose((0, 2, 1))
return palettes
@classmethod
def rgbtoyuvpolar(cls, pal):
# TODO: premultiply alpha or some nonsense like that?
y, u, v = np.array(cls.YUV * pal.T[:3])
uvr = np.hypot(u, v)
uvt = np.unwrap(np.arctan2(v, u))
return y, uvr, uvt, pal.T[3]
@classmethod
def yuvpolartorgb(cls, y, uvr, uvt, a):
u = uvr * np.cos(uvt)
v = uvr * np.sin(uvt)
r, g, b = np.array(cls.YUV.I * np.array([y, u, v]))
# Ensure Fortran order so that the memory gets laid out correctly
return np.array([r, g, b, a], order='F').T
modes = ['hsv', 'yuv', 'yuvpolar']
decls = "surface<void, cudaSurfaceType2D> flatpal;\n"
_defs = Template(r"""
__device__
float4 interp_color(const float *times, const float4 *sources, float time) {
int idx = fmaxf(bitwise_binsearch(times, time) + 1, 1); int idx = fmaxf(bitwise_binsearch(times, time) + 1, 1);
float lf = (times[idx] - time) / (times[idx] - times[idx-1]); float lf = (times[idx] - time) / (times[idx] - times[idx-1]);
float rf = 1.0f - lf; float rf = 1.0f - lf;
@ -375,42 +307,11 @@ float4 interp_color(const float *times, const float4 *sources, float time) {
float3 l3 = make_float3(left.x, left.y, left.z); float3 l3 = make_float3(left.x, left.y, left.z);
float3 r3 = make_float3(right.x, right.y, right.z); float3 r3 = make_float3(right.x, right.y, right.z);
{{if mode == 'hsv'}}
float3 lhsv = rgb2hsv(l3);
float3 rhsv = rgb2hsv(r3);
if (fabs(lhsv.x - rhsv.x) > 3.0f)
if (lhsv.x < rhsv.x)
lhsv.x += 6.0f;
else
rhsv.x += 6.0f;
float3 hsv;
hsv.x = lhsv.x * lf + rhsv.x * rf;
hsv.y = lhsv.y * lf + rhsv.y * rf;
hsv.z = lhsv.z * lf + rhsv.z * rf;
if (hsv.x > 6.0f)
hsv.x -= 6.0f;
if (hsv.x < 0.0f)
hsv.x += 6.0f;
yuv = rgb2yuv(hsv2rgb(hsv));
{{elif mode.startswith('yuv')}}
{{if mode == 'yuv'}}
float3 lyuv = rgb2yuv(l3); float3 lyuv = rgb2yuv(l3);
float3 ryuv = rgb2yuv(r3); float3 ryuv = rgb2yuv(r3);
yuv.x = lyuv.x * lf + ryuv.x * rf; yuv.x = lyuv.x * lf + ryuv.x * rf;
yuv.y = lyuv.y * lf + ryuv.y * rf; yuv.y = lyuv.y * lf + ryuv.y * rf;
yuv.z = lyuv.z * lf + ryuv.z * rf; yuv.z = lyuv.z * lf + ryuv.z * rf;
{{elif mode == 'yuvpolar'}}
yuv.x = l3.x * lf + r3.x * rf;
float radius = l3.y * lf + r3.y * rf;
float angle = l3.z * lf + r3.z * rf;
yuv.y = radius * cosf(angle);
yuv.z = radius * sinf(angle);
{{endif}}
{{endif}}
yuv.y += 0.5f; yuv.y += 0.5f;
yuv.z += 0.5f; yuv.z += 0.5f;
@ -418,28 +319,13 @@ float4 interp_color(const float *times, const float4 *sources, float time) {
return make_float4(yuv.x, yuv.y, yuv.z, left.w * lf + right.w * rf); return make_float4(yuv.x, yuv.y, yuv.z, left.w * lf + right.w * rf);
} }
__global__ __global__ void interp_palette_flat(
void interp_palette(uchar4 *outs, ringbuf *rb, mwc_st *rctxs,
const float *times, const float4 *sources, const float *times, const float4 *sources,
float tstart, float tstep) { float tstart, float tstep)
float time = tstart + blockIdx.x * tstep; {
float4 yuva = interp_color(times, sources, time); mwc_st rctx = rctxs[rb_incr(rb->head, threadIdx.x)];
uchar4 out;
out.x = yuva.x * 255.0f;
out.y = yuva.y * 255.0f;
out.z = yuva.z * 255.0f;
out.w = yuva.w * 255.0f;
outs[blockDim.x * blockIdx.x + threadIdx.x] = out;
}
__global__
void interp_palette_flat(mwc_st *rctxs,
const float *times, const float4 *sources,
float tstart, float tstep) {
int gid = blockIdx.x * blockDim.x + threadIdx.x; int gid = blockIdx.x * blockDim.x + threadIdx.x;
mwc_st rctx = rctxs[gid];
float time = tstart + blockIdx.x * tstep; float time = tstart + blockIdx.x * tstep;
float4 yuva = interp_color(times, sources, time); float4 yuva = interp_color(times, sources, time);
@ -447,17 +333,30 @@ void interp_palette_flat(mwc_st *rctxs,
// TODO: pack Y at full precision, UV at quarter // TODO: pack Y at full precision, UV at quarter
uint2 out; uint2 out;
uint32_t y = min(255, (uint32_t) (yuva.x * 255.0f + 0.49f * mwc_next_11(rctx))); uint32_t y = yuva.x * 255.0f + 0.49f * mwc_next_11(rctx);
uint32_t u = min(255, (uint32_t) (yuva.y * 255.0f + 0.49f * mwc_next_11(rctx))); uint32_t u = yuva.y * 255.0f + 0.49f * mwc_next_11(rctx);
uint32_t v = min(255, (uint32_t) (yuva.z * 255.0f + 0.49f * mwc_next_11(rctx))); uint32_t v = yuva.z * 255.0f + 0.49f * mwc_next_11(rctx);
y = min(255, y);
u = min(255, u);
v = min(255, v);
out.y = (1 << 22) | (y << 4); out.y = (1 << 22) | (y << 4);
out.x = (u << 18) | v; out.x = (u << 18) | v;
surf2Dwrite(out, flatpal, 8 * threadIdx.x, blockIdx.x); surf2Dwrite(out, flatpal, 8 * threadIdx.x, blockIdx.x);
rctxs[gid] = rctx; rctxs[rb_incr(rb->tail, threadIdx.x)] = rctx;
} }
""") ''')
testcrlib = devlib(defs=r'''
__global__ void
test_cr(const float *times, const float *knots, const float *t, float *r) {
int i = threadIdx.x + blockDim.x * blockIdx.x;
r[i] = catmull_rom(times, knots, t[i]);
}
''')
if __name__ == "__main__": if __name__ == "__main__":
# Test spline evaluation. This code will probably drift pretty often.
import pycuda.driver as cuda import pycuda.driver as cuda
from pycuda.compiler import SourceModule from pycuda.compiler import SourceModule
import pycuda.autoinit import pycuda.autoinit

View File

@ -2,15 +2,16 @@
The main iteration loop. The main iteration loop.
""" """
from cuburn.code import mwc, variations, interp import variations
from cuburn.code.util import * import interp
from util import Template, devlib, ringbuflib
from mwc import mwclib
def precalc_densities(pcp, std_xforms): def precalc_densities(pcp, std_xforms):
# This pattern recurs a few times for precalc segments. Unfortunately, # This pattern recurs a few times for precalc segments. Unfortunately,
# namespace stuff means it's not easy to functionalize this boilerplate # namespace stuff means it's not easy to functionalize this boilerplate
pre_cp = pcp._precalc() pre_cp = pcp._precalc()
pre_cp._code(Template(r""" pre_cp._code(Template(r"""
float sum = 0.0f; float sum = 0.0f;
{{for n in std_xforms}} {{for n in std_xforms}}
@ -30,7 +31,6 @@ def precalc_densities(pcp, std_xforms):
def precalc_chaos(pcp, std_xforms): def precalc_chaos(pcp, std_xforms):
pre_cp = pcp._precalc() pre_cp = pcp._precalc()
pre_cp._code(Template(""" pre_cp._code(Template("""
float sum, rsum; float sum, rsum;
{{for p in std_xforms}} {{for p in std_xforms}}
@ -50,7 +50,6 @@ def precalc_chaos(pcp, std_xforms):
{{endfor}} {{endfor}}
{{endfor}} {{endfor}}
""", name='precalc_chaos').substitute(locals())) """, name='precalc_chaos').substitute(locals()))
def precalc_camera(pcam): def precalc_camera(pcam):
@ -64,7 +63,6 @@ def precalc_camera(pcam):
# . matrix([X],[Y],[1]); # . matrix([X],[Y],[1]);
pre_cam._code(Template(r""" pre_cam._code(Template(r"""
float rot = {{pre_cam.rotation}} * M_PI / 180.0f; float rot = {{pre_cam.rotation}} * M_PI / 180.0f;
float rotsin = sin(rot), rotcos = cos(rot); float rotsin = sin(rot), rotcos = cos(rot);
float cenx = {{pre_cam.center.x}}, ceny = {{pre_cam.center.y}}; float cenx = {{pre_cam.center.x}}, ceny = {{pre_cam.center.y}};
@ -79,13 +77,11 @@ def precalc_camera(pcam):
{{pre_cam._set('yy')}} = scale * rotcos; {{pre_cam._set('yy')}} = scale * rotcos;
{{pre_cam._set('yo')}} = scale * -(rotsin * cenx + rotcos * ceny) {{pre_cam._set('yo')}} = scale * -(rotsin * cenx + rotcos * ceny)
+ 0.5f * acc_size.aheight; + 0.5f * acc_size.aheight;
""", 'precalc_camera').substitute(locals()))
""").substitute(locals()))
def precalc_xf_affine(px): def precalc_xf_affine(px):
pre = px._precalc() pre = px._precalc()
pre._code(Template(r""" pre._code(Template(r"""
float pri = {{pre.angle}} * M_PI / 180.0f; float pri = {{pre.angle}} * M_PI / 180.0f;
float spr = {{pre.spread}} * M_PI / 180.0f; float spr = {{pre.spread}} * M_PI / 180.0f;
@ -98,27 +94,22 @@ def precalc_xf_affine(px):
{{pre._set('yy')}} = magy * sin(pri+spr); {{pre._set('yy')}} = magy * sin(pri+spr);
{{pre._set('xo')}} = {{pre.offset.x}}; {{pre._set('xo')}} = {{pre.offset.x}};
{{pre._set('yo')}} = -{{pre.offset.y}}; {{pre._set('yo')}} = -{{pre.offset.y}};
""", 'precalc_xf_affine').substitute(locals()))
""").substitute(locals())) def apply_affine(x, y, xo, yo, packer):
return Template("""
{{xo}} = {{packer.xx}} * {{x}} + {{packer.xy}} * {{y}} + {{packer.xo}};
{{yo}} = {{packer.yx}} * {{x}} + {{packer.yy}} * {{y}} + {{packer.yo}};
""", 'apply_affine').substitute(locals())
class IterCode(HunkOCode): # The number of threads per block used in the iteration function. Don't change
# The number of threads per block # it lightly; the code may depend on it in unusual ways.
NTHREADS = 256 NTHREADS = 256
def __init__(self, info, genome): iter_decls = """
self.packer = interp.GenomePacker('iter_params')
self.pcp = self.packer.view('params', genome, 'cp')
iterbody = self._iterbody(info, genome)
bodies = [self._xfbody(i,x) for i,x in sorted(genome.xforms.items())]
bodies.append(iterbody)
self.defs = '\n'.join(bodies)
decls = """
// Note: for normalized lookups, uchar4 actually returns floats // Note: for normalized lookups, uchar4 actually returns floats
texture<uchar4, cudaTextureType2D, cudaReadModeNormalizedFloat> palTex; texture<uchar4, cudaTextureType2D, cudaReadModeNormalizedFloat> palTex;
__shared__ iter_params params; __shared__ iter_params params;
__device__ int rb_head, rb_tail, rb_size;
typedef struct { typedef struct {
uint32_t width; uint32_t width;
@ -128,12 +119,9 @@ typedef struct {
uint32_t astride; uint32_t astride;
} acc_size_t; } acc_size_t;
__constant__ acc_size_t acc_size; __constant__ acc_size_t acc_size;
""" """
def _xfbody(self, xfid, xform): iter_xf_body_code = r"""
px = self.pcp.xforms[xfid]
tmpl = Template(r"""
__device__ __device__
void apply_xf_{{xfid}}(float &ox, float &oy, float &color, mwc_st &rctx) { void apply_xf_{{xfid}}(float &ox, float &oy, float &color, mwc_st &rctx) {
float tx, ty; float tx, ty;
@ -162,44 +150,30 @@ void apply_xf_{{xfid}}(float &ox, float &oy, float &color, mwc_st &rctx) {
float csp = {{px.color_speed}}; float csp = {{px.color_speed}};
color = color * (1.0f - csp) + {{px.color}} * csp; color = color * (1.0f - csp) + {{px.color}} * csp;
}; };
""", 'apply_xf_'+xfid) """
g = dict(globals())
g.update(locals())
return tmpl.substitute(g)
def _iterbody(self, info, genome): def iter_xf_body(pcp, xfid, xform):
tmpl = Template(r''' px = pcp.xforms[xfid]
tmpl = Template(iter_xf_body_code, 'apply_xf_'+xfid)
__global__ void reset_rb(int size) {
rb_head = rb_tail = 0;
rb_size = size;
}
__global__
void iter(
uint64_t out_ptr,
mwc_st *msts,
float4 *points,
const iter_params *all_params
{{if info.acc_mode == 'atomic'}}
, uint64_t atom_ptr
{{endif}}
) {
const iter_params *global_params = &(all_params[blockIdx.x]);
g = dict(globals())
g.update(locals())
return tmpl.substitute(g)
iter_body_code = r'''
__global__ void
iter(uint64_t out_ptr, uint64_t atom_ptr,
ringbuf *rb, mwc_st *msts, float4 *points,
const iter_params *all_params)
{
// load params to shared memory cooperatively // load params to shared memory cooperatively
const iter_params *global_params = &(all_params[blockIdx.x]);
for (int i = threadIdx.y * blockDim.x + threadIdx.x; for (int i = threadIdx.y * blockDim.x + threadIdx.x;
i < (sizeof(iter_params) / 4); i += blockDim.x * blockDim.y) i < (sizeof(iter_params) / 4); i += blockDim.x * blockDim.y)
reinterpret_cast<float*>(&params)[i] = reinterpret_cast<float*>(&params)[i] =
reinterpret_cast<const float*>(global_params)[i]; reinterpret_cast<const float*>(global_params)[i];
__shared__ int rb_idx; int this_rb_idx = rb_incr(rb->head, blockDim.x * threadIdx.y + threadIdx.x);
if (threadIdx.y == 1 && threadIdx.x == 1)
rb_idx = 32 * blockDim.y * (atomicAdd(&rb_head, 1) % rb_size);
__syncthreads();
int this_rb_idx = rb_idx + threadIdx.x + 32 * threadIdx.y;
mwc_st rctx = msts[this_rb_idx]; mwc_st rctx = msts[this_rb_idx];
{{precalc_camera(pcp.camera)}} {{precalc_camera(pcp.camera)}}
@ -209,22 +183,15 @@ void iter(
{{pcp.camera.yo}} += ditherwidth * mwc_next_11(rctx); {{pcp.camera.yo}} += ditherwidth * mwc_next_11(rctx);
} }
{{if info.acc_mode == 'global'}}
__shared__ float time_frac;
time_frac = blockIdx.x / (float) gridDim.x;
{{else}}
{{if info.acc_mode == 'atomic'}}
// TODO: spare the register, reuse at call site? // TODO: spare the register, reuse at call site?
int time = blockIdx.x >> 4; int time = blockIdx.x >> 4;
{{endif}}
float color_dither = 0.49f * mwc_next_11(rctx); float color_dither = 0.49f * mwc_next_11(rctx);
{{endif}}
// TODO: 4th channel unused. Kill or use for something helpful // TODO: 4th channel unused. Kill or use for something helpful
float4 old_point = points[this_rb_idx]; float4 old_point = points[this_rb_idx];
float x = old_point.x, y = old_point.y, color = old_point.z; float x = old_point.x, y = old_point.y, color = old_point.z;
{{if not info.chaos_used}} {{if not chaos_used}}
// Shared memory size can be reduced by a factor of four using a slower // Shared memory size can be reduced by a factor of four using a slower
// 4-stage reduce, but on Fermi hardware shmem use isn't a bottleneck // 4-stage reduce, but on Fermi hardware shmem use isn't a bottleneck
__shared__ float swap[{{4*NTHREADS}}]; __shared__ float swap[{{4*NTHREADS}}];
@ -250,7 +217,7 @@ void iter(
// TODO: link up with FUSE, etc // TODO: link up with FUSE, etc
for (int round = 0; round < 256; round++) { for (int round = 0; round < 256; round++) {
{{if info.chaos_used}} {{if chaos_used}}
{{precalc_chaos(pcp, std_xforms)}} {{precalc_chaos(pcp, std_xforms)}}
@ -315,16 +282,7 @@ void iter(
{{endif}} {{endif}}
{{if info.acc_mode == 'deferred'}}
int tid = threadIdx.y * 32 + threadIdx.x;
int offset = 4 * (256 * (256 * blockIdx.x + round) + tid);
int *log = reinterpret_cast<int*>(out_ptr + offset);
{{endif}}
if (fuse) { if (fuse) {
{{if info.acc_mode == 'deferred'}}
*log = 0xffffffff;
{{endif}}
continue; continue;
} }
@ -346,15 +304,11 @@ void iter(
uint32_t ix = trunca(cx), iy = trunca(cy); uint32_t ix = trunca(cx), iy = trunca(cy);
if (ix >= acc_size.astride || iy >= acc_size.aheight) { if (ix >= acc_size.astride || iy >= acc_size.aheight) {
{{if info.acc_mode == 'deferred'}}
*log = 0xffffffff;
{{endif}}
continue; continue;
} }
uint32_t ibase = (last_xf_used % {{info.max_nxf}}) * acc_size.aheight; uint32_t i = iy * acc_size.astride + ix;
uint32_t i = (ibase + iy) * acc_size.astride + ix;
{{if info.acc_mode == 'atomic'}}
asm volatile ({{crep(""" asm volatile ({{crep("""
{ {
// To prevent overflow, we need to flush each pixel before the density // To prevent overflow, we need to flush each pixel before the density
@ -432,38 +386,45 @@ oflow_end:
""")}} :: "f"(cc), "f"(color_dither), "r"(time), "r"(i), """)}} :: "f"(cc), "f"(color_dither), "r"(time), "r"(i),
"l"(atom_ptr), "f"(cosel[threadIdx.y + {{NWARPS}}]), "l"(atom_ptr), "f"(cosel[threadIdx.y + {{NWARPS}}]),
"l"(out_ptr)); "l"(out_ptr));
{{endif}}
{{if info.acc_mode == 'global'}}
float4 outcol = tex2D(palTex, cc, time_frac);
float4 *accbuf = reinterpret_cast<float4*>(out_ptr + (16*i));
float4 pix = *accbuf;
pix.x += outcol.x;
pix.y += outcol.y;
pix.z += outcol.z;
pix.w += 1.0f;
*accbuf = pix;
{{elif info.acc_mode == 'deferred'}}
// 'color' gets the top 8 bits. TODO: add dithering via precalc.
uint32_t icolor = fminf(1.0f, cc) * 255.0f + color_dither;
asm("bfi.b32 %0, %1, %0, 24, 8;" : "+r"(i) : "r"(icolor));
*log = i;
{{endif}}
} }
if (threadIdx.x == 0 && threadIdx.y == 0) this_rb_idx = rb_incr(rb->tail, blockDim.x * threadIdx.y + threadIdx.x);
rb_idx = 32 * blockDim.y * (atomicAdd(&rb_tail, 1) % rb_size);
__syncthreads();
this_rb_idx = rb_idx + threadIdx.x + 32 * threadIdx.y;
points[this_rb_idx] = make_float4(x, y, color, 0.0f); points[this_rb_idx] = make_float4(x, y, color, 0.0f);
msts[this_rb_idx] = rctx; msts[this_rb_idx] = rctx;
return; return;
} }
'''
{{if info.acc_mode == 'atomic'}} def iter_body(cp, pcp):
__global__ # For legacy reasons, 'cp' is used here instead of 'genome'.
void flush_atom(uint64_t out_ptr, uint64_t atom_ptr, int nbins) { tmpl = Template(iter_body_code, 'iter_body')
NWARPS = NTHREADS / 32
std_xforms = [n for n in sorted(cp.xforms) if n != 'final']
# TODO: detect this properly and use it
chaos_used = False
vars = globals()
vars.update(locals())
return tmpl.substitute(vars)
def mkiterlib(genome):
packer = interp.GenomePacker('iter_params')
pcp = packer.view('params', genome, 'cp')
iterbody = iter_body(genome, pcp)
bodies = [iter_xf_body(pcp, i, x) for i, x in sorted(genome.xforms.items())]
bodies.append(iterbody)
packer_lib = packer.finalize()
lib = devlib(deps=[packer_lib, mwclib, ringbuflib],
# We grab the surf decl from palintlib as well
decls=iter_decls + interp.palintlib.decls,
defs='\n'.join(bodies))
return packer, lib
flushatomlib = devlib(defs=Template(r'''
__global__ void flush_atom(uint64_t out_ptr, uint64_t atom_ptr, int nbins) {
int i = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x; int i = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;
if (i >= nbins) return; if (i >= nbins) return;
asm volatile ({{crep(""" asm volatile ({{crep("""
@ -499,168 +460,4 @@ void flush_atom(uint64_t out_ptr, uint64_t atom_ptr, int nbins) {
} }
""")}} :: "r"(i), "l"(atom_ptr), "l"(out_ptr)); """)}} :: "r"(i), "l"(atom_ptr), "l"(out_ptr));
} }
''', 'flush_atom').substitute())
{{endif}}
{{if info.acc_mode == 'deferred'}}
// Block size, shared accumulation bits, shared accumulation width.
#define BS 1024
#define SHAB 12
#define SHAW (1<<SHAB)
// Read the point log, accumulate in shared memory, and write the results.
// This kernel is to be launched with one block for every 4,096 addresses to
// be processed, and will handle those addresses.
//
// log_bounds is an array mapping radix values to the first index in the log
// with that radix position. For performance reasons in other parts of the
// code, the radix may actually include bits within the lower SHAB part of the
// address, or it might not cover the first few bits after the SHAB part;
// log_bounds_shift covers that. glob_addr_bits specifies the number of bits
// above SHAB which are address bits.
__global__ void
__launch_bounds__(BS, 1)
write_shmem(
float4 *acc,
const uint32_t *log,
const uint32_t *log_bounds,
uint32_t nbins
) {
const int tid = threadIdx.x;
const int bid = blockIdx.x;
// These two accumulators, used in write_shmem, hold {density, red} and
// {green, blue} values as packed u16 pairs. The fixed size represents
// 4,096 pixels in the accumulator.
__shared__ uint32_t s_acc[SHAW*2];
int idx = tid;
for (int i = 0; i < (SHAW * 2 / BS); i++) {
s_acc[idx] = 0;
idx += BS;
}
__syncthreads();
// Shut the compiler up
idx = s_acc[0];
// log_bounds[] holds inclusive prefix sums, so that log_bounds[0] is the
// largest index with radix 0, and so on.
int lb_idx_hi = bid & 0xff;
int lb_idx_lo = lb_idx_hi - 1;
int idx_lo;
if (lb_idx_lo > 0) idx_lo = log_bounds[lb_idx_lo] & ~(BS - 1);
else idx_lo = 0;
int idx_hi = (log_bounds[lb_idx_hi] & ~(BS - 1)) + BS;
float rnrounds = {{'%d.0f' % info.palette_height}} / (idx_hi - idx_lo);
float time = tid * rnrounds;
float time_step = BS * rnrounds;
int magic = ((blockIdx.x & 0xff) << 3) + ((blockIdx.x & 0xf00) << 12);
int magic_mask = 0xf007f8;
for (int i = idx_lo + tid; i < idx_hi; i += BS) {
int entry = log[i];
asm volatile ({{crep("""
{
.reg .pred q;
.reg .u32 shoff, color, time, d, r, g, b, hi, lo, hiw, low, tmp;
.reg .u64 ptr;
.reg .f32 rf, gf, bf, df, rg, gg, dg, bg;
// TODO: opacity
and.b32 tmp, %0, %4;
setp.eq.u32 q, tmp, %3;
@!q bra before_sync;
and.b32 shoff, %0, 0xff800;
shr.b32 shoff, shoff, 5;
bfi.b32 shoff, %0, shoff, 3, 3;
bfe.u32 color, %0, 24, 8;
shl.b32 color, color, 3;
cvt.rni.u32.f32 time, %1;
suld.b.2d.v2.b32.clamp {lo, hi}, [flatpal, {color, time}];
ld.shared.v2.u32 {low, hiw}, [shoff];
add.cc.u32 lo, lo, low;
addc.u32 hi, hi, hiw;
setp.hs.u32 q, hi, (1023 << 22);
@q bra oflow_sync;
st.shared.v2.u32 [shoff], {hi, lo};
before_sync:
bar.sync 0;
bra oflow_write_end;
oflow_sync:
st.shared.v2.u32 [shoff], {0, 0};
// TODO: opacity
bfi.b32 shoff, %0, 0, 4, 24;
cvt.u64.u32 ptr, shoff;
add.u64 ptr, ptr, %2;
ld.global.v4.f32 {dg,bg,gg,rg}, [ptr];
bar.sync 0;
bfe.u32 r, hi, 4, 18;
bfe.u32 g, lo, 18, 14;
bfi.b32 g, hi, g, 14, 4;
and.b32 b, lo, ((1<<18)-1);
cvt.rn.f32.u32 rf, r;
cvt.rn.f32.u32 gf, g;
cvt.rn.f32.u32 bf, b;
fma.rn.ftz.f32 rf, rf, (1.0/255.0), rg;
fma.rn.ftz.f32 gf, gf, (1.0/255.0), gg;
fma.rn.ftz.f32 bf, bf, (1.0/255.0), bg;
add.f32 df, df, dg;
st.global.v4.f32 [ptr], {df,bf,gf,rf};
oflow_write_end:
}
""")}} :: "r"(entry), "f"(time), "l"(acc), "r"(magic), "r"(magic_mask));
time += time_step;
}
__syncthreads();
idx = tid;
int glo_idx = magic | (((idx << 8) | idx) & 0xff807);
for (int i = 0; i < (SHAW / BS) && glo_idx < nbins; i++) {
int d, r, g, b;
float4 pix = acc[glo_idx];
asm({{crep("""
{
.reg .u32 hi, lo;
ld.shared.v2.u32 {lo, hi}, [%4];
shr.u32 %0, hi, 22;
bfe.u32 %1, hi, 4, 18;
bfe.u32 %2, lo, 18, 14;
bfi.b32 %2, hi, %2, 14, 4;
and.b32 %3, lo, ((1<<18)-1);
}
""")}} : "=r"(d), "=r"(r), "=r"(g), "=r"(b) : "r"(idx*8));
pix.x += r / 255.0f;
pix.y += g / 255.0f;
pix.z += b / 255.0f;
pix.w += d;
acc[glo_idx] = pix;
idx += BS;
glo_idx += (BS << 8);
}
}
{{endif}}
''', 'iter_kern')
return tmpl.substitute(
info = info,
cp = genome,
pcp = self.pcp,
NTHREADS = self.NTHREADS,
NWARPS = self.NTHREADS / 32,
std_xforms = [n for n in sorted(genome.xforms) if n != 'final'],
**globals())

View File

@ -6,18 +6,53 @@ import os
import warnings import warnings
import numpy as np import numpy as np
from util import * from util import devlib, assemble_code
class MWC(HunkOCode): # Keeping this live in the module isn't necessary, but loading the mults
decls = """ # can be surprisingly slow.
mults = None
def load_mults():
pfpath = os.path.join(os.path.dirname(__file__), 'primes.bin')
if os.path.isfile(pfpath):
with open(pfpath) as fp:
return np.frombuffer(fp.read(), dtype='<u4')
warnings.warn('primes.bin not found, trying to download it')
import bz2, urllib2
ufp = urllib2.urlopen('http://aduro.strobe.cc/primes.diff.bin.bz2')
diffs = np.frombuffer(bz2.decompress(ufp.read()), dtype='<u2')
mults = np.cumsum(-np.array(diffs, dtype='<u4'), dtype='<u4')
with open(pfpath, 'wb') as fp:
fp.write(mults)
return mults
def make_seeds(nthreads, host_seed=None):
global mults
if mults is None:
mults = load_mults()
if host_seed:
rand = np.random.RandomState(host_seed)
else:
rand = np.random
# Create the seed structures. TODO: check that struct is 4-byte aligned
seeds = np.empty((3, nthreads), dtype=np.uint32, order='F')
seeds[0][:] = mults[:nthreads]
# Excludes 0xffffffff for 32-bit compatibility with laziness
seeds[1] = rand.randint(1, 0x7fffffff, size=nthreads)
seeds[2] = rand.randint(1, 0x7fffffff, size=nthreads)
return seeds
mwclib = devlib(decls=r'''
typedef struct { typedef struct {
uint32_t mul; uint32_t mul;
uint32_t state; uint32_t state;
uint32_t carry; uint32_t carry;
} mwc_st; } mwc_st;
""" ''', defs=r'''
defs = r"""
__device__ uint32_t mwc_next(mwc_st &st) { __device__ uint32_t mwc_next(mwc_st &st) {
asm("{\n\t" asm("{\n\t"
".reg .u32 tmp;\n\t" ".reg .u32 tmp;\n\t"
@ -40,46 +75,9 @@ __device__ float mwc_next_11(mwc_st &st) {
: "=f"(ret) : "r"(val)); : "=f"(ret) : "r"(val));
return ret; return ret;
} }
""" ''')
@staticmethod mwctestlib = devlib(deps=[mwclib], defs="""
def load_mults():
pfpath = os.path.join(os.path.dirname(__file__), 'primes.bin')
if os.path.isfile(pfpath):
with open(pfpath) as fp:
return np.frombuffer(fp.read(), dtype='<u4')
warnings.warn('primes.bin not found, trying to download it')
import bz2, urllib2
ufp = urllib2.urlopen('http://aduro.strobe.cc/primes.diff.bin.bz2')
diffs = np.frombuffer(bz2.decompress(ufp.read()), dtype='<u2')
mults = np.cumsum(-np.array(diffs, dtype='<u4'), dtype='<u4')
with open(pfpath, 'wb') as fp:
fp.write(mults)
return mults
mults = None
@classmethod
def make_seeds(cls, nthreads, host_seed=None):
if cls.mults is None:
cls.mults = cls.load_mults()
if host_seed:
rand = np.random.RandomState(host_seed)
else:
rand = np.random
# Create the seed structures. TODO: check that struct is 4-byte aligned
seeds = np.empty((3, nthreads), dtype=np.uint32, order='F')
seeds[0][:] = cls.mults[:nthreads]
# Excludes 0xffffffff for 32-bit compatibility with laziness
seeds[1] = rand.randint(1, 0x7fffffff, size=nthreads)
seeds[2] = rand.randint(1, 0x7fffffff, size=nthreads)
return seeds
class MWCTest(HunkOCode):
defs = """
__global__ void test_mwc(mwc_st *msts, uint64_t *sums, float nrounds) { __global__ void test_mwc(mwc_st *msts, uint64_t *sums, float nrounds) {
mwc_st rctx = msts[gtid()]; mwc_st rctx = msts[gtid()];
uint64_t sum = 0; uint64_t sum = 0;
@ -87,51 +85,49 @@ __global__ void test_mwc(mwc_st *msts, uint64_t *sums, float nrounds) {
sums[gtid()] = sum; sums[gtid()] = sum;
msts[gtid()] = rctx; msts[gtid()] = rctx;
} }
""" """)
@classmethod def test_mwc(rounds=5000, nblocks=64, blockwidth=512):
def test_mwc(cls, rounds=5000, nblocks=64, blockwidth=512): import pycuda.driver as cuda
import pycuda.driver as cuda from pycuda.compiler import SourceModule
from pycuda.compiler import SourceModule import time
import time
nthreads = blockwidth * nblocks nthreads = blockwidth * nblocks
seeds = MWC.make_seeds(nthreads, host_seed = 5) seeds = make_seeds(nthreads, host_seed=42)
dseeds = cuda.to_device(seeds) dseeds = cuda.to_device(seeds)
mod = SourceModule(assemble_code(BaseCode, MWC, cls)) mod = SourceModule(assemble_code(mwctestlib))
for trial in range(2): for trial in range(2):
print "Trial %d, on CPU: " % trial, print "Trial %d, on CPU: " % trial,
sums = np.zeros(nthreads, dtype=np.uint64) sums = np.zeros(nthreads, dtype=np.uint64)
ctime = time.time() ctime = time.time()
mults = seeds[0].astype(np.uint64) mults = seeds[0].astype(np.uint64)
states = seeds[1] states = seeds[1]
carries = seeds[2] carries = seeds[2]
for i in range(rounds): for i in range(rounds):
step = np.frombuffer((mults * states + carries).data, step = np.frombuffer((mults * states + carries).data,
dtype=np.uint32).reshape((2, nthreads), order='F') dtype=np.uint32).reshape((2, nthreads), order='F')
states[:] = step[0] states[:] = step[0]
carries[:] = step[1] carries[:] = step[1]
sums += states sums += states
ctime = time.time() - ctime ctime = time.time() - ctime
print "Took %g seconds." % ctime print "Took %g seconds." % ctime
print "Trial %d, on device: " % trial, print "Trial %d, on device: " % trial,
dsums = cuda.mem_alloc(8*nthreads) dsums = cuda.mem_alloc(8*nthreads)
fun = mod.get_function("test_mwc") fun = mod.get_function("test_mwc")
dtime = fun(dseeds, dsums, np.float32(rounds), dtime = fun(dseeds, dsums, np.float32(rounds),
block=(blockwidth,1,1), grid=(nblocks,1), block=(blockwidth,1,1), grid=(nblocks,1),
time_kernel=True) time_kernel=True)
print "Took %g seconds." % dtime print "Took %g seconds." % dtime
dsums = cuda.from_device(dsums, nthreads, np.uint64) dsums = cuda.from_device(dsums, nthreads, np.uint64)
if not np.all(np.equal(sums, dsums)): if not np.all(np.equal(sums, dsums)):
print "Sum discrepancy!" print "Sum discrepancy!"
print sums print sums
print dsums print dsums
if __name__ == "__main__": if __name__ == "__main__":
import pycuda.autoinit import pycuda.autoinit
MWCTest.test_mwc() test_mwc()

34
cuburn/code/output.py Normal file
View File

@ -0,0 +1,34 @@
from util import devlib, ringbuflib
from mwc import mwclib
f32tou8lib = devlib(deps=[ringbuflib, mwclib], defs=r'''
// Perform a conversion from float32 values to uint8 ones, applying
// pixel- and channel-independent dithering to reduce suprathreshold banding
// artifacts. Clamps values larger than 1.0f.
// TODO: move to a separate module?
// TODO: less ineffecient mwc_st handling?
__global__ void f32_to_u8(
ringbuf *rb, mwc_st *rctxs, uchar4 *dst, const float4 *src,
int gutter, int dstride, int sstride, int height)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x > dstride || y > height) return;
int isrc = sstride * (y + gutter) + x + gutter;
int tid = blockDim.x * threadIdx.y + threadIdx.x;
mwc_st rctx = rctxs[rb_incr(rb->head, tid)];
float4 in = src[isrc];
uchar4 out = make_uchar4(
fminf(1.0f, in.x) * 255.0f + 0.49f * mwc_next_11(rctx),
fminf(1.0f, in.y) * 255.0f + 0.49f * mwc_next_11(rctx),
fminf(1.0f, in.z) * 255.0f + 0.49f * mwc_next_11(rctx),
fminf(1.0f, in.w) * 255.0f + 0.49f * mwc_next_11(rctx)
);
int idst = dstride * y + x;
dst[idst] = out;
rctxs[rb_incr(rb->head, tid)] = rctx;
}
''')

View File

@ -1,55 +1,119 @@
""" """
Provides tools and miscellaneous functions for building device code. Provides tools and miscellaneous functions for building device code.
""" """
import os
import tempfile
from collections import namedtuple
import pycuda.driver as cuda
import pycuda.compiler
import numpy as np import numpy as np
import tempita import tempita
def crep(s):
"""Escape for PTX assembly"""
if isinstance(s, unicode):
s = s.encode('utf-8')
return '"%s"' % s.encode("string_escape")
def argset(obj, **kwargs): def argset(obj, **kwargs):
"""
Allow an object with many properties to be set using one call.
>>> x = argset(X(), a=1, b=2)
>>> x.a
1
"""
for k, v in kwargs.items(): for k, v in kwargs.items():
setattr(obj, k, v) setattr(obj, k, v)
return obj return obj
def launch(name, mod, stream, block, grid, *args, **kwargs):
"""
Oft-used kernel launch helper. Provides a nice boost in readability of
densely-packed asynchronous code launches.
"""
fun = mod.get_function(name)
if isinstance(block, (int, np.number)):
block = (int(block), 1, 1)
if isinstance(grid, (int, np.number)):
grid = (int(grid), 1)
fun(*args, block=block, grid=grid, stream=stream, **kwargs)
def crep(s):
"""Multiline literal escape for inline PTX assembly."""
if isinstance(s, unicode):
s = s.encode('utf-8')
return '"%s"' % s.encode("string_escape")
class Template(tempita.Template): class Template(tempita.Template):
"""
A Tempita template with extra stuff in the default namespace.
"""
default_namespace = tempita.Template.default_namespace.copy() default_namespace = tempita.Template.default_namespace.copy()
Template.default_namespace.update({'np': np, 'crep': crep}) Template.default_namespace.update({'np': np, 'crep': crep})
class HunkOCode(object): # Passive container for device code.
"""An apparently passive container for device code.""" DevLib = namedtuple('DevLib', 'deps headers decls defs')
# Use property objects to make these dynamic
headers = ''
decls = ''
defs = ''
def assemble_code(*sections): def devlib(deps=(), headers='', decls='', defs=''):
return ''.join([''.join([getattr(sect, kind) for sect in sections]) """Create a library of device code."""
for kind in ['headers', 'decls', 'defs']]) # This exists because namedtuple doesn't support default args
return DevLib(deps, headers, decls, defs)
def apply_affine(x, y, xo, yo, packer): def assemble_code(*libs):
return Template(""" seen = set()
{{xo}} = {{packer.xx}} * {{x}} + {{packer.xy}} * {{y}} + {{packer.xo}}; out = []
{{yo}} = {{packer.yx}} * {{x}} + {{packer.yy}} * {{y}} + {{packer.yo}}; def go(lib):
""").substitute(locals()) map(go, lib.deps)
code = lib[1:]
if code not in seen:
seen.add(code)
out.append(code)
go(stdlib)
map(go, libs)
return ''.join(sum(zip(*out), ()))
class BaseCode(HunkOCode): DEFAULT_CMP_OPTIONS = ('-use_fast_math', '-maxrregcount', '42')
headers = """ DEFAULT_SAVE_KERNEL = True
def compile(name, src, opts=DEFAULT_CMP_OPTIONS, save=DEFAULT_SAVE_KERNEL):
"""
Compile a module. Returns a copy of the source (for inspection or
display) and the compiled cubin.
"""
dir = tempfile.gettempdir()
if save:
with open(os.path.join(dir, name + '_kern.cu'), 'w') as fp:
fp.write(src)
cubin = pycuda.compiler.compile(src, options=list(opts))
if save:
with open(os.path.join(dir, name + '_kern.cubin'), 'w') as fp:
fp.write(cubin)
return cubin
class ClsMod(object):
"""
Convenience class or mixin that automatically compiles and loads a module
once per class, saving potentially expensive code regeneration. Only use
if your class does not employ run-time code generation.
"""
mod = None
# Supply your own DevLib on this property
lib = None
def __init__(self):
super(ClsMod, self).__init__()
self.load()
@classmethod
def load(cls, name=None):
if cls.mod is None:
if name is None:
name = cls.__name__.lower()
cubin = compile(name, assemble_code(cls.lib))
cls.mod = cuda.module_from_buffer(cubin)
# This lib is included with *every* assembled module. It contains no global
# functions, so it shouldn't slow down compilation time too much.
stdlib = devlib(headers="""
#include<cuda.h> #include<cuda.h>
#include<stdint.h> #include<stdint.h>
#include<stdio.h> #include<stdio.h>
""" """, decls=r"""
decls = """
float3 rgb2hsv(float3 rgb);
float3 hsv2rgb(float3 hsv);
"""
defs = Template(r"""
#undef M_E #undef M_E
#undef M_LOG2E #undef M_LOG2E
#undef M_LOG10E #undef M_LOG10E
@ -84,33 +148,79 @@ float3 hsv2rgb(float3 hsv);
#define bfe_decl(d, s, o, w) \ #define bfe_decl(d, s, o, w) \
int d; \ int d; \
bfe(d, s, o, w) bfe(d, s, o, w)
""", defs=r'''
// TODO: use launch parameter preconfig to eliminate unnecessary parts __device__ uint32_t gtid() {
__device__
uint32_t gtid() {
return threadIdx.x + blockDim.x * return threadIdx.x + blockDim.x *
(threadIdx.y + blockDim.y * (threadIdx.y + blockDim.y *
(threadIdx.z + blockDim.z * (threadIdx.z + blockDim.z *
(blockIdx.x + (gridDim.x * blockIdx.y)))); (blockIdx.x + (gridDim.x * blockIdx.y))));
} }
__device__ __device__ uint32_t trunca(float f) {
uint32_t trunca(float f) {
// truncate as used in address calculations. note the use of a signed // truncate as used in address calculations. note the use of a signed
// conversion is intentional here (simplifies image bounds checking). // conversion is intentional here (simplifies image bounds checking).
uint32_t ret; uint32_t ret;
asm("cvt.rni.s32.f32 %0, %1;" : "=r"(ret) : "f"(f)); asm("cvt.rni.s32.f32 %0, %1;" : "=r"(ret) : "f"(f));
return ret; return ret;
} }
''')
__global__ def mkbinsearchlib(rounds):
void fill_dptr(uint32_t* dptr, int size, uint32_t value) { """
Search through the fixed-size list 'hay' to find the rightmost index which
contains a value strictly smaller than the input 'needle'. The list must
be exactly '2^rounds' long, although padding at the top with very large
numbers or even +inf effectively shortens it.
"""
# TODO: this doesn't optimize well on a 64-bit arch, not that it's a
# performance-critical chunk of code or anything
src = Template(r'''
__device__ int bitwise_binsearch(const float *hay, float needle) {
int lo = 0;
// TODO: improve efficiency on 64-bit arches
{{for i in range(search_rounds-1, -1, -1)}}
if (needle > hay[lo + {{1 << i}}])
lo += {{1 << i}};
{{endfor}}
return lo;
}
''', 'bitwise_binsearch')
return devlib(defs=src.substitute(search_rounds=rounds))
# 2^search_rounds is the maximum number of knots allowed in a single spline.
# This includes the four required knots, so a 5 round search supports 28
# interior knots in the domain (0, 1). 2^5 fits nicely on a single cache line.
DEFAULT_SEARCH_ROUNDS = 5
binsearchlib = mkbinsearchlib(DEFAULT_SEARCH_ROUNDS)
filldptrlib = devlib(defs=r'''
__global__ void
fill_dptr(uint32_t* dptr, int size, uint32_t value) {
int i = (gridDim.x * blockIdx.y + blockIdx.x) * blockDim.x + threadIdx.x; int i = (gridDim.x * blockIdx.y + blockIdx.x) * blockDim.x + threadIdx.x;
if (i < size) { if (i < size) {
dptr[i] = value; dptr[i] = value;
} }
} }
''')
def fill_dptr(mod, dptr, size, stream=None, value=np.uint32(0)):
"""
A memory zeroer which can be embedded in a stream, unlike the various
memset routines. Size is the number of 4-byte words in the pointer;
value is the word to fill it with. If value is not an np.uint32, it
will be coerced to a buffer and the first four bytes taken.
"""
if not isinstance(value, np.uint32):
if isinstance(value, int):
value = np.uint32(value)
else:
value = np.frombuffer(buffer(value), np.uint32)[0]
blocks = int(np.ceil(np.sqrt(size / 1024.)))
launch('fill_dptr', mod, stream, (1024, 1, 1), (blocks, blocks),
dptr, np.int32(size), value)
writehalflib = devlib(defs=r'''
/* read_half and write_half decode and encode, respectively, two /* read_half and write_half decode and encode, respectively, two
* floating-point values from a 32-bit value (typed as a 'float' for * floating-point values from a 32-bit value (typed as a 'float' for
* convenience but not really). The values are packed into u16s as a * convenience but not really). The values are packed into u16s as a
@ -122,8 +232,8 @@ void fill_dptr(uint32_t* dptr, int size, uint32_t value) {
* approach when the alpha channel is present. * approach when the alpha channel is present.
*/ */
__device__ __device__ void
void read_half(float &x, float &y, float xy, float den) { read_half(float &x, float &y, float xy, float den) {
asm("\n\t{" asm("\n\t{"
"\n\t .reg .u16 x, y;" "\n\t .reg .u16 x, y;"
"\n\t .reg .f32 rc;" "\n\t .reg .f32 rc;"
@ -137,8 +247,8 @@ void read_half(float &x, float &y, float xy, float den) {
: "=f"(x), "=f"(y) : "f"(xy), "f"(den)); : "=f"(x), "=f"(y) : "f"(xy), "f"(den));
} }
__device__ __device__ void
void write_half(float &xy, float x, float y, float den) { write_half(float &xy, float x, float y, float den) {
asm("\n\t{" asm("\n\t{"
"\n\t .reg .u16 x, y;" "\n\t .reg .u16 x, y;"
"\n\t .reg .f32 rc, xf, yf;" "\n\t .reg .f32 rc, xf, yf;"
@ -152,87 +262,41 @@ void write_half(float &xy, float x, float y, float den) {
"\n\t}" "\n\t}"
: "=f"(xy) : "f"(x), "f"(y), "f"(den)); : "=f"(xy) : "f"(x), "f"(y), "f"(den));
} }
''')
/* This conversion uses the JPEG full-range standard. Note that UV have range def mkringbuflib(rb_size):
* [-0.5, 0.5], so consider biasing the results. */ """
__device__ A ringbuffer for access to shared resources.
float3 rgb2yuv(float3 rgb) {
return make_float3( Some components, such as the MWC contexts, are expensive to generate, and
0.299f * rgb.x + 0.587f * rgb.y + 0.114f * rgb.z, have no affinity to a particular block. Rather than maintain a separate
-0.168736f * rgb.x - 0.331264f * rgb.y + 0.5f * rgb.z, copy of each of these objects for every thread block in a launch, we want
0.5f * rgb.x - 0.418688f * rgb.y - 0.081312f * rgb.z); to only keep enough copies of these resources around to service every
thread block that could possibly be active simultaneously on one card,
which is often considerably smaller. This class provides a simple
ringbuffer type and an increment function, used in a couple places to
implement this kind of sharing.
"""
return devlib(headers="#define RB_SIZE_MASK %d" % (rb_size - 1), decls='''
typedef struct {
int head;
int tail;
} ringbuf;
''', defs=r'''
__shared__ int rb_idx;
__device__ int rb_incr(int &rb_base, int tidx) {
if (threadIdx.y == 1 && threadIdx.x == 1)
rb_idx = 256 * (atomicAdd(&rb_base, 1) & RB_SIZE_MASK);
__syncthreads();
return rb_idx + tidx;
} }
''')
__device__ # For now, the number of entries is fixed to a value known to work on all
float3 yuv2rgb(float3 yuv) { # Fermi cards. Autodetection, or perhaps just a global limit increase, will be
return make_float3( # done when I get my hands on a Kepler device. The fixed size assumes blocks
yuv.x + 1.402f * yuv.z, # of 256 threads, although even at that size there are pathological cases that
yuv.x - 0.34414f * yuv.y - 0.71414f * yuv.z, # could break the assumption.
yuv.x + 1.772f * yuv.y); DEFAULT_RB_SIZE = 64
} ringbuflib = mkringbuflib(DEFAULT_RB_SIZE)
__device__
float3 rgb2hsv(float3 rgb) {
float M = fmaxf(fmaxf(rgb.x, rgb.y), rgb.z);
float m = fminf(fminf(rgb.x, rgb.y), rgb.z);
float C = M - m;
float s = M > 0.0f ? C / M : 0.0f;
float h = 0.0f;
if (s != 0.0f) {
C = 1.0f / C;
float rc = (M - rgb.x) * C;
float gc = (M - rgb.y) * C;
float bc = (M - rgb.z) * C;
if (rgb.x == M) h = bc - gc;
else if (rgb.y == M) h = 2 + rc - bc;
else h = 4 + gc - rc;
if (h < 0) h += 6.0f;
}
return make_float3(h, s, M);
}
__device__
float3 hsv2rgb(float3 hsv) {
float whole = floorf(hsv.x);
float frac = hsv.x - whole;
float val = hsv.z;
float min = val * (1 - hsv.y);
float mid = val * (1 - (hsv.y * frac));
float alt = val * (1 - (hsv.y * (1 - frac)));
float3 out;
if (whole == 0.0f) { out.x = val; out.y = alt; out.z = min; }
else if (whole == 1.0f) { out.x = mid; out.y = val; out.z = min; }
else if (whole == 2.0f) { out.x = min; out.y = val; out.z = alt; }
else if (whole == 3.0f) { out.x = min; out.y = mid; out.z = val; }
else if (whole == 4.0f) { out.x = alt; out.y = min; out.z = val; }
else { out.x = val; out.y = min; out.z = mid; }
return out;
}
""").substitute()
@staticmethod
def fill_dptr(mod, dptr, size, stream=None, value=np.uint32(0)):
"""
A memory zeroer which can be embedded in a stream, unlike the various
memset routines. Size is the number of 4-byte words in the pointer;
value is the word to fill it with. If value is not an np.uint32, it
will be coerced to a buffer and the first four bytes taken.
"""
fill = mod.get_function("fill_dptr")
if not isinstance(value, np.uint32):
if isinstance(value, int):
value = np.uint32(value)
else:
value = np.frombuffer(buffer(value), np.uint32)[0]
blocks = int(np.ceil(np.sqrt(size / 1024 + 1)))
fill(dptr, np.int32(size), value, stream=stream,
block=(1024, 1, 1), grid=(blocks, blocks, 1))

105
cuburn/filters.py Normal file
View File

@ -0,0 +1,105 @@
import numpy as np
from numpy import float32 as f32, int32 as i32
import pycuda.driver as cuda
import pycuda.compiler
from pycuda.gpuarray import vec
import code.filters
from code.util import ClsMod, argset, launch
class Filter(object):
def apply(self, fb, gnm, dim, tc, stream=None):
"""
Queue the application of this filter. When the live stream finishes
executing the last item enqueued by this method, the result must be
live in the buffer pointed to by ``fb.d_front`` at the return of this
function.
"""
raise NotImplementedError()
class Bilateral(Filter, ClsMod):
lib = code.filters.bilaterallib
def __init__(self, directions=8, r=15, sstd=6, cstd=0.05,
dstd=1.5, dpow=0.8, gspeed=4.0):
# TODO: expose these parameters on the genome, or at least on the
# profile, and set them by a less ugly mechanism
for n in 'directions r sstd cstd dstd dpow gspeed'.split():
setattr(self, n, locals()[n])
super(Bilateral, self).__init__()
def apply(self, fb, gnm, dim, tc, stream=None):
# Helper variables and functions to keep it clean
sb = 16 * dim.astride
bs = sb * dim.ah
bl, gr = (32, 8, 1), (dim.astride / 32, dim.ah / 8)
mkdsc = lambda c: argset(cuda.ArrayDescriptor(), height=dim.ah,
width=dim.astride, num_channels=c,
format=cuda.array_format.FLOAT)
def mktref(n):
tref = self.mod.get_texref(n)
tref.set_filter_mode(cuda.filter_mode.POINT)
tref.set_address_mode(0, cuda.address_mode.WRAP)
tref.set_address_mode(1, cuda.address_mode.WRAP)
return tref
dsc = mkdsc(4)
tref = mktref('bilateral_src')
grad_dsc = mkdsc(1)
grad_tref = mktref('blur_src')
for pattern in range(self.directions):
# Scale spatial parameter so that a "pixel" is equivalent to an
# actual pixel at 1080p
sstd = self.sstd * dim.w / 1920.
tref.set_address_2d(fb.d_front, dsc, sb)
# Blur density two octaves along sampling vector, ultimately
# storing in the side buffer
launch('den_blur', self.mod, stream, bl, gr,
fb.d_back, i32(pattern), i32(0), texrefs=[tref])
grad_tref.set_address_2d(fb.d_back, grad_dsc, sb / 4)
launch('den_blur_1c', self.mod, stream, bl, gr,
fb.d_side, i32(pattern), i32(1), texrefs=[grad_tref])
grad_tref.set_address_2d(fb.d_side, grad_dsc, sb / 4)
launch('bilateral', self.mod, stream, bl, gr,
fb.d_back, i32(pattern), i32(self.r),
f32(sstd), f32(self.cstd), f32(self.dstd),
f32(self.dpow), f32(self.gspeed),
texrefs=[tref, grad_tref])
fb.flip()
class Logscale(Filter, ClsMod):
lib = code.filters.logscalelib
def apply(self, fb, gnm, dim, tc, stream=None):
"""Log-scale in place."""
k1 = f32(gnm.color.brightness(tc) * 268 / 256)
# Old definition of area is (w*h/(s*s)). Since new scale 'ns' is now
# s/w, new definition is (w*h/(s*s*w*w)) = (h/(s*s*w))
area = dim.h / (gnm.camera.scale(tc) ** 2 * dim.w)
k2 = f32(1.0 / (area * gnm.spp(tc)))
nbins = dim.ah * dim.astride
launch('logscale', self.mod, stream, 256, nbins/256,
fb.d_front, fb.d_front, k1, k2)
class ColorClip(Filter, ClsMod):
lib = code.filters.colorcliplib
def apply(self, fb, gnm, dim, tc, stream=None):
# TODO: implement integration over cubic splines?
gam = f32(1 / gnm.color.gamma(tc))
vib = f32(gnm.color.vibrance(tc))
hipow = f32(gnm.color.highlight_power(tc))
lin = f32(gnm.color.gamma_threshold(tc))
lingam = f32(lin ** (gam-1.0) if lin > 0 else 0)
bkgd = vec.make_float3(
gnm.color.background.r(tc),
gnm.color.background.g(tc),
gnm.color.background.b(tc))
nbins = dim.ah * dim.astride
blocks = int(np.ceil(np.sqrt(nbins / 256.)))
launch('colorclip', self.mod, stream, 256, (blocks, blocks),
fb.d_front, gam, vib, hipow, lin, lingam, bkgd, i32(nbins))

49
cuburn/output.py Normal file
View File

@ -0,0 +1,49 @@
import numpy as np
from numpy import float32 as f32, int32 as i32
import pycuda.driver as cuda
from code.util import ClsMod, launch
from code.output import f32tou8lib
import scipy.misc
if not hasattr(scipy.misc, 'toimage'):
raise ImportError("Could not find scipy.misc.toimage. "
"Are scipy and PIL installed?")
class Output(object):
def convert(self, fb, gnm, dim, stream=None):
"""
Convert a filtered buffer to whatever output format is needed by the
writer.
"""
raise NotImplementedError()
def copy(self, fb, dim, pool, stream=None):
"""
Schedule a copy from the device buffer to host memory, returning the
target buffer.
"""
raise NotImplementedError()
class PILOutput(Output, ClsMod):
lib = f32tou8lib
def convert(self, fb, gnm, dim, stream=None):
launch('f32_to_u8', self.mod, stream,
(32, 8, 1), (int(np.ceil(dim.w/32.)), int(np.ceil(dim.h/8.))),
fb.d_rb, fb.d_seeds, fb.d_back, fb.d_front,
i32(fb.gutter), i32(dim.w), i32(dim.astride), i32(dim.h))
def copy(self, fb, dim, pool, stream=None):
h_out = pool.allocate((dim.h, dim.w, 4), 'u1')
cuda.memcpy_dtoh_async(h_out, fb.d_back, stream)
return h_out
@staticmethod
def save(buf, name, type=None, quality=98):
if type == 'jpeg' or (type is None and name.endswith('.jpg')):
buf = buf[:,:,:3]
img = scipy.misc.toimage(buf, cmin=0, cmax=1)
img.save(name, type, quality=quality)

View File

@ -1,383 +1,372 @@
"""
Resources and tools to perform rendering.
"""
import os import os
import sys import sys
import re import re
import time as timemod import time
import tempfile import tempfile
from collections import namedtuple from collections import namedtuple
from itertools import cycle, repeat, chain, izip, imap, ifilter
from ctypes import *
from cStringIO import StringIO
import numpy as np import numpy as np
from numpy import float32 as f32, int32 as i32, uint32 as u32, uint64 as u64 from numpy import float32 as f32, int32 as i32, uint32 as u32, uint64 as u64
from scipy import ndimage
import pycuda.compiler
import pycuda.driver as cuda import pycuda.driver as cuda
import pycuda.tools import pycuda.tools
import cuburn.genome import filters
from cuburn import affine import output
from cuburn.code import util, mwc, iter, interp, filtering, sort from code import util, mwc, iter, interp, sort
from code.util import ClsMod, devlib, filldptrlib, assemble_code, launch
RenderedImage = namedtuple('RenderedImage', 'buf idx gpu_time') RenderedImage = namedtuple('RenderedImage', 'buf idx gpu_time')
Dimensions = namedtuple('Dimensions', 'w h aw ah astride') Dimensions = namedtuple('Dimensions', 'w h aw ah astride')
def _sync_stream(dst, src): class Framebuffers(object):
dst.wait_for_event(cuda.Event(cuda.event_flags.DISABLE_TIMING).record(src)) """
The largest memory allocations, and a stream to serialize their use.
class Renderer(object): ``d_front`` and ``d_back`` are separate buffers, each large enough to hold
four float32 components per pixel (including any gutter pixels added for
alignment or padding). ``d_side`` is another buffer large enough to hold
two float32 components per pixel.
Every user of this set of buffers may use and overwrite the buffers in
any way, as long as the output for the next stage winds up in the front
buffer. The front and back buffers can be exchanged by the ``flip()``
method (which simply exchanges the pointers); while no similar method
exists for the side buffer, you're free to do the same by taking local
copies of the references and exchanging them yourself.
There's one spot in the stream interleaving where the behavior is
different: the ``Output.convert`` call must store its output to the back
buffer, which will remain untouched until the dtoh copy of the converted
buffer is finished. This happens while the ``iter`` kernel of the next
frame writes to the front and side buffers, which means in practice that
there's essentially no waiting on any buffers.
If an output module decides to get all krazy and actually replaces the
references to the buffers on this object - to, say, implement a temporally
aware tone-mapping or denoising pass - that's probably okay, but just make
sure it appears like it's expected to.
""" """
Control structure for rendering a series of frames.
# Minimum extension of accumulation buffer beyond output region. Used to
# alleviate edge effects during filtering. Actual gutter may be larger to
# accomodate alignment requirements; when it is, that extension will be
# applied to the lower-right corner of the buffer. This is asymmetrical,
# but simplifies trimming logic when it's time for that.
gutter = 10
@classmethod
def calc_dim(cls, width, height):
"""
Given a width and height, return a valid set of dimensions which
include at least enough gutter to exceed the minimum, and where
(acc_width % 32) == 0 and (acc_height % 8) == 0.
"""
awidth = width + 2 * cls.gutter
aheight = 8 * int(np.ceil((height + 2 * cls.gutter) / 8.))
astride = 32 * int(np.ceil(awidth / 32.))
return Dimensions(width, height, awidth, aheight, astride)
def __init__(self):
self.stream = cuda.Stream()
self._clear()
# These resources rely on the slots/ringbuffer mechanism for sharing,
# and so can be shared across any number of launches, genomes, and
# render kernels. Notably, seeds are self-synchronizing, so they're not
# attached to either stream object.
self.d_rb = cuda.to_device(np.array([0, 0], dtype=u32))
seeds = mwc.make_seeds(util.DEFAULT_RB_SIZE * 256)
self.d_seeds = cuda.to_device(seeds)
self._len_d_points = util.DEFAULT_RB_SIZE * 256 * 16
self.d_points = cuda.mem_alloc(self._len_d_points)
def _clear(self):
self.nbins = self.d_front = self.d_back = self.d_side = None
def free(self, stream=None):
if stream is not None:
stream.synchronize()
else:
cu
for p in (self.d_front, self.d_back, self.d_side):
if p is not None:
p.free()
self._clear()
def alloc(self, dim, stream=None):
"""
Ensure that this object's framebuffers are large enough to handle the
given dimensions, allocating new ones if not.
If ``stream`` is not None and a reallocation is necessary, the stream
will be synchronized before the old buffers are deallocated.
"""
nbins = dim.ah * dim.astride
if self.nbins >= nbins: return
if self.nbins is not None: self.free()
try:
self.d_front = cuda.mem_alloc(16 * nbins)
self.d_back = cuda.mem_alloc(16 * nbins)
self.d_side = cuda.mem_alloc(8 * nbins)
self.nbins = nbins
except cuda.MemoryError, e:
# If a frame that's too large sneaks by the task distributor, we
# don't want to kill the server, but we also don't want to leave
# it stuck without any free memory to complete the next alloc.
# TODO: measure free mem and only take tasks that fit (but that
# should be done elsewhere)
self.free(stream)
raise e
def set_dim(self, width, height, stream=None):
"""
Compute padded dimensions for given width and height, ensure that the
buffers are large enough (and reallocate if not), and return the
calculated dimensions.
Note that the returned dimensions are always the same for a given
width, height, and minimum gutter, even if the underlying buffers are
larger due to a previous allocation.
"""
dim = self.calc_dim(width, height)
self.alloc(dim, stream)
return dim
def flip(self):
"""Flip the front and back buffers."""
self.d_front, self.d_back = self.d_back, self.d_front
class DevSrc(object):
""" """
The buffers which represent a genome on-device, in the formats needed to
serve as a source for interpolating temporal samples.
"""
# Maximum number of knots per parameter. This also covers the maximum
# number of palettes allowed.
max_knots = 1 << util.DEFAULT_SEARCH_ROUNDS
# Maximum number of parameters per genome. This number is exceedingly
# high, and should never even come close to being hit.
max_params = 1024
def __init__(self):
self.d_times = cuda.mem_alloc(4 * self.max_knots * self.max_params)
self.d_knots = cuda.mem_alloc(4 * self.max_knots * self.max_params)
self.d_ptimes = cuda.mem_alloc(4 * self.max_knots)
self.d_pals = cuda.mem_alloc(4 * 4 * 256 * self.max_knots)
class DevInfo(object):
"""
The buffers which hold temporal samples on-device, as used by iter.
"""
# The palette texture/surface covers the color coordinate from [0,1] with
# equidistant horizontal samples, and spans the temporal range of the
# frame linearly with this many rows. Increasing these values increases the
# number of uniquely-dithered samples when using pre-dithered surfaces, as
# is done in 'atomic' accumulation.
palette_width = 256 # TODO: make this setting be respected
palette_height = 64
# This used to be determined automagically, but simultaneous occupancy
# and a much smaller block size simplifies this.
ntemporal_samples = 1024
# Number of iterations to iterate without write after generating a new # Number of iterations to iterate without write after generating a new
# point. This number is currently fixed pretty deeply in the set of magic # point. This number is currently fixed pretty deeply in the set of magic
# constants which govern buffer sizes; changing the value here won't # constants which govern buffer sizes; changing the value here won't
# actually change the code on the device to do something different. # actually change the code on the device to do something different.
# It's here just for documentation purposes.
fuse = 256 fuse = 256
# The palette texture/surface covers the color coordinate from [0,1] with def __init__(self):
# (for now, a fixed 256) equidistant horizontal samples, and spans the self.d_params = cuda.mem_alloc(
# temporal range of the frame linearly with this many rows. Increasing self.ntemporal_samples * DevSrc.max_params * 4)
# this value increases the number of uniquely-dithered samples when using self.palette_surf_dsc = util.argset(cuda.ArrayDescriptor3D(),
# pre-dithered surfaces. height=self.palette_height, width=self.palette_width, depth=0,
palette_height = 64 format=cuda.array_format.SIGNED_INT32,
num_channels=2, flags=cuda.array3d_flags.SURFACE_LDST)
self.d_pal_array = cuda.Array(self.palette_surf_dsc)
# Palette color interpolation mode (see code.interp.Palette) class Renderer(object):
palette_interp_mode = 'yuv' def __init__(self, gnm):
self.packer, self.lib = iter.mkiterlib(gnm)
cubin = util.compile('iter', assemble_code(self.lib))
self.mod = cuda.module_from_buffer(cubin)
# Maximum width of DE and other spatial filters, and thus in turn the # TODO: make these customizable
# amount of padding applied. Note that, for now, this must not be changed! self.filts = [ filters.Bilateral()
# The filtering code makes deep assumptions about this value. , filters.Logscale()
gutter = 10 , filters.ColorClip() ]
self.out = output.PILOutput()
# Accumulation mode. Leave it at 'atomic' for now. class RenderManager(ClsMod):
acc_mode = 'atomic' lib = devlib(deps=[interp.palintlib, filldptrlib, iter.flushatomlib])
# At most this many separate buffers for xforms will be allocated, after
# which further xforms will wrap to the first when writing. Currently it
# is compiled in, so power-of-two and no runtime maximization. Current
# value of 16 fits into a 1GB card at 1080p.
max_nxf = 1
# TODO
chaos_used = False
cmp_options = ('-use_fast_math', '-maxrregcount', '42')
keep = False
def __init__(self): def __init__(self):
self._iter = self.pal = self.src = self.cubin = self.mod = None super(RenderManager, self).__init__()
self.pool = pycuda.tools.PageLockedMemoryPool()
# Ensure class options don't get contaminated on an instance self.fb = Framebuffers()
self.cmp_options = list(self.cmp_options) self.src_a, self.src_b = DevSrc(), DevSrc()
self.info_a, self.info_b = DevInfo(), DevInfo()
self.stream_a, self.stream_b = cuda.Stream(), cuda.Stream()
self.filt_evt = self.copy_evt = None
def compile(self, genome, keep=None, cmp_options=None): def _copy(self, rdr, gnm):
""" """
Compile a kernel capable of rendering every frame in this animation. Queue a copy of a host genome into a set of device interpolation source
The resulting compiled kernel is stored in the ``cubin`` property; buffers.
the source is available as ``src``, and is also returned for
inspection and display.
This operation is idempotent, and has no side effects outside of Note that for now, this is broken! It ignores ``gnm``, and only packs
setting properties on this instance (unless there's a compiler error, the genome that was used when creating the renderer.
which is a bug); it should therefore be threadsafe as well.
It is, however, rather slow.
""" """
keep = self.keep if keep is None else keep times, knots = rdr.packer.pack(self.pool)
cmp_options = self.cmp_options if cmp_options is None else cmp_options cuda.memcpy_htod_async(self.src_a.d_times, times, self.stream_a)
cuda.memcpy_htod_async(self.src_a.d_knots, knots, self.stream_a)
self._iter = iter.IterCode(self, genome) ptimes, pidxs = zip(*gnm.palette_times)
self._iter.packer.finalize() palettes = self.pool.allocate((len(ptimes), 256, 4), f32)
self.pal = interp.Palette(self.palette_interp_mode) palettes[:] = [gnm.decoded_palettes[i] for i in pidxs]
self.src = util.assemble_code(util.BaseCode, mwc.MWC, self._iter.packer, palette_times = self.pool.allocate((self.src_a.max_knots,), f32)
self.pal, self._iter) palette_times.fill(1e9)
with open(os.path.join(tempfile.gettempdir(), 'kernel.cu'), 'w') as fp: palette_times[:len(ptimes)] = ptimes
fp.write(self.src) cuda.memcpy_htod_async(self.src_a.d_pals, palettes, self.stream_a)
self.cubin = pycuda.compiler.compile( cuda.memcpy_htod_async(self.src_a.d_ptimes, palette_times,
self.src, keep=keep, options=cmp_options, self.stream_a)
cache_dir=False if keep else None)
def load(self, genome, jit_options=[]): # TODO: use bilerp tex as src for palette interp
if not self.cubin:
self.compile(genome)
self.mod = cuda.module_from_buffer(self.cubin, jit_options)
with open('/tmp/iter_kern.cubin', 'wb') as fp:
fp.write(self.cubin)
return self.src
def render(self, genome, times, width, height, blend=True): def _interp(self, rdr, gnm, dim, ts, td):
d_acc_size = rdr.mod.get_global('acc_size')[0]
p_dim = self.pool.allocate((len(dim),), u32)
p_dim[:] = dim
cuda.memcpy_htod_async(d_acc_size, p_dim, self.stream_a)
tref = self.mod.get_surfref('flatpal')
tref.set_array(self.info_a.d_pal_array, 0)
launch('interp_palette_flat', self.mod, self.stream_a,
256, self.info_a.palette_height,
self.fb.d_rb, self.fb.d_seeds,
self.src_a.d_ptimes, self.src_a.d_pals,
f32(ts), f32(td / self.info_a.palette_height))
nts = self.info_a.ntemporal_samples
launch('interp_iter_params', rdr.mod, self.stream_a,
256, np.ceil(nts / 256.),
self.info_a.d_params, self.src_a.d_times, self.src_a.d_knots,
f32(ts), f32(td / nts), i32(nts))
def _print_interp_knots(self, rdr, tsidx=5):
infos = cuda.from_device_like(self.info_a.d_params,
(tsidx + 1, self.info_a.max_params), f32)
for i, n in zip(infos[-1], rdr.packer.packed):
print '%60s %g' % ('_'.join(n), i)
def _iter(self, rdr, gnm, dim, tc):
tref = rdr.mod.get_surfref('flatpal')
tref.set_array(self.info_a.d_pal_array, 0)
nbins = dim.ah * dim.astride
fill = lambda b, s, v=i32(0): util.fill_dptr(
self.mod, b, s, stream=self.stream_a, value=v)
fill(self.fb.d_front, 4 * nbins)
fill(self.fb.d_side, 2 * nbins)
fill(self.fb.d_points, self.fb._len_d_points / 4, f32(np.nan))
nts = self.info_a.ntemporal_samples
nsamps = (gnm.spp(tc) * dim.w * dim.h)
nrounds = int(nsamps / (nts * 256. * 256)) + 1
launch('iter', rdr.mod, self.stream_a, (32, 8, 1), (nts, nrounds),
self.fb.d_front, self.fb.d_side,
self.fb.d_rb, self.fb.d_seeds, self.fb.d_points,
self.info_a.d_params)
nblocks = int(np.ceil(np.sqrt(dim.ah*dim.astride/256.)))
launch('flush_atom', self.mod, self.stream_a,
256, (nblocks, nblocks),
u64(self.fb.d_front), u64(self.fb.d_side), i32(nbins))
def queue_frame(self, rdr, gnm, tc, w, h, copy=True):
""" """
Render a frame for each timestamp in the iterable value ``times``. This Queue one frame for rendering.
function returns a generator that will yield a RenderedImage object
containing a shared reference to the output buffer for each specified
frame.
The returned buffer is page-locked host memory. Between the time a ``rdr`` is a compiled Renderer module. Caller must ensure that the
buffer is yielded and the time the next frame's results are requested, module is compatible with the genome data provided.
the buffer will not be modified. Thereafter, however, it will be
overwritten by an asynchronous DMA operation coming from the CUDA
device. If you hang on to it for longer than one frame, copy it.
``genome`` is the genome to be rendered. Successive calls to the ``gnm`` is the genome to be rendered.
`render()` method on one ``Renderer`` object must use genomes which
produce identical compiled code, and this will not be verified by the
renderer. In practice, this means you can alter genome parameter
values, but the full set of keys must remain identical between runs on
the same renderer.
``times`` is a list of (idx, cen_time) tuples, where ``idx`` is passed ``tc`` is the center time at which to render.
unmodified in the RenderedImage return value and ``cen_time`` is the
central time of the current frame in spline-time units. (Any
clock-time or frame-time units in the genome should be preconverted.)
If ``blend`` is False, the output buffer will contain unclipped, ``w``, ``h`` are the width and height of the desired output in px.
premultiplied RGBA data, without vibrancy, highlight power, or the
alpha elbow applied. If ``copy`` is False, the genome data will not be recopied for each
new genome. This function must be called with ``copy=True`` the first
time a new genome is used, and may be called in that manner
subsequently without harm. I suspect the performance impact is low, so
leave ``copy`` to True every time for now.
The return value is a 2-tuple ``(evt, h_out)``, where ``evt`` is a
CUDA event and ``h_out`` is the return value of the output module's
``copy`` function. In the typical case, ``h_out`` will be a host
allocation containing data in an appropriate format for the output
module's file writer, and ``evt`` indicates when the asynchronous
DMA copy which will populate ``h_out`` is complete. This can vary
depending on the output module in use, though.
""" """
r = self.render_gen(genome, width, height, blend=blend) # Note: we synchronize on the previous stream if buffers need to be
next(r) # reallocated, which implicitly also syncs the current stream.
return ifilter(None, imap(r.send, chain(times, [None]))) dim = self.fb.set_dim(w, h, self.stream_b)
def render_gen(self, genome, width, height, blend=True): td = gnm.adj_frame_width(tc)
ts, te = tc - 0.5 * td, tc + 0.5 * td
# The stream interleaving here is nontrivial.
# TODO: update diagram and link to it here
if copy:
self.src_a, self.src_b = self.src_b, self.src_a
self._copy(rdr, gnm)
self._interp(rdr, gnm, dim, ts, td)
if self.filt_evt:
self.stream_a.wait_for_event(self.filt_evt)
self._iter(rdr, gnm, dim, tc)
if self.copy_evt:
self.stream_a.wait_for_event(self.copy_evt)
for filt in rdr.filts:
filt.apply(self.fb, gnm, dim, tc, self.stream_a)
rdr.out.convert(self.fb, gnm, dim, self.stream_a)
self.filt_evt = cuda.Event().record(self.stream_a)
h_out = rdr.out.copy(self.fb, dim, self.pool, self.stream_a)
self.copy_evt = cuda.Event().record(self.stream_a)
self.info_a, self.info_b = self.info_b, self.info_a
self.stream_a, self.stream_b = self.stream_b, self.stream_a
return self.copy_evt, h_out
def render(self, gnm, times, w, h):
""" """
Render frames. This method is wrapped by the ``render()`` method; see A port of the old rendering function, retained for backwards
its docstring for warnings and details. compatibility. Some of this will be pulled into as-yet-undecided
methods for more DRY.
Instead of passing frame times as an iterable, they are passed
individually via the ``generator.send()`` method. There is an
internal pipeline latency of one frame, so the first call to the
``send()`` method will return None, the second call will return the
first frame's result, and so on. To retrieve the last frame in a
sequence, send ``None``.
Direct use of this method is useful for implementing render servers.
""" """
rdr = Renderer(gnm)
last_evt = cuda.Event().record(self.stream_a)
last_idx = None last_idx = None
next_frame = yield def wait(): # Times like these where you wish for a macro
if next_frame is None: while not last_evt.query():
return time.sleep(0.01)
gpu_time = last_evt.time_since(two_evts_ago)
if not self.mod: return RenderedImage(h_buf, idx, gpu_time)
self.load(genome) for idx, tc in times:
evt, h_buf = self.queue_frame(rdr, gnm, tc, w, h, last_idx is None)
filt = filtering.Filtering() if last_idx:
yield wait()
reset_rb_fun = self.mod.get_function("reset_rb") two_evts_ago, last_evt = last_evt, evt
packer_fun = self.mod.get_function("interp_iter_params") last_buf, last_idx = h_buf, idx
iter_fun = self.mod.get_function("iter") if last_idx:
yield wait()
# The synchronization model is messy. See helpers/task_model.svg.
iter_stream = cuda.Stream()
filt_stream = cuda.Stream()
if self.acc_mode == 'deferred':
write_stream = cuda.Stream()
write_fun = self.mod.get_function("write_shmem")
else:
write_stream = iter_stream
# These events fire when the corresponding buffer is available for
# reading on the host (i.e. the copy is done). On the first pass, 'a'
# will be ignored, and subsequently moved to 'b'.
event_a = cuda.Event().record(filt_stream)
event_b = None
awidth = width + 2 * self.gutter
aheight = 32 * int(np.ceil((height + 2 * self.gutter) / 32.))
astride = 32 * int(np.ceil(awidth / 32.))
dim = Dimensions(width, height, awidth, aheight, astride)
d_acc_size = self.mod.get_global('acc_size')[0]
cuda.memcpy_htod_async(d_acc_size, u32(list(dim)), write_stream)
nbins = astride * aheight
nxf = len(filter(lambda g: g != 'final', genome.xforms))
nxf = min(nxf, self.max_nxf)
d_accum = cuda.mem_alloc(16 * nbins * nxf)
d_out = cuda.mem_alloc(16 * nbins)
if self.acc_mode == 'atomic':
d_atom = cuda.mem_alloc(8 * nbins * nxf)
flush_fun = self.mod.get_function("flush_atom")
else:
# d_atom is also used as a scratch buffer during filtering, so we
# need it at least this size
d_atom = cuda.mem_alloc(4 * nbins)
obuf_copy = util.argset(cuda.Memcpy2D(),
src_y=self.gutter, src_x_in_bytes=16*self.gutter,
src_pitch=16*astride, dst_pitch=16*width,
width_in_bytes=16*width, height=height)
obuf_copy.set_src_device(d_out)
h_out_a = cuda.pagelocked_empty((height, width, 4), f32)
h_out_b = cuda.pagelocked_empty((height, width, 4), f32)
if self.acc_mode == 'deferred':
# Having a fixed, power-of-two log size makes things much easier
log_size = 64 << 20
d_log = cuda.mem_alloc(log_size * 4)
d_log_sorted = cuda.mem_alloc(log_size * 4)
sorter = sort.Sorter(log_size)
# We need to cover each unique tag - address bits 20-23 - with one
# write block per sort bin. Or somethinig like that.
nwriteblocks = int(np.ceil(nbins / float(1<<20))) * 256
# Calculate 'nslots', the number of simultaneous running threads that
# can be active on the GPU during iteration (and thus the number of
# slots for loading and storing RNG and point context that will be
# prepared on the device), and derive 'rb_size', the number of blocks in
# 'nslots'.
iter_threads_per_block = 256
dev_data = pycuda.tools.DeviceData()
occupancy = pycuda.tools.OccupancyRecord(
dev_data, iter_threads_per_block,
iter_fun.shared_size_bytes, iter_fun.num_regs)
nsms = cuda.Context.get_device().multiprocessor_count
rb_size = occupancy.warps_per_mp * nsms / (iter_threads_per_block / 32)
nslots = iter_threads_per_block * rb_size
# Reset the ringbuffer info for the slots
reset_rb_fun(np.int32(rb_size), block=(1,1,1))
d_points = cuda.mem_alloc(nslots * 16)
# This statement may add extra seeds to simplify palette dithering.
seeds = mwc.MWC.make_seeds(max(nslots, 256 * self.palette_height))
d_seeds = cuda.to_device(seeds)
# We used to auto-calculate this to a multiple of the number of SMs on
# the device, but since we now use shorter launches and, to a certain
# extent, allow simultaneous occupancy, that's not as important. The
# 1024 is a magic constant to ensure reasonable and power-of-two log
# size for deferred: 256MB / (4B * FUSE * NTHREADS). Enhancements to
# the sort engine are needed to make this more flexible.
ntemporal_samples = 1024
genome_times, genome_knots = self._iter.packer.pack()
d_genome_times = cuda.to_device(genome_times)
d_genome_knots = cuda.to_device(genome_knots)
info_size = 4 * len(self._iter.packer) * ntemporal_samples
d_infos = cuda.mem_alloc(info_size)
ptimes, pidxs = zip(*genome.palette_times)
palint_times = np.empty(len(genome_times[0]), f32)
palint_times.fill(1e10)
palint_times[:len(ptimes)] = ptimes
d_palint_times = cuda.to_device(palint_times)
pvals = self.pal.prepare([genome.decoded_palettes[i] for i in pidxs])
d_palint_vals = cuda.to_device(np.concatenate(pvals))
if self.acc_mode in ('deferred', 'atomic'):
palette_fun = self.mod.get_function("interp_palette_flat")
dsc = util.argset(cuda.ArrayDescriptor3D(),
height=self.palette_height, width=256, depth=0,
format=cuda.array_format.SIGNED_INT32,
num_channels=2, flags=cuda.array3d_flags.SURFACE_LDST)
palarray = cuda.Array(dsc)
tref = self.mod.get_surfref('flatpal')
tref.set_array(palarray, 0)
else:
palette_fun = self.mod.get_function("interp_palette")
dsc = util.argset(cuda.ArrayDescriptor(),
height=self.palette_height, width=256,
format=cuda.array_format.UNSIGNED_INT8,
num_channels=4)
d_palmem = cuda.mem_alloc(256 * self.palette_height * 4)
tref = self.mod.get_texref('palTex')
tref.set_address_2d(d_palmem, dsc, 1024)
tref.set_flags(cuda.TRSF_NORMALIZED_COORDINATES)
tref.set_filter_mode(cuda.filter_mode.LINEAR)
while next_frame is not None:
# tc, td, ts, te: central, delta, start, end times
idx, tc = next_frame
td = genome.adj_frame_width(tc)
ts, te = tc - 0.5 * td, tc + 0.5 * td
if self.acc_mode in ('deferred', 'atomic'):
# In this mode, the palette writes to a surface reference, but
# requires dithering, so we pass it the seeds instead
arg0 = d_seeds
else:
arg0 = d_palmem
palette_fun(arg0, d_palint_times, d_palint_vals,
f32(ts), f32(td / self.palette_height),
block=(256,1,1), grid=(self.palette_height,1),
stream=write_stream)
packer_fun(d_infos, d_genome_times, d_genome_knots,
f32(ts), f32(td / ntemporal_samples),
i32(ntemporal_samples), block=(256,1,1),
grid=(int(np.ceil(ntemporal_samples/256.)),1),
stream=iter_stream)
# Reset points so that they will be FUSEd
util.BaseCode.fill_dptr(self.mod, d_points, 4 * nslots,
iter_stream, f32(np.nan))
# Get interpolated control points for debugging
#iter_stream.synchronize()
#d_temp = cuda.from_device(d_infos,
#(ntemporal_samples, len(self._iter.packer)), f32)
#for i, n in zip(d_temp[5], self._iter.packer.packed):
#print '%60s %g' % ('_'.join(n), i)
util.BaseCode.fill_dptr(self.mod, d_accum, 4 * nbins * nxf,
write_stream)
if self.acc_mode == 'atomic':
util.BaseCode.fill_dptr(self.mod, d_atom, 2 * nbins * nxf,
write_stream)
nrounds = int( (genome.spp(tc) * width * height)
/ (ntemporal_samples * 256 * 256) ) + 1
if self.acc_mode == 'deferred':
for i in range(nrounds):
iter_fun(np.uint64(d_log), d_seeds, d_points, d_infos,
block=(32, self._iter.NTHREADS/32, 1),
grid=(ntemporal_samples, 1), stream=iter_stream)
_sync_stream(write_stream, iter_stream)
sorter.sort(d_log_sorted, d_log, log_size, 3, True,
stream=write_stream)
_sync_stream(iter_stream, write_stream)
write_fun(d_accum, d_log_sorted, sorter.dglobal, i32(nbins),
block=(1024, 1, 1), grid=(nwriteblocks, 1),
stream=write_stream)
else:
args = [u64(d_accum), d_seeds, d_points, d_infos]
if self.acc_mode == 'atomic':
args.append(u64(d_atom))
iter_fun(*args, block=(32, self._iter.NTHREADS/32, 1),
grid=(ntemporal_samples, nrounds), stream=iter_stream)
if self.acc_mode == 'atomic':
nblocks = int(np.ceil(np.sqrt(nbins*nxf/float(512))))
flush_fun(u64(d_accum), u64(d_atom), i32(nbins*nxf),
block=(512, 1, 1), grid=(nblocks, nblocks),
stream=iter_stream)
util.BaseCode.fill_dptr(self.mod, d_out, 4 * nbins, filt_stream)
_sync_stream(filt_stream, write_stream)
filt.de(d_out, d_accum, d_atom, genome, dim, tc, nxf,
stream=filt_stream)
_sync_stream(write_stream, filt_stream)
filt.colorclip(d_out, genome, dim, tc, blend, stream=filt_stream)
obuf_copy.set_dst_host(h_out_a)
obuf_copy(filt_stream)
if event_b:
while not event_a.query():
timemod.sleep(0.01)
gpu_time = event_a.time_since(event_b)
result = RenderedImage(h_out_b, last_idx, gpu_time)
else:
result = None
last_idx = idx
event_a, event_b = cuda.Event().record(filt_stream), event_a
h_out_a, h_out_b = h_out_b, h_out_a
# TODO: add ability to flush a frame without breaking the pipe
next_frame = yield result
while not event_a.query():
timemod.sleep(0.001)
gpu_time = event_a.time_since(event_b)
yield RenderedImage(h_out_b, last_idx, gpu_time)

127
main.py
View File

@ -3,8 +3,8 @@
# cuburn, one of a surprisingly large number of ports of the fractal flame # cuburn, one of a surprisingly large number of ports of the fractal flame
# algorithm to NVIDIA GPUs. # algorithm to NVIDIA GPUs.
# #
# This one is copyright 2010-2011, Steven Robertson <steven@strobe.cc> # This one is copyright 2010-2012, Steven Robertson <steven@strobe.cc>
# and Eric Reckase <e.reckase@gmail.com>. # and Erik Reckase <e.reckase@gmail.com>.
# #
# This program is free software; you can redistribute it and/or modify # This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 or later # it under the terms of the GNU General Public License version 2 or later
@ -20,11 +20,9 @@ from subprocess import Popen
from itertools import ifilter from itertools import ifilter
import numpy as np import numpy as np
import Image
import scipy
import pycuda.driver as cuda import pycuda.driver as cuda
from cuburn import genome, render from cuburn import genome, render, filters, output
profiles = { profiles = {
'1080p': dict(fps=24, width=1920, height=1080, quality=3000, skip=0), '1080p': dict(fps=24, width=1920, height=1080, quality=3000, skip=0),
@ -33,11 +31,10 @@ profiles = {
'preview': dict(fps=24, width=640, height=360, quality=800, skip=1) 'preview': dict(fps=24, width=640, height=360, quality=800, skip=1)
} }
def save(rframe): def save(out):
noalpha = rframe.buf[:,:,:3] # Temporary! TODO: fix this
img = scipy.misc.toimage(noalpha, cmin=0, cmax=1) output.PILOutput.save(out.buf, out.idx)
img.save(rframe.idx, quality=98) print out.idx, out.gpu_time
print rframe.idx, rframe.gpu_time
def main(args, prof): def main(args, prof):
import pycuda.autoinit import pycuda.autoinit
@ -53,9 +50,7 @@ def main(args, prof):
gnm = genome.Genome(gnm) gnm = genome.Genome(gnm)
err, times = gnm.set_profile(prof) err, times = gnm.set_profile(prof)
anim = render.Renderer() rmgr = render.RenderManager()
anim.compile(gnm, keep=args.keep)
anim.load(gnm)
basename = os.path.basename(args.flame.name).rsplit('.', 1)[0] + '_' basename = os.path.basename(args.flame.name).rsplit('.', 1)[0] + '_'
if args.flame.name == '-': if args.flame.name == '-':
@ -76,61 +71,61 @@ def main(args, prof):
if not os.path.isfile(f[0]) or m > os.path.getmtime(f[0])) if not os.path.isfile(f[0]) or m > os.path.getmtime(f[0]))
w, h = prof['width'], prof['height'] w, h = prof['width'], prof['height']
gen = anim.render(gnm, frames, w, h) gen = rmgr.render(gnm, frames, w, h)
if args.gfx: if not args.gfx:
import pyglet
window = pyglet.window.Window(w, h, vsync=False)
image = pyglet.image.CheckerImagePattern().create_image(w, h)
label = pyglet.text.Label('Rendering first frame', x=5, y=h-5,
width=w, anchor_y='top', font_size=16,
bold=True, multiline=True)
@window.event
def on_draw():
window.clear()
image.texture.blit(0, 0)
label.draw()
@window.event
def on_key_press(sym, mod):
if sym == pyglet.window.key.Q:
pyglet.app.exit()
@window.event
def on_mouse_motion(x, y, dx, dy):
pass
last_time = [time.time()]
def poll(dt):
out = next(gen, False)
if out is False:
if args.pause:
label.text = "Done. ('q' to quit)"
#pyglet.clock.unschedule(poll)
else:
pyglet.app.exit()
elif out is not None:
real_dt = time.time() - last_time[0]
last_time[0] = time.time()
save(out)
imgbuf = np.uint8(out.buf.flatten() * 255)
image.set_data('RGBA', -w*4, imgbuf.tostring())
label.text = '%s (%g fps)' % (out.idx, 1./real_dt)
else:
label.text += '.'
if args.sync:
cuda.Context.synchronize()
pyglet.clock.set_fps_limit(30)
pyglet.clock.schedule_interval(poll, 1/30.)
pyglet.app.run()
else:
for out in gen: for out in gen:
save(out) save(out)
if args.sync: return
cuda.Context.synchronize()
import pyglet
window = pyglet.window.Window(w, h, vsync=False)
image = pyglet.image.CheckerImagePattern().create_image(w, h)
label = pyglet.text.Label('Rendering first frame', x=5, y=h-5,
width=w, anchor_y='top', font_size=16,
bold=True, multiline=True)
@window.event
def on_draw():
window.clear()
image.texture.blit(0, 0)
label.draw()
@window.event
def on_key_press(sym, mod):
if sym == pyglet.window.key.Q:
pyglet.app.exit()
@window.event
def on_mouse_motion(x, y, dx, dy):
pass
last_time = [time.time()]
def poll(dt):
out = next(gen, False)
if out is False:
if args.pause:
label.text = "Done. ('q' to quit)"
#pyglet.clock.unschedule(poll)
else:
pyglet.app.exit()
elif out is not None:
real_dt = time.time() - last_time[0]
last_time[0] = time.time()
save(out)
imgbuf = np.uint8(out.buf.flatten() * 255)
image.set_data('RGBA', -w*4, imgbuf.tostring())
label.text = '%s (%g fps)' % (out.idx, 1./real_dt)
else:
label.text += '.'
if args.sync:
cuda.Context.synchronize()
pyglet.clock.set_fps_limit(30)
pyglet.clock.schedule_interval(poll, 1/30.)
pyglet.app.run()
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Render fractal flames.') parser = argparse.ArgumentParser(description='Render fractal flames.')
@ -150,8 +145,6 @@ if __name__ == "__main__":
parser.add_argument('--pause', action='store_true', parser.add_argument('--pause', action='store_true',
help="Don't close the preview window after rendering is finished") help="Don't close the preview window after rendering is finished")
parser.add_argument('--keep', action='store_true', dest='keep',
help='Keep compilation directory (disables kernel caching)')
parser.add_argument('--sync', action='store_true', dest='sync', parser.add_argument('--sync', action='store_true', dest='sync',
help='Use synchronous launches whenever possible') help='Use synchronous launches whenever possible')

View File

@ -17,6 +17,8 @@ import redis
from cuburn import render, genome from cuburn import render, genome
import pycuda.driver as cuda
pycuda = None pycuda = None
# The default maximum number of waiting jobs. Also used to determine when a # The default maximum number of waiting jobs. Also used to determine when a
@ -50,20 +52,6 @@ def get_temperature():
return out[idx+1:idx+3] return out[idx+1:idx+3]
return '' return ''
def push_frame(r, out):
if out is None:
return
sid, sidx, ftag = out.idx
# TODO: gotta put this in a module somewhere and make it adjustable
noalpha = out.buf[:,:,:3]
img = scipy.misc.toimage(noalpha, cmin=0, cmax=1)
buf = StringIO()
img.save(buf, 'jpeg', quality=98)
buf.seek(0)
head = ' '.join([sidx, str(out.gpu_time), ftag])
r.rpush(sid + ':queue', head + '\0' + buf.read())
print 'Pushed frame: %s' % head
def work(server): def work(server):
global pycuda global pycuda
import pycuda.autoinit import pycuda.autoinit
@ -80,7 +68,11 @@ def work(server):
r.expire(wid, 180) r.expire(wid, 180)
last_ping = time.time() last_ping = time.time()
last_pid, last_gid, riter = None, None, None idx = evt = buf = None
last_idx = last_buf = last_evt = two_evts_ago = None
last_pid = last_gid = rdr = None
mgr = render.RenderManager()
while True: while True:
task = r.blpop('renderpool:' + rev + ':queue', 10) task = r.blpop('renderpool:' + rev + ':queue', 10)
@ -90,28 +82,46 @@ def work(server):
r.expire(wid, 180) r.expire(wid, 180)
last_ping = now last_ping = now
# last_evt will be populated during normal queue operation (when evt
# contains the most recent event), as well as when the render queue is
# flushing due to not receiving a new task before the timeout.
if last_idx is not None:
while not last_evt.query():
# This delay could probably be a lot higher with zero impact
# on throughput for Fermi cards
time.sleep(0.05)
sid, sidx, ftag = last_idx
obuf = StringIO()
rdr.out.save(buf, obuf, 'jpeg')
obuf.seek(0)
gpu_time = last_evt.time_since(two_evts_ago)
head = ' '.join([sidx, str(gpu_time), ftag])
r.rpush(sid + ':queue', head + '\0' + obuf.read())
print 'Pushed frame: %s' % head
two_evts_ago, last_evt = last_evt, evt
last_idx, last_buf = idx, buf
if not task: if not task:
if riter: idx = evt = buf = None
push_frame(r, riter.send(None))
riter = None
continue continue
sid, sidx, pid, gid, ftime, ftag = task[1].split(' ', 5) sid, sidx, pid, gid, ftime, ftag = task[1].split(' ', 5)
if pid != last_pid or gid != last_gid or not riter: if pid != last_pid or gid != last_gid or not rdr:
gnm = genome.Genome(json.loads(r.get(gid))) gnm = genome.Genome(json.loads(r.get(gid)))
prof = json.loads(r.get(pid)) prof = json.loads(r.get(pid))
gnm.set_profile(prof) gnm.set_profile(prof)
renderer = render.Renderer() rdr = render.Renderer(gnm)
renderer.load(gnm)
if riter: if last_evt is None:
push_frame(r, riter.send(None)) # Create a dummy event for timing
last_evt = cuda.Event().record(mgr.stream_a)
riter = renderer.render_gen(gnm, prof['width'], prof['height']) copy = last_idx is None
next(riter) w, h = prof['width'], prof['height']
last_pid, last_gid = pid, gid evt, buf = mgr.queue_frame(rdr, gnm, float(ftime), w, h, copy)
idx = sid, sidx, ftag
push_frame(r, riter.send(((sid, sidx, ftag), float(ftime))))
def iter_genomes(prof, gpaths, pname='540p'): def iter_genomes(prof, gpaths, pname='540p'):
""" """