From 60a45c9a209058d1a896d15e99f9c78dd2e6c865 Mon Sep 17 00:00:00 2001 From: Steven Robertson Date: Tue, 14 Feb 2012 07:40:58 -0800 Subject: [PATCH] Sweeping refactor. More bugs undoubtedly remain. --- cuburn/code/__init__.py | 9 - cuburn/code/color.py | 77 +++ cuburn/code/{filtering.py => filters.py} | 313 ++++------- cuburn/code/interp.py | 219 ++------ cuburn/code/iter.py | 339 +++--------- cuburn/code/mwc.py | 164 +++--- cuburn/code/output.py | 34 ++ cuburn/code/util.py | 308 ++++++----- cuburn/filters.py | 105 ++++ cuburn/output.py | 49 ++ cuburn/render.py | 659 +++++++++++------------ main.py | 127 +++-- worker.py | 66 ++- 13 files changed, 1170 insertions(+), 1299 deletions(-) create mode 100644 cuburn/code/color.py rename cuburn/code/{filtering.py => filters.py} (60%) create mode 100644 cuburn/code/output.py create mode 100644 cuburn/filters.py create mode 100644 cuburn/output.py diff --git a/cuburn/code/__init__.py b/cuburn/code/__init__.py index 949d19a..e69de29 100644 --- a/cuburn/code/__init__.py +++ b/cuburn/code/__init__.py @@ -1,9 +0,0 @@ -""" -Contains the PTX fragments which will drive the device, and helper functions -to combine those fragments. -""" - -import util -import mwc -import iter - diff --git a/cuburn/code/color.py b/cuburn/code/color.py new file mode 100644 index 0000000..046bc84 --- /dev/null +++ b/cuburn/code/color.py @@ -0,0 +1,77 @@ +import numpy as np + +from util import devlib + +# The JPEG YUV full-range matrix, without bias into the positve regime. +# This assumes input color space is CIERGB D65, encoded with gamma 2.2. +# Note that some interpolated colors may exceed the sRGB and YUV gamuts. +YUV_MATRIX = np.matrix([[ 0.299, 0.587, 0.114], + [-0.168736, -0.331264, 0.5], + [ 0.5, -0.418688, -0.081312]]) + +yuvlib = devlib(defs=''' +__device__ float3 rgb2yuv(float3 rgb); +__device__ float3 yuv2rgb(float3 yuv); +''', decls=r''' +/* This conversion uses the JPEG full-range standard. Note that UV have range + * [-0.5, 0.5], so consider biasing the results. */ +__device__ float3 rgb2yuv(float3 rgb) { + return make_float3( + 0.299f * rgb.x + 0.587f * rgb.y + 0.114f * rgb.z, + -0.168736f * rgb.x - 0.331264f * rgb.y + 0.5f * rgb.z, + 0.5f * rgb.x - 0.418688f * rgb.y - 0.081312f * rgb.z); +} + +__device__ float3 yuv2rgb(float3 yuv) { + return make_float3( + yuv.x + 1.402f * yuv.z, + yuv.x - 0.34414f * yuv.y - 0.71414f * yuv.z, + yuv.x + 1.772f * yuv.y); +} +''') + +hsvlib = devlib(decls=''' +__device__ float3 rgb2hsv(float3 rgb); +__device__ float3 hsv2rgb(float3 hsv); +''', defs=r''' +__device__ float3 rgb2hsv(float3 rgb) { + float M = fmaxf(fmaxf(rgb.x, rgb.y), rgb.z); + float m = fminf(fminf(rgb.x, rgb.y), rgb.z); + float C = M - m; + + float s = M > 0.0f ? C / M : 0.0f; + + float h = 0.0f; + if (s != 0.0f) { + C = 1.0f / C; + float rc = (M - rgb.x) * C; + float gc = (M - rgb.y) * C; + float bc = (M - rgb.z) * C; + + if (rgb.x == M) h = bc - gc; + else if (rgb.y == M) h = 2 + rc - bc; + else h = 4 + gc - rc; + + if (h < 0) h += 6.0f; + } + return make_float3(h, s, M); +} + +__device__ float3 hsv2rgb(float3 hsv) { + float whole = floorf(hsv.x); + float frac = hsv.x - whole; + float val = hsv.z; + float min = val * (1 - hsv.y); + float mid = val * (1 - (hsv.y * frac)); + float alt = val * (1 - (hsv.y * (1 - frac))); + + float3 out; + if (whole == 0.0f) { out.x = val; out.y = alt; out.z = min; } + else if (whole == 1.0f) { out.x = mid; out.y = val; out.z = min; } + else if (whole == 2.0f) { out.x = min; out.y = val; out.z = alt; } + else if (whole == 3.0f) { out.x = min; out.y = mid; out.z = val; } + else if (whole == 4.0f) { out.x = alt; out.y = min; out.z = val; } + else { out.x = val; out.y = min; out.z = mid; } + return out; +} +''') diff --git a/cuburn/code/filtering.py b/cuburn/code/filters.py similarity index 60% rename from cuburn/code/filtering.py rename to cuburn/code/filters.py index 6bcba88..ae2d44f 100644 --- a/cuburn/code/filtering.py +++ b/cuburn/code/filters.py @@ -1,13 +1,7 @@ -import numpy as np -from numpy import float32 as f32, int32 as i32 +from util import devlib +from color import yuvlib -import pycuda.driver as cuda -import pycuda.compiler -from pycuda.gpuarray import vec - -from cuburn.code.util import * - -_CODE = r''' +texshearlib = devlib(defs=r''' // Filter directions specified in degrees, using image/texture addressing // [(0,0) is upper left corner, 90 degrees is down]. @@ -22,7 +16,9 @@ __constant__ float2 addressing_patterns[16] = { { 1.0f, -0.333333f}, { 0.333333f, 1.0f}, // 14, 15: -15, 75 }; -// Mon dieu! A C++ feature? +// Mon dieu! A C++ feature? Gotta to close the "extern C" added by the compiler. +} + template __device__ T tex_shear(texture ref, int pattern, float x, float y, float radius) { @@ -39,100 +35,14 @@ tex_shear(texture ref, int pattern, } extern "C" { +''') -__global__ -void colorclip(float4 *pixbuf, float gamma, float vibrance, float highpow, - float linrange, float lingam, float3 bkgd, - int fbsize, int blend_background_color) { - int i = threadIdx.x + blockDim.x * (blockIdx.x + gridDim.x * blockIdx.y); - if (i >= fbsize) return; - - float4 pix = pixbuf[i]; - - if (pix.w <= 0) { - pixbuf[i] = make_float4(bkgd.x, bkgd.y, bkgd.z, 0.0f); - return; - } - pix.y -= 0.5f * pix.w; - pix.z -= 0.5f * pix.w; - float3 tmp = yuv2rgb(make_float3(pix.x, pix.y, pix.z)); - pix.x = tmp.x; - pix.y = tmp.y; - pix.z = tmp.z; - - pix.x = fmaxf(0.0f, pix.x); - pix.y = fmaxf(0.0f, pix.y); - pix.z = fmaxf(0.0f, pix.z); - - float4 opix = pix; - - float alpha = powf(pix.w, gamma); - if (pix.w < linrange) { - float frac = pix.w / linrange; - alpha = (1.0f - frac) * pix.w * lingam + frac * alpha; - } - - if (!blend_background_color) { - float ls = alpha / pix.w; - pix.x *= ls; - pix.y *= ls; - pix.z *= ls; - pix.w = alpha; - pixbuf[i] = pix; - return; - } - - float ls = vibrance * alpha / pix.w; - alpha = fminf(1.0f, fmaxf(0.0f, alpha)); - - float maxc = fmaxf(pix.x, fmaxf(pix.y, pix.z)); - float maxa = maxc * ls; - float newls = 1.0f / maxc; - - if (maxa > 1.0f && highpow >= 0.0f) { - float lsratio = powf(newls / ls, highpow); - pix.x *= newls; - pix.y *= newls; - pix.z *= newls; - - // Reduce saturation (according to the HSV model) by proportionally - // increasing the values of the other colors. - pix.x = maxc - (maxc - pix.x) * lsratio; - pix.y = maxc - (maxc - pix.y) * lsratio; - pix.z = maxc - (maxc - pix.z) * lsratio; - } else { - float adjhlp = -highpow; - if (adjhlp > 1.0f || maxa <= 1.0f) adjhlp = 1.0f; - if (maxc > 0.0f) { - float adj = ((1.0f - adjhlp) * newls + adjhlp * ls); - pix.x *= adj; - pix.y *= adj; - pix.z *= adj; - } - } - - pix.x += (1.0f - vibrance) * powf(opix.x, gamma); - pix.y += (1.0f - vibrance) * powf(opix.y, gamma); - pix.z += (1.0f - vibrance) * powf(opix.z, gamma); - - pix.x += (1.0f - alpha) * bkgd.x; - pix.y += (1.0f - alpha) * bkgd.y; - pix.z += (1.0f - alpha) * bkgd.z; - - pix.x = fminf(1.0f, pix.x); - pix.y = fminf(1.0f, pix.y); - pix.z = fminf(1.0f, pix.z); - pix.w = alpha; - - pixbuf[i] = pix; -} - -__global__ -void logscale(float4 *outbuf, const float4 *pixbuf, float k1, float k2) { +logscalelib = devlib(defs=r''' +__global__ void +logscale(float4 *outbuf, const float4 *pixbuf, float k1, float k2) { int i = blockDim.x * blockIdx.x + threadIdx.x; float4 pix = pixbuf[i]; - // float ls = fmaxf(0, k1 * logf(1.0f + pix.w * pix.w * k2 / (1 + pix.w)) / pix.w); float ls = fmaxf(0, k1 * logf(1.0f + pix.w * k2) / pix.w); pix.x *= ls; pix.y *= ls; @@ -141,10 +51,12 @@ void logscale(float4 *outbuf, const float4 *pixbuf, float k1, float k2) { outbuf[i] = pix; } +''') +fmabuflib = devlib(defs=r''' // Element-wise computation of ``dst[i]=dst[i]+src[i]*scale``. -__global__ -void fma_buf(float4 *dst, const float4 *src, int astride, float scale) { +__global__ void +fma_buf(float4 *dst, const float4 *src, int astride, float scale) { int x = blockIdx.x * blockDim.x + threadIdx.x; int y = blockIdx.y * blockDim.y + threadIdx.y; int i = y * astride + x; @@ -155,23 +67,16 @@ void fma_buf(float4 *dst, const float4 *src, int astride, float scale) { d.w += s.w * scale; dst[i] = d; } +''') -texture bilateral_src; +denblurlib = devlib(decls=''' texture blur_src; -__global__ void logscale_den(float *dst, float k2) { - int xi = blockIdx.x * blockDim.x + threadIdx.x; - int yi = blockIdx.y * blockDim.y + threadIdx.y; - float4 pix = tex2D(bilateral_src, xi, yi); - float out = logf(1.0f + pix.w * k2); - dst[yi * (blockDim.x * gridDim.x) + xi] = out; -} - __constant__ float gauss_coefs[9] = { 0.00443305f, 0.05400558f, 0.24203623f, 0.39905028f, 0.24203623f, 0.05400558f, 0.00443305f }; - +''', defs=r''' // Apply a Gaussian-esque blur to the density channel of the texture in // ``bilateral_src`` in the horizontal direction, and write it to ``dst``, a // one-channel buffer. @@ -203,7 +108,11 @@ __global__ void den_blur_1c(float *dst, int pattern, int upsample) { * gauss_coefs[i]; dst[yi * (blockDim.x * gridDim.x) + xi] = den; } +''') +bilaterallib = devlib(deps=[logscalelib, texshearlib, denblurlib], decls=''' +texture bilateral_src; +''', defs=r''' /* sstd: spatial standard deviation (Gaussian filter) * cstd: color standard deviation (Gaussian on the range [0, 1], where 1 * represents an "opposite" color). @@ -215,10 +124,9 @@ __global__ void den_blur_1c(float *dst, int pattern, int upsample) { * gspeed: Speed of (exp2f) Gompertz distribution governing how much to * tighten gradients. Zero and negative values OK. */ - -__global__ -void bilateral(float4 *dst, int pattern, int radius, - float sstd, float cstd, float dstd, float dpow, float gspeed) +__global__ void +bilateral(float4 *dst, int pattern, int radius, + float sstd, float cstd, float dstd, float dpow, float gspeed) { int xi = blockIdx.x * blockDim.x + threadIdx.x; int yi = blockIdx.y * blockDim.y + threadIdx.y; @@ -317,124 +225,83 @@ void bilateral(float4 *dst, int pattern, int radius, const int astride = blockDim.x * gridDim.x; dst[yi * astride + xi] = out; } +''') -} // end extern "C" -''' +colorcliplib = devlib(deps=[yuvlib], defs=r''' +__global__ void +colorclip(float4 *pixbuf, float gamma, float vibrance, float highpow, + float linrange, float lingam, float3 bkgd, int fbsize) +{ + int i = threadIdx.x + blockDim.x * (blockIdx.x + gridDim.x * blockIdx.y); + if (i >= fbsize) return; -class Filtering(HunkOCode): - mod = None - defs = _CODE + float4 pix = pixbuf[i]; - @classmethod - def init_mod(cls): - if cls.mod is None: - cls.mod = pycuda.compiler.SourceModule(assemble_code(BaseCode, cls), - options=['-use_fast_math', '-maxrregcount', '32'], - no_extern_c=True) + if (pix.w <= 0) { + pixbuf[i] = make_float4(bkgd.x, bkgd.y, bkgd.z, 0.0f); + return; + } + pix.y -= 0.5f * pix.w; + pix.z -= 0.5f * pix.w; + float3 tmp = yuv2rgb(make_float3(pix.x, pix.y, pix.z)); + pix.x = tmp.x; + pix.y = tmp.y; + pix.z = tmp.z; - def __init__(self): - self.init_mod() + pix.x = fmaxf(0.0f, pix.x); + pix.y = fmaxf(0.0f, pix.y); + pix.z = fmaxf(0.0f, pix.z); - def de(self, ddst, dsrc, dscratch, gnm, dim, tc, nxf, stream=None): + float4 opix = pix; - # Helper variables and functions to keep it clean - sb = 16 * dim.astride - bs = sb * dim.ah - bl, gr = (32, 8, 1), (dim.astride / 32, dim.ah / 8) + float alpha = powf(pix.w, gamma); + if (pix.w < linrange) { + float frac = pix.w / linrange; + alpha = (1.0f - frac) * pix.w * lingam + frac * alpha; + } - def launch(f, *args, **kwargs): - f(*args, block=bl, grid=gr, stream=stream, **kwargs) - mkdsc = lambda c: argset(cuda.ArrayDescriptor(), height=dim.ah, - width=dim.astride, num_channels=c, - format=cuda.array_format.FLOAT) - def mktref(n): - tref = self.mod.get_texref(n) - tref.set_filter_mode(cuda.filter_mode.POINT) - tref.set_address_mode(0, cuda.address_mode.WRAP) - tref.set_address_mode(1, cuda.address_mode.WRAP) - return tref + float ls = vibrance * alpha / pix.w; + alpha = fminf(1.0f, fmaxf(0.0f, alpha)); - dsc = mkdsc(4) - tref = mktref('bilateral_src') - grad_dsc = mkdsc(1) - grad_tref = mktref('blur_src') + float maxc = fmaxf(pix.x, fmaxf(pix.y, pix.z)); + float maxa = maxc * ls; + float newls = 1.0f / maxc; - bilateral, logscale_den, den_blur, den_blur_1c, fma_buf = map( - self.mod.get_function, - 'bilateral logscale_den den_blur den_blur_1c fma_buf'.split()) + if (maxa > 1.0f && highpow >= 0.0f) { + float lsratio = powf(newls / ls, highpow); + pix.x *= newls; + pix.y *= newls; + pix.z *= newls; - # Number of different shear patterns to use when filtering. Must be - # even, since we depend on buffer bouncing (but I don't think that it's - # a requirement for the filter itself to get decent results). - DIRECTIONS = 8 + // Reduce saturation (according to the HSV model) by proportionally + // increasing the values of the other colors. + pix.x = maxc - (maxc - pix.x) * lsratio; + pix.y = maxc - (maxc - pix.y) * lsratio; + pix.z = maxc - (maxc - pix.z) * lsratio; + } else { + float adjhlp = -highpow; + if (adjhlp > 1.0f || maxa <= 1.0f) adjhlp = 1.0f; + if (maxc > 0.0f) { + float adj = ((1.0f - adjhlp) * newls + adjhlp * ls); + pix.x *= adj; + pix.y *= adj; + pix.z *= adj; + } + } - def do_bilateral(bsrc, bdst, pattern, r=15, sstd=6, cstd=0.05, - dstd=1.5, dpow=0.8, gspeed=4.0): - # Scale spatial parameter so that a "pixel" is equivalent to an - # actual pixel at 1080p - sstd *= dim.w / 1920. + pix.x += (1.0f - vibrance) * powf(opix.x, gamma); + pix.y += (1.0f - vibrance) * powf(opix.y, gamma); + pix.z += (1.0f - vibrance) * powf(opix.z, gamma); - tref.set_address_2d(bsrc, dsc, sb) + pix.x += (1.0f - alpha) * bkgd.x; + pix.y += (1.0f - alpha) * bkgd.y; + pix.z += (1.0f - alpha) * bkgd.z; - # Blur density two octaves along sampling vector, ultimately - # storing in `dscratch` - launch(den_blur, np.intp(bdst), i32(pattern), i32(0), - texrefs=[tref]) - grad_tref.set_address_2d(bdst, grad_dsc, sb / 4) - launch(den_blur_1c, dscratch, i32(pattern), i32(1), - texrefs=[grad_tref]) - grad_tref.set_address_2d(dscratch, grad_dsc, sb / 4) + pix.x = fminf(1.0f, pix.x); + pix.y = fminf(1.0f, pix.y); + pix.z = fminf(1.0f, pix.z); + pix.w = alpha; - launch(bilateral, np.intp(bdst), i32(pattern), i32(r), - f32(sstd), f32(cstd), f32(dstd), f32(dpow), f32(gspeed), - texrefs=[tref, grad_tref]) - - def do_bilateral_range(bsrc, bdst, npats, *args, **kwargs): - for i in range(npats): - do_bilateral(bsrc, bdst, i, *args, **kwargs) - bdst, bsrc = bsrc, bdst - if npats % 2: - cuda.memcpy_dtod_async(bdst, bsrc, bs, stream) - - # Filter the first xform, using `ddst` as an intermediate buffer. - # End result is the filtered copy in `dsrc`. - do_bilateral_range(dsrc, ddst, DIRECTIONS) - - # Filter the remaining xforms, using `ddst` as an intermediate - # buffer, then add the result to `dsrc` (at the zero'th xform). - for x in range(1, nxf): - src = int(dsrc) + x * bs - do_bilateral_range(src, ddst, DIRECTIONS) - launch(fma_buf, dsrc, np.intp(src), i32(dim.astride), f32(1)) - - # Log-scale the accumulated buffer in `dsrc`. - k1 = f32(gnm.color.brightness(tc) * 268 / 256) - # Old definition of area is (w*h/(s*s)). Since new scale 'ns' is now - # s/w, new definition is (w*h/(s*s*w*w)) = (h/(s*s*w)) - area = dim.h / (gnm.camera.scale(tc) ** 2 * dim.w) - k2 = f32(1.0 / (area * gnm.spp(tc))) - - nbins = dim.ah * dim.astride - logscale = self.mod.get_function("logscale") - t = logscale(ddst, dsrc, k1, k2, - block=(512, 1, 1), grid=(nbins/512, 1), stream=stream) - - def colorclip(self, dbuf, gnm, dim, tc, blend, stream=None): - nbins = dim.ah * dim.astride - - # TODO: implement integration over cubic splines? - gam = f32(1 / gnm.color.gamma(tc)) - vib = f32(gnm.color.vibrance(tc)) - hipow = f32(gnm.color.highlight_power(tc)) - lin = f32(gnm.color.gamma_threshold(tc)) - lingam = f32(lin ** (gam-1.0) if lin > 0 else 0) - bkgd = vec.make_float3( - gnm.color.background.r(tc), - gnm.color.background.g(tc), - gnm.color.background.b(tc)) - - color_fun = self.mod.get_function("colorclip") - blocks = int(np.ceil(np.sqrt(nbins / 256))) - color_fun(dbuf, gam, vib, hipow, lin, lingam, bkgd, i32(nbins), - i32(blend), block=(256, 1, 1), grid=(blocks, blocks), - stream=stream) + pixbuf[i] = pix; +} +''') diff --git a/cuburn/code/interp.py b/cuburn/code/interp.py index e3a29a7..2c1a2e4 100644 --- a/cuburn/code/interp.py +++ b/cuburn/code/interp.py @@ -2,8 +2,10 @@ from collections import OrderedDict from itertools import cycle import numpy as np -import tempita -from util import HunkOCode, Template, BaseCode, assemble_code +import util +from util import Template, assemble_code, devlib, binsearchlib, ringbuflib +from color import yuvlib +from mwc import mwclib class GenomePackerName(str): """Class to indicate that a property is precalculated on the device""" @@ -115,17 +117,10 @@ class GenomePackerPrecalc(GenomePackerView): def _code(self, code): self.packer.precalc_code.append(code) -class GenomePacker(HunkOCode): +class GenomePacker(object): """ Packs a genome for use in iteration. """ - - # 2^search_rounds is the maximum number of control points, including - # endpoints, that can be used in a single genome. It should be okay to - # change this number without touching other code, but 32 samples fits - # nicely on a single cache line. - search_rounds = 5 - def __init__(self, tname): """ Create a new DataPacker. @@ -152,6 +147,7 @@ class GenomePacker(HunkOCode): self.packed = None self.genome = None + self.search_rounds = util.DEFAULT_SEARCH_ROUNDS def __len__(self): assert self._len is not None, 'len() called before finalize()' @@ -189,42 +185,41 @@ class GenomePacker(HunkOCode): self._len = len(self.packed) - self.decls = self._decls.substitute( - packed=self.packed, tname=self.tname, - search_rounds=self.search_rounds) - self.defs = self._defs.substitute( + decls = self._decls.substitute(packed=self.packed, tname=self.tname) + defs = self._defs.substitute( packed_direct=self.packed_direct, tname=self.tname, precalc_code=self.precalc_code, search_rounds=self.search_rounds) + return devlib(deps=[catmullromlib], decls=decls, defs=defs) - def pack(self): + def pack(self, pool=None): """ - Return a packed copy of the genome ready for uploading to the GPU as a - 3D NDArray, with the first element being the times and the second - being the values. + Return a packed copy of the genome ready for uploading to the GPU, + as two float32 NDArrays for the knot times and values. """ width = 1 << self.search_rounds - out = np.empty((2, len(self.genome), width), dtype=np.float32) - # Ensure that unused values at the top are always big (must be >2.0) - out[0].fill(1e9) + if pool: + times = pool.allocate((len(self.genome), width), 'f4') + knots = pool.allocate((len(self.genome), width), 'f4') + else: + times, knots = np.empty((2, len(self.genome), width), 'f4') + times.fill(1e9) for idx, gname in enumerate(self.genome): attr = self.ns[gname[0]] for g in gname[1:]: attr = getattr(attr, g) - out[0][idx][:len(attr.knots[0])] = attr.knots[0] - out[1][idx][:len(attr.knots[1])] = attr.knots[1] - return out + times[idx,:len(attr.knots[0])] = attr.knots[0] + knots[idx,:len(attr.knots[1])] = attr.knots[1] + return times, knots _defs = Template(r""" - -__global__ -void interp_{{tname}}( +__global__ void interp_{{tname}}( {{tname}}* out, const float *times, const float *knots, - float tstart, float tstep, int maxid -) { + float tstart, float tstep, int maxid) +{ int id = gtid(); if (id >= maxid) return; out = &out[id]; @@ -249,7 +244,6 @@ void interp_{{tname}}( } {{endfor}} } - """) _decls = Template(r""" @@ -259,22 +253,13 @@ typedef struct { {{endfor}} } {{tname}}; -/* Search through the fixed-size list 'hay' to find the rightmost index which - * contains a value strictly smaller than the input 'needle'. The crazy - * bitwise search is just for my own amusement. - */ -__device__ -int bitwise_binsearch(const float *hay, float needle) { - int lo = 0; - // TODO: improve efficiency on 64-bit arches - {{for i in range(search_rounds-1, -1, -1)}} - if (needle > hay[lo + {{1 << i}}]) - lo += {{1 << i}}; - {{endfor}} - return lo; -} +""") +catmullromlib = devlib(deps=[binsearchlib], decls=r''' +__device__ __noinline__ +float catmull_rom(const float *times, const float *knots, float t); +''', defs=r''' __device__ __noinline__ float catmull_rom(const float *times, const float *knots, float t) { int idx = bitwise_binsearch(times, t); @@ -303,67 +288,14 @@ float catmull_rom(const float *times, const float *knots, float t) { + m2 * ( ttt - tt) + k2 * (-2.0f*ttt + 3.0f*tt); } +''') -__global__ -void test_cr(const float *times, const float *knots, const float *t, float *r) { - int i = threadIdx.x + blockDim.x * blockIdx.x; - r[i] = catmull_rom(times, knots, t[i]); -} -""") - -class Palette(HunkOCode): - # The JPEG YUV full-range matrix, without bias into the positve regime. - # This assumes input color space is CIERGB D65, encoded with gamma 2.2. - # Note that some interpolated colors may exceed the sRGB and YUV gamuts. - YUV = np.matrix([[ 0.299, 0.587, 0.114], - [-0.168736, -0.331264, 0.5], - [ 0.5, -0.418688, -0.081312]]) - - def __init__(self, interp_mode): - assert interp_mode in self.modes - self.mode = interp_mode - self.defs = self._defs.substitute(mode=interp_mode) - - def prepare(self, palettes): - """ - Produce palettes suitable for uploading to the device. Returns an - array of palettes in the same size and shape as the input. - - This function will never modify its argument, but may return it - unmodified for certain interpolation modes. - """ - if self.mode == 'yuvpolar': - ys, uvrs, uvts, alphas = zip(*map(self.rgbtoyuvpolar, palettes)) - # Center all medians as closely to 0 as possible - means = np.mean(uvts, axis=1) - newmeans = (means + np.pi) % (2 * np.pi) - np.pi - uvts = (newmeans - means).reshape((-1, 1)) + uvts - zipped = zip(ys, uvrs, uvts, alphas) - return np.array(zipped, dtype='f4').transpose((0, 2, 1)) - return palettes - - @classmethod - def rgbtoyuvpolar(cls, pal): - # TODO: premultiply alpha or some nonsense like that? - y, u, v = np.array(cls.YUV * pal.T[:3]) - uvr = np.hypot(u, v) - uvt = np.unwrap(np.arctan2(v, u)) - return y, uvr, uvt, pal.T[3] - - @classmethod - def yuvpolartorgb(cls, y, uvr, uvt, a): - u = uvr * np.cos(uvt) - v = uvr * np.sin(uvt) - r, g, b = np.array(cls.YUV.I * np.array([y, u, v])) - # Ensure Fortran order so that the memory gets laid out correctly - return np.array([r, g, b, a], order='F').T - - modes = ['hsv', 'yuv', 'yuvpolar'] - decls = "surface flatpal;\n" - - _defs = Template(r""" -__device__ -float4 interp_color(const float *times, const float4 *sources, float time) { +palintlib = devlib(deps=[binsearchlib, ringbuflib, yuvlib, mwclib], decls=''' +surface flatpal; +''', defs=r''' +__device__ float4 +interp_color(const float *times, const float4 *sources, float time) +{ int idx = fmaxf(bitwise_binsearch(times, time) + 1, 1); float lf = (times[idx] - time) / (times[idx] - times[idx-1]); float rf = 1.0f - lf; @@ -375,42 +307,11 @@ float4 interp_color(const float *times, const float4 *sources, float time) { float3 l3 = make_float3(left.x, left.y, left.z); float3 r3 = make_float3(right.x, right.y, right.z); -{{if mode == 'hsv'}} - float3 lhsv = rgb2hsv(l3); - float3 rhsv = rgb2hsv(r3); - - if (fabs(lhsv.x - rhsv.x) > 3.0f) - if (lhsv.x < rhsv.x) - lhsv.x += 6.0f; - else - rhsv.x += 6.0f; - - float3 hsv; - hsv.x = lhsv.x * lf + rhsv.x * rf; - hsv.y = lhsv.y * lf + rhsv.y * rf; - hsv.z = lhsv.z * lf + rhsv.z * rf; - - if (hsv.x > 6.0f) - hsv.x -= 6.0f; - if (hsv.x < 0.0f) - hsv.x += 6.0f; - - yuv = rgb2yuv(hsv2rgb(hsv)); -{{elif mode.startswith('yuv')}} - {{if mode == 'yuv'}} float3 lyuv = rgb2yuv(l3); float3 ryuv = rgb2yuv(r3); yuv.x = lyuv.x * lf + ryuv.x * rf; yuv.y = lyuv.y * lf + ryuv.y * rf; yuv.z = lyuv.z * lf + ryuv.z * rf; - {{elif mode == 'yuvpolar'}} - yuv.x = l3.x * lf + r3.x * rf; - float radius = l3.y * lf + r3.y * rf; - float angle = l3.z * lf + r3.z * rf; - yuv.y = radius * cosf(angle); - yuv.z = radius * sinf(angle); - {{endif}} -{{endif}} yuv.y += 0.5f; yuv.z += 0.5f; @@ -418,28 +319,13 @@ float4 interp_color(const float *times, const float4 *sources, float time) { return make_float4(yuv.x, yuv.y, yuv.z, left.w * lf + right.w * rf); } -__global__ -void interp_palette(uchar4 *outs, +__global__ void interp_palette_flat( + ringbuf *rb, mwc_st *rctxs, const float *times, const float4 *sources, - float tstart, float tstep) { - float time = tstart + blockIdx.x * tstep; - float4 yuva = interp_color(times, sources, time); - - uchar4 out; - out.x = yuva.x * 255.0f; - out.y = yuva.y * 255.0f; - out.z = yuva.z * 255.0f; - out.w = yuva.w * 255.0f; - outs[blockDim.x * blockIdx.x + threadIdx.x] = out; -} - -__global__ -void interp_palette_flat(mwc_st *rctxs, - const float *times, const float4 *sources, - float tstart, float tstep) { - + float tstart, float tstep) +{ + mwc_st rctx = rctxs[rb_incr(rb->head, threadIdx.x)]; int gid = blockIdx.x * blockDim.x + threadIdx.x; - mwc_st rctx = rctxs[gid]; float time = tstart + blockIdx.x * tstep; float4 yuva = interp_color(times, sources, time); @@ -447,17 +333,30 @@ void interp_palette_flat(mwc_st *rctxs, // TODO: pack Y at full precision, UV at quarter uint2 out; - uint32_t y = min(255, (uint32_t) (yuva.x * 255.0f + 0.49f * mwc_next_11(rctx))); - uint32_t u = min(255, (uint32_t) (yuva.y * 255.0f + 0.49f * mwc_next_11(rctx))); - uint32_t v = min(255, (uint32_t) (yuva.z * 255.0f + 0.49f * mwc_next_11(rctx))); + uint32_t y = yuva.x * 255.0f + 0.49f * mwc_next_11(rctx); + uint32_t u = yuva.y * 255.0f + 0.49f * mwc_next_11(rctx); + uint32_t v = yuva.z * 255.0f + 0.49f * mwc_next_11(rctx); + y = min(255, y); + u = min(255, u); + v = min(255, v); out.y = (1 << 22) | (y << 4); out.x = (u << 18) | v; + surf2Dwrite(out, flatpal, 8 * threadIdx.x, blockIdx.x); - rctxs[gid] = rctx; + rctxs[rb_incr(rb->tail, threadIdx.x)] = rctx; } -""") +''') + +testcrlib = devlib(defs=r''' +__global__ void +test_cr(const float *times, const float *knots, const float *t, float *r) { + int i = threadIdx.x + blockDim.x * blockIdx.x; + r[i] = catmull_rom(times, knots, t[i]); +} +''') if __name__ == "__main__": + # Test spline evaluation. This code will probably drift pretty often. import pycuda.driver as cuda from pycuda.compiler import SourceModule import pycuda.autoinit diff --git a/cuburn/code/iter.py b/cuburn/code/iter.py index 28e2eb7..2f3df96 100644 --- a/cuburn/code/iter.py +++ b/cuburn/code/iter.py @@ -2,15 +2,16 @@ The main iteration loop. """ -from cuburn.code import mwc, variations, interp -from cuburn.code.util import * +import variations +import interp +from util import Template, devlib, ringbuflib +from mwc import mwclib def precalc_densities(pcp, std_xforms): # This pattern recurs a few times for precalc segments. Unfortunately, # namespace stuff means it's not easy to functionalize this boilerplate pre_cp = pcp._precalc() pre_cp._code(Template(r""" - float sum = 0.0f; {{for n in std_xforms}} @@ -30,7 +31,6 @@ def precalc_densities(pcp, std_xforms): def precalc_chaos(pcp, std_xforms): pre_cp = pcp._precalc() pre_cp._code(Template(""" - float sum, rsum; {{for p in std_xforms}} @@ -50,7 +50,6 @@ def precalc_chaos(pcp, std_xforms): {{endfor}} {{endfor}} - """, name='precalc_chaos').substitute(locals())) def precalc_camera(pcam): @@ -64,7 +63,6 @@ def precalc_camera(pcam): # . matrix([X],[Y],[1]); pre_cam._code(Template(r""" - float rot = {{pre_cam.rotation}} * M_PI / 180.0f; float rotsin = sin(rot), rotcos = cos(rot); float cenx = {{pre_cam.center.x}}, ceny = {{pre_cam.center.y}}; @@ -79,13 +77,11 @@ def precalc_camera(pcam): {{pre_cam._set('yy')}} = scale * rotcos; {{pre_cam._set('yo')}} = scale * -(rotsin * cenx + rotcos * ceny) + 0.5f * acc_size.aheight; - - """).substitute(locals())) + """, 'precalc_camera').substitute(locals())) def precalc_xf_affine(px): pre = px._precalc() pre._code(Template(r""" - float pri = {{pre.angle}} * M_PI / 180.0f; float spr = {{pre.spread}} * M_PI / 180.0f; @@ -98,27 +94,22 @@ def precalc_xf_affine(px): {{pre._set('yy')}} = magy * sin(pri+spr); {{pre._set('xo')}} = {{pre.offset.x}}; {{pre._set('yo')}} = -{{pre.offset.y}}; + """, 'precalc_xf_affine').substitute(locals())) - """).substitute(locals())) +def apply_affine(x, y, xo, yo, packer): + return Template(""" + {{xo}} = {{packer.xx}} * {{x}} + {{packer.xy}} * {{y}} + {{packer.xo}}; + {{yo}} = {{packer.yx}} * {{x}} + {{packer.yy}} * {{y}} + {{packer.yo}}; + """, 'apply_affine').substitute(locals()) -class IterCode(HunkOCode): - # The number of threads per block - NTHREADS = 256 +# The number of threads per block used in the iteration function. Don't change +# it lightly; the code may depend on it in unusual ways. +NTHREADS = 256 - def __init__(self, info, genome): - self.packer = interp.GenomePacker('iter_params') - self.pcp = self.packer.view('params', genome, 'cp') - - iterbody = self._iterbody(info, genome) - bodies = [self._xfbody(i,x) for i,x in sorted(genome.xforms.items())] - bodies.append(iterbody) - self.defs = '\n'.join(bodies) - - decls = """ +iter_decls = """ // Note: for normalized lookups, uchar4 actually returns floats texture palTex; __shared__ iter_params params; -__device__ int rb_head, rb_tail, rb_size; typedef struct { uint32_t width; @@ -128,12 +119,9 @@ typedef struct { uint32_t astride; } acc_size_t; __constant__ acc_size_t acc_size; - """ - def _xfbody(self, xfid, xform): - px = self.pcp.xforms[xfid] - tmpl = Template(r""" +iter_xf_body_code = r""" __device__ void apply_xf_{{xfid}}(float &ox, float &oy, float &color, mwc_st &rctx) { float tx, ty; @@ -162,44 +150,30 @@ void apply_xf_{{xfid}}(float &ox, float &oy, float &color, mwc_st &rctx) { float csp = {{px.color_speed}}; color = color * (1.0f - csp) + {{px.color}} * csp; }; -""", 'apply_xf_'+xfid) - g = dict(globals()) - g.update(locals()) - return tmpl.substitute(g) +""" - def _iterbody(self, info, genome): - tmpl = Template(r''' - -__global__ void reset_rb(int size) { - rb_head = rb_tail = 0; - rb_size = size; -} - -__global__ -void iter( - uint64_t out_ptr, - mwc_st *msts, - float4 *points, - const iter_params *all_params -{{if info.acc_mode == 'atomic'}} - , uint64_t atom_ptr -{{endif}} -) { - const iter_params *global_params = &(all_params[blockIdx.x]); +def iter_xf_body(pcp, xfid, xform): + px = pcp.xforms[xfid] + tmpl = Template(iter_xf_body_code, 'apply_xf_'+xfid) + g = dict(globals()) + g.update(locals()) + return tmpl.substitute(g) +iter_body_code = r''' +__global__ void +iter(uint64_t out_ptr, uint64_t atom_ptr, + ringbuf *rb, mwc_st *msts, float4 *points, + const iter_params *all_params) +{ // load params to shared memory cooperatively + const iter_params *global_params = &(all_params[blockIdx.x]); for (int i = threadIdx.y * blockDim.x + threadIdx.x; i < (sizeof(iter_params) / 4); i += blockDim.x * blockDim.y) reinterpret_cast(¶ms)[i] = reinterpret_cast(global_params)[i]; - __shared__ int rb_idx; - if (threadIdx.y == 1 && threadIdx.x == 1) - rb_idx = 32 * blockDim.y * (atomicAdd(&rb_head, 1) % rb_size); - - __syncthreads(); - int this_rb_idx = rb_idx + threadIdx.x + 32 * threadIdx.y; + int this_rb_idx = rb_incr(rb->head, blockDim.x * threadIdx.y + threadIdx.x); mwc_st rctx = msts[this_rb_idx]; {{precalc_camera(pcp.camera)}} @@ -209,22 +183,15 @@ void iter( {{pcp.camera.yo}} += ditherwidth * mwc_next_11(rctx); } -{{if info.acc_mode == 'global'}} - __shared__ float time_frac; - time_frac = blockIdx.x / (float) gridDim.x; -{{else}} - {{if info.acc_mode == 'atomic'}} // TODO: spare the register, reuse at call site? int time = blockIdx.x >> 4; - {{endif}} float color_dither = 0.49f * mwc_next_11(rctx); -{{endif}} // TODO: 4th channel unused. Kill or use for something helpful float4 old_point = points[this_rb_idx]; float x = old_point.x, y = old_point.y, color = old_point.z; -{{if not info.chaos_used}} +{{if not chaos_used}} // Shared memory size can be reduced by a factor of four using a slower // 4-stage reduce, but on Fermi hardware shmem use isn't a bottleneck __shared__ float swap[{{4*NTHREADS}}]; @@ -250,7 +217,7 @@ void iter( // TODO: link up with FUSE, etc for (int round = 0; round < 256; round++) { -{{if info.chaos_used}} +{{if chaos_used}} {{precalc_chaos(pcp, std_xforms)}} @@ -315,16 +282,7 @@ void iter( {{endif}} -{{if info.acc_mode == 'deferred'}} - int tid = threadIdx.y * 32 + threadIdx.x; - int offset = 4 * (256 * (256 * blockIdx.x + round) + tid); - int *log = reinterpret_cast(out_ptr + offset); -{{endif}} - if (fuse) { -{{if info.acc_mode == 'deferred'}} - *log = 0xffffffff; -{{endif}} continue; } @@ -346,15 +304,11 @@ void iter( uint32_t ix = trunca(cx), iy = trunca(cy); if (ix >= acc_size.astride || iy >= acc_size.aheight) { -{{if info.acc_mode == 'deferred'}} - *log = 0xffffffff; -{{endif}} continue; } - uint32_t ibase = (last_xf_used % {{info.max_nxf}}) * acc_size.aheight; - uint32_t i = (ibase + iy) * acc_size.astride + ix; -{{if info.acc_mode == 'atomic'}} + uint32_t i = iy * acc_size.astride + ix; + asm volatile ({{crep(""" { // To prevent overflow, we need to flush each pixel before the density @@ -432,38 +386,45 @@ oflow_end: """)}} :: "f"(cc), "f"(color_dither), "r"(time), "r"(i), "l"(atom_ptr), "f"(cosel[threadIdx.y + {{NWARPS}}]), "l"(out_ptr)); -{{endif}} - -{{if info.acc_mode == 'global'}} - float4 outcol = tex2D(palTex, cc, time_frac); - float4 *accbuf = reinterpret_cast(out_ptr + (16*i)); - float4 pix = *accbuf; - pix.x += outcol.x; - pix.y += outcol.y; - pix.z += outcol.z; - pix.w += 1.0f; - *accbuf = pix; -{{elif info.acc_mode == 'deferred'}} - // 'color' gets the top 8 bits. TODO: add dithering via precalc. - uint32_t icolor = fminf(1.0f, cc) * 255.0f + color_dither; - asm("bfi.b32 %0, %1, %0, 24, 8;" : "+r"(i) : "r"(icolor)); - *log = i; -{{endif}} } - if (threadIdx.x == 0 && threadIdx.y == 0) - rb_idx = 32 * blockDim.y * (atomicAdd(&rb_tail, 1) % rb_size); - __syncthreads(); - this_rb_idx = rb_idx + threadIdx.x + 32 * threadIdx.y; - + this_rb_idx = rb_incr(rb->tail, blockDim.x * threadIdx.y + threadIdx.x); points[this_rb_idx] = make_float4(x, y, color, 0.0f); msts[this_rb_idx] = rctx; return; } +''' -{{if info.acc_mode == 'atomic'}} -__global__ -void flush_atom(uint64_t out_ptr, uint64_t atom_ptr, int nbins) { +def iter_body(cp, pcp): + # For legacy reasons, 'cp' is used here instead of 'genome'. + tmpl = Template(iter_body_code, 'iter_body') + NWARPS = NTHREADS / 32 + std_xforms = [n for n in sorted(cp.xforms) if n != 'final'] + + # TODO: detect this properly and use it + chaos_used = False + + vars = globals() + vars.update(locals()) + return tmpl.substitute(vars) + +def mkiterlib(genome): + packer = interp.GenomePacker('iter_params') + pcp = packer.view('params', genome, 'cp') + + iterbody = iter_body(genome, pcp) + bodies = [iter_xf_body(pcp, i, x) for i, x in sorted(genome.xforms.items())] + bodies.append(iterbody) + packer_lib = packer.finalize() + + lib = devlib(deps=[packer_lib, mwclib, ringbuflib], + # We grab the surf decl from palintlib as well + decls=iter_decls + interp.palintlib.decls, + defs='\n'.join(bodies)) + return packer, lib + +flushatomlib = devlib(defs=Template(r''' +__global__ void flush_atom(uint64_t out_ptr, uint64_t atom_ptr, int nbins) { int i = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x; if (i >= nbins) return; asm volatile ({{crep(""" @@ -499,168 +460,4 @@ void flush_atom(uint64_t out_ptr, uint64_t atom_ptr, int nbins) { } """)}} :: "r"(i), "l"(atom_ptr), "l"(out_ptr)); } - -{{endif}} - -{{if info.acc_mode == 'deferred'}} - -// Block size, shared accumulation bits, shared accumulation width. -#define BS 1024 -#define SHAB 12 -#define SHAW (1< 0) idx_lo = log_bounds[lb_idx_lo] & ~(BS - 1); - else idx_lo = 0; - int idx_hi = (log_bounds[lb_idx_hi] & ~(BS - 1)) + BS; - - float rnrounds = {{'%d.0f' % info.palette_height}} / (idx_hi - idx_lo); - float time = tid * rnrounds; - float time_step = BS * rnrounds; - - int magic = ((blockIdx.x & 0xff) << 3) + ((blockIdx.x & 0xf00) << 12); - int magic_mask = 0xf007f8; - - for (int i = idx_lo + tid; i < idx_hi; i += BS) { - int entry = log[i]; - - asm volatile ({{crep(""" -{ - .reg .pred q; - .reg .u32 shoff, color, time, d, r, g, b, hi, lo, hiw, low, tmp; - .reg .u64 ptr; - .reg .f32 rf, gf, bf, df, rg, gg, dg, bg; - - // TODO: opacity - and.b32 tmp, %0, %4; - setp.eq.u32 q, tmp, %3; -@!q bra before_sync; - - and.b32 shoff, %0, 0xff800; - shr.b32 shoff, shoff, 5; - bfi.b32 shoff, %0, shoff, 3, 3; - - bfe.u32 color, %0, 24, 8; - shl.b32 color, color, 3; - cvt.rni.u32.f32 time, %1; - - suld.b.2d.v2.b32.clamp {lo, hi}, [flatpal, {color, time}]; - ld.shared.v2.u32 {low, hiw}, [shoff]; - add.cc.u32 lo, lo, low; - addc.u32 hi, hi, hiw; - setp.hs.u32 q, hi, (1023 << 22); -@q bra oflow_sync; - st.shared.v2.u32 [shoff], {hi, lo}; -before_sync: - bar.sync 0; - bra oflow_write_end; -oflow_sync: - st.shared.v2.u32 [shoff], {0, 0}; - - // TODO: opacity - bfi.b32 shoff, %0, 0, 4, 24; - cvt.u64.u32 ptr, shoff; - add.u64 ptr, ptr, %2; - ld.global.v4.f32 {dg,bg,gg,rg}, [ptr]; - bar.sync 0; - - bfe.u32 r, hi, 4, 18; - bfe.u32 g, lo, 18, 14; - bfi.b32 g, hi, g, 14, 4; - and.b32 b, lo, ((1<<18)-1); - cvt.rn.f32.u32 rf, r; - cvt.rn.f32.u32 gf, g; - cvt.rn.f32.u32 bf, b; - fma.rn.ftz.f32 rf, rf, (1.0/255.0), rg; - fma.rn.ftz.f32 gf, gf, (1.0/255.0), gg; - fma.rn.ftz.f32 bf, bf, (1.0/255.0), bg; - add.f32 df, df, dg; - st.global.v4.f32 [ptr], {df,bf,gf,rf}; - -oflow_write_end: -} - """)}} :: "r"(entry), "f"(time), "l"(acc), "r"(magic), "r"(magic_mask)); - time += time_step; - } - - __syncthreads(); - - - idx = tid; - int glo_idx = magic | (((idx << 8) | idx) & 0xff807); - - for (int i = 0; i < (SHAW / BS) && glo_idx < nbins; i++) { - int d, r, g, b; - float4 pix = acc[glo_idx]; - asm({{crep(""" -{ - .reg .u32 hi, lo; - ld.shared.v2.u32 {lo, hi}, [%4]; - shr.u32 %0, hi, 22; - bfe.u32 %1, hi, 4, 18; - bfe.u32 %2, lo, 18, 14; - bfi.b32 %2, hi, %2, 14, 4; - and.b32 %3, lo, ((1<<18)-1); -} - """)}} : "=r"(d), "=r"(r), "=r"(g), "=r"(b) : "r"(idx*8)); - pix.x += r / 255.0f; - pix.y += g / 255.0f; - pix.z += b / 255.0f; - pix.w += d; - acc[glo_idx] = pix; - idx += BS; - glo_idx += (BS << 8); - } -} -{{endif}} -''', 'iter_kern') - return tmpl.substitute( - info = info, - cp = genome, - pcp = self.pcp, - NTHREADS = self.NTHREADS, - NWARPS = self.NTHREADS / 32, - std_xforms = [n for n in sorted(genome.xforms) if n != 'final'], - **globals()) +''', 'flush_atom').substitute()) diff --git a/cuburn/code/mwc.py b/cuburn/code/mwc.py index 5b07d92..2790029 100644 --- a/cuburn/code/mwc.py +++ b/cuburn/code/mwc.py @@ -6,18 +6,53 @@ import os import warnings import numpy as np -from util import * +from util import devlib, assemble_code -class MWC(HunkOCode): - decls = """ +# Keeping this live in the module isn't necessary, but loading the mults +# can be surprisingly slow. +mults = None + +def load_mults(): + pfpath = os.path.join(os.path.dirname(__file__), 'primes.bin') + if os.path.isfile(pfpath): + with open(pfpath) as fp: + return np.frombuffer(fp.read(), dtype=' dstride || y > height) return; + int isrc = sstride * (y + gutter) + x + gutter; + + int tid = blockDim.x * threadIdx.y + threadIdx.x; + mwc_st rctx = rctxs[rb_incr(rb->head, tid)]; + + float4 in = src[isrc]; + uchar4 out = make_uchar4( + fminf(1.0f, in.x) * 255.0f + 0.49f * mwc_next_11(rctx), + fminf(1.0f, in.y) * 255.0f + 0.49f * mwc_next_11(rctx), + fminf(1.0f, in.z) * 255.0f + 0.49f * mwc_next_11(rctx), + fminf(1.0f, in.w) * 255.0f + 0.49f * mwc_next_11(rctx) + ); + + int idst = dstride * y + x; + dst[idst] = out; + rctxs[rb_incr(rb->head, tid)] = rctx; +} +''') diff --git a/cuburn/code/util.py b/cuburn/code/util.py index 1bd1614..062b751 100644 --- a/cuburn/code/util.py +++ b/cuburn/code/util.py @@ -1,55 +1,119 @@ """ Provides tools and miscellaneous functions for building device code. """ +import os +import tempfile +from collections import namedtuple +import pycuda.driver as cuda +import pycuda.compiler import numpy as np import tempita -def crep(s): - """Escape for PTX assembly""" - if isinstance(s, unicode): - s = s.encode('utf-8') - return '"%s"' % s.encode("string_escape") - def argset(obj, **kwargs): + """ + Allow an object with many properties to be set using one call. + + >>> x = argset(X(), a=1, b=2) + >>> x.a + 1 + """ for k, v in kwargs.items(): setattr(obj, k, v) return obj +def launch(name, mod, stream, block, grid, *args, **kwargs): + """ + Oft-used kernel launch helper. Provides a nice boost in readability of + densely-packed asynchronous code launches. + """ + fun = mod.get_function(name) + if isinstance(block, (int, np.number)): + block = (int(block), 1, 1) + if isinstance(grid, (int, np.number)): + grid = (int(grid), 1) + fun(*args, block=block, grid=grid, stream=stream, **kwargs) + +def crep(s): + """Multiline literal escape for inline PTX assembly.""" + if isinstance(s, unicode): + s = s.encode('utf-8') + return '"%s"' % s.encode("string_escape") + class Template(tempita.Template): + """ + A Tempita template with extra stuff in the default namespace. + """ default_namespace = tempita.Template.default_namespace.copy() Template.default_namespace.update({'np': np, 'crep': crep}) -class HunkOCode(object): - """An apparently passive container for device code.""" - # Use property objects to make these dynamic - headers = '' - decls = '' - defs = '' +# Passive container for device code. +DevLib = namedtuple('DevLib', 'deps headers decls defs') -def assemble_code(*sections): - return ''.join([''.join([getattr(sect, kind) for sect in sections]) - for kind in ['headers', 'decls', 'defs']]) +def devlib(deps=(), headers='', decls='', defs=''): + """Create a library of device code.""" + # This exists because namedtuple doesn't support default args + return DevLib(deps, headers, decls, defs) -def apply_affine(x, y, xo, yo, packer): - return Template(""" - {{xo}} = {{packer.xx}} * {{x}} + {{packer.xy}} * {{y}} + {{packer.xo}}; - {{yo}} = {{packer.yx}} * {{x}} + {{packer.yy}} * {{y}} + {{packer.yo}}; - """).substitute(locals()) +def assemble_code(*libs): + seen = set() + out = [] + def go(lib): + map(go, lib.deps) + code = lib[1:] + if code not in seen: + seen.add(code) + out.append(code) + go(stdlib) + map(go, libs) + return ''.join(sum(zip(*out), ())) -class BaseCode(HunkOCode): - headers = """ +DEFAULT_CMP_OPTIONS = ('-use_fast_math', '-maxrregcount', '42') +DEFAULT_SAVE_KERNEL = True +def compile(name, src, opts=DEFAULT_CMP_OPTIONS, save=DEFAULT_SAVE_KERNEL): + """ + Compile a module. Returns a copy of the source (for inspection or + display) and the compiled cubin. + """ + dir = tempfile.gettempdir() + if save: + with open(os.path.join(dir, name + '_kern.cu'), 'w') as fp: + fp.write(src) + cubin = pycuda.compiler.compile(src, options=list(opts)) + if save: + with open(os.path.join(dir, name + '_kern.cubin'), 'w') as fp: + fp.write(cubin) + return cubin + +class ClsMod(object): + """ + Convenience class or mixin that automatically compiles and loads a module + once per class, saving potentially expensive code regeneration. Only use + if your class does not employ run-time code generation. + """ + mod = None + # Supply your own DevLib on this property + lib = None + + def __init__(self): + super(ClsMod, self).__init__() + self.load() + + @classmethod + def load(cls, name=None): + if cls.mod is None: + if name is None: + name = cls.__name__.lower() + cubin = compile(name, assemble_code(cls.lib)) + cls.mod = cuda.module_from_buffer(cubin) + +# This lib is included with *every* assembled module. It contains no global +# functions, so it shouldn't slow down compilation time too much. +stdlib = devlib(headers=""" #include #include #include -""" - - decls = """ -float3 rgb2hsv(float3 rgb); -float3 hsv2rgb(float3 hsv); -""" - - defs = Template(r""" +""", decls=r""" #undef M_E #undef M_LOG2E #undef M_LOG10E @@ -84,33 +148,79 @@ float3 hsv2rgb(float3 hsv); #define bfe_decl(d, s, o, w) \ int d; \ bfe(d, s, o, w) - -// TODO: use launch parameter preconfig to eliminate unnecessary parts -__device__ -uint32_t gtid() { +""", defs=r''' +__device__ uint32_t gtid() { return threadIdx.x + blockDim.x * (threadIdx.y + blockDim.y * (threadIdx.z + blockDim.z * (blockIdx.x + (gridDim.x * blockIdx.y)))); } -__device__ -uint32_t trunca(float f) { +__device__ uint32_t trunca(float f) { // truncate as used in address calculations. note the use of a signed // conversion is intentional here (simplifies image bounds checking). uint32_t ret; asm("cvt.rni.s32.f32 %0, %1;" : "=r"(ret) : "f"(f)); return ret; } +''') -__global__ -void fill_dptr(uint32_t* dptr, int size, uint32_t value) { +def mkbinsearchlib(rounds): + """ + Search through the fixed-size list 'hay' to find the rightmost index which + contains a value strictly smaller than the input 'needle'. The list must + be exactly '2^rounds' long, although padding at the top with very large + numbers or even +inf effectively shortens it. + """ + # TODO: this doesn't optimize well on a 64-bit arch, not that it's a + # performance-critical chunk of code or anything + src = Template(r''' +__device__ int bitwise_binsearch(const float *hay, float needle) { + int lo = 0; + + // TODO: improve efficiency on 64-bit arches + {{for i in range(search_rounds-1, -1, -1)}} + if (needle > hay[lo + {{1 << i}}]) + lo += {{1 << i}}; + {{endfor}} + return lo; +} +''', 'bitwise_binsearch') + return devlib(defs=src.substitute(search_rounds=rounds)) + +# 2^search_rounds is the maximum number of knots allowed in a single spline. +# This includes the four required knots, so a 5 round search supports 28 +# interior knots in the domain (0, 1). 2^5 fits nicely on a single cache line. +DEFAULT_SEARCH_ROUNDS = 5 +binsearchlib = mkbinsearchlib(DEFAULT_SEARCH_ROUNDS) + + +filldptrlib = devlib(defs=r''' +__global__ void +fill_dptr(uint32_t* dptr, int size, uint32_t value) { int i = (gridDim.x * blockIdx.y + blockIdx.x) * blockDim.x + threadIdx.x; if (i < size) { dptr[i] = value; } } +''') +def fill_dptr(mod, dptr, size, stream=None, value=np.uint32(0)): + """ + A memory zeroer which can be embedded in a stream, unlike the various + memset routines. Size is the number of 4-byte words in the pointer; + value is the word to fill it with. If value is not an np.uint32, it + will be coerced to a buffer and the first four bytes taken. + """ + if not isinstance(value, np.uint32): + if isinstance(value, int): + value = np.uint32(value) + else: + value = np.frombuffer(buffer(value), np.uint32)[0] + blocks = int(np.ceil(np.sqrt(size / 1024.))) + launch('fill_dptr', mod, stream, (1024, 1, 1), (blocks, blocks), + dptr, np.int32(size), value) +writehalflib = devlib(defs=r''' /* read_half and write_half decode and encode, respectively, two * floating-point values from a 32-bit value (typed as a 'float' for * convenience but not really). The values are packed into u16s as a @@ -122,8 +232,8 @@ void fill_dptr(uint32_t* dptr, int size, uint32_t value) { * approach when the alpha channel is present. */ -__device__ -void read_half(float &x, float &y, float xy, float den) { +__device__ void +read_half(float &x, float &y, float xy, float den) { asm("\n\t{" "\n\t .reg .u16 x, y;" "\n\t .reg .f32 rc;" @@ -137,8 +247,8 @@ void read_half(float &x, float &y, float xy, float den) { : "=f"(x), "=f"(y) : "f"(xy), "f"(den)); } -__device__ -void write_half(float &xy, float x, float y, float den) { +__device__ void +write_half(float &xy, float x, float y, float den) { asm("\n\t{" "\n\t .reg .u16 x, y;" "\n\t .reg .f32 rc, xf, yf;" @@ -152,87 +262,41 @@ void write_half(float &xy, float x, float y, float den) { "\n\t}" : "=f"(xy) : "f"(x), "f"(y), "f"(den)); } +''') -/* This conversion uses the JPEG full-range standard. Note that UV have range - * [-0.5, 0.5], so consider biasing the results. */ -__device__ -float3 rgb2yuv(float3 rgb) { - return make_float3( - 0.299f * rgb.x + 0.587f * rgb.y + 0.114f * rgb.z, - -0.168736f * rgb.x - 0.331264f * rgb.y + 0.5f * rgb.z, - 0.5f * rgb.x - 0.418688f * rgb.y - 0.081312f * rgb.z); +def mkringbuflib(rb_size): + """ + A ringbuffer for access to shared resources. + + Some components, such as the MWC contexts, are expensive to generate, and + have no affinity to a particular block. Rather than maintain a separate + copy of each of these objects for every thread block in a launch, we want + to only keep enough copies of these resources around to service every + thread block that could possibly be active simultaneously on one card, + which is often considerably smaller. This class provides a simple + ringbuffer type and an increment function, used in a couple places to + implement this kind of sharing. + """ + return devlib(headers="#define RB_SIZE_MASK %d" % (rb_size - 1), decls=''' +typedef struct { + int head; + int tail; +} ringbuf; +''', defs=r''' +__shared__ int rb_idx; +__device__ int rb_incr(int &rb_base, int tidx) { + if (threadIdx.y == 1 && threadIdx.x == 1) + rb_idx = 256 * (atomicAdd(&rb_base, 1) & RB_SIZE_MASK); + __syncthreads(); + return rb_idx + tidx; } +''') -__device__ -float3 yuv2rgb(float3 yuv) { - return make_float3( - yuv.x + 1.402f * yuv.z, - yuv.x - 0.34414f * yuv.y - 0.71414f * yuv.z, - yuv.x + 1.772f * yuv.y); -} - -__device__ -float3 rgb2hsv(float3 rgb) { - float M = fmaxf(fmaxf(rgb.x, rgb.y), rgb.z); - float m = fminf(fminf(rgb.x, rgb.y), rgb.z); - float C = M - m; - - float s = M > 0.0f ? C / M : 0.0f; - - float h = 0.0f; - if (s != 0.0f) { - C = 1.0f / C; - float rc = (M - rgb.x) * C; - float gc = (M - rgb.y) * C; - float bc = (M - rgb.z) * C; - - if (rgb.x == M) h = bc - gc; - else if (rgb.y == M) h = 2 + rc - bc; - else h = 4 + gc - rc; - - if (h < 0) h += 6.0f; - } - return make_float3(h, s, M); -} - -__device__ -float3 hsv2rgb(float3 hsv) { - - float whole = floorf(hsv.x); - float frac = hsv.x - whole; - float val = hsv.z; - float min = val * (1 - hsv.y); - float mid = val * (1 - (hsv.y * frac)); - float alt = val * (1 - (hsv.y * (1 - frac))); - - float3 out; - if (whole == 0.0f) { out.x = val; out.y = alt; out.z = min; } - else if (whole == 1.0f) { out.x = mid; out.y = val; out.z = min; } - else if (whole == 2.0f) { out.x = min; out.y = val; out.z = alt; } - else if (whole == 3.0f) { out.x = min; out.y = mid; out.z = val; } - else if (whole == 4.0f) { out.x = alt; out.y = min; out.z = val; } - else { out.x = val; out.y = min; out.z = mid; } - return out; -} -""").substitute() - - @staticmethod - def fill_dptr(mod, dptr, size, stream=None, value=np.uint32(0)): - """ - A memory zeroer which can be embedded in a stream, unlike the various - memset routines. Size is the number of 4-byte words in the pointer; - value is the word to fill it with. If value is not an np.uint32, it - will be coerced to a buffer and the first four bytes taken. - """ - fill = mod.get_function("fill_dptr") - if not isinstance(value, np.uint32): - if isinstance(value, int): - value = np.uint32(value) - else: - value = np.frombuffer(buffer(value), np.uint32)[0] - blocks = int(np.ceil(np.sqrt(size / 1024 + 1))) - fill(dptr, np.int32(size), value, stream=stream, - block=(1024, 1, 1), grid=(blocks, blocks, 1)) - - +# For now, the number of entries is fixed to a value known to work on all +# Fermi cards. Autodetection, or perhaps just a global limit increase, will be +# done when I get my hands on a Kepler device. The fixed size assumes blocks +# of 256 threads, although even at that size there are pathological cases that +# could break the assumption. +DEFAULT_RB_SIZE = 64 +ringbuflib = mkringbuflib(DEFAULT_RB_SIZE) diff --git a/cuburn/filters.py b/cuburn/filters.py new file mode 100644 index 0000000..d081b8f --- /dev/null +++ b/cuburn/filters.py @@ -0,0 +1,105 @@ +import numpy as np +from numpy import float32 as f32, int32 as i32 + +import pycuda.driver as cuda +import pycuda.compiler +from pycuda.gpuarray import vec + +import code.filters +from code.util import ClsMod, argset, launch + +class Filter(object): + def apply(self, fb, gnm, dim, tc, stream=None): + """ + Queue the application of this filter. When the live stream finishes + executing the last item enqueued by this method, the result must be + live in the buffer pointed to by ``fb.d_front`` at the return of this + function. + """ + raise NotImplementedError() + +class Bilateral(Filter, ClsMod): + lib = code.filters.bilaterallib + def __init__(self, directions=8, r=15, sstd=6, cstd=0.05, + dstd=1.5, dpow=0.8, gspeed=4.0): + # TODO: expose these parameters on the genome, or at least on the + # profile, and set them by a less ugly mechanism + for n in 'directions r sstd cstd dstd dpow gspeed'.split(): + setattr(self, n, locals()[n]) + super(Bilateral, self).__init__() + + def apply(self, fb, gnm, dim, tc, stream=None): + # Helper variables and functions to keep it clean + sb = 16 * dim.astride + bs = sb * dim.ah + bl, gr = (32, 8, 1), (dim.astride / 32, dim.ah / 8) + + mkdsc = lambda c: argset(cuda.ArrayDescriptor(), height=dim.ah, + width=dim.astride, num_channels=c, + format=cuda.array_format.FLOAT) + def mktref(n): + tref = self.mod.get_texref(n) + tref.set_filter_mode(cuda.filter_mode.POINT) + tref.set_address_mode(0, cuda.address_mode.WRAP) + tref.set_address_mode(1, cuda.address_mode.WRAP) + return tref + + dsc = mkdsc(4) + tref = mktref('bilateral_src') + grad_dsc = mkdsc(1) + grad_tref = mktref('blur_src') + + for pattern in range(self.directions): + # Scale spatial parameter so that a "pixel" is equivalent to an + # actual pixel at 1080p + sstd = self.sstd * dim.w / 1920. + + tref.set_address_2d(fb.d_front, dsc, sb) + + # Blur density two octaves along sampling vector, ultimately + # storing in the side buffer + launch('den_blur', self.mod, stream, bl, gr, + fb.d_back, i32(pattern), i32(0), texrefs=[tref]) + grad_tref.set_address_2d(fb.d_back, grad_dsc, sb / 4) + launch('den_blur_1c', self.mod, stream, bl, gr, + fb.d_side, i32(pattern), i32(1), texrefs=[grad_tref]) + grad_tref.set_address_2d(fb.d_side, grad_dsc, sb / 4) + + launch('bilateral', self.mod, stream, bl, gr, + fb.d_back, i32(pattern), i32(self.r), + f32(sstd), f32(self.cstd), f32(self.dstd), + f32(self.dpow), f32(self.gspeed), + texrefs=[tref, grad_tref]) + fb.flip() + +class Logscale(Filter, ClsMod): + lib = code.filters.logscalelib + def apply(self, fb, gnm, dim, tc, stream=None): + """Log-scale in place.""" + k1 = f32(gnm.color.brightness(tc) * 268 / 256) + # Old definition of area is (w*h/(s*s)). Since new scale 'ns' is now + # s/w, new definition is (w*h/(s*s*w*w)) = (h/(s*s*w)) + area = dim.h / (gnm.camera.scale(tc) ** 2 * dim.w) + k2 = f32(1.0 / (area * gnm.spp(tc))) + nbins = dim.ah * dim.astride + launch('logscale', self.mod, stream, 256, nbins/256, + fb.d_front, fb.d_front, k1, k2) + +class ColorClip(Filter, ClsMod): + lib = code.filters.colorcliplib + def apply(self, fb, gnm, dim, tc, stream=None): + # TODO: implement integration over cubic splines? + gam = f32(1 / gnm.color.gamma(tc)) + vib = f32(gnm.color.vibrance(tc)) + hipow = f32(gnm.color.highlight_power(tc)) + lin = f32(gnm.color.gamma_threshold(tc)) + lingam = f32(lin ** (gam-1.0) if lin > 0 else 0) + bkgd = vec.make_float3( + gnm.color.background.r(tc), + gnm.color.background.g(tc), + gnm.color.background.b(tc)) + + nbins = dim.ah * dim.astride + blocks = int(np.ceil(np.sqrt(nbins / 256.))) + launch('colorclip', self.mod, stream, 256, (blocks, blocks), + fb.d_front, gam, vib, hipow, lin, lingam, bkgd, i32(nbins)) diff --git a/cuburn/output.py b/cuburn/output.py new file mode 100644 index 0000000..8436b54 --- /dev/null +++ b/cuburn/output.py @@ -0,0 +1,49 @@ +import numpy as np +from numpy import float32 as f32, int32 as i32 + +import pycuda.driver as cuda + +from code.util import ClsMod, launch +from code.output import f32tou8lib + +import scipy.misc + +if not hasattr(scipy.misc, 'toimage'): + raise ImportError("Could not find scipy.misc.toimage. " + "Are scipy and PIL installed?") + +class Output(object): + def convert(self, fb, gnm, dim, stream=None): + """ + Convert a filtered buffer to whatever output format is needed by the + writer. + """ + raise NotImplementedError() + + def copy(self, fb, dim, pool, stream=None): + """ + Schedule a copy from the device buffer to host memory, returning the + target buffer. + """ + raise NotImplementedError() + +class PILOutput(Output, ClsMod): + lib = f32tou8lib + + def convert(self, fb, gnm, dim, stream=None): + launch('f32_to_u8', self.mod, stream, + (32, 8, 1), (int(np.ceil(dim.w/32.)), int(np.ceil(dim.h/8.))), + fb.d_rb, fb.d_seeds, fb.d_back, fb.d_front, + i32(fb.gutter), i32(dim.w), i32(dim.astride), i32(dim.h)) + + def copy(self, fb, dim, pool, stream=None): + h_out = pool.allocate((dim.h, dim.w, 4), 'u1') + cuda.memcpy_dtoh_async(h_out, fb.d_back, stream) + return h_out + + @staticmethod + def save(buf, name, type=None, quality=98): + if type == 'jpeg' or (type is None and name.endswith('.jpg')): + buf = buf[:,:,:3] + img = scipy.misc.toimage(buf, cmin=0, cmax=1) + img.save(name, type, quality=quality) diff --git a/cuburn/render.py b/cuburn/render.py index 22300a6..b65ed2f 100644 --- a/cuburn/render.py +++ b/cuburn/render.py @@ -1,383 +1,372 @@ +""" +Resources and tools to perform rendering. +""" + import os import sys import re -import time as timemod +import time import tempfile from collections import namedtuple -from itertools import cycle, repeat, chain, izip, imap, ifilter -from ctypes import * -from cStringIO import StringIO import numpy as np from numpy import float32 as f32, int32 as i32, uint32 as u32, uint64 as u64 -from scipy import ndimage -import pycuda.compiler import pycuda.driver as cuda import pycuda.tools -import cuburn.genome -from cuburn import affine -from cuburn.code import util, mwc, iter, interp, filtering, sort +import filters +import output +from code import util, mwc, iter, interp, sort +from code.util import ClsMod, devlib, filldptrlib, assemble_code, launch RenderedImage = namedtuple('RenderedImage', 'buf idx gpu_time') Dimensions = namedtuple('Dimensions', 'w h aw ah astride') -def _sync_stream(dst, src): - dst.wait_for_event(cuda.Event(cuda.event_flags.DISABLE_TIMING).record(src)) +class Framebuffers(object): + """ + The largest memory allocations, and a stream to serialize their use. -class Renderer(object): + ``d_front`` and ``d_back`` are separate buffers, each large enough to hold + four float32 components per pixel (including any gutter pixels added for + alignment or padding). ``d_side`` is another buffer large enough to hold + two float32 components per pixel. + + Every user of this set of buffers may use and overwrite the buffers in + any way, as long as the output for the next stage winds up in the front + buffer. The front and back buffers can be exchanged by the ``flip()`` + method (which simply exchanges the pointers); while no similar method + exists for the side buffer, you're free to do the same by taking local + copies of the references and exchanging them yourself. + + There's one spot in the stream interleaving where the behavior is + different: the ``Output.convert`` call must store its output to the back + buffer, which will remain untouched until the dtoh copy of the converted + buffer is finished. This happens while the ``iter`` kernel of the next + frame writes to the front and side buffers, which means in practice that + there's essentially no waiting on any buffers. + + If an output module decides to get all krazy and actually replaces the + references to the buffers on this object - to, say, implement a temporally + aware tone-mapping or denoising pass - that's probably okay, but just make + sure it appears like it's expected to. """ - Control structure for rendering a series of frames. + + # Minimum extension of accumulation buffer beyond output region. Used to + # alleviate edge effects during filtering. Actual gutter may be larger to + # accomodate alignment requirements; when it is, that extension will be + # applied to the lower-right corner of the buffer. This is asymmetrical, + # but simplifies trimming logic when it's time for that. + gutter = 10 + + @classmethod + def calc_dim(cls, width, height): + """ + Given a width and height, return a valid set of dimensions which + include at least enough gutter to exceed the minimum, and where + (acc_width % 32) == 0 and (acc_height % 8) == 0. + """ + awidth = width + 2 * cls.gutter + aheight = 8 * int(np.ceil((height + 2 * cls.gutter) / 8.)) + astride = 32 * int(np.ceil(awidth / 32.)) + return Dimensions(width, height, awidth, aheight, astride) + + def __init__(self): + self.stream = cuda.Stream() + self._clear() + + # These resources rely on the slots/ringbuffer mechanism for sharing, + # and so can be shared across any number of launches, genomes, and + # render kernels. Notably, seeds are self-synchronizing, so they're not + # attached to either stream object. + self.d_rb = cuda.to_device(np.array([0, 0], dtype=u32)) + seeds = mwc.make_seeds(util.DEFAULT_RB_SIZE * 256) + self.d_seeds = cuda.to_device(seeds) + self._len_d_points = util.DEFAULT_RB_SIZE * 256 * 16 + self.d_points = cuda.mem_alloc(self._len_d_points) + + def _clear(self): + self.nbins = self.d_front = self.d_back = self.d_side = None + + def free(self, stream=None): + if stream is not None: + stream.synchronize() + else: + cu + for p in (self.d_front, self.d_back, self.d_side): + if p is not None: + p.free() + self._clear() + + def alloc(self, dim, stream=None): + """ + Ensure that this object's framebuffers are large enough to handle the + given dimensions, allocating new ones if not. + + If ``stream`` is not None and a reallocation is necessary, the stream + will be synchronized before the old buffers are deallocated. + """ + nbins = dim.ah * dim.astride + if self.nbins >= nbins: return + if self.nbins is not None: self.free() + try: + self.d_front = cuda.mem_alloc(16 * nbins) + self.d_back = cuda.mem_alloc(16 * nbins) + self.d_side = cuda.mem_alloc(8 * nbins) + self.nbins = nbins + except cuda.MemoryError, e: + # If a frame that's too large sneaks by the task distributor, we + # don't want to kill the server, but we also don't want to leave + # it stuck without any free memory to complete the next alloc. + # TODO: measure free mem and only take tasks that fit (but that + # should be done elsewhere) + self.free(stream) + raise e + + def set_dim(self, width, height, stream=None): + """ + Compute padded dimensions for given width and height, ensure that the + buffers are large enough (and reallocate if not), and return the + calculated dimensions. + + Note that the returned dimensions are always the same for a given + width, height, and minimum gutter, even if the underlying buffers are + larger due to a previous allocation. + """ + dim = self.calc_dim(width, height) + self.alloc(dim, stream) + return dim + + def flip(self): + """Flip the front and back buffers.""" + self.d_front, self.d_back = self.d_back, self.d_front + +class DevSrc(object): """ + The buffers which represent a genome on-device, in the formats needed to + serve as a source for interpolating temporal samples. + """ + + # Maximum number of knots per parameter. This also covers the maximum + # number of palettes allowed. + max_knots = 1 << util.DEFAULT_SEARCH_ROUNDS + + # Maximum number of parameters per genome. This number is exceedingly + # high, and should never even come close to being hit. + max_params = 1024 + + def __init__(self): + self.d_times = cuda.mem_alloc(4 * self.max_knots * self.max_params) + self.d_knots = cuda.mem_alloc(4 * self.max_knots * self.max_params) + self.d_ptimes = cuda.mem_alloc(4 * self.max_knots) + self.d_pals = cuda.mem_alloc(4 * 4 * 256 * self.max_knots) + +class DevInfo(object): + """ + The buffers which hold temporal samples on-device, as used by iter. + """ + + # The palette texture/surface covers the color coordinate from [0,1] with + # equidistant horizontal samples, and spans the temporal range of the + # frame linearly with this many rows. Increasing these values increases the + # number of uniquely-dithered samples when using pre-dithered surfaces, as + # is done in 'atomic' accumulation. + palette_width = 256 # TODO: make this setting be respected + palette_height = 64 + + # This used to be determined automagically, but simultaneous occupancy + # and a much smaller block size simplifies this. + ntemporal_samples = 1024 # Number of iterations to iterate without write after generating a new # point. This number is currently fixed pretty deeply in the set of magic # constants which govern buffer sizes; changing the value here won't # actually change the code on the device to do something different. + # It's here just for documentation purposes. fuse = 256 - # The palette texture/surface covers the color coordinate from [0,1] with - # (for now, a fixed 256) equidistant horizontal samples, and spans the - # temporal range of the frame linearly with this many rows. Increasing - # this value increases the number of uniquely-dithered samples when using - # pre-dithered surfaces. - palette_height = 64 + def __init__(self): + self.d_params = cuda.mem_alloc( + self.ntemporal_samples * DevSrc.max_params * 4) + self.palette_surf_dsc = util.argset(cuda.ArrayDescriptor3D(), + height=self.palette_height, width=self.palette_width, depth=0, + format=cuda.array_format.SIGNED_INT32, + num_channels=2, flags=cuda.array3d_flags.SURFACE_LDST) + self.d_pal_array = cuda.Array(self.palette_surf_dsc) - # Palette color interpolation mode (see code.interp.Palette) - palette_interp_mode = 'yuv' +class Renderer(object): + def __init__(self, gnm): + self.packer, self.lib = iter.mkiterlib(gnm) + cubin = util.compile('iter', assemble_code(self.lib)) + self.mod = cuda.module_from_buffer(cubin) - # Maximum width of DE and other spatial filters, and thus in turn the - # amount of padding applied. Note that, for now, this must not be changed! - # The filtering code makes deep assumptions about this value. - gutter = 10 + # TODO: make these customizable + self.filts = [ filters.Bilateral() + , filters.Logscale() + , filters.ColorClip() ] + self.out = output.PILOutput() - # Accumulation mode. Leave it at 'atomic' for now. - acc_mode = 'atomic' - - # At most this many separate buffers for xforms will be allocated, after - # which further xforms will wrap to the first when writing. Currently it - # is compiled in, so power-of-two and no runtime maximization. Current - # value of 16 fits into a 1GB card at 1080p. - max_nxf = 1 - - # TODO - chaos_used = False - - cmp_options = ('-use_fast_math', '-maxrregcount', '42') - keep = False +class RenderManager(ClsMod): + lib = devlib(deps=[interp.palintlib, filldptrlib, iter.flushatomlib]) def __init__(self): - self._iter = self.pal = self.src = self.cubin = self.mod = None + super(RenderManager, self).__init__() + self.pool = pycuda.tools.PageLockedMemoryPool() - # Ensure class options don't get contaminated on an instance - self.cmp_options = list(self.cmp_options) + self.fb = Framebuffers() + self.src_a, self.src_b = DevSrc(), DevSrc() + self.info_a, self.info_b = DevInfo(), DevInfo() + self.stream_a, self.stream_b = cuda.Stream(), cuda.Stream() + self.filt_evt = self.copy_evt = None - def compile(self, genome, keep=None, cmp_options=None): + def _copy(self, rdr, gnm): """ - Compile a kernel capable of rendering every frame in this animation. - The resulting compiled kernel is stored in the ``cubin`` property; - the source is available as ``src``, and is also returned for - inspection and display. + Queue a copy of a host genome into a set of device interpolation source + buffers. - This operation is idempotent, and has no side effects outside of - setting properties on this instance (unless there's a compiler error, - which is a bug); it should therefore be threadsafe as well. - It is, however, rather slow. + Note that for now, this is broken! It ignores ``gnm``, and only packs + the genome that was used when creating the renderer. """ - keep = self.keep if keep is None else keep - cmp_options = self.cmp_options if cmp_options is None else cmp_options + times, knots = rdr.packer.pack(self.pool) + cuda.memcpy_htod_async(self.src_a.d_times, times, self.stream_a) + cuda.memcpy_htod_async(self.src_a.d_knots, knots, self.stream_a) - self._iter = iter.IterCode(self, genome) - self._iter.packer.finalize() - self.pal = interp.Palette(self.palette_interp_mode) - self.src = util.assemble_code(util.BaseCode, mwc.MWC, self._iter.packer, - self.pal, self._iter) - with open(os.path.join(tempfile.gettempdir(), 'kernel.cu'), 'w') as fp: - fp.write(self.src) - self.cubin = pycuda.compiler.compile( - self.src, keep=keep, options=cmp_options, - cache_dir=False if keep else None) + ptimes, pidxs = zip(*gnm.palette_times) + palettes = self.pool.allocate((len(ptimes), 256, 4), f32) + palettes[:] = [gnm.decoded_palettes[i] for i in pidxs] + palette_times = self.pool.allocate((self.src_a.max_knots,), f32) + palette_times.fill(1e9) + palette_times[:len(ptimes)] = ptimes + cuda.memcpy_htod_async(self.src_a.d_pals, palettes, self.stream_a) + cuda.memcpy_htod_async(self.src_a.d_ptimes, palette_times, + self.stream_a) - def load(self, genome, jit_options=[]): - if not self.cubin: - self.compile(genome) - self.mod = cuda.module_from_buffer(self.cubin, jit_options) - with open('/tmp/iter_kern.cubin', 'wb') as fp: - fp.write(self.cubin) - return self.src + # TODO: use bilerp tex as src for palette interp - def render(self, genome, times, width, height, blend=True): + def _interp(self, rdr, gnm, dim, ts, td): + d_acc_size = rdr.mod.get_global('acc_size')[0] + p_dim = self.pool.allocate((len(dim),), u32) + p_dim[:] = dim + cuda.memcpy_htod_async(d_acc_size, p_dim, self.stream_a) + + tref = self.mod.get_surfref('flatpal') + tref.set_array(self.info_a.d_pal_array, 0) + launch('interp_palette_flat', self.mod, self.stream_a, + 256, self.info_a.palette_height, + self.fb.d_rb, self.fb.d_seeds, + self.src_a.d_ptimes, self.src_a.d_pals, + f32(ts), f32(td / self.info_a.palette_height)) + + nts = self.info_a.ntemporal_samples + launch('interp_iter_params', rdr.mod, self.stream_a, + 256, np.ceil(nts / 256.), + self.info_a.d_params, self.src_a.d_times, self.src_a.d_knots, + f32(ts), f32(td / nts), i32(nts)) + + def _print_interp_knots(self, rdr, tsidx=5): + infos = cuda.from_device_like(self.info_a.d_params, + (tsidx + 1, self.info_a.max_params), f32) + for i, n in zip(infos[-1], rdr.packer.packed): + print '%60s %g' % ('_'.join(n), i) + + def _iter(self, rdr, gnm, dim, tc): + tref = rdr.mod.get_surfref('flatpal') + tref.set_array(self.info_a.d_pal_array, 0) + + nbins = dim.ah * dim.astride + fill = lambda b, s, v=i32(0): util.fill_dptr( + self.mod, b, s, stream=self.stream_a, value=v) + fill(self.fb.d_front, 4 * nbins) + fill(self.fb.d_side, 2 * nbins) + fill(self.fb.d_points, self.fb._len_d_points / 4, f32(np.nan)) + + nts = self.info_a.ntemporal_samples + nsamps = (gnm.spp(tc) * dim.w * dim.h) + nrounds = int(nsamps / (nts * 256. * 256)) + 1 + launch('iter', rdr.mod, self.stream_a, (32, 8, 1), (nts, nrounds), + self.fb.d_front, self.fb.d_side, + self.fb.d_rb, self.fb.d_seeds, self.fb.d_points, + self.info_a.d_params) + + nblocks = int(np.ceil(np.sqrt(dim.ah*dim.astride/256.))) + launch('flush_atom', self.mod, self.stream_a, + 256, (nblocks, nblocks), + u64(self.fb.d_front), u64(self.fb.d_side), i32(nbins)) + + def queue_frame(self, rdr, gnm, tc, w, h, copy=True): """ - Render a frame for each timestamp in the iterable value ``times``. This - function returns a generator that will yield a RenderedImage object - containing a shared reference to the output buffer for each specified - frame. + Queue one frame for rendering. - The returned buffer is page-locked host memory. Between the time a - buffer is yielded and the time the next frame's results are requested, - the buffer will not be modified. Thereafter, however, it will be - overwritten by an asynchronous DMA operation coming from the CUDA - device. If you hang on to it for longer than one frame, copy it. + ``rdr`` is a compiled Renderer module. Caller must ensure that the + module is compatible with the genome data provided. - ``genome`` is the genome to be rendered. Successive calls to the - `render()` method on one ``Renderer`` object must use genomes which - produce identical compiled code, and this will not be verified by the - renderer. In practice, this means you can alter genome parameter - values, but the full set of keys must remain identical between runs on - the same renderer. + ``gnm`` is the genome to be rendered. - ``times`` is a list of (idx, cen_time) tuples, where ``idx`` is passed - unmodified in the RenderedImage return value and ``cen_time`` is the - central time of the current frame in spline-time units. (Any - clock-time or frame-time units in the genome should be preconverted.) + ``tc`` is the center time at which to render. - If ``blend`` is False, the output buffer will contain unclipped, - premultiplied RGBA data, without vibrancy, highlight power, or the - alpha elbow applied. + ``w``, ``h`` are the width and height of the desired output in px. + + If ``copy`` is False, the genome data will not be recopied for each + new genome. This function must be called with ``copy=True`` the first + time a new genome is used, and may be called in that manner + subsequently without harm. I suspect the performance impact is low, so + leave ``copy`` to True every time for now. + + The return value is a 2-tuple ``(evt, h_out)``, where ``evt`` is a + CUDA event and ``h_out`` is the return value of the output module's + ``copy`` function. In the typical case, ``h_out`` will be a host + allocation containing data in an appropriate format for the output + module's file writer, and ``evt`` indicates when the asynchronous + DMA copy which will populate ``h_out`` is complete. This can vary + depending on the output module in use, though. """ - r = self.render_gen(genome, width, height, blend=blend) - next(r) - return ifilter(None, imap(r.send, chain(times, [None]))) + # Note: we synchronize on the previous stream if buffers need to be + # reallocated, which implicitly also syncs the current stream. + dim = self.fb.set_dim(w, h, self.stream_b) - def render_gen(self, genome, width, height, blend=True): + td = gnm.adj_frame_width(tc) + ts, te = tc - 0.5 * td, tc + 0.5 * td + + # The stream interleaving here is nontrivial. + # TODO: update diagram and link to it here + if copy: + self.src_a, self.src_b = self.src_b, self.src_a + self._copy(rdr, gnm) + self._interp(rdr, gnm, dim, ts, td) + if self.filt_evt: + self.stream_a.wait_for_event(self.filt_evt) + self._iter(rdr, gnm, dim, tc) + if self.copy_evt: + self.stream_a.wait_for_event(self.copy_evt) + for filt in rdr.filts: + filt.apply(self.fb, gnm, dim, tc, self.stream_a) + rdr.out.convert(self.fb, gnm, dim, self.stream_a) + self.filt_evt = cuda.Event().record(self.stream_a) + h_out = rdr.out.copy(self.fb, dim, self.pool, self.stream_a) + self.copy_evt = cuda.Event().record(self.stream_a) + + self.info_a, self.info_b = self.info_b, self.info_a + self.stream_a, self.stream_b = self.stream_b, self.stream_a + return self.copy_evt, h_out + + def render(self, gnm, times, w, h): """ - Render frames. This method is wrapped by the ``render()`` method; see - its docstring for warnings and details. - - Instead of passing frame times as an iterable, they are passed - individually via the ``generator.send()`` method. There is an - internal pipeline latency of one frame, so the first call to the - ``send()`` method will return None, the second call will return the - first frame's result, and so on. To retrieve the last frame in a - sequence, send ``None``. - - Direct use of this method is useful for implementing render servers. + A port of the old rendering function, retained for backwards + compatibility. Some of this will be pulled into as-yet-undecided + methods for more DRY. """ - + rdr = Renderer(gnm) + last_evt = cuda.Event().record(self.stream_a) last_idx = None - next_frame = yield - if next_frame is None: - return - - if not self.mod: - self.load(genome) - - filt = filtering.Filtering() - - reset_rb_fun = self.mod.get_function("reset_rb") - packer_fun = self.mod.get_function("interp_iter_params") - iter_fun = self.mod.get_function("iter") - - # The synchronization model is messy. See helpers/task_model.svg. - iter_stream = cuda.Stream() - filt_stream = cuda.Stream() - if self.acc_mode == 'deferred': - write_stream = cuda.Stream() - write_fun = self.mod.get_function("write_shmem") - else: - write_stream = iter_stream - - # These events fire when the corresponding buffer is available for - # reading on the host (i.e. the copy is done). On the first pass, 'a' - # will be ignored, and subsequently moved to 'b'. - event_a = cuda.Event().record(filt_stream) - event_b = None - - awidth = width + 2 * self.gutter - aheight = 32 * int(np.ceil((height + 2 * self.gutter) / 32.)) - astride = 32 * int(np.ceil(awidth / 32.)) - dim = Dimensions(width, height, awidth, aheight, astride) - d_acc_size = self.mod.get_global('acc_size')[0] - cuda.memcpy_htod_async(d_acc_size, u32(list(dim)), write_stream) - - nbins = astride * aheight - nxf = len(filter(lambda g: g != 'final', genome.xforms)) - nxf = min(nxf, self.max_nxf) - d_accum = cuda.mem_alloc(16 * nbins * nxf) - d_out = cuda.mem_alloc(16 * nbins) - if self.acc_mode == 'atomic': - d_atom = cuda.mem_alloc(8 * nbins * nxf) - flush_fun = self.mod.get_function("flush_atom") - else: - # d_atom is also used as a scratch buffer during filtering, so we - # need it at least this size - d_atom = cuda.mem_alloc(4 * nbins) - - obuf_copy = util.argset(cuda.Memcpy2D(), - src_y=self.gutter, src_x_in_bytes=16*self.gutter, - src_pitch=16*astride, dst_pitch=16*width, - width_in_bytes=16*width, height=height) - obuf_copy.set_src_device(d_out) - h_out_a = cuda.pagelocked_empty((height, width, 4), f32) - h_out_b = cuda.pagelocked_empty((height, width, 4), f32) - - if self.acc_mode == 'deferred': - # Having a fixed, power-of-two log size makes things much easier - log_size = 64 << 20 - d_log = cuda.mem_alloc(log_size * 4) - d_log_sorted = cuda.mem_alloc(log_size * 4) - sorter = sort.Sorter(log_size) - # We need to cover each unique tag - address bits 20-23 - with one - # write block per sort bin. Or somethinig like that. - nwriteblocks = int(np.ceil(nbins / float(1<<20))) * 256 - - # Calculate 'nslots', the number of simultaneous running threads that - # can be active on the GPU during iteration (and thus the number of - # slots for loading and storing RNG and point context that will be - # prepared on the device), and derive 'rb_size', the number of blocks in - # 'nslots'. - iter_threads_per_block = 256 - dev_data = pycuda.tools.DeviceData() - occupancy = pycuda.tools.OccupancyRecord( - dev_data, iter_threads_per_block, - iter_fun.shared_size_bytes, iter_fun.num_regs) - nsms = cuda.Context.get_device().multiprocessor_count - rb_size = occupancy.warps_per_mp * nsms / (iter_threads_per_block / 32) - nslots = iter_threads_per_block * rb_size - - # Reset the ringbuffer info for the slots - reset_rb_fun(np.int32(rb_size), block=(1,1,1)) - - d_points = cuda.mem_alloc(nslots * 16) - # This statement may add extra seeds to simplify palette dithering. - seeds = mwc.MWC.make_seeds(max(nslots, 256 * self.palette_height)) - d_seeds = cuda.to_device(seeds) - - # We used to auto-calculate this to a multiple of the number of SMs on - # the device, but since we now use shorter launches and, to a certain - # extent, allow simultaneous occupancy, that's not as important. The - # 1024 is a magic constant to ensure reasonable and power-of-two log - # size for deferred: 256MB / (4B * FUSE * NTHREADS). Enhancements to - # the sort engine are needed to make this more flexible. - ntemporal_samples = 1024 - genome_times, genome_knots = self._iter.packer.pack() - d_genome_times = cuda.to_device(genome_times) - d_genome_knots = cuda.to_device(genome_knots) - info_size = 4 * len(self._iter.packer) * ntemporal_samples - d_infos = cuda.mem_alloc(info_size) - - ptimes, pidxs = zip(*genome.palette_times) - palint_times = np.empty(len(genome_times[0]), f32) - palint_times.fill(1e10) - palint_times[:len(ptimes)] = ptimes - d_palint_times = cuda.to_device(palint_times) - pvals = self.pal.prepare([genome.decoded_palettes[i] for i in pidxs]) - d_palint_vals = cuda.to_device(np.concatenate(pvals)) - - if self.acc_mode in ('deferred', 'atomic'): - palette_fun = self.mod.get_function("interp_palette_flat") - dsc = util.argset(cuda.ArrayDescriptor3D(), - height=self.palette_height, width=256, depth=0, - format=cuda.array_format.SIGNED_INT32, - num_channels=2, flags=cuda.array3d_flags.SURFACE_LDST) - palarray = cuda.Array(dsc) - - tref = self.mod.get_surfref('flatpal') - tref.set_array(palarray, 0) - else: - palette_fun = self.mod.get_function("interp_palette") - dsc = util.argset(cuda.ArrayDescriptor(), - height=self.palette_height, width=256, - format=cuda.array_format.UNSIGNED_INT8, - num_channels=4) - d_palmem = cuda.mem_alloc(256 * self.palette_height * 4) - - tref = self.mod.get_texref('palTex') - tref.set_address_2d(d_palmem, dsc, 1024) - tref.set_flags(cuda.TRSF_NORMALIZED_COORDINATES) - tref.set_filter_mode(cuda.filter_mode.LINEAR) - - while next_frame is not None: - # tc, td, ts, te: central, delta, start, end times - idx, tc = next_frame - td = genome.adj_frame_width(tc) - ts, te = tc - 0.5 * td, tc + 0.5 * td - - if self.acc_mode in ('deferred', 'atomic'): - # In this mode, the palette writes to a surface reference, but - # requires dithering, so we pass it the seeds instead - arg0 = d_seeds - else: - arg0 = d_palmem - palette_fun(arg0, d_palint_times, d_palint_vals, - f32(ts), f32(td / self.palette_height), - block=(256,1,1), grid=(self.palette_height,1), - stream=write_stream) - - packer_fun(d_infos, d_genome_times, d_genome_knots, - f32(ts), f32(td / ntemporal_samples), - i32(ntemporal_samples), block=(256,1,1), - grid=(int(np.ceil(ntemporal_samples/256.)),1), - stream=iter_stream) - - # Reset points so that they will be FUSEd - util.BaseCode.fill_dptr(self.mod, d_points, 4 * nslots, - iter_stream, f32(np.nan)) - - # Get interpolated control points for debugging - #iter_stream.synchronize() - #d_temp = cuda.from_device(d_infos, - #(ntemporal_samples, len(self._iter.packer)), f32) - #for i, n in zip(d_temp[5], self._iter.packer.packed): - #print '%60s %g' % ('_'.join(n), i) - - util.BaseCode.fill_dptr(self.mod, d_accum, 4 * nbins * nxf, - write_stream) - if self.acc_mode == 'atomic': - util.BaseCode.fill_dptr(self.mod, d_atom, 2 * nbins * nxf, - write_stream) - nrounds = int( (genome.spp(tc) * width * height) - / (ntemporal_samples * 256 * 256) ) + 1 - if self.acc_mode == 'deferred': - for i in range(nrounds): - iter_fun(np.uint64(d_log), d_seeds, d_points, d_infos, - block=(32, self._iter.NTHREADS/32, 1), - grid=(ntemporal_samples, 1), stream=iter_stream) - _sync_stream(write_stream, iter_stream) - sorter.sort(d_log_sorted, d_log, log_size, 3, True, - stream=write_stream) - _sync_stream(iter_stream, write_stream) - write_fun(d_accum, d_log_sorted, sorter.dglobal, i32(nbins), - block=(1024, 1, 1), grid=(nwriteblocks, 1), - stream=write_stream) - else: - args = [u64(d_accum), d_seeds, d_points, d_infos] - if self.acc_mode == 'atomic': - args.append(u64(d_atom)) - iter_fun(*args, block=(32, self._iter.NTHREADS/32, 1), - grid=(ntemporal_samples, nrounds), stream=iter_stream) - if self.acc_mode == 'atomic': - nblocks = int(np.ceil(np.sqrt(nbins*nxf/float(512)))) - flush_fun(u64(d_accum), u64(d_atom), i32(nbins*nxf), - block=(512, 1, 1), grid=(nblocks, nblocks), - stream=iter_stream) - - util.BaseCode.fill_dptr(self.mod, d_out, 4 * nbins, filt_stream) - _sync_stream(filt_stream, write_stream) - filt.de(d_out, d_accum, d_atom, genome, dim, tc, nxf, - stream=filt_stream) - _sync_stream(write_stream, filt_stream) - filt.colorclip(d_out, genome, dim, tc, blend, stream=filt_stream) - obuf_copy.set_dst_host(h_out_a) - obuf_copy(filt_stream) - - if event_b: - while not event_a.query(): - timemod.sleep(0.01) - gpu_time = event_a.time_since(event_b) - result = RenderedImage(h_out_b, last_idx, gpu_time) - else: - result = None - last_idx = idx - - event_a, event_b = cuda.Event().record(filt_stream), event_a - h_out_a, h_out_b = h_out_b, h_out_a - - # TODO: add ability to flush a frame without breaking the pipe - next_frame = yield result - - while not event_a.query(): - timemod.sleep(0.001) - gpu_time = event_a.time_since(event_b) - yield RenderedImage(h_out_b, last_idx, gpu_time) - + def wait(): # Times like these where you wish for a macro + while not last_evt.query(): + time.sleep(0.01) + gpu_time = last_evt.time_since(two_evts_ago) + return RenderedImage(h_buf, idx, gpu_time) + for idx, tc in times: + evt, h_buf = self.queue_frame(rdr, gnm, tc, w, h, last_idx is None) + if last_idx: + yield wait() + two_evts_ago, last_evt = last_evt, evt + last_buf, last_idx = h_buf, idx + if last_idx: + yield wait() diff --git a/main.py b/main.py index e3e0b04..6b01f52 100644 --- a/main.py +++ b/main.py @@ -3,8 +3,8 @@ # cuburn, one of a surprisingly large number of ports of the fractal flame # algorithm to NVIDIA GPUs. # -# This one is copyright 2010-2011, Steven Robertson -# and Eric Reckase . +# This one is copyright 2010-2012, Steven Robertson +# and Erik Reckase . # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 or later @@ -20,11 +20,9 @@ from subprocess import Popen from itertools import ifilter import numpy as np -import Image -import scipy import pycuda.driver as cuda -from cuburn import genome, render +from cuburn import genome, render, filters, output profiles = { '1080p': dict(fps=24, width=1920, height=1080, quality=3000, skip=0), @@ -33,11 +31,10 @@ profiles = { 'preview': dict(fps=24, width=640, height=360, quality=800, skip=1) } -def save(rframe): - noalpha = rframe.buf[:,:,:3] - img = scipy.misc.toimage(noalpha, cmin=0, cmax=1) - img.save(rframe.idx, quality=98) - print rframe.idx, rframe.gpu_time +def save(out): + # Temporary! TODO: fix this + output.PILOutput.save(out.buf, out.idx) + print out.idx, out.gpu_time def main(args, prof): import pycuda.autoinit @@ -53,9 +50,7 @@ def main(args, prof): gnm = genome.Genome(gnm) err, times = gnm.set_profile(prof) - anim = render.Renderer() - anim.compile(gnm, keep=args.keep) - anim.load(gnm) + rmgr = render.RenderManager() basename = os.path.basename(args.flame.name).rsplit('.', 1)[0] + '_' if args.flame.name == '-': @@ -76,61 +71,61 @@ def main(args, prof): if not os.path.isfile(f[0]) or m > os.path.getmtime(f[0])) w, h = prof['width'], prof['height'] - gen = anim.render(gnm, frames, w, h) + gen = rmgr.render(gnm, frames, w, h) - if args.gfx: - import pyglet - window = pyglet.window.Window(w, h, vsync=False) - image = pyglet.image.CheckerImagePattern().create_image(w, h) - label = pyglet.text.Label('Rendering first frame', x=5, y=h-5, - width=w, anchor_y='top', font_size=16, - bold=True, multiline=True) - - @window.event - def on_draw(): - window.clear() - image.texture.blit(0, 0) - label.draw() - - @window.event - def on_key_press(sym, mod): - if sym == pyglet.window.key.Q: - pyglet.app.exit() - - @window.event - def on_mouse_motion(x, y, dx, dy): - pass - - last_time = [time.time()] - - def poll(dt): - out = next(gen, False) - if out is False: - if args.pause: - label.text = "Done. ('q' to quit)" - #pyglet.clock.unschedule(poll) - else: - pyglet.app.exit() - elif out is not None: - real_dt = time.time() - last_time[0] - last_time[0] = time.time() - save(out) - imgbuf = np.uint8(out.buf.flatten() * 255) - image.set_data('RGBA', -w*4, imgbuf.tostring()) - label.text = '%s (%g fps)' % (out.idx, 1./real_dt) - else: - label.text += '.' - if args.sync: - cuda.Context.synchronize() - - pyglet.clock.set_fps_limit(30) - pyglet.clock.schedule_interval(poll, 1/30.) - pyglet.app.run() - else: + if not args.gfx: for out in gen: save(out) - if args.sync: - cuda.Context.synchronize() + return + + import pyglet + window = pyglet.window.Window(w, h, vsync=False) + image = pyglet.image.CheckerImagePattern().create_image(w, h) + label = pyglet.text.Label('Rendering first frame', x=5, y=h-5, + width=w, anchor_y='top', font_size=16, + bold=True, multiline=True) + + @window.event + def on_draw(): + window.clear() + image.texture.blit(0, 0) + label.draw() + + @window.event + def on_key_press(sym, mod): + if sym == pyglet.window.key.Q: + pyglet.app.exit() + + @window.event + def on_mouse_motion(x, y, dx, dy): + pass + + last_time = [time.time()] + + def poll(dt): + out = next(gen, False) + if out is False: + if args.pause: + label.text = "Done. ('q' to quit)" + #pyglet.clock.unschedule(poll) + else: + pyglet.app.exit() + elif out is not None: + real_dt = time.time() - last_time[0] + last_time[0] = time.time() + save(out) + imgbuf = np.uint8(out.buf.flatten() * 255) + image.set_data('RGBA', -w*4, imgbuf.tostring()) + label.text = '%s (%g fps)' % (out.idx, 1./real_dt) + else: + label.text += '.' + if args.sync: + cuda.Context.synchronize() + + pyglet.clock.set_fps_limit(30) + pyglet.clock.schedule_interval(poll, 1/30.) + pyglet.app.run() + if __name__ == "__main__": parser = argparse.ArgumentParser(description='Render fractal flames.') @@ -150,8 +145,6 @@ if __name__ == "__main__": parser.add_argument('--pause', action='store_true', help="Don't close the preview window after rendering is finished") - parser.add_argument('--keep', action='store_true', dest='keep', - help='Keep compilation directory (disables kernel caching)') parser.add_argument('--sync', action='store_true', dest='sync', help='Use synchronous launches whenever possible') diff --git a/worker.py b/worker.py index c04c97a..7344aa0 100644 --- a/worker.py +++ b/worker.py @@ -17,6 +17,8 @@ import redis from cuburn import render, genome +import pycuda.driver as cuda + pycuda = None # The default maximum number of waiting jobs. Also used to determine when a @@ -50,20 +52,6 @@ def get_temperature(): return out[idx+1:idx+3] return '' -def push_frame(r, out): - if out is None: - return - sid, sidx, ftag = out.idx - # TODO: gotta put this in a module somewhere and make it adjustable - noalpha = out.buf[:,:,:3] - img = scipy.misc.toimage(noalpha, cmin=0, cmax=1) - buf = StringIO() - img.save(buf, 'jpeg', quality=98) - buf.seek(0) - head = ' '.join([sidx, str(out.gpu_time), ftag]) - r.rpush(sid + ':queue', head + '\0' + buf.read()) - print 'Pushed frame: %s' % head - def work(server): global pycuda import pycuda.autoinit @@ -80,7 +68,11 @@ def work(server): r.expire(wid, 180) last_ping = time.time() - last_pid, last_gid, riter = None, None, None + idx = evt = buf = None + last_idx = last_buf = last_evt = two_evts_ago = None + last_pid = last_gid = rdr = None + + mgr = render.RenderManager() while True: task = r.blpop('renderpool:' + rev + ':queue', 10) @@ -90,28 +82,46 @@ def work(server): r.expire(wid, 180) last_ping = now + # last_evt will be populated during normal queue operation (when evt + # contains the most recent event), as well as when the render queue is + # flushing due to not receiving a new task before the timeout. + if last_idx is not None: + while not last_evt.query(): + # This delay could probably be a lot higher with zero impact + # on throughput for Fermi cards + time.sleep(0.05) + + sid, sidx, ftag = last_idx + obuf = StringIO() + rdr.out.save(buf, obuf, 'jpeg') + obuf.seek(0) + gpu_time = last_evt.time_since(two_evts_ago) + head = ' '.join([sidx, str(gpu_time), ftag]) + r.rpush(sid + ':queue', head + '\0' + obuf.read()) + print 'Pushed frame: %s' % head + + two_evts_ago, last_evt = last_evt, evt + last_idx, last_buf = idx, buf + if not task: - if riter: - push_frame(r, riter.send(None)) - riter = None + idx = evt = buf = None continue sid, sidx, pid, gid, ftime, ftag = task[1].split(' ', 5) - if pid != last_pid or gid != last_gid or not riter: + if pid != last_pid or gid != last_gid or not rdr: gnm = genome.Genome(json.loads(r.get(gid))) prof = json.loads(r.get(pid)) gnm.set_profile(prof) - renderer = render.Renderer() - renderer.load(gnm) + rdr = render.Renderer(gnm) - if riter: - push_frame(r, riter.send(None)) + if last_evt is None: + # Create a dummy event for timing + last_evt = cuda.Event().record(mgr.stream_a) - riter = renderer.render_gen(gnm, prof['width'], prof['height']) - next(riter) - last_pid, last_gid = pid, gid - - push_frame(r, riter.send(((sid, sidx, ftag), float(ftime)))) + copy = last_idx is None + w, h = prof['width'], prof['height'] + evt, buf = mgr.queue_frame(rdr, gnm, float(ftime), w, h, copy) + idx = sid, sidx, ftag def iter_genomes(prof, gpaths, pname='540p'): """