mirror of
https://github.com/stevenrobertson/cuburn.git
synced 2025-02-05 11:40:04 -05:00
New genome format to support flockutil
This commit is contained in:
parent
12655b8611
commit
b43481e374
@ -1,5 +1,7 @@
|
|||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
from numpy import float32 as f32, int32 as i32
|
||||||
|
|
||||||
import pycuda.compiler
|
import pycuda.compiler
|
||||||
from pycuda.gpuarray import vec
|
from pycuda.gpuarray import vec
|
||||||
|
|
||||||
@ -10,8 +12,8 @@ _CODE = '''
|
|||||||
|
|
||||||
__global__
|
__global__
|
||||||
void colorclip(float4 *pixbuf, float gamma, float vibrancy, float highpow,
|
void colorclip(float4 *pixbuf, float gamma, float vibrancy, float highpow,
|
||||||
float linrange, float lingam, float3 bkgd, int fbsize,
|
float linrange, float lingam, float3 bkgd,
|
||||||
int alpha_output_channel) {
|
int fbsize, int blend_background_color) {
|
||||||
int i = threadIdx.x + blockDim.x * (blockIdx.x + gridDim.x * blockIdx.y);
|
int i = threadIdx.x + blockDim.x * (blockIdx.x + gridDim.x * blockIdx.y);
|
||||||
if (i >= fbsize) return;
|
if (i >= fbsize) return;
|
||||||
|
|
||||||
@ -30,8 +32,17 @@ void colorclip(float4 *pixbuf, float gamma, float vibrancy, float highpow,
|
|||||||
alpha = (1.0f - frac) * pix.w * lingam + frac * alpha;
|
alpha = (1.0f - frac) * pix.w * lingam + frac * alpha;
|
||||||
}
|
}
|
||||||
|
|
||||||
float ls = vibrancy * alpha / pix.w;
|
if (!blend_background_color) {
|
||||||
|
float ls = alpha / pix.w;
|
||||||
|
pix.x *= ls;
|
||||||
|
pix.y *= ls;
|
||||||
|
pix.z *= ls;
|
||||||
|
pix.w = alpha;
|
||||||
|
pixbuf[i] = pix;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
float ls = vibrancy * alpha / pix.w;
|
||||||
alpha = fminf(1.0f, fmaxf(0.0f, alpha));
|
alpha = fminf(1.0f, fmaxf(0.0f, alpha));
|
||||||
|
|
||||||
float maxc = fmaxf(pix.x, fmaxf(pix.y, pix.z));
|
float maxc = fmaxf(pix.x, fmaxf(pix.y, pix.z));
|
||||||
@ -64,29 +75,21 @@ void colorclip(float4 *pixbuf, float gamma, float vibrancy, float highpow,
|
|||||||
pix.y += (1.0f - vibrancy) * powf(opix.y, gamma);
|
pix.y += (1.0f - vibrancy) * powf(opix.y, gamma);
|
||||||
pix.z += (1.0f - vibrancy) * powf(opix.z, gamma);
|
pix.z += (1.0f - vibrancy) * powf(opix.z, gamma);
|
||||||
|
|
||||||
if (alpha_output_channel) {
|
pix.x += (1.0f - alpha) * bkgd.x;
|
||||||
float one_alpha = 1.0f / alpha;
|
pix.y += (1.0f - alpha) * bkgd.y;
|
||||||
pix.x *= one_alpha;
|
pix.z += (1.0f - alpha) * bkgd.z;
|
||||||
pix.y *= one_alpha;
|
|
||||||
pix.z *= one_alpha;
|
|
||||||
} else {
|
|
||||||
pix.x += (1.0f - alpha) * bkgd.x;
|
|
||||||
pix.y += (1.0f - alpha) * bkgd.y;
|
|
||||||
pix.z += (1.0f - alpha) * bkgd.z;
|
|
||||||
}
|
|
||||||
pix.w = alpha;
|
|
||||||
|
|
||||||
// Clamp values. I think this is superfluous, but I'm not certain.
|
|
||||||
pix.x = fminf(1.0f, pix.x);
|
pix.x = fminf(1.0f, pix.x);
|
||||||
pix.y = fminf(1.0f, pix.y);
|
pix.y = fminf(1.0f, pix.y);
|
||||||
pix.z = fminf(1.0f, pix.z);
|
pix.z = fminf(1.0f, pix.z);
|
||||||
|
pix.w = alpha;
|
||||||
|
|
||||||
pixbuf[i] = pix;
|
pixbuf[i] = pix;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
#define W 15 // Filter width (regardless of standard deviation chosen)
|
#define W 15 // Filter width (regardless of standard deviation chosen)
|
||||||
#define W2 7 // Half of filter width, rounded down
|
#define W2 7 // Half of filter width, rounded down
|
||||||
#define FW 46 // Width of local result storage (NW+W2+W2)
|
#define FW 46 // Width of local result storage (NW+W2+W2)
|
||||||
#define FW2 (FW*FW)
|
#define FW2 (FW*FW)
|
||||||
|
|
||||||
@ -288,52 +291,45 @@ class Filtering(object):
|
|||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.init_mod()
|
self.init_mod()
|
||||||
|
|
||||||
def de(self, ddst, dsrc, info, start, stop, stream=None):
|
def de(self, ddst, dsrc, gnm, dim, tc, stream=None):
|
||||||
# TODO: use integration to obtain parameter values
|
k1 = f32(gnm.color.brightness(tc) * 268 / 256)
|
||||||
t = (start + stop) / 2
|
|
||||||
cp = info.genome
|
|
||||||
|
|
||||||
k1 = np.float32(cp.color.brightness(t) * 268 / 256)
|
|
||||||
# Old definition of area is (w*h/(s*s)). Since new scale 'ns' is now
|
# Old definition of area is (w*h/(s*s)). Since new scale 'ns' is now
|
||||||
# s/w, new definition is (w*h/(s*s*w*w)) = (h/(s*s*w))
|
# s/w, new definition is (w*h/(s*s*w*w)) = (h/(s*s*w))
|
||||||
area = info.height / (cp.camera.scale(t) ** 2 * info.width)
|
area = dim.h / (gnm.camera.scale(tc) ** 2 * dim.w)
|
||||||
k2 = np.float32(1 / (area * info.density))
|
k2 = f32(1 / (area * gnm.spp(tc)))
|
||||||
|
|
||||||
if cp.de.radius == 0:
|
if gnm.de.radius == 0:
|
||||||
nbins = info.acc_height * info.acc_stride
|
nbins = dim.ah * dim.astride
|
||||||
fun = self.mod.get_function("logscale")
|
fun = self.mod.get_function("logscale")
|
||||||
t = fun(dsrc, ddst, k1, k2,
|
t = fun(dsrc, ddst, k1, k2,
|
||||||
block=(512, 1, 1), grid=(nbins/512, 1), stream=stream)
|
block=(512, 1, 1), grid=(nbins/512, 1), stream=stream)
|
||||||
else:
|
else:
|
||||||
scale_coeff = np.float32(-(1 + cp.de.radius(t)) ** -2.0)
|
scale_coeff = f32(-(1 + gnm.de.radius(tc)) ** -2.0)
|
||||||
est_curve = np.float32(2 * cp.de.curve(t))
|
est_curve = f32(2 * gnm.de.curve(tc))
|
||||||
# TODO: experiment with this
|
# TODO: experiment with this
|
||||||
edge_clamp = np.float32(2.0)
|
edge_clamp = f32(1.2)
|
||||||
fun = self.mod.get_function("density_est")
|
fun = self.mod.get_function("density_est")
|
||||||
fun(dsrc, ddst, scale_coeff, est_curve, edge_clamp, k1, k2,
|
fun(dsrc, ddst, scale_coeff, est_curve, edge_clamp, k1, k2,
|
||||||
np.int32(info.acc_height), np.int32(info.acc_stride),
|
i32(dim.ah), i32(dim.astride), block=(32, 32, 1),
|
||||||
block=(32, 32, 1), grid=(info.acc_width/32, 1), stream=stream)
|
grid=(dim.aw/32, 1), stream=stream)
|
||||||
|
|
||||||
def colorclip(self, dbuf, info, start, stop, stream=None):
|
def colorclip(self, dbuf, gnm, dim, tc, blend, stream=None):
|
||||||
f32 = np.float32
|
nbins = dim.ah * dim.astride
|
||||||
t = (start + stop) / 2
|
|
||||||
cp = info.genome
|
|
||||||
nbins = info.acc_height * info.acc_stride
|
|
||||||
|
|
||||||
# TODO: implement integration over cubic splines?
|
# TODO: implement integration over cubic splines?
|
||||||
gam = f32(1 / cp.color.gamma(t))
|
gam = f32(1 / gnm.color.gamma(tc))
|
||||||
vib = f32(cp.color.vibrancy(t))
|
vib = f32(gnm.color.vibrancy(tc))
|
||||||
hipow = f32(cp.color.highlight_power(t))
|
hipow = f32(gnm.color.highlight_power(tc))
|
||||||
lin = f32(cp.color.gamma_threshold(t))
|
lin = f32(gnm.color.gamma_threshold(tc))
|
||||||
lingam = f32(lin ** (gam-1.0) if lin > 0 else 0)
|
lingam = f32(lin ** (gam-1.0) if lin > 0 else 0)
|
||||||
bkgd = vec.make_float3(
|
bkgd = vec.make_float3(
|
||||||
cp.color.background.r(t),
|
gnm.color.background.r(tc),
|
||||||
cp.color.background.g(t),
|
gnm.color.background.g(tc),
|
||||||
cp.color.background.b(t))
|
gnm.color.background.b(tc))
|
||||||
|
|
||||||
color_fun = self.mod.get_function("colorclip")
|
color_fun = self.mod.get_function("colorclip")
|
||||||
blocks = int(np.ceil(np.sqrt(nbins / 256)))
|
blocks = int(np.ceil(np.sqrt(nbins / 256)))
|
||||||
color_fun(dbuf, gam, vib, hipow, lin, lingam, bkgd, np.int32(nbins),
|
color_fun(dbuf, gam, vib, hipow, lin, lingam, bkgd, i32(nbins),
|
||||||
np.int32(0),
|
i32(blend), block=(256, 1, 1), grid=(blocks, blocks),
|
||||||
block=(256, 1, 1), grid=(blocks, blocks), stream=stream)
|
stream=stream)
|
||||||
|
|
||||||
|
@ -226,8 +226,11 @@ class GenomePacker(HunkOCode):
|
|||||||
_defs = Template(r"""
|
_defs = Template(r"""
|
||||||
|
|
||||||
__global__
|
__global__
|
||||||
void interp_{{tname}}({{tname}}* out, float *times, float *knots,
|
void interp_{{tname}}(
|
||||||
float tstart, float tstep, mwc_st *rctxes, int maxid) {
|
{{tname}}* out, mwc_st *rctxes,
|
||||||
|
const float *times, const float *knots,
|
||||||
|
float tstart, float tstep, int maxid
|
||||||
|
) {
|
||||||
int id = gtid();
|
int id = gtid();
|
||||||
if (id >= maxid) return;
|
if (id >= maxid) return;
|
||||||
out = &out[id];
|
out = &out[id];
|
||||||
|
@ -53,7 +53,7 @@ def precalc_chaos(pcp, std_xforms):
|
|||||||
|
|
||||||
""").substitute(locals()))
|
""").substitute(locals()))
|
||||||
|
|
||||||
def precalc_camera(info, pcam):
|
def precalc_camera(pcam):
|
||||||
pre_cam = pcam._precalc()
|
pre_cam = pcam._precalc()
|
||||||
|
|
||||||
# Maxima code to check my logic:
|
# Maxima code to check my logic:
|
||||||
@ -68,7 +68,7 @@ def precalc_camera(info, pcam):
|
|||||||
float rot = {{pre_cam.rotation}} * M_PI / 180.0f;
|
float rot = {{pre_cam.rotation}} * M_PI / 180.0f;
|
||||||
float rotsin = sin(rot), rotcos = cos(rot);
|
float rotsin = sin(rot), rotcos = cos(rot);
|
||||||
float cenx = {{pre_cam.center.x}}, ceny = {{pre_cam.center.y}};
|
float cenx = {{pre_cam.center.x}}, ceny = {{pre_cam.center.y}};
|
||||||
float scale = {{pre_cam.scale}} * {{info.width}};
|
float scale = {{pre_cam.scale}} * acc_size.width;
|
||||||
|
|
||||||
float ditherwidth = {{pre_cam.dither_width}} * 0.33f;
|
float ditherwidth = {{pre_cam.dither_width}} * 0.33f;
|
||||||
float u0 = mwc_next_01(rctx);
|
float u0 = mwc_next_01(rctx);
|
||||||
@ -81,12 +81,12 @@ def precalc_camera(info, pcam):
|
|||||||
{{pre_cam._set('xx')}} = scale * rotcos;
|
{{pre_cam._set('xx')}} = scale * rotcos;
|
||||||
{{pre_cam._set('xy')}} = scale * -rotsin;
|
{{pre_cam._set('xy')}} = scale * -rotsin;
|
||||||
{{pre_cam._set('xo')}} = scale * (rotsin * ceny - rotcos * cenx)
|
{{pre_cam._set('xo')}} = scale * (rotsin * ceny - rotcos * cenx)
|
||||||
+ {{0.5 * (info.width + info.gutter + 1)}} + ditherx;
|
+ 0.5f * acc_size.awidth + ditherx;
|
||||||
|
|
||||||
{{pre_cam._set('yx')}} = scale * rotsin;
|
{{pre_cam._set('yx')}} = scale * rotsin;
|
||||||
{{pre_cam._set('yy')}} = scale * rotcos;
|
{{pre_cam._set('yy')}} = scale * rotcos;
|
||||||
{{pre_cam._set('yo')}} = scale * -(rotsin * cenx + rotcos * ceny)
|
{{pre_cam._set('yo')}} = scale * -(rotsin * cenx + rotcos * ceny)
|
||||||
+ {{0.5 * (info.height + info.gutter + 1)}} + dithery;
|
+ 0.5f * acc_size.aheight + dithery;
|
||||||
|
|
||||||
""").substitute(locals()))
|
""").substitute(locals()))
|
||||||
|
|
||||||
@ -113,13 +113,12 @@ class IterCode(HunkOCode):
|
|||||||
# The number of threads per block
|
# The number of threads per block
|
||||||
NTHREADS = 256
|
NTHREADS = 256
|
||||||
|
|
||||||
def __init__(self, info):
|
def __init__(self, info, genome):
|
||||||
self.info = info
|
|
||||||
self.packer = interp.GenomePacker('iter_params')
|
self.packer = interp.GenomePacker('iter_params')
|
||||||
self.pcp = self.packer.view('params', self.info.genome, 'cp')
|
self.pcp = self.packer.view('params', genome, 'cp')
|
||||||
|
|
||||||
iterbody = self._iterbody()
|
iterbody = self._iterbody(info, genome)
|
||||||
bodies = [self._xfbody(i,x) for i,x in sorted(info.genome.xforms.items())]
|
bodies = [self._xfbody(i,x) for i,x in sorted(genome.xforms.items())]
|
||||||
bodies.append(iterbody)
|
bodies.append(iterbody)
|
||||||
self.defs = '\n'.join(bodies)
|
self.defs = '\n'.join(bodies)
|
||||||
|
|
||||||
@ -132,7 +131,9 @@ __device__ int rb_head, rb_tail, rb_size;
|
|||||||
typedef struct {
|
typedef struct {
|
||||||
uint32_t width;
|
uint32_t width;
|
||||||
uint32_t height;
|
uint32_t height;
|
||||||
uint32_t stride;
|
uint32_t awidth;
|
||||||
|
uint32_t aheight;
|
||||||
|
uint32_t astride;
|
||||||
} acc_size_t;
|
} acc_size_t;
|
||||||
__constant__ acc_size_t acc_size;
|
__constant__ acc_size_t acc_size;
|
||||||
|
|
||||||
@ -174,7 +175,7 @@ void apply_xf_{{xfid}}(float &ox, float &oy, float &color, mwc_st &rctx) {
|
|||||||
g.update(locals())
|
g.update(locals())
|
||||||
return tmpl.substitute(g)
|
return tmpl.substitute(g)
|
||||||
|
|
||||||
def _iterbody(self):
|
def _iterbody(self, info, genome):
|
||||||
tmpl = Template(r'''
|
tmpl = Template(r'''
|
||||||
|
|
||||||
__global__ void reset_rb(int size) {
|
__global__ void reset_rb(int size) {
|
||||||
@ -331,7 +332,7 @@ void iter(
|
|||||||
|
|
||||||
float cx, cy, cc;
|
float cx, cy, cc;
|
||||||
|
|
||||||
{{precalc_camera(info, pcp.camera)}}
|
{{precalc_camera(pcp.camera)}}
|
||||||
|
|
||||||
{{if 'final' in cp.xforms}}
|
{{if 'final' in cp.xforms}}
|
||||||
{{apply_affine('fx', 'fy', 'cx', 'cy', pcp.camera)}}
|
{{apply_affine('fx', 'fy', 'cx', 'cy', pcp.camera)}}
|
||||||
@ -343,14 +344,14 @@ void iter(
|
|||||||
|
|
||||||
uint32_t ix = trunca(cx), iy = trunca(cy);
|
uint32_t ix = trunca(cx), iy = trunca(cy);
|
||||||
|
|
||||||
if (ix >= acc_size.width || iy >= acc_size.height) {
|
if (ix >= acc_size.awidth || iy >= acc_size.aheight) {
|
||||||
{{if info.acc_mode == 'deferred'}}
|
{{if info.acc_mode == 'deferred'}}
|
||||||
*log = 0xffffffff;
|
*log = 0xffffffff;
|
||||||
{{endif}}
|
{{endif}}
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
uint32_t i = iy * acc_size.stride + ix;
|
uint32_t i = iy * acc_size.astride + ix;
|
||||||
{{if info.acc_mode == 'atomic'}}
|
{{if info.acc_mode == 'atomic'}}
|
||||||
asm volatile ({{crep("""
|
asm volatile ({{crep("""
|
||||||
{
|
{
|
||||||
@ -626,12 +627,11 @@ oflow_write_end:
|
|||||||
{{endif}}
|
{{endif}}
|
||||||
''', 'iter_kern')
|
''', 'iter_kern')
|
||||||
return tmpl.substitute(
|
return tmpl.substitute(
|
||||||
info = self.info,
|
info = info,
|
||||||
cp = self.info.genome,
|
cp = genome,
|
||||||
pcp = self.pcp,
|
pcp = self.pcp,
|
||||||
NTHREADS = self.NTHREADS,
|
NTHREADS = self.NTHREADS,
|
||||||
NWARPS = self.NTHREADS / 32,
|
NWARPS = self.NTHREADS / 32,
|
||||||
std_xforms = [n for n in sorted(self.info.genome.xforms)
|
std_xforms = [n for n in sorted(genome.xforms) if n != 'final'],
|
||||||
if n != 'final'],
|
|
||||||
**globals())
|
**globals())
|
||||||
|
|
||||||
|
179
cuburn/genome.py
179
cuburn/genome.py
@ -79,115 +79,86 @@ class SplEval(object):
|
|||||||
return self.knots[1][0]
|
return self.knots[1][0]
|
||||||
return list(self.knots.T.flat)
|
return list(self.knots.T.flat)
|
||||||
|
|
||||||
@classmethod
|
class Palette(object):
|
||||||
def wrap(cls, obj):
|
"""Wafer-thin wrapper around palettes. For the future!"""
|
||||||
"""
|
def __init__(self, datastr, fmt='rgb8'):
|
||||||
Given a dict 'obj' representing, for instance, a Genome object, walk
|
if fmt != 'rgb8':
|
||||||
through the object recursively and in-place, turning any number or
|
raise NotImplementedError
|
||||||
list of numbers into an SplEval.
|
if len(datastr) != 768:
|
||||||
"""
|
raise ValueError("Unsupported palette width")
|
||||||
for k, v in obj.items():
|
self.width = 256
|
||||||
if (isinstance(v, (float, int)) or
|
pal = np.reshape(np.fromstring(datastr, np.uint8), (256, 3))
|
||||||
(isinstance(v, list) and isinstance(v[1], (float, int)))):
|
self.data = np.ones((256, 4), np.float32)
|
||||||
obj[k] = cls(v)
|
self.data[:,:3] = pal / 255.0
|
||||||
elif isinstance(v, dict):
|
|
||||||
cls.wrap(v)
|
|
||||||
|
|
||||||
class RenderInfo(object):
|
|
||||||
"""
|
|
||||||
Determine features and constants required to render a particular set of
|
|
||||||
genomes. The values of this class are fixed before compilation begins.
|
|
||||||
"""
|
|
||||||
# Number of iterations to iterate without write after generating a new
|
|
||||||
# point. This number is currently fixed pretty deeply in the set of magic
|
|
||||||
# constants which govern buffer sizes; changing the value here won't
|
|
||||||
# actually change the code on the device to do something different.
|
|
||||||
fuse = 256
|
|
||||||
|
|
||||||
# Height of the texture pallete which gets uploaded to the GPU (assuming
|
|
||||||
# that palette-from-texture is enabled). For most genomes, this doesn't
|
|
||||||
# need to be very large at all. However, since only an easily-cached
|
|
||||||
# fraction of this will be accessed per SM, larger values shouldn't hurt
|
|
||||||
# performance too much. When using deferred accumulation, increasing this
|
|
||||||
# value increases the number of uniquely-dithered samples, which is nice.
|
|
||||||
# Power-of-two, please.
|
|
||||||
palette_height = 64
|
|
||||||
|
|
||||||
# Maximum width of DE and other spatial filters, and thus in turn the
|
|
||||||
# amount of padding applied. Note that, for now, this must not be changed!
|
|
||||||
# The filtering code makes deep assumptions about this value.
|
|
||||||
gutter = 15
|
|
||||||
|
|
||||||
# TODO: for now, we always throw away the alpha channel before writing.
|
|
||||||
# All code is in place to not do this, we just need to find a way to expose
|
|
||||||
# this preference via the API (or push alpha blending entirely on the client,
|
|
||||||
# which I'm not opposed to)
|
|
||||||
alpha_output_channel = False
|
|
||||||
|
|
||||||
# There are three settings for this somewhat ersatz paramater. 'global'
|
|
||||||
# uses unsynchronized global writes to accumulate sample points, 'atomic'
|
|
||||||
# uses atomic global writes, and 'deferred' stores color and position in a
|
|
||||||
# sample log, sorts the log by position, and uses shared memory to
|
|
||||||
# perform the accumulation. Deferred has the accuracy of 'atomic' and
|
|
||||||
# the speed of 'global' (it's actually faster!), but packs color and
|
|
||||||
# position into a single 32-bit int for now, which limits resolution to
|
|
||||||
# 1080p when xform opacity is respected, so the other two modes will hang
|
|
||||||
# around until that can be extended to be memory-limited again.
|
|
||||||
acc_mode = 'atomic'
|
|
||||||
|
|
||||||
# TODO: fix this
|
|
||||||
chaos_used = False
|
|
||||||
|
|
||||||
def __init__(self, db, **kwargs):
|
|
||||||
self.db = db
|
|
||||||
# Copy all args into this object's namespace
|
|
||||||
self.__dict__.update(kwargs)
|
|
||||||
|
|
||||||
self.acc_width = self.width + 2 * self.gutter
|
|
||||||
self.acc_height = self.height + 2 * self.gutter
|
|
||||||
self.acc_stride = 32 * int(np.ceil(self.acc_width / 32.))
|
|
||||||
self.density = self.quality
|
|
||||||
|
|
||||||
# Deref genome
|
|
||||||
self.genome = self.db.genomes[self.genome]
|
|
||||||
|
|
||||||
for k, v in self.db.palettes.items():
|
|
||||||
pal = np.fromstring(base64.b64decode(v), np.uint8)
|
|
||||||
pal = np.reshape(pal, (256, 3))
|
|
||||||
pal_a = np.ones((256, 4), np.float32)
|
|
||||||
pal_a[:,:3] = pal / 255.0
|
|
||||||
self.db.palettes[k] = pal_a
|
|
||||||
|
|
||||||
class _AttrDict(dict):
|
class _AttrDict(dict):
|
||||||
def __getattr__(self, name):
|
def __getattr__(self, name):
|
||||||
return self[name]
|
return self[name]
|
||||||
|
|
||||||
def load_info(contents):
|
@classmethod
|
||||||
result = json.loads(contents, object_hook=_AttrDict)
|
def _wrap(cls, dct):
|
||||||
SplEval.wrap(result.genomes)
|
for k, v in dct.items():
|
||||||
|
if (isinstance(v, (float, int)) or
|
||||||
|
(isinstance(v, list) and isinstance(v[1], (float, int)))):
|
||||||
|
dct[k] = SplEval(v)
|
||||||
|
elif isinstance(v, dict):
|
||||||
|
dct[k] = cls._wrap(cls(v))
|
||||||
|
return dct
|
||||||
|
|
||||||
# A Job object will have more details or something
|
class Genome(_AttrDict):
|
||||||
result = RenderInfo(result, **result.renders.values()[0])
|
# For now, we base the Genome class on an _AttrDict, letting its structure
|
||||||
return result
|
# be defined implicitly by the way it is used in device code. More formal
|
||||||
|
# unpacking will happen soon.
|
||||||
|
def __init__(self, gnm, base_den):
|
||||||
|
super(Genome, self).__init__(gnm)
|
||||||
|
for k, v in self.items():
|
||||||
|
v = _AttrDict(v)
|
||||||
|
if k not in ('info', 'time'):
|
||||||
|
_AttrDict._wrap(v)
|
||||||
|
self[k] = v
|
||||||
|
# TODO: this is a hack, figure out how to solve it more elegantly
|
||||||
|
self.spp = SplEval(self.camera.density.knotlist)
|
||||||
|
self.spp.knots[1] *= base_den
|
||||||
|
# TODO: decide how to handle palettes. For now, it's the caller's
|
||||||
|
# responsibility to replace this list with actual palettes.
|
||||||
|
pal = self.color.palette
|
||||||
|
if isinstance(pal, basestring):
|
||||||
|
self.color.palette = [(0.0, pal), (1.0, pal)]
|
||||||
|
elif isinstance(pal, list):
|
||||||
|
self.color.palette = zip(pal[::2], pal[1::2])
|
||||||
|
|
||||||
class HacketyGenome(object):
|
# TODO: caller also needs to call set_timing()
|
||||||
"""
|
self.adj_frame_width = None
|
||||||
Holdover class to postpone a very deep refactoring as long as possible.
|
self.canonical_right = (not self.get('link') or not self.link == 'self'
|
||||||
Converts property accesses into interpolations over predetermined times.
|
or not self.link.get('right'))
|
||||||
"""
|
|
||||||
def __init__(self, referent, times):
|
|
||||||
# Times can be singular
|
|
||||||
self.referent, self.times = referent, times
|
|
||||||
def __getattr__(self, name):
|
|
||||||
r = getattr(self.referent, str(name))
|
|
||||||
if isinstance(r, _AttrDict):
|
|
||||||
return HacketyGenome(r, self.times)
|
|
||||||
elif isinstance(r, SplEval):
|
|
||||||
return r(self.times)
|
|
||||||
return r
|
|
||||||
__getitem__ = __getattr__
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
def set_timing(self, base_dur, fps, offset=0.0, err_spread=True):
|
||||||
import sys
|
"""
|
||||||
import pprint
|
Set frame timing. Must be called at least once prior to rendering.
|
||||||
pprint.pprint(read_genome(sys.stdin))
|
"""
|
||||||
|
# TODO: test!
|
||||||
|
dur = self.time.duration
|
||||||
|
if isinstance(dur, basestring):
|
||||||
|
clock = float(dur[:-1]) + offset
|
||||||
|
else:
|
||||||
|
clock = dur * base_dur + offset
|
||||||
|
if self.canonical_right:
|
||||||
|
nframes = int(np.floor(clock * fps))
|
||||||
|
else:
|
||||||
|
nframes = int(np.ceil(clock * fps))
|
||||||
|
err = (clock - nframes / fps) / clock
|
||||||
|
|
||||||
|
fw = self.time.frame_width
|
||||||
|
if not isinstance(fw, list):
|
||||||
|
fw = [0, fw, 1, fw]
|
||||||
|
fw = [float(f[:-1]) * fps if isinstance(f, basestring)
|
||||||
|
else float(f) / (clock * fps) for f in fw]
|
||||||
|
self.adj_frame_width = SplEval(fw)
|
||||||
|
|
||||||
|
times = np.linspace(offset, 1 - err, nframes + 1)
|
||||||
|
# Move each time to a center time, and discard the last value
|
||||||
|
times = times[:-1] + 0.5 * (times[1] - times[0])
|
||||||
|
if err_spread:
|
||||||
|
epts = np.linspace(-2*np.pi, 2*np.pi, nframes)
|
||||||
|
times = times + 0.5 * err * (np.tanh(epts) + 1)
|
||||||
|
return err, times
|
||||||
|
268
cuburn/render.py
268
cuburn/render.py
@ -4,11 +4,11 @@ import re
|
|||||||
import time as timemod
|
import time as timemod
|
||||||
import tempfile
|
import tempfile
|
||||||
from collections import namedtuple
|
from collections import namedtuple
|
||||||
from itertools import cycle, repeat, chain, izip
|
from itertools import cycle, repeat, chain, izip, imap, ifilter
|
||||||
from ctypes import *
|
from ctypes import *
|
||||||
from cStringIO import StringIO
|
from cStringIO import StringIO
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from numpy import int32 as i32, uint64 as u64
|
from numpy import float32 as f32, int32 as i32, uint32 as u32, uint64 as u64
|
||||||
from scipy import ndimage
|
from scipy import ndimage
|
||||||
|
|
||||||
from fr0stlib import pyflam3
|
from fr0stlib import pyflam3
|
||||||
@ -24,41 +24,55 @@ from cuburn import affine
|
|||||||
from cuburn.code import util, mwc, iter, filtering, sort
|
from cuburn.code import util, mwc, iter, filtering, sort
|
||||||
|
|
||||||
RenderedImage = namedtuple('RenderedImage', 'buf idx gpu_time')
|
RenderedImage = namedtuple('RenderedImage', 'buf idx gpu_time')
|
||||||
|
Dimensions = namedtuple('Dimensions', 'w h aw ah astride')
|
||||||
|
|
||||||
def _sync_stream(dst, src):
|
def _sync_stream(dst, src):
|
||||||
dst.wait_for_event(cuda.Event(cuda.event_flags.DISABLE_TIMING).record(src))
|
dst.wait_for_event(cuda.Event(cuda.event_flags.DISABLE_TIMING).record(src))
|
||||||
|
|
||||||
|
def argset(obj, **kwargs):
|
||||||
|
for k, v in kwargs.items():
|
||||||
|
setattr(obj, k, v)
|
||||||
|
return obj
|
||||||
|
|
||||||
class Renderer(object):
|
class Renderer(object):
|
||||||
"""
|
"""
|
||||||
Control structure for rendering a series of frames.
|
Control structure for rendering a series of frames.
|
||||||
|
|
||||||
Each animation will dynamically generate a kernel that includes only the
|
|
||||||
code necessary to render the genomes provided. The process of generating
|
|
||||||
and uploading the kernel takes a small but finite amount of time. In
|
|
||||||
general, the kernel generated for all genomes resulting from interpolating
|
|
||||||
between two control points will have identical performance, so it is
|
|
||||||
wasteful to create more than one animation for any interpolated sequence.
|
|
||||||
|
|
||||||
However, genome sequences interpolated from three or more control points
|
|
||||||
with different features enabled will have the code needed to render all
|
|
||||||
genomes enabled for every frame. Doing this can hurt performance.
|
|
||||||
|
|
||||||
In other words, it's best to use exactly one Animation for each
|
|
||||||
interpolated sequence between one or two genomes.
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
# Number of iterations to iterate without write after generating a new
|
||||||
|
# point. This number is currently fixed pretty deeply in the set of magic
|
||||||
|
# constants which govern buffer sizes; changing the value here won't
|
||||||
|
# actually change the code on the device to do something different.
|
||||||
|
fuse = 256
|
||||||
|
|
||||||
|
# The palette texture/surface covers the color coordinate from [0,1] with
|
||||||
|
# (for now, a fixed 256) equidistant horizontal samples, and spans the
|
||||||
|
# temporal range of the frame linearly with this many rows. Increasing
|
||||||
|
# this value increases the number of uniquely-dithered samples when using
|
||||||
|
# pre-dithered surfaces.
|
||||||
|
palette_height = 64
|
||||||
|
|
||||||
|
# Maximum width of DE and other spatial filters, and thus in turn the
|
||||||
|
# amount of padding applied. Note that, for now, this must not be changed!
|
||||||
|
# The filtering code makes deep assumptions about this value.
|
||||||
|
gutter = 15
|
||||||
|
|
||||||
|
# Accumulation mode. Leave it at 'atomic' for now.
|
||||||
|
acc_mode = 'atomic'
|
||||||
|
|
||||||
|
# TODO
|
||||||
|
chaos_used = False
|
||||||
|
|
||||||
cmp_options = ('-use_fast_math', '-maxrregcount', '42')
|
cmp_options = ('-use_fast_math', '-maxrregcount', '42')
|
||||||
keep = False
|
keep = False
|
||||||
|
|
||||||
def __init__(self, info):
|
def __init__(self):
|
||||||
self.info = info
|
|
||||||
self._iter = self.src = self.cubin = self.mod = None
|
self._iter = self.src = self.cubin = self.mod = None
|
||||||
self.packed_genome = None
|
|
||||||
|
|
||||||
# Ensure class options don't get contaminated on an instance
|
# Ensure class options don't get contaminated on an instance
|
||||||
self.cmp_options = list(self.cmp_options)
|
self.cmp_options = list(self.cmp_options)
|
||||||
|
|
||||||
def compile(self, keep=None, cmp_options=None, jit_options=[]):
|
def compile(self, genome, keep=None, cmp_options=None):
|
||||||
"""
|
"""
|
||||||
Compile a kernel capable of rendering every frame in this animation.
|
Compile a kernel capable of rendering every frame in this animation.
|
||||||
The resulting compiled kernel is stored in the ``cubin`` property;
|
The resulting compiled kernel is stored in the ``cubin`` property;
|
||||||
@ -73,7 +87,7 @@ class Renderer(object):
|
|||||||
keep = self.keep if keep is None else keep
|
keep = self.keep if keep is None else keep
|
||||||
cmp_options = self.cmp_options if cmp_options is None else cmp_options
|
cmp_options = self.cmp_options if cmp_options is None else cmp_options
|
||||||
|
|
||||||
self._iter = iter.IterCode(self.info)
|
self._iter = iter.IterCode(self, genome)
|
||||||
self._iter.packer.finalize()
|
self._iter.packer.finalize()
|
||||||
self.src = util.assemble_code(util.BaseCode, mwc.MWC, self._iter.packer,
|
self.src = util.assemble_code(util.BaseCode, mwc.MWC, self._iter.packer,
|
||||||
self._iter)
|
self._iter)
|
||||||
@ -82,41 +96,81 @@ class Renderer(object):
|
|||||||
self.cubin = pycuda.compiler.compile(
|
self.cubin = pycuda.compiler.compile(
|
||||||
self.src, keep=keep, options=cmp_options,
|
self.src, keep=keep, options=cmp_options,
|
||||||
cache_dir=False if keep else None)
|
cache_dir=False if keep else None)
|
||||||
|
|
||||||
|
def load(self, genome, jit_options=[]):
|
||||||
|
if not self.cubin:
|
||||||
|
self.compile(genome)
|
||||||
self.mod = cuda.module_from_buffer(self.cubin, jit_options)
|
self.mod = cuda.module_from_buffer(self.cubin, jit_options)
|
||||||
with open('/tmp/iter_kern.cubin', 'wb') as fp:
|
with open('/tmp/iter_kern.cubin', 'wb') as fp:
|
||||||
fp.write(self.cubin)
|
fp.write(self.cubin)
|
||||||
return self.src
|
return self.src
|
||||||
|
|
||||||
def render(self, times):
|
def render(self, genome, times, width, height, blend=True):
|
||||||
"""
|
"""
|
||||||
Render a flame for each genome in the iterable value 'genomes'.
|
Render a frame for each timestamp in the iterable value ``times``. This
|
||||||
Returns a RenderedImage object with the rendered buffer in the
|
function returns a generator that will yield a RenderedImage object
|
||||||
requested format (3D RGBA ndarray only for now).
|
containing a shared reference to the output buffer for each specified
|
||||||
|
frame.
|
||||||
|
|
||||||
This method produces a considerable amount of side effects, and should
|
The returned buffer is page-locked host memory. Between the time a
|
||||||
not be used lightly. Things may go poorly for you if this method is not
|
buffer is yielded and the time the next frame's results are requested,
|
||||||
allowed to run until completion (by exhausting all items in the
|
the buffer will not be modified. Thereafter, however, it will be
|
||||||
generator object).
|
overwritten by an asynchronous DMA operation coming from the CUDA
|
||||||
|
device. If you hang on to it for longer than one frame, copy it.
|
||||||
|
|
||||||
``times`` is a sequence of (idx, start, stop) times, where index is
|
``genome`` is the genome to be rendered. Successive calls to the
|
||||||
the logical frame number (though it can be any value) and 'start' and
|
`render()` method on one ``Renderer`` object must use genomes which
|
||||||
'stop' together define the time range to be rendered for each frame.
|
produce identical compiled code, and this will not be verified by the
|
||||||
|
renderer. In practice, this means you can alter genome parameter
|
||||||
|
values, but the full set of keys must remain identical between runs on
|
||||||
|
the same renderer.
|
||||||
|
|
||||||
|
``times`` is a list of (idx, cen_time) tuples, where ``idx`` is passed
|
||||||
|
unmodified in the RenderedImage return value and ``cen_time`` is the
|
||||||
|
central time of the current frame in spline-time units. (Any
|
||||||
|
clock-time or frame-time units in the genome should be preconverted.)
|
||||||
|
|
||||||
|
If ``blend`` is False, the output buffer will contain unclipped,
|
||||||
|
premultiplied RGBA data, without vibrancy, highlight power, or the
|
||||||
|
alpha elbow applied.
|
||||||
"""
|
"""
|
||||||
if times == []:
|
r = self.render_gen(genome, width, height, blend=blend)
|
||||||
|
next(r)
|
||||||
|
return ifilter(None, imap(r.send, chain(times, [None])))
|
||||||
|
|
||||||
|
def render_gen(self, genome, width, height, blend=True):
|
||||||
|
"""
|
||||||
|
Render frames. This method is wrapped by the ``render()`` method; see
|
||||||
|
its docstring for warnings and details.
|
||||||
|
|
||||||
|
Instead of passing frame times as an iterable, they are passed
|
||||||
|
individually via the ``generator.send()`` method. There is an
|
||||||
|
internal pipeline latency of one frame, so the first call to the
|
||||||
|
``send()`` method will return None, the second call will return the
|
||||||
|
first frame's result, and so on. To retrieve the last frame in a
|
||||||
|
sequence, send ``None``.
|
||||||
|
|
||||||
|
Direct use of this method is useful for implementing render servers.
|
||||||
|
"""
|
||||||
|
|
||||||
|
last_idx = None
|
||||||
|
next_frame = yield
|
||||||
|
if next_frame is None:
|
||||||
return
|
return
|
||||||
|
|
||||||
|
if not self.mod:
|
||||||
|
self.load(genome)
|
||||||
|
|
||||||
filt = filtering.Filtering()
|
filt = filtering.Filtering()
|
||||||
|
|
||||||
reset_rb_fun = self.mod.get_function("reset_rb")
|
reset_rb_fun = self.mod.get_function("reset_rb")
|
||||||
packer_fun = self.mod.get_function("interp_iter_params")
|
packer_fun = self.mod.get_function("interp_iter_params")
|
||||||
iter_fun = self.mod.get_function("iter")
|
iter_fun = self.mod.get_function("iter")
|
||||||
|
|
||||||
info = self.info
|
|
||||||
|
|
||||||
# The synchronization model is messy. See helpers/task_model.svg.
|
# The synchronization model is messy. See helpers/task_model.svg.
|
||||||
iter_stream = cuda.Stream()
|
iter_stream = cuda.Stream()
|
||||||
filt_stream = cuda.Stream()
|
filt_stream = cuda.Stream()
|
||||||
if info.acc_mode == 'deferred':
|
if self.acc_mode == 'deferred':
|
||||||
write_stream = cuda.Stream()
|
write_stream = cuda.Stream()
|
||||||
write_fun = self.mod.get_function("write_shmem")
|
write_fun = self.mod.get_function("write_shmem")
|
||||||
else:
|
else:
|
||||||
@ -128,19 +182,30 @@ class Renderer(object):
|
|||||||
event_a = cuda.Event().record(filt_stream)
|
event_a = cuda.Event().record(filt_stream)
|
||||||
event_b = None
|
event_b = None
|
||||||
|
|
||||||
nbins = info.acc_height * info.acc_stride
|
awidth = width + 2 * self.gutter
|
||||||
|
aheight = height + 2 * self.gutter
|
||||||
|
astride = 32 * int(np.ceil(awidth / 32.))
|
||||||
|
dim = Dimensions(width, height, awidth, aheight, astride)
|
||||||
|
d_acc_size = self.mod.get_global('acc_size')[0]
|
||||||
|
cuda.memcpy_htod_async(d_acc_size, u32(list(dim)), write_stream)
|
||||||
|
|
||||||
|
nbins = awidth * aheight
|
||||||
# Extra padding in accum helps with write_shmem overruns
|
# Extra padding in accum helps with write_shmem overruns
|
||||||
d_accum = cuda.mem_alloc(16 * nbins + (1<<16))
|
d_accum = cuda.mem_alloc(16 * nbins + (1<<16))
|
||||||
d_out = cuda.mem_alloc(16 * nbins)
|
d_out = cuda.mem_alloc(16 * nbins)
|
||||||
if info.acc_mode == 'atomic':
|
if self.acc_mode == 'atomic':
|
||||||
d_atom = cuda.mem_alloc(8 * nbins)
|
d_atom = cuda.mem_alloc(8 * nbins)
|
||||||
flush_fun = self.mod.get_function("flush_atom")
|
flush_fun = self.mod.get_function("flush_atom")
|
||||||
|
|
||||||
acc_size = np.array([info.acc_width, info.acc_height, info.acc_stride])
|
obuf_copy = argset(cuda.Memcpy2D(),
|
||||||
d_acc_size = self.mod.get_global('acc_size')[0]
|
src_y=self.gutter, src_x_in_bytes=16*self.gutter,
|
||||||
cuda.memcpy_htod_async(d_acc_size, np.uint32(acc_size), write_stream)
|
src_pitch=16*astride, dst_pitch=16*width,
|
||||||
|
width_in_bytes=16*width, height=height)
|
||||||
|
obuf_copy.set_src_device(d_out)
|
||||||
|
h_out_a = cuda.pagelocked_empty((height, width, 4), f32)
|
||||||
|
h_out_b = cuda.pagelocked_empty((height, width, 4), f32)
|
||||||
|
|
||||||
if info.acc_mode == 'deferred':
|
if self.acc_mode == 'deferred':
|
||||||
# Having a fixed, power-of-two log size makes things much easier
|
# Having a fixed, power-of-two log size makes things much easier
|
||||||
log_size = 64 << 20
|
log_size = 64 << 20
|
||||||
d_log = cuda.mem_alloc(log_size * 4)
|
d_log = cuda.mem_alloc(log_size * 4)
|
||||||
@ -153,9 +218,8 @@ class Renderer(object):
|
|||||||
# Calculate 'nslots', the number of simultaneous running threads that
|
# Calculate 'nslots', the number of simultaneous running threads that
|
||||||
# can be active on the GPU during iteration (and thus the number of
|
# can be active on the GPU during iteration (and thus the number of
|
||||||
# slots for loading and storing RNG and point context that will be
|
# slots for loading and storing RNG and point context that will be
|
||||||
# prepared on the device), 'rb_size' (the number of blocks in
|
# prepared on the device), and derive 'rb_size', the number of blocks in
|
||||||
# 'nslots'), and determine a number of temporal samples
|
# 'nslots'.
|
||||||
# likely to load-balance effectively
|
|
||||||
iter_threads_per_block = 256
|
iter_threads_per_block = 256
|
||||||
dev_data = pycuda.tools.DeviceData()
|
dev_data = pycuda.tools.DeviceData()
|
||||||
occupancy = pycuda.tools.OccupancyRecord(
|
occupancy = pycuda.tools.OccupancyRecord(
|
||||||
@ -169,14 +233,16 @@ class Renderer(object):
|
|||||||
reset_rb_fun(np.int32(rb_size), block=(1,1,1))
|
reset_rb_fun(np.int32(rb_size), block=(1,1,1))
|
||||||
|
|
||||||
d_points = cuda.mem_alloc(nslots * 16)
|
d_points = cuda.mem_alloc(nslots * 16)
|
||||||
# We may add extra seeds to simplify palette dithering.
|
# This statement may add extra seeds to simplify palette dithering.
|
||||||
seeds = mwc.MWC.make_seeds(max(nslots, 256 * info.palette_height))
|
seeds = mwc.MWC.make_seeds(max(nslots, 256 * self.palette_height))
|
||||||
d_seeds = cuda.to_device(seeds)
|
d_seeds = cuda.to_device(seeds)
|
||||||
|
|
||||||
# We used to auto-calculate this to a multiple of the number of SMs on
|
# We used to auto-calculate this to a multiple of the number of SMs on
|
||||||
# the device, but since we now use shorter launches and, to a certain
|
# the device, but since we now use shorter launches and, to a certain
|
||||||
# extent, allow simultaneous occupancy, that's not as important. The
|
# extent, allow simultaneous occupancy, that's not as important. The
|
||||||
# 1024 is a magic constant, though: FUSE
|
# 1024 is a magic constant to ensure reasonable and power-of-two log
|
||||||
|
# size for deferred: 256MB / (4B * FUSE * NTHREADS). Enhancements to
|
||||||
|
# the sort engine are needed to make this more flexible.
|
||||||
ntemporal_samples = 1024
|
ntemporal_samples = 1024
|
||||||
genome_times, genome_knots = self._iter.packer.pack()
|
genome_times, genome_knots = self._iter.packer.pack()
|
||||||
d_genome_times = cuda.to_device(genome_times)
|
d_genome_times = cuda.to_device(genome_times)
|
||||||
@ -184,37 +250,31 @@ class Renderer(object):
|
|||||||
info_size = 4 * len(self._iter.packer) * ntemporal_samples
|
info_size = 4 * len(self._iter.packer) * ntemporal_samples
|
||||||
d_infos = cuda.mem_alloc(info_size)
|
d_infos = cuda.mem_alloc(info_size)
|
||||||
|
|
||||||
pals = info.genome.color.palette
|
pals = genome.color.palette
|
||||||
if isinstance(pals, basestring):
|
if isinstance(pals, basestring):
|
||||||
pals = [0.0, pals, 1.0, pals]
|
pals = [0.0, pals, 1.0, pals]
|
||||||
palint_times = np.empty(len(genome_times[0]), np.float32)
|
palint_times = np.empty(len(genome_times[0]), f32)
|
||||||
palint_times.fill(100.0)
|
palint_times.fill(100.0)
|
||||||
palint_times[:len(pals)/2] = pals[::2]
|
palint_times[:len(pals)] = [p[0] for p in pals]
|
||||||
d_palint_times = cuda.to_device(palint_times)
|
d_palint_times = cuda.to_device(palint_times)
|
||||||
d_palint_vals = cuda.to_device(
|
d_palint_vals = cuda.to_device(
|
||||||
np.concatenate(map(info.db.palettes.get, pals[1::2])))
|
np.concatenate([p[1].data for p in pals]))
|
||||||
|
|
||||||
if info.acc_mode in ('deferred', 'atomic'):
|
if self.acc_mode in ('deferred', 'atomic'):
|
||||||
palette_fun = self.mod.get_function("interp_palette_hsv_flat")
|
palette_fun = self.mod.get_function("interp_palette_hsv_flat")
|
||||||
dsc = cuda.ArrayDescriptor3D()
|
dsc = argset(cuda.ArrayDescriptor3D(), height=self.palette_height,
|
||||||
dsc.height = info.palette_height
|
width=256, depth=0, format=cuda.array_format.SIGNED_INT32,
|
||||||
dsc.width = 256
|
num_channels=2, flags=cuda.array3d_flags.SURFACE_LDST)
|
||||||
dsc.depth = 0
|
|
||||||
dsc.format = cuda.array_format.SIGNED_INT32
|
|
||||||
dsc.num_channels = 2
|
|
||||||
dsc.flags = cuda.array3d_flags.SURFACE_LDST
|
|
||||||
palarray = cuda.Array(dsc)
|
palarray = cuda.Array(dsc)
|
||||||
|
|
||||||
tref = self.mod.get_surfref('flatpal')
|
tref = self.mod.get_surfref('flatpal')
|
||||||
tref.set_array(palarray, 0)
|
tref.set_array(palarray, 0)
|
||||||
else:
|
else:
|
||||||
palette_fun = self.mod.get_function("interp_palette_hsv")
|
palette_fun = self.mod.get_function("interp_palette_hsv")
|
||||||
dsc = cuda.ArrayDescriptor()
|
dsc = argset(cuda.ArrayDescriptor(), height=self.palette_height,
|
||||||
dsc.height = info.palette_height
|
width=256, format=cuda.array_format.UNSIGNED_INT8,
|
||||||
dsc.width = 256
|
num_channels=4)
|
||||||
dsc.format = cuda.array_format.UNSIGNED_INT8
|
d_palmem = cuda.mem_alloc(256 * self.palette_height * 4)
|
||||||
dsc.num_channels = 4
|
|
||||||
d_palmem = cuda.mem_alloc(256 * info.palette_height * 4)
|
|
||||||
|
|
||||||
tref = self.mod.get_texref('palTex')
|
tref = self.mod.get_texref('palTex')
|
||||||
tref.set_address_2d(d_palmem, dsc, 1024)
|
tref.set_address_2d(d_palmem, dsc, 1024)
|
||||||
@ -222,49 +282,46 @@ class Renderer(object):
|
|||||||
tref.set_flags(cuda.TRSF_NORMALIZED_COORDINATES)
|
tref.set_flags(cuda.TRSF_NORMALIZED_COORDINATES)
|
||||||
tref.set_filter_mode(cuda.filter_mode.LINEAR)
|
tref.set_filter_mode(cuda.filter_mode.LINEAR)
|
||||||
|
|
||||||
h_out_a = cuda.pagelocked_empty((info.acc_height, info.acc_stride, 4),
|
while next_frame is not None:
|
||||||
np.float32)
|
# tc, td, ts, te: central, delta, start, end times
|
||||||
h_out_b = cuda.pagelocked_empty((info.acc_height, info.acc_stride, 4),
|
idx, tc = next_frame
|
||||||
np.float32)
|
td = genome.adj_frame_width(tc)
|
||||||
last_idx = None
|
ts, te = tc - 0.5 * td, tc + 0.5 * td
|
||||||
|
|
||||||
for idx, start, stop in times:
|
if self.acc_mode in ('deferred', 'atomic'):
|
||||||
twidth = np.float32((stop-start) / info.palette_height)
|
# In this mode, the palette writes to a surface reference, but
|
||||||
if info.acc_mode in ('deferred', 'atomic'):
|
# requires dithering, so we pass it the seeds instead
|
||||||
palette_fun(d_seeds, d_palint_times, d_palint_vals,
|
arg0 = d_seeds
|
||||||
np.float32(start), twidth,
|
|
||||||
block=(256,1,1), grid=(info.palette_height,1),
|
|
||||||
stream=write_stream)
|
|
||||||
else:
|
else:
|
||||||
palette_fun(d_palmem, d_palint_times, d_palint_vals,
|
arg0 = d_palmem
|
||||||
np.float32(start), twidth,
|
palette_fun(arg0, d_palint_times, d_palint_vals,
|
||||||
block=(256,1,1), grid=(info.palette_height,1),
|
f32(ts), f32(td / self.palette_height),
|
||||||
stream=write_stream)
|
block=(256,1,1), grid=(self.palette_height,1),
|
||||||
|
stream=write_stream)
|
||||||
|
|
||||||
width = np.float32((stop-start) / ntemporal_samples)
|
packer_fun(d_infos, d_seeds, d_genome_times, d_genome_knots,
|
||||||
packer_fun(d_infos, d_genome_times, d_genome_knots,
|
f32(ts), f32(td / ntemporal_samples),
|
||||||
np.float32(start), width, d_seeds,
|
i32(ntemporal_samples), block=(256,1,1),
|
||||||
np.int32(ntemporal_samples), block=(256,1,1),
|
|
||||||
grid=(int(np.ceil(ntemporal_samples/256.)),1),
|
grid=(int(np.ceil(ntemporal_samples/256.)),1),
|
||||||
stream=iter_stream)
|
stream=iter_stream)
|
||||||
|
|
||||||
# Reset points so that they will be FUSEd
|
# Reset points so that they will be FUSEd
|
||||||
util.BaseCode.fill_dptr(self.mod, d_points, 4 * nslots,
|
util.BaseCode.fill_dptr(self.mod, d_points, 4 * nslots,
|
||||||
iter_stream, np.float32(np.nan))
|
iter_stream, f32(np.nan))
|
||||||
|
|
||||||
# Get interpolated control points for debugging
|
# Get interpolated control points for debugging
|
||||||
#iter_stream.synchronize()
|
#iter_stream.synchronize()
|
||||||
#d_temp = cuda.from_device(d_infos,
|
#d_temp = cuda.from_device(d_infos,
|
||||||
#(ntemporal_samples, len(self._iter.packer)), np.float32)
|
#(ntemporal_samples, len(self._iter.packer)), f32)
|
||||||
#for i, n in zip(d_temp[5], self._iter.packer.packed):
|
#for i, n in zip(d_temp[5], self._iter.packer.packed):
|
||||||
#print '%60s %g' % ('_'.join(n), i)
|
#print '%60s %g' % ('_'.join(n), i)
|
||||||
|
|
||||||
util.BaseCode.fill_dptr(self.mod, d_accum, 4 * nbins, write_stream)
|
util.BaseCode.fill_dptr(self.mod, d_accum, 4 * nbins, write_stream)
|
||||||
if info.acc_mode == 'atomic':
|
if self.acc_mode == 'atomic':
|
||||||
util.BaseCode.fill_dptr(self.mod, d_atom, 2 * nbins, write_stream)
|
util.BaseCode.fill_dptr(self.mod, d_atom, 2 * nbins, write_stream)
|
||||||
nrounds = ( (info.density * info.width * info.height)
|
nrounds = int( (genome.spp(tc) * width * height)
|
||||||
/ (ntemporal_samples * 256 * 256) ) + 1
|
/ (ntemporal_samples * 256 * 256) ) + 1
|
||||||
if info.acc_mode == 'deferred':
|
if self.acc_mode == 'deferred':
|
||||||
for i in range(nrounds):
|
for i in range(nrounds):
|
||||||
iter_fun(np.uint64(d_log), d_seeds, d_points, d_infos,
|
iter_fun(np.uint64(d_log), d_seeds, d_points, d_infos,
|
||||||
block=(32, self._iter.NTHREADS/32, 1),
|
block=(32, self._iter.NTHREADS/32, 1),
|
||||||
@ -272,18 +329,17 @@ class Renderer(object):
|
|||||||
_sync_stream(write_stream, iter_stream)
|
_sync_stream(write_stream, iter_stream)
|
||||||
sorter.sort(d_log_sorted, d_log, log_size, 3, True,
|
sorter.sort(d_log_sorted, d_log, log_size, 3, True,
|
||||||
stream=write_stream)
|
stream=write_stream)
|
||||||
#print cuda.from_device(sorter.dglobal, (256,), np.uint32)
|
|
||||||
_sync_stream(iter_stream, write_stream)
|
_sync_stream(iter_stream, write_stream)
|
||||||
write_fun(d_accum, d_log_sorted, sorter.dglobal, i32(nbins),
|
write_fun(d_accum, d_log_sorted, sorter.dglobal, i32(nbins),
|
||||||
block=(1024, 1, 1), grid=(nwriteblocks, 1),
|
block=(1024, 1, 1), grid=(nwriteblocks, 1),
|
||||||
stream=write_stream)
|
stream=write_stream)
|
||||||
else:
|
else:
|
||||||
args = [u64(d_accum), d_seeds, d_points, d_infos]
|
args = [u64(d_accum), d_seeds, d_points, d_infos]
|
||||||
if info.acc_mode == 'atomic':
|
if self.acc_mode == 'atomic':
|
||||||
args.append(u64(d_atom))
|
args.append(u64(d_atom))
|
||||||
iter_fun(*args, block=(32, self._iter.NTHREADS/32, 1),
|
iter_fun(*args, block=(32, self._iter.NTHREADS/32, 1),
|
||||||
grid=(ntemporal_samples, nrounds), stream=iter_stream)
|
grid=(ntemporal_samples, nrounds), stream=iter_stream)
|
||||||
if info.acc_mode == 'atomic':
|
if self.acc_mode == 'atomic':
|
||||||
nblocks = int(np.ceil(np.sqrt(nbins/float(512))))
|
nblocks = int(np.ceil(np.sqrt(nbins/float(512))))
|
||||||
flush_fun(u64(d_accum), u64(d_atom), i32(nbins),
|
flush_fun(u64(d_accum), u64(d_atom), i32(nbins),
|
||||||
block=(512, 1, 1), grid=(nblocks, nblocks),
|
block=(512, 1, 1), grid=(nblocks, nblocks),
|
||||||
@ -291,27 +347,29 @@ class Renderer(object):
|
|||||||
|
|
||||||
util.BaseCode.fill_dptr(self.mod, d_out, 4 * nbins, filt_stream)
|
util.BaseCode.fill_dptr(self.mod, d_out, 4 * nbins, filt_stream)
|
||||||
_sync_stream(filt_stream, write_stream)
|
_sync_stream(filt_stream, write_stream)
|
||||||
filt.de(d_out, d_accum, info, start, stop, filt_stream)
|
filt.de(d_out, d_accum, genome, dim, tc, stream=filt_stream)
|
||||||
_sync_stream(write_stream, filt_stream)
|
_sync_stream(write_stream, filt_stream)
|
||||||
filt.colorclip(d_out, info, start, stop, filt_stream)
|
filt.colorclip(d_out, genome, dim, tc, blend, stream=filt_stream)
|
||||||
cuda.memcpy_dtoh_async(h_out_a, d_out, filt_stream)
|
obuf_copy.set_dst_host(h_out_a)
|
||||||
|
obuf_copy(filt_stream)
|
||||||
|
|
||||||
if event_b:
|
if event_b:
|
||||||
while not event_a.query():
|
while not event_a.query():
|
||||||
timemod.sleep(0.01)
|
timemod.sleep(0.01)
|
||||||
gpu_time = event_a.time_since(event_b)
|
gpu_time = event_a.time_since(event_b)
|
||||||
yield RenderedImage(self._trim(h_out_b), last_idx, gpu_time)
|
result = RenderedImage(h_out_b, last_idx, gpu_time)
|
||||||
|
else:
|
||||||
|
result = None
|
||||||
|
last_idx = idx
|
||||||
|
|
||||||
event_a, event_b = cuda.Event().record(filt_stream), event_a
|
event_a, event_b = cuda.Event().record(filt_stream), event_a
|
||||||
h_out_a, h_out_b = h_out_b, h_out_a
|
h_out_a, h_out_b = h_out_b, h_out_a
|
||||||
last_idx = idx
|
|
||||||
|
# TODO: add ability to flush a frame without breaking the pipe
|
||||||
|
next_frame = yield result
|
||||||
|
|
||||||
while not event_a.query():
|
while not event_a.query():
|
||||||
timemod.sleep(0.001)
|
timemod.sleep(0.001)
|
||||||
gpu_time = event_a.time_since(event_b)
|
gpu_time = event_a.time_since(event_b)
|
||||||
yield RenderedImage(self._trim(h_out_b), last_idx, gpu_time)
|
yield RenderedImage(h_out_b, last_idx, gpu_time)
|
||||||
|
|
||||||
def _trim(self, result):
|
|
||||||
g = self.info.gutter
|
|
||||||
return result[g:-g,g:g+self.info.width].copy()
|
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user