From b43481e374ee5b7a5937c91ed631d5ffa5bff17e Mon Sep 17 00:00:00 2001 From: Steven Robertson Date: Thu, 15 Dec 2011 11:11:05 -0500 Subject: [PATCH] New genome format to support flockutil --- cuburn/code/filtering.py | 88 ++++++------- cuburn/code/interp.py | 7 +- cuburn/code/iter.py | 36 +++--- cuburn/genome.py | 179 +++++++++++--------------- cuburn/render.py | 268 ++++++++++++++++++++++++--------------- 5 files changed, 303 insertions(+), 275 deletions(-) diff --git a/cuburn/code/filtering.py b/cuburn/code/filtering.py index fdc1f86..905bc59 100644 --- a/cuburn/code/filtering.py +++ b/cuburn/code/filtering.py @@ -1,5 +1,7 @@ import numpy as np +from numpy import float32 as f32, int32 as i32 + import pycuda.compiler from pycuda.gpuarray import vec @@ -10,8 +12,8 @@ _CODE = ''' __global__ void colorclip(float4 *pixbuf, float gamma, float vibrancy, float highpow, - float linrange, float lingam, float3 bkgd, int fbsize, - int alpha_output_channel) { + float linrange, float lingam, float3 bkgd, + int fbsize, int blend_background_color) { int i = threadIdx.x + blockDim.x * (blockIdx.x + gridDim.x * blockIdx.y); if (i >= fbsize) return; @@ -30,8 +32,17 @@ void colorclip(float4 *pixbuf, float gamma, float vibrancy, float highpow, alpha = (1.0f - frac) * pix.w * lingam + frac * alpha; } - float ls = vibrancy * alpha / pix.w; + if (!blend_background_color) { + float ls = alpha / pix.w; + pix.x *= ls; + pix.y *= ls; + pix.z *= ls; + pix.w = alpha; + pixbuf[i] = pix; + return; + } + float ls = vibrancy * alpha / pix.w; alpha = fminf(1.0f, fmaxf(0.0f, alpha)); float maxc = fmaxf(pix.x, fmaxf(pix.y, pix.z)); @@ -64,29 +75,21 @@ void colorclip(float4 *pixbuf, float gamma, float vibrancy, float highpow, pix.y += (1.0f - vibrancy) * powf(opix.y, gamma); pix.z += (1.0f - vibrancy) * powf(opix.z, gamma); - if (alpha_output_channel) { - float one_alpha = 1.0f / alpha; - pix.x *= one_alpha; - pix.y *= one_alpha; - pix.z *= one_alpha; - } else { - pix.x += (1.0f - alpha) * bkgd.x; - pix.y += (1.0f - alpha) * bkgd.y; - pix.z += (1.0f - alpha) * bkgd.z; - } - pix.w = alpha; + pix.x += (1.0f - alpha) * bkgd.x; + pix.y += (1.0f - alpha) * bkgd.y; + pix.z += (1.0f - alpha) * bkgd.z; - // Clamp values. I think this is superfluous, but I'm not certain. pix.x = fminf(1.0f, pix.x); pix.y = fminf(1.0f, pix.y); pix.z = fminf(1.0f, pix.z); + pix.w = alpha; pixbuf[i] = pix; } #define W 15 // Filter width (regardless of standard deviation chosen) -#define W2 7 // Half of filter width, rounded down +#define W2 7 // Half of filter width, rounded down #define FW 46 // Width of local result storage (NW+W2+W2) #define FW2 (FW*FW) @@ -288,52 +291,45 @@ class Filtering(object): def __init__(self): self.init_mod() - def de(self, ddst, dsrc, info, start, stop, stream=None): - # TODO: use integration to obtain parameter values - t = (start + stop) / 2 - cp = info.genome - - k1 = np.float32(cp.color.brightness(t) * 268 / 256) + def de(self, ddst, dsrc, gnm, dim, tc, stream=None): + k1 = f32(gnm.color.brightness(tc) * 268 / 256) # Old definition of area is (w*h/(s*s)). Since new scale 'ns' is now # s/w, new definition is (w*h/(s*s*w*w)) = (h/(s*s*w)) - area = info.height / (cp.camera.scale(t) ** 2 * info.width) - k2 = np.float32(1 / (area * info.density)) + area = dim.h / (gnm.camera.scale(tc) ** 2 * dim.w) + k2 = f32(1 / (area * gnm.spp(tc))) - if cp.de.radius == 0: - nbins = info.acc_height * info.acc_stride + if gnm.de.radius == 0: + nbins = dim.ah * dim.astride fun = self.mod.get_function("logscale") t = fun(dsrc, ddst, k1, k2, block=(512, 1, 1), grid=(nbins/512, 1), stream=stream) else: - scale_coeff = np.float32(-(1 + cp.de.radius(t)) ** -2.0) - est_curve = np.float32(2 * cp.de.curve(t)) + scale_coeff = f32(-(1 + gnm.de.radius(tc)) ** -2.0) + est_curve = f32(2 * gnm.de.curve(tc)) # TODO: experiment with this - edge_clamp = np.float32(2.0) + edge_clamp = f32(1.2) fun = self.mod.get_function("density_est") fun(dsrc, ddst, scale_coeff, est_curve, edge_clamp, k1, k2, - np.int32(info.acc_height), np.int32(info.acc_stride), - block=(32, 32, 1), grid=(info.acc_width/32, 1), stream=stream) + i32(dim.ah), i32(dim.astride), block=(32, 32, 1), + grid=(dim.aw/32, 1), stream=stream) - def colorclip(self, dbuf, info, start, stop, stream=None): - f32 = np.float32 - t = (start + stop) / 2 - cp = info.genome - nbins = info.acc_height * info.acc_stride + def colorclip(self, dbuf, gnm, dim, tc, blend, stream=None): + nbins = dim.ah * dim.astride # TODO: implement integration over cubic splines? - gam = f32(1 / cp.color.gamma(t)) - vib = f32(cp.color.vibrancy(t)) - hipow = f32(cp.color.highlight_power(t)) - lin = f32(cp.color.gamma_threshold(t)) + gam = f32(1 / gnm.color.gamma(tc)) + vib = f32(gnm.color.vibrancy(tc)) + hipow = f32(gnm.color.highlight_power(tc)) + lin = f32(gnm.color.gamma_threshold(tc)) lingam = f32(lin ** (gam-1.0) if lin > 0 else 0) bkgd = vec.make_float3( - cp.color.background.r(t), - cp.color.background.g(t), - cp.color.background.b(t)) + gnm.color.background.r(tc), + gnm.color.background.g(tc), + gnm.color.background.b(tc)) color_fun = self.mod.get_function("colorclip") blocks = int(np.ceil(np.sqrt(nbins / 256))) - color_fun(dbuf, gam, vib, hipow, lin, lingam, bkgd, np.int32(nbins), - np.int32(0), - block=(256, 1, 1), grid=(blocks, blocks), stream=stream) + color_fun(dbuf, gam, vib, hipow, lin, lingam, bkgd, i32(nbins), + i32(blend), block=(256, 1, 1), grid=(blocks, blocks), + stream=stream) diff --git a/cuburn/code/interp.py b/cuburn/code/interp.py index 5b4961f..d363ac1 100644 --- a/cuburn/code/interp.py +++ b/cuburn/code/interp.py @@ -226,8 +226,11 @@ class GenomePacker(HunkOCode): _defs = Template(r""" __global__ -void interp_{{tname}}({{tname}}* out, float *times, float *knots, - float tstart, float tstep, mwc_st *rctxes, int maxid) { +void interp_{{tname}}( + {{tname}}* out, mwc_st *rctxes, + const float *times, const float *knots, + float tstart, float tstep, int maxid +) { int id = gtid(); if (id >= maxid) return; out = &out[id]; diff --git a/cuburn/code/iter.py b/cuburn/code/iter.py index 30e0e02..549e835 100644 --- a/cuburn/code/iter.py +++ b/cuburn/code/iter.py @@ -53,7 +53,7 @@ def precalc_chaos(pcp, std_xforms): """).substitute(locals())) -def precalc_camera(info, pcam): +def precalc_camera(pcam): pre_cam = pcam._precalc() # Maxima code to check my logic: @@ -68,7 +68,7 @@ def precalc_camera(info, pcam): float rot = {{pre_cam.rotation}} * M_PI / 180.0f; float rotsin = sin(rot), rotcos = cos(rot); float cenx = {{pre_cam.center.x}}, ceny = {{pre_cam.center.y}}; - float scale = {{pre_cam.scale}} * {{info.width}}; + float scale = {{pre_cam.scale}} * acc_size.width; float ditherwidth = {{pre_cam.dither_width}} * 0.33f; float u0 = mwc_next_01(rctx); @@ -81,12 +81,12 @@ def precalc_camera(info, pcam): {{pre_cam._set('xx')}} = scale * rotcos; {{pre_cam._set('xy')}} = scale * -rotsin; {{pre_cam._set('xo')}} = scale * (rotsin * ceny - rotcos * cenx) - + {{0.5 * (info.width + info.gutter + 1)}} + ditherx; + + 0.5f * acc_size.awidth + ditherx; {{pre_cam._set('yx')}} = scale * rotsin; {{pre_cam._set('yy')}} = scale * rotcos; {{pre_cam._set('yo')}} = scale * -(rotsin * cenx + rotcos * ceny) - + {{0.5 * (info.height + info.gutter + 1)}} + dithery; + + 0.5f * acc_size.aheight + dithery; """).substitute(locals())) @@ -113,13 +113,12 @@ class IterCode(HunkOCode): # The number of threads per block NTHREADS = 256 - def __init__(self, info): - self.info = info + def __init__(self, info, genome): self.packer = interp.GenomePacker('iter_params') - self.pcp = self.packer.view('params', self.info.genome, 'cp') + self.pcp = self.packer.view('params', genome, 'cp') - iterbody = self._iterbody() - bodies = [self._xfbody(i,x) for i,x in sorted(info.genome.xforms.items())] + iterbody = self._iterbody(info, genome) + bodies = [self._xfbody(i,x) for i,x in sorted(genome.xforms.items())] bodies.append(iterbody) self.defs = '\n'.join(bodies) @@ -132,7 +131,9 @@ __device__ int rb_head, rb_tail, rb_size; typedef struct { uint32_t width; uint32_t height; - uint32_t stride; + uint32_t awidth; + uint32_t aheight; + uint32_t astride; } acc_size_t; __constant__ acc_size_t acc_size; @@ -174,7 +175,7 @@ void apply_xf_{{xfid}}(float &ox, float &oy, float &color, mwc_st &rctx) { g.update(locals()) return tmpl.substitute(g) - def _iterbody(self): + def _iterbody(self, info, genome): tmpl = Template(r''' __global__ void reset_rb(int size) { @@ -331,7 +332,7 @@ void iter( float cx, cy, cc; - {{precalc_camera(info, pcp.camera)}} + {{precalc_camera(pcp.camera)}} {{if 'final' in cp.xforms}} {{apply_affine('fx', 'fy', 'cx', 'cy', pcp.camera)}} @@ -343,14 +344,14 @@ void iter( uint32_t ix = trunca(cx), iy = trunca(cy); - if (ix >= acc_size.width || iy >= acc_size.height) { + if (ix >= acc_size.awidth || iy >= acc_size.aheight) { {{if info.acc_mode == 'deferred'}} *log = 0xffffffff; {{endif}} continue; } - uint32_t i = iy * acc_size.stride + ix; + uint32_t i = iy * acc_size.astride + ix; {{if info.acc_mode == 'atomic'}} asm volatile ({{crep(""" { @@ -626,12 +627,11 @@ oflow_write_end: {{endif}} ''', 'iter_kern') return tmpl.substitute( - info = self.info, - cp = self.info.genome, + info = info, + cp = genome, pcp = self.pcp, NTHREADS = self.NTHREADS, NWARPS = self.NTHREADS / 32, - std_xforms = [n for n in sorted(self.info.genome.xforms) - if n != 'final'], + std_xforms = [n for n in sorted(genome.xforms) if n != 'final'], **globals()) diff --git a/cuburn/genome.py b/cuburn/genome.py index 8386bf7..2d9e1de 100644 --- a/cuburn/genome.py +++ b/cuburn/genome.py @@ -79,115 +79,86 @@ class SplEval(object): return self.knots[1][0] return list(self.knots.T.flat) - @classmethod - def wrap(cls, obj): - """ - Given a dict 'obj' representing, for instance, a Genome object, walk - through the object recursively and in-place, turning any number or - list of numbers into an SplEval. - """ - for k, v in obj.items(): - if (isinstance(v, (float, int)) or - (isinstance(v, list) and isinstance(v[1], (float, int)))): - obj[k] = cls(v) - elif isinstance(v, dict): - cls.wrap(v) - -class RenderInfo(object): - """ - Determine features and constants required to render a particular set of - genomes. The values of this class are fixed before compilation begins. - """ - # Number of iterations to iterate without write after generating a new - # point. This number is currently fixed pretty deeply in the set of magic - # constants which govern buffer sizes; changing the value here won't - # actually change the code on the device to do something different. - fuse = 256 - - # Height of the texture pallete which gets uploaded to the GPU (assuming - # that palette-from-texture is enabled). For most genomes, this doesn't - # need to be very large at all. However, since only an easily-cached - # fraction of this will be accessed per SM, larger values shouldn't hurt - # performance too much. When using deferred accumulation, increasing this - # value increases the number of uniquely-dithered samples, which is nice. - # Power-of-two, please. - palette_height = 64 - - # Maximum width of DE and other spatial filters, and thus in turn the - # amount of padding applied. Note that, for now, this must not be changed! - # The filtering code makes deep assumptions about this value. - gutter = 15 - - # TODO: for now, we always throw away the alpha channel before writing. - # All code is in place to not do this, we just need to find a way to expose - # this preference via the API (or push alpha blending entirely on the client, - # which I'm not opposed to) - alpha_output_channel = False - - # There are three settings for this somewhat ersatz paramater. 'global' - # uses unsynchronized global writes to accumulate sample points, 'atomic' - # uses atomic global writes, and 'deferred' stores color and position in a - # sample log, sorts the log by position, and uses shared memory to - # perform the accumulation. Deferred has the accuracy of 'atomic' and - # the speed of 'global' (it's actually faster!), but packs color and - # position into a single 32-bit int for now, which limits resolution to - # 1080p when xform opacity is respected, so the other two modes will hang - # around until that can be extended to be memory-limited again. - acc_mode = 'atomic' - - # TODO: fix this - chaos_used = False - - def __init__(self, db, **kwargs): - self.db = db - # Copy all args into this object's namespace - self.__dict__.update(kwargs) - - self.acc_width = self.width + 2 * self.gutter - self.acc_height = self.height + 2 * self.gutter - self.acc_stride = 32 * int(np.ceil(self.acc_width / 32.)) - self.density = self.quality - - # Deref genome - self.genome = self.db.genomes[self.genome] - - for k, v in self.db.palettes.items(): - pal = np.fromstring(base64.b64decode(v), np.uint8) - pal = np.reshape(pal, (256, 3)) - pal_a = np.ones((256, 4), np.float32) - pal_a[:,:3] = pal / 255.0 - self.db.palettes[k] = pal_a +class Palette(object): + """Wafer-thin wrapper around palettes. For the future!""" + def __init__(self, datastr, fmt='rgb8'): + if fmt != 'rgb8': + raise NotImplementedError + if len(datastr) != 768: + raise ValueError("Unsupported palette width") + self.width = 256 + pal = np.reshape(np.fromstring(datastr, np.uint8), (256, 3)) + self.data = np.ones((256, 4), np.float32) + self.data[:,:3] = pal / 255.0 class _AttrDict(dict): def __getattr__(self, name): return self[name] -def load_info(contents): - result = json.loads(contents, object_hook=_AttrDict) - SplEval.wrap(result.genomes) + @classmethod + def _wrap(cls, dct): + for k, v in dct.items(): + if (isinstance(v, (float, int)) or + (isinstance(v, list) and isinstance(v[1], (float, int)))): + dct[k] = SplEval(v) + elif isinstance(v, dict): + dct[k] = cls._wrap(cls(v)) + return dct - # A Job object will have more details or something - result = RenderInfo(result, **result.renders.values()[0]) - return result +class Genome(_AttrDict): + # For now, we base the Genome class on an _AttrDict, letting its structure + # be defined implicitly by the way it is used in device code. More formal + # unpacking will happen soon. + def __init__(self, gnm, base_den): + super(Genome, self).__init__(gnm) + for k, v in self.items(): + v = _AttrDict(v) + if k not in ('info', 'time'): + _AttrDict._wrap(v) + self[k] = v + # TODO: this is a hack, figure out how to solve it more elegantly + self.spp = SplEval(self.camera.density.knotlist) + self.spp.knots[1] *= base_den + # TODO: decide how to handle palettes. For now, it's the caller's + # responsibility to replace this list with actual palettes. + pal = self.color.palette + if isinstance(pal, basestring): + self.color.palette = [(0.0, pal), (1.0, pal)] + elif isinstance(pal, list): + self.color.palette = zip(pal[::2], pal[1::2]) -class HacketyGenome(object): - """ - Holdover class to postpone a very deep refactoring as long as possible. - Converts property accesses into interpolations over predetermined times. - """ - def __init__(self, referent, times): - # Times can be singular - self.referent, self.times = referent, times - def __getattr__(self, name): - r = getattr(self.referent, str(name)) - if isinstance(r, _AttrDict): - return HacketyGenome(r, self.times) - elif isinstance(r, SplEval): - return r(self.times) - return r - __getitem__ = __getattr__ + # TODO: caller also needs to call set_timing() + self.adj_frame_width = None + self.canonical_right = (not self.get('link') or not self.link == 'self' + or not self.link.get('right')) -if __name__ == "__main__": - import sys - import pprint - pprint.pprint(read_genome(sys.stdin)) + def set_timing(self, base_dur, fps, offset=0.0, err_spread=True): + """ + Set frame timing. Must be called at least once prior to rendering. + """ + # TODO: test! + dur = self.time.duration + if isinstance(dur, basestring): + clock = float(dur[:-1]) + offset + else: + clock = dur * base_dur + offset + if self.canonical_right: + nframes = int(np.floor(clock * fps)) + else: + nframes = int(np.ceil(clock * fps)) + err = (clock - nframes / fps) / clock + + fw = self.time.frame_width + if not isinstance(fw, list): + fw = [0, fw, 1, fw] + fw = [float(f[:-1]) * fps if isinstance(f, basestring) + else float(f) / (clock * fps) for f in fw] + self.adj_frame_width = SplEval(fw) + + times = np.linspace(offset, 1 - err, nframes + 1) + # Move each time to a center time, and discard the last value + times = times[:-1] + 0.5 * (times[1] - times[0]) + if err_spread: + epts = np.linspace(-2*np.pi, 2*np.pi, nframes) + times = times + 0.5 * err * (np.tanh(epts) + 1) + return err, times diff --git a/cuburn/render.py b/cuburn/render.py index 977a73a..8b4f2bb 100644 --- a/cuburn/render.py +++ b/cuburn/render.py @@ -4,11 +4,11 @@ import re import time as timemod import tempfile from collections import namedtuple -from itertools import cycle, repeat, chain, izip +from itertools import cycle, repeat, chain, izip, imap, ifilter from ctypes import * from cStringIO import StringIO import numpy as np -from numpy import int32 as i32, uint64 as u64 +from numpy import float32 as f32, int32 as i32, uint32 as u32, uint64 as u64 from scipy import ndimage from fr0stlib import pyflam3 @@ -24,41 +24,55 @@ from cuburn import affine from cuburn.code import util, mwc, iter, filtering, sort RenderedImage = namedtuple('RenderedImage', 'buf idx gpu_time') +Dimensions = namedtuple('Dimensions', 'w h aw ah astride') def _sync_stream(dst, src): dst.wait_for_event(cuda.Event(cuda.event_flags.DISABLE_TIMING).record(src)) +def argset(obj, **kwargs): + for k, v in kwargs.items(): + setattr(obj, k, v) + return obj + class Renderer(object): """ Control structure for rendering a series of frames. - - Each animation will dynamically generate a kernel that includes only the - code necessary to render the genomes provided. The process of generating - and uploading the kernel takes a small but finite amount of time. In - general, the kernel generated for all genomes resulting from interpolating - between two control points will have identical performance, so it is - wasteful to create more than one animation for any interpolated sequence. - - However, genome sequences interpolated from three or more control points - with different features enabled will have the code needed to render all - genomes enabled for every frame. Doing this can hurt performance. - - In other words, it's best to use exactly one Animation for each - interpolated sequence between one or two genomes. """ + # Number of iterations to iterate without write after generating a new + # point. This number is currently fixed pretty deeply in the set of magic + # constants which govern buffer sizes; changing the value here won't + # actually change the code on the device to do something different. + fuse = 256 + + # The palette texture/surface covers the color coordinate from [0,1] with + # (for now, a fixed 256) equidistant horizontal samples, and spans the + # temporal range of the frame linearly with this many rows. Increasing + # this value increases the number of uniquely-dithered samples when using + # pre-dithered surfaces. + palette_height = 64 + + # Maximum width of DE and other spatial filters, and thus in turn the + # amount of padding applied. Note that, for now, this must not be changed! + # The filtering code makes deep assumptions about this value. + gutter = 15 + + # Accumulation mode. Leave it at 'atomic' for now. + acc_mode = 'atomic' + + # TODO + chaos_used = False + cmp_options = ('-use_fast_math', '-maxrregcount', '42') keep = False - def __init__(self, info): - self.info = info + def __init__(self): self._iter = self.src = self.cubin = self.mod = None - self.packed_genome = None # Ensure class options don't get contaminated on an instance self.cmp_options = list(self.cmp_options) - def compile(self, keep=None, cmp_options=None, jit_options=[]): + def compile(self, genome, keep=None, cmp_options=None): """ Compile a kernel capable of rendering every frame in this animation. The resulting compiled kernel is stored in the ``cubin`` property; @@ -73,7 +87,7 @@ class Renderer(object): keep = self.keep if keep is None else keep cmp_options = self.cmp_options if cmp_options is None else cmp_options - self._iter = iter.IterCode(self.info) + self._iter = iter.IterCode(self, genome) self._iter.packer.finalize() self.src = util.assemble_code(util.BaseCode, mwc.MWC, self._iter.packer, self._iter) @@ -82,41 +96,81 @@ class Renderer(object): self.cubin = pycuda.compiler.compile( self.src, keep=keep, options=cmp_options, cache_dir=False if keep else None) + + def load(self, genome, jit_options=[]): + if not self.cubin: + self.compile(genome) self.mod = cuda.module_from_buffer(self.cubin, jit_options) with open('/tmp/iter_kern.cubin', 'wb') as fp: fp.write(self.cubin) return self.src - def render(self, times): + def render(self, genome, times, width, height, blend=True): """ - Render a flame for each genome in the iterable value 'genomes'. - Returns a RenderedImage object with the rendered buffer in the - requested format (3D RGBA ndarray only for now). + Render a frame for each timestamp in the iterable value ``times``. This + function returns a generator that will yield a RenderedImage object + containing a shared reference to the output buffer for each specified + frame. - This method produces a considerable amount of side effects, and should - not be used lightly. Things may go poorly for you if this method is not - allowed to run until completion (by exhausting all items in the - generator object). + The returned buffer is page-locked host memory. Between the time a + buffer is yielded and the time the next frame's results are requested, + the buffer will not be modified. Thereafter, however, it will be + overwritten by an asynchronous DMA operation coming from the CUDA + device. If you hang on to it for longer than one frame, copy it. - ``times`` is a sequence of (idx, start, stop) times, where index is - the logical frame number (though it can be any value) and 'start' and - 'stop' together define the time range to be rendered for each frame. + ``genome`` is the genome to be rendered. Successive calls to the + `render()` method on one ``Renderer`` object must use genomes which + produce identical compiled code, and this will not be verified by the + renderer. In practice, this means you can alter genome parameter + values, but the full set of keys must remain identical between runs on + the same renderer. + + ``times`` is a list of (idx, cen_time) tuples, where ``idx`` is passed + unmodified in the RenderedImage return value and ``cen_time`` is the + central time of the current frame in spline-time units. (Any + clock-time or frame-time units in the genome should be preconverted.) + + If ``blend`` is False, the output buffer will contain unclipped, + premultiplied RGBA data, without vibrancy, highlight power, or the + alpha elbow applied. """ - if times == []: + r = self.render_gen(genome, width, height, blend=blend) + next(r) + return ifilter(None, imap(r.send, chain(times, [None]))) + + def render_gen(self, genome, width, height, blend=True): + """ + Render frames. This method is wrapped by the ``render()`` method; see + its docstring for warnings and details. + + Instead of passing frame times as an iterable, they are passed + individually via the ``generator.send()`` method. There is an + internal pipeline latency of one frame, so the first call to the + ``send()`` method will return None, the second call will return the + first frame's result, and so on. To retrieve the last frame in a + sequence, send ``None``. + + Direct use of this method is useful for implementing render servers. + """ + + last_idx = None + next_frame = yield + if next_frame is None: return + if not self.mod: + self.load(genome) + filt = filtering.Filtering() reset_rb_fun = self.mod.get_function("reset_rb") packer_fun = self.mod.get_function("interp_iter_params") iter_fun = self.mod.get_function("iter") - info = self.info - # The synchronization model is messy. See helpers/task_model.svg. iter_stream = cuda.Stream() filt_stream = cuda.Stream() - if info.acc_mode == 'deferred': + if self.acc_mode == 'deferred': write_stream = cuda.Stream() write_fun = self.mod.get_function("write_shmem") else: @@ -128,19 +182,30 @@ class Renderer(object): event_a = cuda.Event().record(filt_stream) event_b = None - nbins = info.acc_height * info.acc_stride + awidth = width + 2 * self.gutter + aheight = height + 2 * self.gutter + astride = 32 * int(np.ceil(awidth / 32.)) + dim = Dimensions(width, height, awidth, aheight, astride) + d_acc_size = self.mod.get_global('acc_size')[0] + cuda.memcpy_htod_async(d_acc_size, u32(list(dim)), write_stream) + + nbins = awidth * aheight # Extra padding in accum helps with write_shmem overruns d_accum = cuda.mem_alloc(16 * nbins + (1<<16)) d_out = cuda.mem_alloc(16 * nbins) - if info.acc_mode == 'atomic': + if self.acc_mode == 'atomic': d_atom = cuda.mem_alloc(8 * nbins) flush_fun = self.mod.get_function("flush_atom") - acc_size = np.array([info.acc_width, info.acc_height, info.acc_stride]) - d_acc_size = self.mod.get_global('acc_size')[0] - cuda.memcpy_htod_async(d_acc_size, np.uint32(acc_size), write_stream) + obuf_copy = argset(cuda.Memcpy2D(), + src_y=self.gutter, src_x_in_bytes=16*self.gutter, + src_pitch=16*astride, dst_pitch=16*width, + width_in_bytes=16*width, height=height) + obuf_copy.set_src_device(d_out) + h_out_a = cuda.pagelocked_empty((height, width, 4), f32) + h_out_b = cuda.pagelocked_empty((height, width, 4), f32) - if info.acc_mode == 'deferred': + if self.acc_mode == 'deferred': # Having a fixed, power-of-two log size makes things much easier log_size = 64 << 20 d_log = cuda.mem_alloc(log_size * 4) @@ -153,9 +218,8 @@ class Renderer(object): # Calculate 'nslots', the number of simultaneous running threads that # can be active on the GPU during iteration (and thus the number of # slots for loading and storing RNG and point context that will be - # prepared on the device), 'rb_size' (the number of blocks in - # 'nslots'), and determine a number of temporal samples - # likely to load-balance effectively + # prepared on the device), and derive 'rb_size', the number of blocks in + # 'nslots'. iter_threads_per_block = 256 dev_data = pycuda.tools.DeviceData() occupancy = pycuda.tools.OccupancyRecord( @@ -169,14 +233,16 @@ class Renderer(object): reset_rb_fun(np.int32(rb_size), block=(1,1,1)) d_points = cuda.mem_alloc(nslots * 16) - # We may add extra seeds to simplify palette dithering. - seeds = mwc.MWC.make_seeds(max(nslots, 256 * info.palette_height)) + # This statement may add extra seeds to simplify palette dithering. + seeds = mwc.MWC.make_seeds(max(nslots, 256 * self.palette_height)) d_seeds = cuda.to_device(seeds) # We used to auto-calculate this to a multiple of the number of SMs on # the device, but since we now use shorter launches and, to a certain # extent, allow simultaneous occupancy, that's not as important. The - # 1024 is a magic constant, though: FUSE + # 1024 is a magic constant to ensure reasonable and power-of-two log + # size for deferred: 256MB / (4B * FUSE * NTHREADS). Enhancements to + # the sort engine are needed to make this more flexible. ntemporal_samples = 1024 genome_times, genome_knots = self._iter.packer.pack() d_genome_times = cuda.to_device(genome_times) @@ -184,37 +250,31 @@ class Renderer(object): info_size = 4 * len(self._iter.packer) * ntemporal_samples d_infos = cuda.mem_alloc(info_size) - pals = info.genome.color.palette + pals = genome.color.palette if isinstance(pals, basestring): pals = [0.0, pals, 1.0, pals] - palint_times = np.empty(len(genome_times[0]), np.float32) + palint_times = np.empty(len(genome_times[0]), f32) palint_times.fill(100.0) - palint_times[:len(pals)/2] = pals[::2] + palint_times[:len(pals)] = [p[0] for p in pals] d_palint_times = cuda.to_device(palint_times) d_palint_vals = cuda.to_device( - np.concatenate(map(info.db.palettes.get, pals[1::2]))) + np.concatenate([p[1].data for p in pals])) - if info.acc_mode in ('deferred', 'atomic'): + if self.acc_mode in ('deferred', 'atomic'): palette_fun = self.mod.get_function("interp_palette_hsv_flat") - dsc = cuda.ArrayDescriptor3D() - dsc.height = info.palette_height - dsc.width = 256 - dsc.depth = 0 - dsc.format = cuda.array_format.SIGNED_INT32 - dsc.num_channels = 2 - dsc.flags = cuda.array3d_flags.SURFACE_LDST + dsc = argset(cuda.ArrayDescriptor3D(), height=self.palette_height, + width=256, depth=0, format=cuda.array_format.SIGNED_INT32, + num_channels=2, flags=cuda.array3d_flags.SURFACE_LDST) palarray = cuda.Array(dsc) tref = self.mod.get_surfref('flatpal') tref.set_array(palarray, 0) else: palette_fun = self.mod.get_function("interp_palette_hsv") - dsc = cuda.ArrayDescriptor() - dsc.height = info.palette_height - dsc.width = 256 - dsc.format = cuda.array_format.UNSIGNED_INT8 - dsc.num_channels = 4 - d_palmem = cuda.mem_alloc(256 * info.palette_height * 4) + dsc = argset(cuda.ArrayDescriptor(), height=self.palette_height, + width=256, format=cuda.array_format.UNSIGNED_INT8, + num_channels=4) + d_palmem = cuda.mem_alloc(256 * self.palette_height * 4) tref = self.mod.get_texref('palTex') tref.set_address_2d(d_palmem, dsc, 1024) @@ -222,49 +282,46 @@ class Renderer(object): tref.set_flags(cuda.TRSF_NORMALIZED_COORDINATES) tref.set_filter_mode(cuda.filter_mode.LINEAR) - h_out_a = cuda.pagelocked_empty((info.acc_height, info.acc_stride, 4), - np.float32) - h_out_b = cuda.pagelocked_empty((info.acc_height, info.acc_stride, 4), - np.float32) - last_idx = None + while next_frame is not None: + # tc, td, ts, te: central, delta, start, end times + idx, tc = next_frame + td = genome.adj_frame_width(tc) + ts, te = tc - 0.5 * td, tc + 0.5 * td - for idx, start, stop in times: - twidth = np.float32((stop-start) / info.palette_height) - if info.acc_mode in ('deferred', 'atomic'): - palette_fun(d_seeds, d_palint_times, d_palint_vals, - np.float32(start), twidth, - block=(256,1,1), grid=(info.palette_height,1), - stream=write_stream) + if self.acc_mode in ('deferred', 'atomic'): + # In this mode, the palette writes to a surface reference, but + # requires dithering, so we pass it the seeds instead + arg0 = d_seeds else: - palette_fun(d_palmem, d_palint_times, d_palint_vals, - np.float32(start), twidth, - block=(256,1,1), grid=(info.palette_height,1), - stream=write_stream) + arg0 = d_palmem + palette_fun(arg0, d_palint_times, d_palint_vals, + f32(ts), f32(td / self.palette_height), + block=(256,1,1), grid=(self.palette_height,1), + stream=write_stream) - width = np.float32((stop-start) / ntemporal_samples) - packer_fun(d_infos, d_genome_times, d_genome_knots, - np.float32(start), width, d_seeds, - np.int32(ntemporal_samples), block=(256,1,1), + packer_fun(d_infos, d_seeds, d_genome_times, d_genome_knots, + f32(ts), f32(td / ntemporal_samples), + i32(ntemporal_samples), block=(256,1,1), grid=(int(np.ceil(ntemporal_samples/256.)),1), stream=iter_stream) # Reset points so that they will be FUSEd util.BaseCode.fill_dptr(self.mod, d_points, 4 * nslots, - iter_stream, np.float32(np.nan)) + iter_stream, f32(np.nan)) # Get interpolated control points for debugging #iter_stream.synchronize() #d_temp = cuda.from_device(d_infos, - #(ntemporal_samples, len(self._iter.packer)), np.float32) + #(ntemporal_samples, len(self._iter.packer)), f32) #for i, n in zip(d_temp[5], self._iter.packer.packed): #print '%60s %g' % ('_'.join(n), i) util.BaseCode.fill_dptr(self.mod, d_accum, 4 * nbins, write_stream) - if info.acc_mode == 'atomic': + if self.acc_mode == 'atomic': util.BaseCode.fill_dptr(self.mod, d_atom, 2 * nbins, write_stream) - nrounds = ( (info.density * info.width * info.height) - / (ntemporal_samples * 256 * 256) ) + 1 - if info.acc_mode == 'deferred': + nrounds = int( (genome.spp(tc) * width * height) + / (ntemporal_samples * 256 * 256) ) + 1 + if self.acc_mode == 'deferred': for i in range(nrounds): iter_fun(np.uint64(d_log), d_seeds, d_points, d_infos, block=(32, self._iter.NTHREADS/32, 1), @@ -272,18 +329,17 @@ class Renderer(object): _sync_stream(write_stream, iter_stream) sorter.sort(d_log_sorted, d_log, log_size, 3, True, stream=write_stream) - #print cuda.from_device(sorter.dglobal, (256,), np.uint32) _sync_stream(iter_stream, write_stream) write_fun(d_accum, d_log_sorted, sorter.dglobal, i32(nbins), block=(1024, 1, 1), grid=(nwriteblocks, 1), stream=write_stream) else: args = [u64(d_accum), d_seeds, d_points, d_infos] - if info.acc_mode == 'atomic': + if self.acc_mode == 'atomic': args.append(u64(d_atom)) iter_fun(*args, block=(32, self._iter.NTHREADS/32, 1), grid=(ntemporal_samples, nrounds), stream=iter_stream) - if info.acc_mode == 'atomic': + if self.acc_mode == 'atomic': nblocks = int(np.ceil(np.sqrt(nbins/float(512)))) flush_fun(u64(d_accum), u64(d_atom), i32(nbins), block=(512, 1, 1), grid=(nblocks, nblocks), @@ -291,27 +347,29 @@ class Renderer(object): util.BaseCode.fill_dptr(self.mod, d_out, 4 * nbins, filt_stream) _sync_stream(filt_stream, write_stream) - filt.de(d_out, d_accum, info, start, stop, filt_stream) + filt.de(d_out, d_accum, genome, dim, tc, stream=filt_stream) _sync_stream(write_stream, filt_stream) - filt.colorclip(d_out, info, start, stop, filt_stream) - cuda.memcpy_dtoh_async(h_out_a, d_out, filt_stream) + filt.colorclip(d_out, genome, dim, tc, blend, stream=filt_stream) + obuf_copy.set_dst_host(h_out_a) + obuf_copy(filt_stream) if event_b: while not event_a.query(): timemod.sleep(0.01) gpu_time = event_a.time_since(event_b) - yield RenderedImage(self._trim(h_out_b), last_idx, gpu_time) + result = RenderedImage(h_out_b, last_idx, gpu_time) + else: + result = None + last_idx = idx event_a, event_b = cuda.Event().record(filt_stream), event_a h_out_a, h_out_b = h_out_b, h_out_a - last_idx = idx + + # TODO: add ability to flush a frame without breaking the pipe + next_frame = yield result while not event_a.query(): timemod.sleep(0.001) gpu_time = event_a.time_since(event_b) - yield RenderedImage(self._trim(h_out_b), last_idx, gpu_time) - - def _trim(self, result): - g = self.info.gutter - return result[g:-g,g:g+self.info.width].copy() + yield RenderedImage(h_out_b, last_idx, gpu_time)