mirror of
synced 2025-03-13 15:01:28 -04:00
Support x264 10-bit output format.
This commit is contained in:
@ -1,15 +1,16 @@
from util import devlib, ringbuflib
from mwc import mwclib
f32tou8lib = devlib(deps=[ringbuflib, mwclib], defs=r'''
rgba8lib = devlib(deps=[ringbuflib, mwclib], defs=r'''
// Perform a conversion from float32 values to uint8 ones, applying
// pixel- and channel-independent dithering to reduce suprathreshold banding
// artifacts. Clamps values larger than 1.0f.
// TODO: move to a separate module?
// TODO: less ineffecient mwc_st handling?
__global__ void f32_to_u8(
ringbuf *rb, mwc_st *rctxs, uchar4 *dst, const float4 *src,
int gutter, int dstride, int sstride, int height)
__global__ void f32_to_rgba_u8(
uchar4 *dst, const float4 *src,
int gutter, int dstride, int sstride, int height,
ringbuf *rb, mwc_st *rctxs)
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -32,3 +33,34 @@ __global__ void f32_to_u8(
rctxs[rb_incr(rb->tail, tid)] = rctx;
rgba16lib = devlib(deps=[ringbuflib, mwclib], defs=r'''
// Perform a conversion from float32 values to uint16 ones, as above.
__global__ void f32_to_rgba_u16(
ushort4 *dst, const float4 *src,
int gutter, int dstride, int sstride, int height,
ringbuf *rb, mwc_st *rctxs)
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x > dstride || y > height) return;
int isrc = sstride * (y + gutter) + x + gutter;
int tid = blockDim.x * threadIdx.y + threadIdx.x;
mwc_st rctx = rctxs[rb_incr(rb->head, tid)];
float4 in = src[isrc];
ushort4 out = make_ushort4(
fminf(1.0f, in.x) * 65535.0f + 0.49f * mwc_next_11(rctx),
fminf(1.0f, in.y) * 65535.0f + 0.49f * mwc_next_11(rctx),
fminf(1.0f, in.z) * 65535.0f + 0.49f * mwc_next_11(rctx),
fminf(1.0f, in.w) * 65535.0f + 0.49f * mwc_next_11(rctx)
int idst = dstride * y + x;
dst[idst] = out;
rctxs[rb_incr(rb->tail, tid)] = rctx;
pixfmtlib = devlib(deps=[rgba8lib, rgba16lib])
@ -115,6 +115,8 @@ profile = (
, 'end': Scalar(None, 'Last frame to render (1-indexed, exclusive; '
'negative indexes from the end)')
, 'skip': Scalar(0, 'Skip this many frames between each rendered frame')
, 'shard': Scalar(0, 'Pack this many frames in each output file '
'(causing start, end, and skip to be ignored)')
, 'height': Scalar(1920, 'Output height in pixels')
, 'width': Scalar(1080, 'Output width in pixels')
@ -123,7 +125,9 @@ profile = (
, 'filter_order': list_(enum(filters.keys()), default_filters)
, 'filters': prof_filters
, 'output_format': enum('jpg png tif', 'jpg')
# The other keys in the 'output' dictionary are format-specific and not
# documented here.
, 'output': {'type': enum('jpeg png tiff x264', 'jpeg')}
# Types recognized as independent units with a 'type' key
@ -1,10 +1,14 @@
import os
import tempfile
from cStringIO import StringIO
from subprocess import Popen, PIPE
import numpy as np
from numpy import float32 as f32, int32 as i32
import pycuda.driver as cuda
from code.util import ClsMod, launch
from code.output import f32tou8lib
from code.output import pixfmtlib
import scipy.misc
@ -12,39 +16,209 @@ if not hasattr(scipy.misc, 'toimage'):
raise ImportError("Could not find scipy.misc.toimage. "
"Are scipy and PIL installed?")
def launchC(name, mod, stream, dim, fb, *args):
launch(name, mod, stream,
(32, 8, 1), (int(np.ceil(dim.w/32.)), int(np.ceil(dim.h/8.))),
fb.d_back, fb.d_front,
i32(fb.gutter), i32(dim.w), i32(dim.astride), i32(dim.h),
class Output(object):
def convert(self, fb, gnm, dim, stream=None):
Convert a filtered buffer to whatever output format is needed by the
This function is intended for use by the Renderer, and should not be
called by clients. It does not modify its instance.
raise NotImplementedError()
def copy(self, fb, dim, pool, stream=None):
Schedule a copy from the device buffer to host memory, returning the
target buffer.
target buffer(s).
This function is intended for use by the Renderer, and should not be
called by clients. It does not modify its instance.
raise NotImplementedError()
def encode(self, host_frame):
Push `host_frame` (as returned from `Output.copy`) into the encoding
pipeline, and return any completed media segments. If `host_frame` is
None, flush the encoding pipeline.
The return value is a 2-tuple `(media, logs)`. `media` is a dictionary
mapping channel names (appropriate for use as file suffixes) to
file-like objects containing the encoded media segments. `logs` is a
dictionary containing log entries. Either or both entries can be empty
at any time (and will typically be either populated on each frame
except the flush, for non-temporal codecs, or will be empty on all
frames except the flush, for temporal codecs.)
Media segments are discretely decodeable chunks of content. The
mapping of media segments to individual frames is not specified.
raise NotImplementedError()
def suffix(self):
Return the file suffix that will be used. If more than one suffix will
be used, the value returned is the one considered to be "primary".
raise NotImplementedError()
class PILOutput(Output, ClsMod):
lib = f32tou8lib
lib = pixfmtlib
def __init__(self, codec='jpeg', quality=100, alpha=False):
super(PILOutput, self).__init__()
self.type, self.quality, self.alpha = codec, quality, alpha
def convert(self, fb, gnm, dim, stream=None):
launch('f32_to_u8', self.mod, stream,
(32, 8, 1), (int(np.ceil(dim.w/32.)), int(np.ceil(dim.h/8.))),
fb.d_rb, fb.d_seeds, fb.d_back, fb.d_front,
i32(fb.gutter), i32(dim.w), i32(dim.astride), i32(dim.h))
launchC('f32_to_rgba_u8', self.mod, stream, dim, fb,
fb.d_rb, fb.d_seeds)
def copy(self, fb, dim, pool, stream=None):
h_out = pool.allocate((dim.h, dim.w, 4), 'u1')
cuda.memcpy_dtoh_async(h_out, fb.d_back, stream)
return h_out
def save(buf, name, type=None, quality=98):
type = dict(jpg='jpeg', tif='tiff').get(type, type)
if type == 'jpeg' or (type is None and name.endswith('.jpg')):
buf = buf[:,:,:3]
def _convert_buf(self, buf):
out = StringIO()
img = scipy.misc.toimage(buf, cmin=0, cmax=1)
img.save(name, type, quality=quality)
img.save(out, self.type, quality=self.quality)
return out
def encode(self, buf):
if buf is None: return {}, []
if self.type == 'jpeg':
out = self._convert_buf(buf[:,:,:3])
if self.alpha:
alpha = self._convert_buf(buf[:,:,3])
return {'_color.jpg': out, '_alpha.jpg': alpha}, []
return {'.jpg': out}, {}
return {'.'+self.type: self._convert_buf(buf)}, []
def suffix(self):
if self.type == 'jpeg':
if self.alpha: return '_color.jpg'
return '.jpg'
return '.'+self.type
class X264Output(Output, ClsMod):
lib = pixfmtlib
profiles = (
{ 'normal': '--profile high444 --level 4.2'
, '': ''
base = ('x264 --no-progress --input-depth 16 --sync-lookahead 0 '
'--rc-lookahead 5 --muxer raw -o - - --log-level debug ')
def __init__(self, profile='normal', csp='i444', crf=15,
x264opts='', alpha=False):
super(X264Output, self).__init__()
self.args = ' '.join([self.base, self.profiles[profile],
'--crf', str(crf), x264opts]).split()
self.alpha = alpha
self.csp = csp
self.framesize = None
self.zeros = None
self.subp = None
self.outf = None
self.asubp = None
self.aoutf = None
def convert(self, fb, gnm, dim, stream=None):
launchC('f32_to_rgba_u16', self.mod, stream, dim, fb,
fb.d_rb, fb.d_seeds)
def copy(self, fb, dim, pool, stream=None):
h_out = pool.allocate((dim.h, dim.w, 4), 'u2')
cuda.memcpy_dtoh_async(h_out, fb.d_back, stream)
return h_out
def _spawn_sub(self, framesize, alpha):
res = '%dx%d' % (framesize[1], framesize[0])
csp = 'yv12' if alpha else 'rgb'
extras = ['--input-csp', csp, '--demuxer', 'raw', '--input-res', res]
outf = tempfile.TemporaryFile(bufsize=0)
if alpha:
extras += ['--output-csp', 'i420', '--chroma-qp-offset', '24']
extras += ['--output-csp', self.csp]
subp = Popen(self.args + extras, stdin=PIPE, stderr=PIPE,
return outf, subp
def _spawn(self, framesize):
self.framesize = framesize
self.outf, self.subp = self._spawn_sub(framesize, False)
if self.alpha:
self.aoutf, self.asubp = self._spawn_sub(framesize, True)
bufsz = framesize[0] * framesize[1] / 2
self.zeros = np.empty(bufsz, dtype='u2')
def _flush_sub(self, subp):
(stdout, stderr) = subp.communicate()
if subp.returncode:
raise IOError("x264 exited with an error")
return stderr
def _flush(self):
if self.subp is None:
return {}, []
log = self._flush_sub(self.subp)
self.subp = None
if self.alpha:
alog = self._flush_sub(self.asubp)
self.asubp = None
return ({'_color.h264': self.outf, '_alpha.h264': self.aoutf},
[('x264_color', log), ('x264_alpha', alog)])
return {'.h264': self.outf}, [('x264_color', stderr)]
def _write(self, buf, subp):
except IOError, e:
print 'Exception while writing. Log:'
print subp.stderr.read()
raise e
def encode(self, buf):
out = ({}, [])
if buf is None or self.framesize != buf.shape[:2]:
out = self._flush()
if buf is None:
return out
if self.subp is None:
self._write(np.delete(buf, 3, axis=2), self.subp)
if self.alpha:
self._write(buf[:,:,3].tostring(), self.asubp)
self._write(buffer(self.zeros), self.asubp)
return out
def suffix(self):
if self.alpha: return '_color.h264'
return '.h264'
def get_output_for_profile(gprof):
opts = dict(gprof.output._val)
handler = opts.pop('type', 'jpeg')
if handler in ('jpeg', 'png', 'tiff'):
return PILOutput(codec=handler, **opts)
elif handler == 'x264':
return X264Output(**opts)
raise ValueError('Invalid output type "%s".' % handler)
@ -37,6 +37,13 @@ def add_args(parser=None):
help="Last frame to render (1-indexed, exclusive, negative from end)")
tmp.add_argument('--skip', dest='skip', metavar='N', type=int,
help="Skip N frames between each rendered frame")
# TODO: eliminate the 'silently overwritten' bit.
tmp.add_argument('--shard', dest='shard', metavar='SECS', type=float,
help="Write SECS of output into each file, instead of one frame per "
"file. If set, causes 'start', 'end', and 'skip' to be ignored. "
"If output codecs don't support multi-file writing, files will "
"be silently overwritten.")
tmp.add_argument('--still', action='store_true',
help='Override start, end, and temporal frame width to render one '
'frame without motion blur.')
@ -48,7 +55,7 @@ def add_args(parser=None):
spa.add_argument('--height', type=int, metavar='PX')
out = parser.add_argument_group('Output options')
out.add_argument('--codec', choices=['jpg', 'png', 'tiff'])
out.add_argument('--codec', choices=['jpg', 'png', 'tiff', 'x264'])
return parser
def get_from_args(args):
@ -64,9 +71,11 @@ def get_from_args(args):
if args.still:
base.update(frame_width=0, start=1, end=2)
for arg in 'duration fps start end skip spp width height'.split():
for arg in 'duration fps start end skip shard spp width height'.split():
if getattr(args, arg, None) is not None:
base[arg] = getattr(args, arg)
if args.codec is not None:
base.setdefault('output', {})['type'] = args.codec
return name, base
@ -82,13 +91,20 @@ def wrap(prof, gnm):
def enumerate_times(gprof):
Given a profile, return a list of `(frame_no, center_time)` pairs. Note
Given a profile, return a list of `(frame_no, center_times)` pairs. Note
that the enumeration is applied before `start`, `end`, and `skip`, and so
`frame_no` may be non-contiguous.
nframes = round(gprof.fps * gprof.duration)
times = np.linspace(0, 1, nframes + 1)
times = list(enumerate(times[:-1] + 0.5 * (times[1] - times[0]), 1))
times = times[:-1] + 0.5 * (times[1] - times[0])
if gprof.shard:
s = max(1, int(round(gprof.fps * gprof.shard)))
return [(i, times[t:t+s])
for i, t in enumerate(range(0, len(times), s), 1)]
times = [[t] for t in times]
times = list(enumerate(times, 1))
if gprof.end is not None:
times = times[:gprof.end]
if gprof.start is not None:
@ -235,7 +235,7 @@ class Renderer(object):
self.packer, self.lib, self.cubin = self.compile(gnm)
self.mod = self.load(self.cubin)
self.filts = filters.create(gprof)
self.out = output.PILOutput()
self.out = output.get_output_for_profile(gprof)
class RenderManager(ClsMod):
lib = devlib(deps=[interp.palintlib, filldptrlib, iter.flushatomlib])
@ -395,26 +395,3 @@ class RenderManager(ClsMod):
self.info_a, self.info_b = self.info_b, self.info_a
self.stream_a, self.stream_b = self.stream_b, self.stream_a
return self.copy_evt, h_out
def render(self, gnm, gprof, times):
A port of the old rendering function, retained for backwards
compatibility. Some of this will be pulled into as-yet-undecided
methods for more DRY.
rdr = Renderer(gnm, gprof)
last_evt = cuda.Event().record(self.stream_a)
last_idx = None
def wait(): # Times like these where you wish for a macro
while not last_evt.query():
gpu_time = last_evt.time_since(two_evts_ago)
return RenderedImage(last_buf, last_idx, gpu_time)
for idx, tc in times:
evt, h_buf = self.queue_frame(rdr, gnm, gprof, tc, last_idx is None)
if last_idx:
yield wait()
two_evts_ago, last_evt = last_evt, evt
last_buf, last_idx = h_buf, idx
if last_idx:
yield wait()
@ -10,11 +10,15 @@ from gevent import spawn, queue, coros
import zmq.green as zmq
import _importhack
from cuburn import profile
from cuburn import profile, output
from cuburn.genome import db, util
from messages import *
# TODO: remove this dependency (loading the output module to get the suffix
# requires a compiler / default instance)
import pycuda.autoinit
class RenderClient(object):
def __init__(self, task_addr, rsp_addr, ctx=None, start=True):
ctx = zmq.Context() if ctx is None else ctx
@ -56,12 +60,12 @@ class RenderClient(object):
def _deal_rsps(self):
while True:
rsp = self.rsock.recv_multipart(copy=False)
assert len(rsp) == 2
rq = self.taskmap.get(rsp[0].bytes, None)
if rq: rq.put(rsp[1])
if rq: rq.put((rsp[1].bytes, rsp[2].bytes.split('\0'), rsp[3:]))
# Time (in seconds) before a job times out
# TODO: replace timeout mechanism with polling?
# Max. queue length before request considered lost, as a multiple of the
# number of in-flight requests
@ -92,21 +96,27 @@ def iter_genomes(prof, outpath, gpaths):
with open(os.path.join(odir, 'NFRAMES'), 'w') as fp:
outmod = output.get_output_for_profile(gprof)
for i, t in times:
opath = os.path.join(odir, '%05d.%s' % (i, gprof.output_format))
if not os.path.isfile(opath):
opath = os.path.join(odir, '%05d' % i)
if not os.path.isfile(opath + outmod.suffix):
yield Task(opath, ghash, prof, gnm, t)
def get_result(cli, task, rq):
rsp = rq.get(timeout=TIMEOUT)
log, names, bufs = rq.get(timeout=TIMEOUT)
except queue.Empty:
cli.put(task, rq)
print '>>', task.id
rsp = rq.get()
log, names, bufs = rq.get()
with open(task.id, 'wb') as fp:
with open(task.id + '.log', 'wb') as fp:
for name in reversed(names):
buf = bufs.pop()
with open(task.id + name, 'wb') as fp:
print '< ', task.id
def main(addrs):
@ -128,6 +138,8 @@ def main(addrs):
while cli.taskmap:
print 'Still waiting on %d tasks...' % len(cli.taskmap)
for i in cli.taskmap.items():
print i
if __name__ == "__main__":
@ -1,5 +1,5 @@
from collections import namedtuple
Task = namedtuple('Task', 'id hash profile anim time')
Task = namedtuple('Task', 'id hash profile anim times')
AddressedTask = namedtuple('AddressedTask', 'addr task')
FullTask = namedtuple('FullTask', 'addr task cubin packer')
@ -1,4 +1,5 @@
#!/usr/bin/env python2
from itertools import takewhile
import gevent
from gevent import spawn, queue, event
@ -29,7 +30,6 @@ def setup_task_listeners(addrs, tq, rq):
# losock to be added to the queue.
task = hisock.recv_pyobj()
print 'OOOOOH! Got a hiprio evt'
loevt.clear() # Got message; pause listen_lo().
@ -77,7 +77,7 @@ def setup_worker_listener(addrs, tq, rq):
while True:
rsp = wsock.recv_multipart(copy=False)
if rsp[2].bytes != '':
print '< ', ' '.join([r.bytes for r in rsp[2:-1]])
print '< ', rsp[2].bytes, rsp[3].bytes
@ -1,5 +1,6 @@
#!/usr/bin/env python2
import sys
import socket
from cStringIO import StringIO
import gevent
@ -37,20 +38,29 @@ def main(worker_addr):
hash = None
while True:
log = [('worker', socket.gethostname() + ':' +
addr, task, cubin, packer = sock.recv_pyobj()
gprof = profile.wrap(task.profile, task.anim)
if hash != task.hash:
rdr = PrecompiledRenderer(task.anim, gprof, packer, cubin)
evt, buf = rmgr.queue_frame(rdr, task.anim, gprof, task.time)
while not evt.query():
ofile = StringIO()
output.PILOutput.save(buf, ofile, task.id[-3:])
sock.send_multipart(addr + [ofile.read()])
hash = task.hash
for t in task.times:
evt, buf = rmgr.queue_frame(rdr, task.anim, gprof, t)
while not evt.query():
out, frame_log = rdr.out.encode(buf)
log += frame_log
print 'Rendered', task.id, 'in', int(evt.time()), 'ms'
final_out, final_log = rdr.out.encode(None)
assert not (out and final_out), 'Got output from two sources!'
out = out or final_out
log += final_log
log = '\0'.join([k + ' ' + v for k, v in log])
print 'Rendered', task.id, 'in', int(evt.time()), 'ms'
suffixes, files = zip(*[(k, v.read())
for k, v in sorted(out.items())])
# TODO: reduce copies, generally spruce up the memory usage here
sock.send_multipart(addr + [log, '\0'.join(suffixes)] + list(files))
# Spawn two request loops to take advantage of CUDA pipelining.
@ -26,44 +26,16 @@ sys.path.insert(0, os.path.dirname(__file__))
from cuburn import render, filters, output, profile
from cuburn.genome import convert, use, db
def save(out):
# Temporary! TODO: fix this
output.PILOutput.save(out.buf, out.idx)
print out.idx, out.gpu_time
def main(args, prof):
gdb = db.connect(args.genomedb)
gnm, basename = gdb.get_anim(args.flame, args.half)
if getattr(args, 'print'):
print convert.to_json(gnm)
gprof = profile.wrap(prof, gnm)
if args.name is not None:
basename = args.name
prefix = os.path.join(args.dir, basename)
if args.subdir:
if not os.path.isdir(prefix):
prefix += '/'
prefix += '_'
frames = [('%s%05d%s.jpg' % (prefix, (i+1), args.suffix), t)
for i, t in profile.enumerate_times(gprof)]
if args.resume:
m = os.path.getmtime(args.flame)
frames = (f for f in frames
if not os.path.isfile(f[0]) or m > os.path.getmtime(f[0]))
import pycuda.autoinit
rmgr = render.RenderManager()
gen = rmgr.render(gnm, gprof, frames)
if not args.gfx:
for out in gen:
def save(output_module, name, rendered_frame):
out, log = output_module.encode(rendered_frame)
for suffix, file_like in out.items():
with open(name + suffix, 'w') as fp:
for key, val in log:
print '\n=== %s ===' % key
print val
def pyglet_preview(args, gprof, itr):
import pyglet
import pyglet.gl as gl
w, h = gprof.width, gprof.height
@ -92,39 +64,89 @@ def main(args, prof):
last_time = [time.time()]
def poll(dt):
out = next(gen, False)
out = next(itr, False)
if out is False:
if args.pause:
label.text = "Done. ('q' to quit)"
elif out is not None:
name, buf = out
real_dt = time.time() - last_time[0]
last_time[0] = time.time()
if out.buf.dtype == np.uint8:
if buf.dtype == np.uint8:
elif out.buf.dtype == np.uint16:
elif buf.dtype == np.uint16:
label.text = 'Unsupported format: ' + out.buf.dtype
label.text = 'Unsupported format: ' + buf.dtype
h, w, ch = out.buf.shape
h, w, ch = buf.shape
gl.glBindTexture(tex.target, tex.id)
gl.glTexImage2D(tex.target, 0, gl.GL_RGB8, w, h, 0, gl.GL_RGBA,
fmt, out.buf.tostring())
fmt, buf.tostring())
label.text = '%s (%g fps)' % (out.idx, 1./real_dt)
label.text = '%s (%g fps)' % (name, 1./real_dt)
label.text += '.'
pyglet.clock.schedule_interval(poll, 1/30.)
pyglet.clock.schedule_interval(poll, 1/20.)
def main(args, prof):
gdb = db.connect(args.genomedb)
gnm, basename = gdb.get_anim(args.flame, args.half)
if getattr(args, 'print'):
print convert.to_json(gnm)
gprof = profile.wrap(prof, gnm)
if args.name is not None:
basename = args.name
prefix = os.path.join(args.dir, basename)
if args.subdir:
if not os.path.isdir(prefix):
prefix_plus = prefix + '/'
prefix_plus = prefix + '_'
frames = [('%s%05d%s' % (prefix_plus, i, args.suffix), t)
for i, t in profile.enumerate_times(gprof)]
# We don't initialize a CUDA context until here. This keeps other
# functions like --help and --print snappy.
import pycuda.autoinit
rmgr = render.RenderManager()
rdr = render.Renderer(gnm, gprof)
def render_iter():
m = os.path.getmtime(args.flame)
first = True
for name, times in frames:
if args.resume:
fp = name + rdr.out.suffix
if os.path.isfile(fp) and m < os.path.getmtime(f[0]+ext):
for t in times:
evt, buf = rmgr.queue_frame(rdr, gnm, gprof, t, first)
first = False
while not evt.query():
yield None
save(rdr.out, name, buf)
print name, evt.time()
yield name, buf
save(rdr.out, name, None)
if args.gfx:
pyglet_preview(args, gprof, render_iter())
for i in render_iter(): pass
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Render fractal flames.')
Reference in New Issue
Block a user