Support x264 10-bit output format.

This commit is contained in:
Steven Robertson
2012-07-22 15:53:38 -07:00
parent 21f783730a
commit 3294ba10d6
10 changed files with 363 additions and 116 deletions

View File

@ -1,15 +1,16 @@
from util import devlib, ringbuflib
from mwc import mwclib
f32tou8lib = devlib(deps=[ringbuflib, mwclib], defs=r'''
rgba8lib = devlib(deps=[ringbuflib, mwclib], defs=r'''
// Perform a conversion from float32 values to uint8 ones, applying
// pixel- and channel-independent dithering to reduce suprathreshold banding
// artifacts. Clamps values larger than 1.0f.
// TODO: move to a separate module?
// TODO: less ineffecient mwc_st handling?
__global__ void f32_to_u8(
ringbuf *rb, mwc_st *rctxs, uchar4 *dst, const float4 *src,
int gutter, int dstride, int sstride, int height)
__global__ void f32_to_rgba_u8(
uchar4 *dst, const float4 *src,
int gutter, int dstride, int sstride, int height,
ringbuf *rb, mwc_st *rctxs)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -32,3 +33,34 @@ __global__ void f32_to_u8(
rctxs[rb_incr(rb->tail, tid)] = rctx;
}
''')
rgba16lib = devlib(deps=[ringbuflib, mwclib], defs=r'''
// Perform a conversion from float32 values to uint16 ones, as above.
__global__ void f32_to_rgba_u16(
ushort4 *dst, const float4 *src,
int gutter, int dstride, int sstride, int height,
ringbuf *rb, mwc_st *rctxs)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x > dstride || y > height) return;
int isrc = sstride * (y + gutter) + x + gutter;
int tid = blockDim.x * threadIdx.y + threadIdx.x;
mwc_st rctx = rctxs[rb_incr(rb->head, tid)];
float4 in = src[isrc];
ushort4 out = make_ushort4(
fminf(1.0f, in.x) * 65535.0f + 0.49f * mwc_next_11(rctx),
fminf(1.0f, in.y) * 65535.0f + 0.49f * mwc_next_11(rctx),
fminf(1.0f, in.z) * 65535.0f + 0.49f * mwc_next_11(rctx),
fminf(1.0f, in.w) * 65535.0f + 0.49f * mwc_next_11(rctx)
);
int idst = dstride * y + x;
dst[idst] = out;
rctxs[rb_incr(rb->tail, tid)] = rctx;
}
''')
pixfmtlib = devlib(deps=[rgba8lib, rgba16lib])

View File

@ -115,6 +115,8 @@ profile = (
, 'end': Scalar(None, 'Last frame to render (1-indexed, exclusive; '
'negative indexes from the end)')
, 'skip': Scalar(0, 'Skip this many frames between each rendered frame')
, 'shard': Scalar(0, 'Pack this many frames in each output file '
'(causing start, end, and skip to be ignored)')
, 'height': Scalar(1920, 'Output height in pixels')
, 'width': Scalar(1080, 'Output width in pixels')
@ -123,7 +125,9 @@ profile = (
, 'filter_order': list_(enum(filters.keys()), default_filters)
, 'filters': prof_filters
, 'output_format': enum('jpg png tif', 'jpg')
# The other keys in the 'output' dictionary are format-specific and not
# documented here.
, 'output': {'type': enum('jpeg png tiff x264', 'jpeg')}
})
# Types recognized as independent units with a 'type' key

View File

@ -1,10 +1,14 @@
import os
import tempfile
from cStringIO import StringIO
from subprocess import Popen, PIPE
import numpy as np
from numpy import float32 as f32, int32 as i32
import pycuda.driver as cuda
from code.util import ClsMod, launch
from code.output import f32tou8lib
from code.output import pixfmtlib
import scipy.misc
@ -12,39 +16,209 @@ if not hasattr(scipy.misc, 'toimage'):
raise ImportError("Could not find scipy.misc.toimage. "
"Are scipy and PIL installed?")
def launchC(name, mod, stream, dim, fb, *args):
launch(name, mod, stream,
(32, 8, 1), (int(np.ceil(dim.w/32.)), int(np.ceil(dim.h/8.))),
fb.d_back, fb.d_front,
i32(fb.gutter), i32(dim.w), i32(dim.astride), i32(dim.h),
*args)
class Output(object):
def convert(self, fb, gnm, dim, stream=None):
"""
Convert a filtered buffer to whatever output format is needed by the
writer.
This function is intended for use by the Renderer, and should not be
called by clients. It does not modify its instance.
"""
raise NotImplementedError()
def copy(self, fb, dim, pool, stream=None):
"""
Schedule a copy from the device buffer to host memory, returning the
target buffer.
target buffer(s).
This function is intended for use by the Renderer, and should not be
called by clients. It does not modify its instance.
"""
raise NotImplementedError()
def encode(self, host_frame):
"""
Push `host_frame` (as returned from `Output.copy`) into the encoding
pipeline, and return any completed media segments. If `host_frame` is
None, flush the encoding pipeline.
The return value is a 2-tuple `(media, logs)`. `media` is a dictionary
mapping channel names (appropriate for use as file suffixes) to
file-like objects containing the encoded media segments. `logs` is a
dictionary containing log entries. Either or both entries can be empty
at any time (and will typically be either populated on each frame
except the flush, for non-temporal codecs, or will be empty on all
frames except the flush, for temporal codecs.)
Media segments are discretely decodeable chunks of content. The
mapping of media segments to individual frames is not specified.
"""
raise NotImplementedError()
@property
def suffix(self):
"""
Return the file suffix that will be used. If more than one suffix will
be used, the value returned is the one considered to be "primary".
"""
raise NotImplementedError()
class PILOutput(Output, ClsMod):
lib = f32tou8lib
lib = pixfmtlib
def __init__(self, codec='jpeg', quality=100, alpha=False):
super(PILOutput, self).__init__()
self.type, self.quality, self.alpha = codec, quality, alpha
def convert(self, fb, gnm, dim, stream=None):
launch('f32_to_u8', self.mod, stream,
(32, 8, 1), (int(np.ceil(dim.w/32.)), int(np.ceil(dim.h/8.))),
fb.d_rb, fb.d_seeds, fb.d_back, fb.d_front,
i32(fb.gutter), i32(dim.w), i32(dim.astride), i32(dim.h))
launchC('f32_to_rgba_u8', self.mod, stream, dim, fb,
fb.d_rb, fb.d_seeds)
def copy(self, fb, dim, pool, stream=None):
h_out = pool.allocate((dim.h, dim.w, 4), 'u1')
cuda.memcpy_dtoh_async(h_out, fb.d_back, stream)
return h_out
@staticmethod
def save(buf, name, type=None, quality=98):
type = dict(jpg='jpeg', tif='tiff').get(type, type)
if type == 'jpeg' or (type is None and name.endswith('.jpg')):
buf = buf[:,:,:3]
def _convert_buf(self, buf):
out = StringIO()
img = scipy.misc.toimage(buf, cmin=0, cmax=1)
img.save(name, type, quality=quality)
img.save(out, self.type, quality=self.quality)
out.seek(0)
return out
def encode(self, buf):
if buf is None: return {}, []
if self.type == 'jpeg':
out = self._convert_buf(buf[:,:,:3])
if self.alpha:
alpha = self._convert_buf(buf[:,:,3])
return {'_color.jpg': out, '_alpha.jpg': alpha}, []
return {'.jpg': out}, {}
return {'.'+self.type: self._convert_buf(buf)}, []
@property
def suffix(self):
if self.type == 'jpeg':
if self.alpha: return '_color.jpg'
return '.jpg'
return '.'+self.type
class X264Output(Output, ClsMod):
lib = pixfmtlib
profiles = (
{ 'normal': '--profile high444 --level 4.2'
, '': ''
})
base = ('x264 --no-progress --input-depth 16 --sync-lookahead 0 '
'--rc-lookahead 5 --muxer raw -o - - --log-level debug ')
def __init__(self, profile='normal', csp='i444', crf=15,
x264opts='', alpha=False):
super(X264Output, self).__init__()
self.args = ' '.join([self.base, self.profiles[profile],
'--crf', str(crf), x264opts]).split()
self.alpha = alpha
self.csp = csp
self.framesize = None
self.zeros = None
self.subp = None
self.outf = None
self.asubp = None
self.aoutf = None
def convert(self, fb, gnm, dim, stream=None):
launchC('f32_to_rgba_u16', self.mod, stream, dim, fb,
fb.d_rb, fb.d_seeds)
def copy(self, fb, dim, pool, stream=None):
h_out = pool.allocate((dim.h, dim.w, 4), 'u2')
cuda.memcpy_dtoh_async(h_out, fb.d_back, stream)
return h_out
def _spawn_sub(self, framesize, alpha):
res = '%dx%d' % (framesize[1], framesize[0])
csp = 'yv12' if alpha else 'rgb'
extras = ['--input-csp', csp, '--demuxer', 'raw', '--input-res', res]
outf = tempfile.TemporaryFile(bufsize=0)
if alpha:
extras += ['--output-csp', 'i420', '--chroma-qp-offset', '24']
else:
extras += ['--output-csp', self.csp]
subp = Popen(self.args + extras, stdin=PIPE, stderr=PIPE,
stdout=os.dup(outf.fileno()))
return outf, subp
def _spawn(self, framesize):
self.framesize = framesize
self.outf, self.subp = self._spawn_sub(framesize, False)
if self.alpha:
self.aoutf, self.asubp = self._spawn_sub(framesize, True)
bufsz = framesize[0] * framesize[1] / 2
self.zeros = np.empty(bufsz, dtype='u2')
self.zeros.fill(32767)
def _flush_sub(self, subp):
(stdout, stderr) = subp.communicate()
if subp.returncode:
raise IOError("x264 exited with an error")
return stderr
def _flush(self):
if self.subp is None:
return {}, []
log = self._flush_sub(self.subp)
self.outf.seek(0)
self.subp = None
if self.alpha:
alog = self._flush_sub(self.asubp)
self.aoutf.seek(0)
self.asubp = None
return ({'_color.h264': self.outf, '_alpha.h264': self.aoutf},
[('x264_color', log), ('x264_alpha', alog)])
return {'.h264': self.outf}, [('x264_color', stderr)]
def _write(self, buf, subp):
try:
subp.stdin.write(buffer(buf))
except IOError, e:
print 'Exception while writing. Log:'
print subp.stderr.read()
raise e
def encode(self, buf):
out = ({}, [])
if buf is None or self.framesize != buf.shape[:2]:
out = self._flush()
if buf is None:
return out
if self.subp is None:
self._spawn(buf.shape[:2])
self._write(np.delete(buf, 3, axis=2), self.subp)
if self.alpha:
self._write(buf[:,:,3].tostring(), self.asubp)
self._write(buffer(self.zeros), self.asubp)
return out
@property
def suffix(self):
if self.alpha: return '_color.h264'
return '.h264'
def get_output_for_profile(gprof):
opts = dict(gprof.output._val)
handler = opts.pop('type', 'jpeg')
if handler in ('jpeg', 'png', 'tiff'):
return PILOutput(codec=handler, **opts)
elif handler == 'x264':
return X264Output(**opts)
raise ValueError('Invalid output type "%s".' % handler)

View File

@ -37,6 +37,13 @@ def add_args(parser=None):
help="Last frame to render (1-indexed, exclusive, negative from end)")
tmp.add_argument('--skip', dest='skip', metavar='N', type=int,
help="Skip N frames between each rendered frame")
# TODO: eliminate the 'silently overwritten' bit.
tmp.add_argument('--shard', dest='shard', metavar='SECS', type=float,
help="Write SECS of output into each file, instead of one frame per "
"file. If set, causes 'start', 'end', and 'skip' to be ignored. "
"If output codecs don't support multi-file writing, files will "
"be silently overwritten.")
tmp.add_argument('--still', action='store_true',
help='Override start, end, and temporal frame width to render one '
'frame without motion blur.')
@ -48,7 +55,7 @@ def add_args(parser=None):
spa.add_argument('--height', type=int, metavar='PX')
out = parser.add_argument_group('Output options')
out.add_argument('--codec', choices=['jpg', 'png', 'tiff'])
out.add_argument('--codec', choices=['jpg', 'png', 'tiff', 'x264'])
return parser
def get_from_args(args):
@ -64,9 +71,11 @@ def get_from_args(args):
if args.still:
base.update(frame_width=0, start=1, end=2)
for arg in 'duration fps start end skip spp width height'.split():
for arg in 'duration fps start end skip shard spp width height'.split():
if getattr(args, arg, None) is not None:
base[arg] = getattr(args, arg)
if args.codec is not None:
base.setdefault('output', {})['type'] = args.codec
return name, base
@ -82,13 +91,20 @@ def wrap(prof, gnm):
def enumerate_times(gprof):
"""
Given a profile, return a list of `(frame_no, center_time)` pairs. Note
Given a profile, return a list of `(frame_no, center_times)` pairs. Note
that the enumeration is applied before `start`, `end`, and `skip`, and so
`frame_no` may be non-contiguous.
"""
nframes = round(gprof.fps * gprof.duration)
times = np.linspace(0, 1, nframes + 1)
times = list(enumerate(times[:-1] + 0.5 * (times[1] - times[0]), 1))
times = times[:-1] + 0.5 * (times[1] - times[0])
if gprof.shard:
s = max(1, int(round(gprof.fps * gprof.shard)))
return [(i, times[t:t+s])
for i, t in enumerate(range(0, len(times), s), 1)]
else:
times = [[t] for t in times]
times = list(enumerate(times, 1))
if gprof.end is not None:
times = times[:gprof.end]
if gprof.start is not None:

View File

@ -235,7 +235,7 @@ class Renderer(object):
self.packer, self.lib, self.cubin = self.compile(gnm)
self.mod = self.load(self.cubin)
self.filts = filters.create(gprof)
self.out = output.PILOutput()
self.out = output.get_output_for_profile(gprof)
class RenderManager(ClsMod):
lib = devlib(deps=[interp.palintlib, filldptrlib, iter.flushatomlib])
@ -395,26 +395,3 @@ class RenderManager(ClsMod):
self.info_a, self.info_b = self.info_b, self.info_a
self.stream_a, self.stream_b = self.stream_b, self.stream_a
return self.copy_evt, h_out
def render(self, gnm, gprof, times):
"""
A port of the old rendering function, retained for backwards
compatibility. Some of this will be pulled into as-yet-undecided
methods for more DRY.
"""
rdr = Renderer(gnm, gprof)
last_evt = cuda.Event().record(self.stream_a)
last_idx = None
def wait(): # Times like these where you wish for a macro
while not last_evt.query():
time.sleep(0.01)
gpu_time = last_evt.time_since(two_evts_ago)
return RenderedImage(last_buf, last_idx, gpu_time)
for idx, tc in times:
evt, h_buf = self.queue_frame(rdr, gnm, gprof, tc, last_idx is None)
if last_idx:
yield wait()
two_evts_ago, last_evt = last_evt, evt
last_buf, last_idx = h_buf, idx
if last_idx:
yield wait()