From 3294ba10d6515eadc284f6603be6deda3d4ebd39 Mon Sep 17 00:00:00 2001
From: Steven Robertson <steven@strobe.cc>
Date: Sun, 22 Jul 2012 15:53:38 -0700
Subject: [PATCH] Support x264 10-bit output format.

---
 cuburn/code/output.py  |  40 ++++++++-
 cuburn/genome/specs.py |   6 +-
 cuburn/output.py       | 200 ++++++++++++++++++++++++++++++++++++++---
 cuburn/profile.py      |  24 ++++-
 cuburn/render.py       |  25 +-----
 dist/client.py         |  32 ++++---
 dist/messages.py       |   2 +-
 dist/server.py         |   4 +-
 dist/worker.py         |  28 ++++--
 main.py                | 118 ++++++++++++++----------
 10 files changed, 363 insertions(+), 116 deletions(-)

diff --git a/cuburn/code/output.py b/cuburn/code/output.py
index 9126e30..808698b 100644
--- a/cuburn/code/output.py
+++ b/cuburn/code/output.py
@@ -1,15 +1,16 @@
 from util import devlib, ringbuflib
 from mwc import mwclib
 
-f32tou8lib = devlib(deps=[ringbuflib, mwclib], defs=r'''
+rgba8lib = devlib(deps=[ringbuflib, mwclib], defs=r'''
 // Perform a conversion from float32 values to uint8 ones, applying
 // pixel- and channel-independent dithering to reduce suprathreshold banding
 // artifacts. Clamps values larger than 1.0f.
 // TODO: move to a separate module?
 // TODO: less ineffecient mwc_st handling?
-__global__ void f32_to_u8(
-    ringbuf *rb, mwc_st *rctxs, uchar4 *dst, const float4 *src,
-    int gutter, int dstride, int sstride, int height)
+__global__ void f32_to_rgba_u8(
+    uchar4 *dst, const float4 *src,
+    int gutter, int dstride, int sstride, int height,
+    ringbuf *rb, mwc_st *rctxs)
 {
     int x = blockIdx.x * blockDim.x + threadIdx.x;
     int y = blockIdx.y * blockDim.y + threadIdx.y;
@@ -32,3 +33,34 @@ __global__ void f32_to_u8(
     rctxs[rb_incr(rb->tail, tid)] = rctx;
 }
 ''')
+
+rgba16lib = devlib(deps=[ringbuflib, mwclib], defs=r'''
+// Perform a conversion from float32 values to uint16 ones, as above.
+__global__ void f32_to_rgba_u16(
+    ushort4 *dst, const float4 *src,
+    int gutter, int dstride, int sstride, int height,
+    ringbuf *rb, mwc_st *rctxs)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x > dstride || y > height) return;
+    int isrc = sstride * (y + gutter) + x + gutter;
+
+    int tid = blockDim.x * threadIdx.y + threadIdx.x;
+    mwc_st rctx = rctxs[rb_incr(rb->head, tid)];
+
+    float4 in = src[isrc];
+    ushort4 out = make_ushort4(
+        fminf(1.0f, in.x) * 65535.0f + 0.49f * mwc_next_11(rctx),
+        fminf(1.0f, in.y) * 65535.0f + 0.49f * mwc_next_11(rctx),
+        fminf(1.0f, in.z) * 65535.0f + 0.49f * mwc_next_11(rctx),
+        fminf(1.0f, in.w) * 65535.0f + 0.49f * mwc_next_11(rctx)
+    );
+
+    int idst = dstride * y + x;
+    dst[idst] = out;
+    rctxs[rb_incr(rb->tail, tid)] = rctx;
+}
+''')
+
+pixfmtlib = devlib(deps=[rgba8lib, rgba16lib])
diff --git a/cuburn/genome/specs.py b/cuburn/genome/specs.py
index 7f8f9e3..88e3a16 100644
--- a/cuburn/genome/specs.py
+++ b/cuburn/genome/specs.py
@@ -115,6 +115,8 @@ profile = (
   , 'end': Scalar(None, 'Last frame to render (1-indexed, exclusive; '
                   'negative indexes from the end)')
   , 'skip': Scalar(0, 'Skip this many frames between each rendered frame')
+  , 'shard': Scalar(0, 'Pack this many frames in each output file '
+                    '(causing start, end, and skip to be ignored)')
 
   , 'height': Scalar(1920, 'Output height in pixels')
   , 'width': Scalar(1080, 'Output width in pixels')
@@ -123,7 +125,9 @@ profile = (
   , 'filter_order': list_(enum(filters.keys()), default_filters)
   , 'filters': prof_filters
 
-  , 'output_format': enum('jpg png tif', 'jpg')
+  # The other keys in the 'output' dictionary are format-specific and not
+  # documented here.
+  , 'output': {'type': enum('jpeg png tiff x264', 'jpeg')}
   })
 
 # Types recognized as independent units with a 'type' key
diff --git a/cuburn/output.py b/cuburn/output.py
index b8882aa..5f8f39d 100644
--- a/cuburn/output.py
+++ b/cuburn/output.py
@@ -1,10 +1,14 @@
+import os
+import tempfile
+from cStringIO import StringIO
+from subprocess import Popen, PIPE
 import numpy as np
 from numpy import float32 as f32, int32 as i32
 
 import pycuda.driver as cuda
 
 from code.util import ClsMod, launch
-from code.output import f32tou8lib
+from code.output import pixfmtlib
 
 import scipy.misc
 
@@ -12,39 +16,209 @@ if not hasattr(scipy.misc, 'toimage'):
     raise ImportError("Could not find scipy.misc.toimage. "
                       "Are scipy and PIL installed?")
 
+def launchC(name, mod, stream, dim, fb, *args):
+    launch(name, mod, stream,
+            (32, 8, 1), (int(np.ceil(dim.w/32.)), int(np.ceil(dim.h/8.))),
+            fb.d_back, fb.d_front,
+            i32(fb.gutter), i32(dim.w), i32(dim.astride), i32(dim.h),
+            *args)
+
 class Output(object):
     def convert(self, fb, gnm, dim, stream=None):
         """
         Convert a filtered buffer to whatever output format is needed by the
         writer.
+
+        This function is intended for use by the Renderer, and should not be
+        called by clients. It does not modify its instance.
         """
         raise NotImplementedError()
 
     def copy(self, fb, dim, pool, stream=None):
         """
         Schedule a copy from the device buffer to host memory, returning the
-        target buffer.
+        target buffer(s).
+
+        This function is intended for use by the Renderer, and should not be
+        called by clients. It does not modify its instance.
         """
         raise NotImplementedError()
 
+    def encode(self, host_frame):
+        """
+        Push `host_frame` (as returned from `Output.copy`) into the encoding
+        pipeline, and return any completed media segments. If `host_frame` is
+        None, flush the encoding pipeline.
+
+        The return value is a 2-tuple `(media, logs)`. `media` is a dictionary
+        mapping channel names (appropriate for use as file suffixes) to
+        file-like objects containing the encoded media segments. `logs` is a
+        dictionary containing log entries. Either or both entries can be empty
+        at any time (and will typically be either populated on each frame
+        except the flush, for non-temporal codecs, or will be empty on all
+        frames except the flush, for temporal codecs.)
+
+        Media segments are discretely decodeable chunks of content. The
+        mapping of media segments to individual frames is not specified.
+        """
+        raise NotImplementedError()
+
+    @property
+    def suffix(self):
+        """
+        Return the file suffix that will be used. If more than one suffix will
+        be used, the value returned is the one considered to be "primary".
+        """
+        raise NotImplementedError()
+
+
 class PILOutput(Output, ClsMod):
-    lib = f32tou8lib
+    lib = pixfmtlib
+
+    def __init__(self, codec='jpeg', quality=100, alpha=False):
+        super(PILOutput, self).__init__()
+        self.type, self.quality, self.alpha = codec, quality, alpha
 
     def convert(self, fb, gnm, dim, stream=None):
-        launch('f32_to_u8', self.mod, stream,
-                (32, 8, 1), (int(np.ceil(dim.w/32.)), int(np.ceil(dim.h/8.))),
-                fb.d_rb, fb.d_seeds, fb.d_back, fb.d_front,
-                i32(fb.gutter), i32(dim.w), i32(dim.astride), i32(dim.h))
+        launchC('f32_to_rgba_u8', self.mod, stream, dim, fb,
+                fb.d_rb, fb.d_seeds)
 
     def copy(self, fb, dim, pool, stream=None):
         h_out = pool.allocate((dim.h, dim.w, 4), 'u1')
         cuda.memcpy_dtoh_async(h_out, fb.d_back, stream)
         return h_out
 
-    @staticmethod
-    def save(buf, name, type=None, quality=98):
-        type = dict(jpg='jpeg', tif='tiff').get(type, type)
-        if type == 'jpeg' or (type is None and name.endswith('.jpg')):
-            buf = buf[:,:,:3]
+    def _convert_buf(self, buf):
+        out = StringIO()
         img = scipy.misc.toimage(buf, cmin=0, cmax=1)
-        img.save(name, type, quality=quality)
+        img.save(out, self.type, quality=self.quality)
+        out.seek(0)
+        return out
+
+    def encode(self, buf):
+        if buf is None: return {}, []
+        if self.type == 'jpeg':
+            out = self._convert_buf(buf[:,:,:3])
+            if self.alpha:
+                alpha = self._convert_buf(buf[:,:,3])
+                return {'_color.jpg': out, '_alpha.jpg': alpha}, []
+            return {'.jpg': out}, {}
+        return {'.'+self.type: self._convert_buf(buf)}, []
+
+    @property
+    def suffix(self):
+        if self.type == 'jpeg':
+            if self.alpha: return '_color.jpg'
+            return '.jpg'
+        return '.'+self.type
+
+class X264Output(Output, ClsMod):
+    lib = pixfmtlib
+
+    profiles = (
+      { 'normal': '--profile high444 --level 4.2'
+      , '': ''
+      })
+    base = ('x264 --no-progress --input-depth 16 --sync-lookahead 0 '
+            '--rc-lookahead 5 --muxer raw -o - - --log-level debug ')
+
+    def __init__(self, profile='normal', csp='i444', crf=15,
+                 x264opts='', alpha=False):
+        super(X264Output, self).__init__()
+        self.args = ' '.join([self.base, self.profiles[profile],
+                              '--crf', str(crf), x264opts]).split()
+        self.alpha = alpha
+        self.csp = csp
+        self.framesize = None
+        self.zeros = None
+        self.subp = None
+        self.outf = None
+        self.asubp = None
+        self.aoutf = None
+
+    def convert(self, fb, gnm, dim, stream=None):
+        launchC('f32_to_rgba_u16', self.mod, stream, dim, fb,
+                fb.d_rb, fb.d_seeds)
+
+    def copy(self, fb, dim, pool, stream=None):
+        h_out = pool.allocate((dim.h, dim.w, 4), 'u2')
+        cuda.memcpy_dtoh_async(h_out, fb.d_back, stream)
+        return h_out
+
+    def _spawn_sub(self, framesize, alpha):
+        res = '%dx%d' % (framesize[1], framesize[0])
+        csp = 'yv12' if alpha else 'rgb'
+        extras = ['--input-csp', csp, '--demuxer', 'raw', '--input-res', res]
+        outf = tempfile.TemporaryFile(bufsize=0)
+        if alpha:
+            extras += ['--output-csp', 'i420', '--chroma-qp-offset', '24']
+        else:
+            extras += ['--output-csp', self.csp]
+        subp = Popen(self.args + extras, stdin=PIPE, stderr=PIPE,
+                     stdout=os.dup(outf.fileno()))
+        return outf, subp
+
+    def _spawn(self, framesize):
+        self.framesize = framesize
+        self.outf, self.subp = self._spawn_sub(framesize, False)
+        if self.alpha:
+            self.aoutf, self.asubp = self._spawn_sub(framesize, True)
+            bufsz = framesize[0] * framesize[1] / 2
+            self.zeros = np.empty(bufsz, dtype='u2')
+            self.zeros.fill(32767)
+
+    def _flush_sub(self, subp):
+        (stdout, stderr) = subp.communicate()
+        if subp.returncode:
+            raise IOError("x264 exited with an error")
+        return stderr
+
+    def _flush(self):
+        if self.subp is None:
+            return {}, []
+        log = self._flush_sub(self.subp)
+        self.outf.seek(0)
+        self.subp = None
+        if self.alpha:
+            alog = self._flush_sub(self.asubp)
+            self.aoutf.seek(0)
+            self.asubp = None
+            return ({'_color.h264': self.outf, '_alpha.h264': self.aoutf},
+                    [('x264_color', log), ('x264_alpha', alog)])
+        return {'.h264': self.outf}, [('x264_color', stderr)]
+
+    def _write(self, buf, subp):
+        try:
+            subp.stdin.write(buffer(buf))
+        except IOError, e:
+            print 'Exception while writing. Log:'
+            print subp.stderr.read()
+            raise e
+
+    def encode(self, buf):
+        out = ({}, [])
+        if buf is None or self.framesize != buf.shape[:2]:
+            out = self._flush()
+        if buf is None:
+            return out
+        if self.subp is None:
+            self._spawn(buf.shape[:2])
+        self._write(np.delete(buf, 3, axis=2), self.subp)
+        if self.alpha:
+            self._write(buf[:,:,3].tostring(), self.asubp)
+            self._write(buffer(self.zeros), self.asubp)
+        return out
+
+    @property
+    def suffix(self):
+        if self.alpha: return '_color.h264'
+        return '.h264'
+
+def get_output_for_profile(gprof):
+    opts = dict(gprof.output._val)
+    handler = opts.pop('type', 'jpeg')
+    if handler in ('jpeg', 'png', 'tiff'):
+        return PILOutput(codec=handler, **opts)
+    elif handler == 'x264':
+        return X264Output(**opts)
+    raise ValueError('Invalid output type "%s".' % handler)
diff --git a/cuburn/profile.py b/cuburn/profile.py
index b67e5e3..d191c02 100644
--- a/cuburn/profile.py
+++ b/cuburn/profile.py
@@ -37,6 +37,13 @@ def add_args(parser=None):
         help="Last frame to render (1-indexed, exclusive, negative from end)")
     tmp.add_argument('--skip', dest='skip', metavar='N', type=int,
         help="Skip N frames between each rendered frame")
+    # TODO: eliminate the 'silently overwritten' bit.
+    tmp.add_argument('--shard', dest='shard', metavar='SECS', type=float,
+        help="Write SECS of output into each file, instead of one frame per "
+             "file. If set, causes 'start', 'end', and 'skip' to be ignored. "
+             "If output codecs don't support multi-file writing, files will "
+             "be silently overwritten.")
+
     tmp.add_argument('--still', action='store_true',
         help='Override start, end, and temporal frame width to render one '
              'frame without motion blur.')
@@ -48,7 +55,7 @@ def add_args(parser=None):
     spa.add_argument('--height', type=int, metavar='PX')
 
     out = parser.add_argument_group('Output options')
-    out.add_argument('--codec', choices=['jpg', 'png', 'tiff'])
+    out.add_argument('--codec', choices=['jpg', 'png', 'tiff', 'x264'])
     return parser
 
 def get_from_args(args):
@@ -64,9 +71,11 @@ def get_from_args(args):
 
     if args.still:
         base.update(frame_width=0, start=1, end=2)
-    for arg in 'duration fps start end skip spp width height'.split():
+    for arg in 'duration fps start end skip shard spp width height'.split():
         if getattr(args, arg, None) is not None:
             base[arg] = getattr(args, arg)
+    if args.codec is not None:
+        base.setdefault('output', {})['type'] = args.codec
 
     return name, base
 
@@ -82,13 +91,20 @@ def wrap(prof, gnm):
 
 def enumerate_times(gprof):
     """
-    Given a profile, return a list of `(frame_no, center_time)` pairs. Note
+    Given a profile, return a list of `(frame_no, center_times)` pairs. Note
     that the enumeration is applied before `start`, `end`, and `skip`, and so
     `frame_no` may be non-contiguous.
     """
     nframes = round(gprof.fps * gprof.duration)
     times = np.linspace(0, 1, nframes + 1)
-    times = list(enumerate(times[:-1] + 0.5 * (times[1] - times[0]), 1))
+    times = times[:-1] + 0.5 * (times[1] - times[0])
+    if gprof.shard:
+        s = max(1, int(round(gprof.fps * gprof.shard)))
+        return [(i, times[t:t+s])
+                for i, t in enumerate(range(0, len(times), s), 1)]
+    else:
+        times = [[t] for t in times]
+    times = list(enumerate(times, 1))
     if gprof.end is not None:
         times = times[:gprof.end]
     if gprof.start is not None:
diff --git a/cuburn/render.py b/cuburn/render.py
index e25df5a..a7c754a 100644
--- a/cuburn/render.py
+++ b/cuburn/render.py
@@ -235,7 +235,7 @@ class Renderer(object):
         self.packer, self.lib, self.cubin = self.compile(gnm)
         self.mod = self.load(self.cubin)
         self.filts = filters.create(gprof)
-        self.out = output.PILOutput()
+        self.out = output.get_output_for_profile(gprof)
 
 class RenderManager(ClsMod):
     lib = devlib(deps=[interp.palintlib, filldptrlib, iter.flushatomlib])
@@ -395,26 +395,3 @@ class RenderManager(ClsMod):
         self.info_a, self.info_b = self.info_b, self.info_a
         self.stream_a, self.stream_b = self.stream_b, self.stream_a
         return self.copy_evt, h_out
-
-    def render(self, gnm, gprof, times):
-        """
-        A port of the old rendering function, retained for backwards
-        compatibility. Some of this will be pulled into as-yet-undecided
-        methods for more DRY.
-        """
-        rdr = Renderer(gnm, gprof)
-        last_evt = cuda.Event().record(self.stream_a)
-        last_idx = None
-        def wait(): # Times like these where you wish for a macro
-            while not last_evt.query():
-                time.sleep(0.01)
-            gpu_time = last_evt.time_since(two_evts_ago)
-            return RenderedImage(last_buf, last_idx, gpu_time)
-        for idx, tc in times:
-            evt, h_buf = self.queue_frame(rdr, gnm, gprof, tc, last_idx is None)
-            if last_idx:
-                yield wait()
-            two_evts_ago, last_evt = last_evt, evt
-            last_buf, last_idx = h_buf, idx
-        if last_idx:
-            yield wait()
diff --git a/dist/client.py b/dist/client.py
index 9b07909..fcf6ca6 100644
--- a/dist/client.py
+++ b/dist/client.py
@@ -10,11 +10,15 @@ from gevent import spawn, queue, coros
 import zmq.green as zmq
 
 import _importhack
-from cuburn import profile
+from cuburn import profile, output
 from cuburn.genome import db, util
 
 from messages import *
 
+# TODO: remove this dependency (loading the output module to get the suffix
+# requires a compiler / default instance)
+import pycuda.autoinit
+
 class RenderClient(object):
     def __init__(self, task_addr, rsp_addr, ctx=None, start=True):
         ctx = zmq.Context() if ctx is None else ctx
@@ -56,12 +60,12 @@ class RenderClient(object):
     def _deal_rsps(self):
         while True:
             rsp = self.rsock.recv_multipart(copy=False)
-            assert len(rsp) == 2
             rq = self.taskmap.get(rsp[0].bytes, None)
-            if rq: rq.put(rsp[1])
+            if rq: rq.put((rsp[1].bytes, rsp[2].bytes.split('\0'), rsp[3:]))
 
 # Time (in seconds) before a job times out
-TIMEOUT=240
+# TODO: replace timeout mechanism with polling?
+TIMEOUT=2400
 
 # Max. queue length before request considered lost, as a multiple of the
 # number of in-flight requests
@@ -92,21 +96,27 @@ def iter_genomes(prof, outpath, gpaths):
             os.makedirs(odir)
         with open(os.path.join(odir, 'NFRAMES'), 'w') as fp:
             fp.write(str(len(times)))
+        outmod = output.get_output_for_profile(gprof)
         for i, t in times:
-            opath = os.path.join(odir, '%05d.%s' % (i, gprof.output_format))
-            if not os.path.isfile(opath):
+            opath = os.path.join(odir, '%05d' % i)
+            if not os.path.isfile(opath + outmod.suffix):
                 yield Task(opath, ghash, prof, gnm, t)
 
 def get_result(cli, task, rq):
     try:
-        rsp = rq.get(timeout=TIMEOUT)
+        log, names, bufs = rq.get(timeout=TIMEOUT)
     except queue.Empty:
         cli.put(task, rq)
         print '>>', task.id
-        rsp = rq.get()
+        log, names, bufs = rq.get()
 
-    with open(task.id, 'wb') as fp:
-        fp.write(buffer(rsp))
+    with open(task.id + '.log', 'wb') as fp:
+        fp.write(log)
+
+    for name in reversed(names):
+        buf = bufs.pop()
+        with open(task.id + name, 'wb') as fp:
+            fp.write(buffer(buf))
     print '< ', task.id
 
 def main(addrs):
@@ -128,6 +138,8 @@ def main(addrs):
 
     while cli.taskmap:
         print 'Still waiting on %d tasks...' % len(cli.taskmap)
+        for i in cli.taskmap.items():
+            print i
         gevent.sleep(3)
 
 if __name__ == "__main__":
diff --git a/dist/messages.py b/dist/messages.py
index 647de2d..792537c 100644
--- a/dist/messages.py
+++ b/dist/messages.py
@@ -1,5 +1,5 @@
 from collections import namedtuple
 
-Task = namedtuple('Task', 'id hash profile anim time')
+Task = namedtuple('Task', 'id hash profile anim times')
 AddressedTask = namedtuple('AddressedTask', 'addr task')
 FullTask = namedtuple('FullTask', 'addr task cubin packer')
diff --git a/dist/server.py b/dist/server.py
index b634e8e..a3a9eee 100644
--- a/dist/server.py
+++ b/dist/server.py
@@ -1,4 +1,5 @@
 #!/usr/bin/env python2
+from itertools import takewhile
 
 import gevent
 from gevent import spawn, queue, event
@@ -29,7 +30,6 @@ def setup_task_listeners(addrs, tq, rq):
                 # losock to be added to the queue.
                 loevt.set()
             task = hisock.recv_pyobj()
-            print 'OOOOOH! Got a hiprio evt'
             loevt.clear() # Got message; pause listen_lo().
             tq.put(task)
             hisock.send('')
@@ -77,7 +77,7 @@ def setup_worker_listener(addrs, tq, rq):
         while True:
             rsp = wsock.recv_multipart(copy=False)
             if rsp[2].bytes != '':
-                print '< ', ' '.join([r.bytes for r in rsp[2:-1]])
+                print '< ', rsp[2].bytes, rsp[3].bytes
                 rq.put(rsp[2:])
             readyq.put(rsp[0])
 
diff --git a/dist/worker.py b/dist/worker.py
index 347def2..3847587 100644
--- a/dist/worker.py
+++ b/dist/worker.py
@@ -1,5 +1,6 @@
 #!/usr/bin/env python2
 import sys
+import socket
 from cStringIO import StringIO
 
 import gevent
@@ -37,20 +38,29 @@ def main(worker_addr):
 
         hash = None
         while True:
+            log = [('worker', socket.gethostname() + ':' +
+                    cuda.Context.get_current().get_device().pci_bus_id())]
             addr, task, cubin, packer = sock.recv_pyobj()
             gprof = profile.wrap(task.profile, task.anim)
             if hash != task.hash:
                 rdr = PrecompiledRenderer(task.anim, gprof, packer, cubin)
-            evt, buf = rmgr.queue_frame(rdr, task.anim, gprof, task.time)
-            while not evt.query():
-                gevent.sleep(0.01)
-            ofile = StringIO()
-            output.PILOutput.save(buf, ofile, task.id[-3:])
-            ofile.seek(0)
-            sock.send_multipart(addr + [ofile.read()])
-            hash = task.hash
+            for t in task.times:
+                evt, buf = rmgr.queue_frame(rdr, task.anim, gprof, t)
+                while not evt.query():
+                    gevent.sleep(0.01)
+                out, frame_log = rdr.out.encode(buf)
+                log += frame_log
+                print 'Rendered', task.id, 'in', int(evt.time()), 'ms'
+            final_out, final_log = rdr.out.encode(None)
+            assert not (out and final_out), 'Got output from two sources!'
+            out = out or final_out
+            log += final_log
+            log = '\0'.join([k + ' ' + v for k, v in log])
 
-            print 'Rendered', task.id, 'in', int(evt.time()), 'ms'
+            suffixes, files = zip(*[(k, v.read())
+                                    for k, v in sorted(out.items())])
+            # TODO: reduce copies, generally spruce up the memory usage here
+            sock.send_multipart(addr + [log, '\0'.join(suffixes)] + list(files))
 
     # Spawn two request loops to take advantage of CUDA pipelining.
     spawn(request_loop)
diff --git a/main.py b/main.py
index 04ab6c1..3df259a 100755
--- a/main.py
+++ b/main.py
@@ -26,44 +26,16 @@ sys.path.insert(0, os.path.dirname(__file__))
 from cuburn import render, filters, output, profile
 from cuburn.genome import convert, use, db
 
-def save(out):
-    # Temporary! TODO: fix this
-    output.PILOutput.save(out.buf, out.idx)
-    print out.idx, out.gpu_time
-
-def main(args, prof):
-    gdb = db.connect(args.genomedb)
-    gnm, basename = gdb.get_anim(args.flame, args.half)
-    if getattr(args, 'print'):
-        print convert.to_json(gnm)
-        return
-    gprof = profile.wrap(prof, gnm)
-
-    if args.name is not None:
-        basename = args.name
-    prefix = os.path.join(args.dir, basename)
-    if args.subdir:
-        if not os.path.isdir(prefix):
-            os.mkdir(prefix)
-        prefix += '/'
-    else:
-        prefix += '_'
-    frames = [('%s%05d%s.jpg' % (prefix, (i+1), args.suffix), t)
-              for i, t in profile.enumerate_times(gprof)]
-    if args.resume:
-        m = os.path.getmtime(args.flame)
-        frames = (f for f in frames
-                  if not os.path.isfile(f[0]) or m > os.path.getmtime(f[0]))
-
-    import pycuda.autoinit
-    rmgr = render.RenderManager()
-    gen = rmgr.render(gnm, gprof, frames)
-
-    if not args.gfx:
-        for out in gen:
-            save(out)
-        return
+def save(output_module, name, rendered_frame):
+    out, log = output_module.encode(rendered_frame)
+    for suffix, file_like in out.items():
+        with open(name + suffix, 'w') as fp:
+            fp.write(file_like.read())
+    for key, val in log:
+        print '\n=== %s ===' % key
+        print val
 
+def pyglet_preview(args, gprof, itr):
     import pyglet
     import pyglet.gl as gl
     w, h = gprof.width, gprof.height
@@ -92,39 +64,89 @@ def main(args, prof):
     last_time = [time.time()]
 
     def poll(dt):
-        out = next(gen, False)
+        out = next(itr, False)
         if out is False:
             if args.pause:
                 label.text = "Done. ('q' to quit)"
-                #pyglet.clock.unschedule(poll)
             else:
                 pyglet.app.exit()
         elif out is not None:
+            name, buf = out
             real_dt = time.time() - last_time[0]
             last_time[0] = time.time()
-            save(out)
-            if out.buf.dtype == np.uint8:
+            if buf.dtype == np.uint8:
                 fmt = gl.GL_UNSIGNED_BYTE
-            elif out.buf.dtype == np.uint16:
+            elif buf.dtype == np.uint16:
                 fmt = gl.GL_UNSIGNED_SHORT
             else:
-                label.text = 'Unsupported format: ' + out.buf.dtype
+                label.text = 'Unsupported format: ' + buf.dtype
                 return
 
-            h, w, ch = out.buf.shape
+            h, w, ch = buf.shape
             gl.glEnable(tex.target)
             gl.glBindTexture(tex.target, tex.id)
             gl.glTexImage2D(tex.target, 0, gl.GL_RGB8, w, h, 0, gl.GL_RGBA,
-                            fmt, out.buf.tostring())
+                            fmt, buf.tostring())
             gl.glDisable(tex.target)
-            label.text = '%s (%g fps)' % (out.idx, 1./real_dt)
+            label.text = '%s (%g fps)' % (name, 1./real_dt)
         else:
             label.text += '.'
 
-    pyglet.clock.set_fps_limit(30)
-    pyglet.clock.schedule_interval(poll, 1/30.)
+    pyglet.clock.set_fps_limit(20)
+    pyglet.clock.schedule_interval(poll, 1/20.)
     pyglet.app.run()
 
+def main(args, prof):
+    gdb = db.connect(args.genomedb)
+    gnm, basename = gdb.get_anim(args.flame, args.half)
+    if getattr(args, 'print'):
+        print convert.to_json(gnm)
+        return
+    gprof = profile.wrap(prof, gnm)
+
+    if args.name is not None:
+        basename = args.name
+    prefix = os.path.join(args.dir, basename)
+    if args.subdir:
+        if not os.path.isdir(prefix):
+            os.mkdir(prefix)
+        prefix_plus = prefix + '/'
+    else:
+        prefix_plus = prefix + '_'
+
+    frames = [('%s%05d%s' % (prefix_plus, i, args.suffix), t)
+              for i, t in profile.enumerate_times(gprof)]
+
+    # We don't initialize a CUDA context until here. This keeps other
+    # functions like --help and --print snappy.
+    import pycuda.autoinit
+    rmgr = render.RenderManager()
+    rdr = render.Renderer(gnm, gprof)
+
+    def render_iter():
+        m = os.path.getmtime(args.flame)
+        first = True
+        for name, times in frames:
+            if args.resume:
+                fp = name + rdr.out.suffix
+                if os.path.isfile(fp) and m < os.path.getmtime(f[0]+ext):
+                    continue
+
+            for t in times:
+                evt, buf = rmgr.queue_frame(rdr, gnm, gprof, t, first)
+                first = False
+                while not evt.query():
+                    time.sleep(0.01)
+                    yield None
+                save(rdr.out, name, buf)
+                print name, evt.time()
+                yield name, buf
+            save(rdr.out, name, None)
+
+    if args.gfx:
+        pyglet_preview(args, gprof, render_iter())
+    else:
+        for i in render_iter(): pass
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description='Render fractal flames.')