Refactor API

--HG-- rename : cuburn/code/filter.py => cuburn/code/filtering.py
2025-07-11 18:55:26 -04:00 · 2011-06-11 15:59:10 -04:00
parent 6f3c27007a
commit e79df46c66
6 changed files with 362 additions and 142 deletions
--- a/cuburn/_pyflam3_hacks.py
+++ b/cuburn/_pyflam3_hacks.py
@ -14,6 +14,8 @@ from ctypes import *
 from fr0stlib.pyflam3 import constants
 from fr0stlib.pyflam3._flam3 import *
 from cuburn import render
 flam3_nvariations = constants.flam3_nvariations = 99
 BaseXForm._fields_ = [('var', c_double * flam3_nvariations)
--- a/cuburn/code/filtering.py
+++ b/cuburn/code/filtering.py
@ -223,24 +223,22 @@ void density_est(float4 *pixbuf, float4 *outbuf, float *denbuf,
 """)
-    def invoke(self, mod, abufd, obufd, dbufd):
+    def invoke(self, mod, abufd, obufd, dbufd, stream=None):
        # TODO: add no-est version
        # TODO: come up with a general way to average these parameters
        k1 = self.cp.brightness * 268 / 256
        area = self.features.acc_width * self.features.acc_height / self.cp.ppu ** 2
        k2 = 1 / (area * self.cp.adj_density)
        print k1, k2, area
        if self.cp.estimator == 0:
            fun = mod.get_function("logscale")
            t = fun(abufd, obufd, np.float32(k1), np.float32(k2),
                    block=(self.features.acc_width, 1, 1),
-                    grid=(self.features.acc_height, 1), time_kernel=True)
+                    grid=(self.features.acc_height, 1), stream=stream)
        else:
            fun = mod.get_function("density_est")
-            t = fun(abufd, obufd, dbufd, np.float32(k1), np.float32(k2),
+            fun(abufd, obufd, dbufd, np.float32(k1), np.float32(k2),
-                    block=(32, 32, 1), grid=(self.features.acc_width/32, 1),
+                block=(32, 32, 1), grid=(self.features.acc_width/32, 1),
-                    time_kernel=True)
+                stream=stream)
            print "Density estimation: %g" % t
--- a/cuburn/code/iter.py
+++ b/cuburn/code/iter.py
@ -2,20 +2,13 @@
 The main iteration loop.
 """
-from ctypes import byref, memset, sizeof
+from cuburn.code import mwc, variations
 import pycuda.driver as cuda
 from pycuda.driver import In, Out, InOut
 from pycuda.compiler import SourceModule
 import numpy as np
 from scipy import ndimage
 from fr0stlib.pyflam3 import flam3_interpolate
 from cuburn.code import mwc, variations, filter
 from cuburn.code.util import *
 from cuburn.render import Genome
 class IterCode(HunkOCode):
    # The number of threads per block
    NTHREADS = 512
    def __init__(self, features):
        self.features = features
        self.packer = DataPacker('iter_info')
@ -69,14 +62,14 @@ void iter(mwc_st *msts, iter_info *infos, float4 *accbuf, float *denbuf) {
    iter_info *info_glob = &(infos[blockIdx.x]);
    // load info to shared memory cooperatively
-    for (int i = threadIdx.y * 32 + threadIdx.x;
+    for (int i = threadIdx.y * blockDim.x + threadIdx.x;
         i * 4 < sizeof(iter_info); i += blockDim.x * blockDim.y)
        reinterpret_cast<float*>(&info)[i] =
            reinterpret_cast<float*>(info_glob)[i];
    int consec_bad = -{{features.fuse}};
-    // TODO: make nsteps adjustable via genome
+    // TODO: remove '512' constant
-    int nsamps = {{packer.get('cp.width * cp.height / 512000. * cp.adj_density')}};
+    int nsamps = {{packer.get('cp.width * cp.height / (cp.ntemporal_samples * 512.) * cp.adj_density')}};
    float x, y, color;
    x = mwc_next_11(&rctx);
@ -157,86 +150,3 @@ void iter(mwc_st *msts, iter_info *infos, float4 *accbuf, float *denbuf) {
                packer = self.packer.view('info'),
                **globals())
 def render(features, cps):
    # TODO: make this adjustable via genome
    nsteps = 1000
    abuf = np.zeros((features.acc_height, features.acc_stride, 4), dtype=np.float32)
    dbuf = np.zeros((features.acc_height, features.acc_stride), dtype=np.float32)
    seeds = mwc.MWC.make_seeds(512 * nsteps)
    iter = IterCode(features)
    de = filter.DensityEst(features, cps[0])
    code = assemble_code(BaseCode, mwc.MWC, iter.packer, iter,
                         filter.ColorClip, de)
    for lno, line in enumerate(code.split('\n')):
        print '%3d %s' % (lno, line)
    mod = SourceModule(code,
            options=['-use_fast_math', '-maxrregcount', '32'])
    cps_as_array = (Genome * len(cps))()
    for i, cp in enumerate(cps):
        cps_as_array[i] = cp
    infos = []
    pal = np.empty((16, 256, 4), dtype=np.uint8)
    # TODO: move this into a common function
    if len(cps) > 1:
        cp = Genome()
        memset(byref(cp), 0, sizeof(cp))
        sampAt = [int(i/15.*(nsteps-1)) for i in range(16)]
        for n in range(nsteps):
            flam3_interpolate(cps_as_array, 2, float(n)/nsteps - 0.5,
                              0, byref(cp))
            cp._init()
            if n in sampAt:
                pidx = sampAt.index(n)
                for i, e in enumerate(cp.palette.entries):
                    pal[pidx][i] = np.uint8(np.array(e.color) * 255.0)
            infos.append(iter.packer.pack(cp=cp, cp_step_frac=float(n)/nsteps))
    else:
        for i, e in enumerate(cps[0].palette.entries):
            pal[0][i] = np.uint8(np.array(e.color) * 255.0)
        pal[1:] = pal[0]
        infos.append(iter.packer.pack(cp=cps[0], cp_step_frac=0))
        infos *= nsteps
    infos = np.concatenate(infos)
    dpal = cuda.make_multichannel_2d_array(pal, 'C')
    tref = mod.get_texref('palTex')
    tref.set_array(dpal)
    tref.set_format(cuda.array_format.UNSIGNED_INT8, 4)
    tref.set_flags(cuda.TRSF_NORMALIZED_COORDINATES)
    tref.set_filter_mode(cuda.filter_mode.LINEAR)
    abufd = cuda.to_device(abuf)
    dbufd = cuda.to_device(dbuf)
    fun = mod.get_function("iter")
    fun.set_cache_config(cuda.func_cache.PREFER_L1)
    t = fun(InOut(seeds), InOut(infos), abufd, dbufd,
        block=(32,16,1), grid=(nsteps,1), time_kernel=True)
    print "Completed render in %g seconds" % t
    f = np.float32
    npix = features.acc_width * features.acc_height
    # TODO: just allocate
    obufd = cuda.to_device(abuf)
    dbuf = cuda.from_device_like(dbufd, dbuf)
    dbuf = ndimage.filters.gaussian_filter(dbuf, 0.6)
    dbufd = cuda.to_device(dbuf)
    de.invoke(mod, abufd, obufd, dbufd)
    fun = mod.get_function("colorclip")
    t = fun(obufd, f(1 / cp.gamma), f(cp.vibrancy), f(cp.highlight_power),
        block=(256,1,1), grid=(npix/256,1), time_kernel=True)
    print "Completed color filtering in %g seconds" % t
    abuf = cuda.from_device_like(obufd, abuf)
    return abuf, dbuf
--- a/cuburn/code/util.py
+++ b/cuburn/code/util.py
@ -66,8 +66,26 @@ int trunca(float f) {
    asm("cvt.rni.s32.f32    %0,     %1;" : "=r"(ret) : "f"(f));
    return ret;
 }
 __global__
 void zero_dptr(float* dptr, int size) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < size) {
        dptr[i] = 0.0f;
    }
 }
 """
    @staticmethod
    def zero_dptr(mod, dptr, size, stream=None):
        """
        A memory zeroer which can be embedded in a stream. Size is the
        number of 4-byte words in the pointer.
        """
        zero = mod.get_function("zero_dptr")
        zero(dptr, np.int32(size), stream=stream,
             block=(1024, 1, 1), grid=(size/1024+1, 1))
 class DataPackerView(object):
    """
    View of a data packer. Intended to be initialized using DataPacker.view().
--- a/cuburn/render.py
+++ b/cuburn/render.py
@ -1,44 +1,57 @@
 import sys
 import math
 import re
 from itertools import cycle, repeat, chain, izip
 from ctypes import *
 from cStringIO import StringIO
 import numpy as np
 from scipy import ndimage
 from fr0stlib import pyflam3
 from fr0stlib.pyflam3._flam3 import *
 from fr0stlib.pyflam3.constants import *
 import pycuda.compiler
 import pycuda.driver as cuda
 from cuburn import affine
-from cuburn.variations import Variations
+from cuburn.code import util, mwc, iter, filtering
-class Genome(pyflam3.Genome):
+def _chunk(l, cs):
-    @classmethod
+    """
-    def from_string(cls, *args, **kwargs):
+    Yield the contents of list ``l`` in chunks of size no more than ``cs``.
-        gnms = super(Genome, cls).from_string(*args, **kwargs)
+    """
-        for g in gnms: g._init()
+    for i in range(0, len(l), cs):
-        return gnms
+        yield l[i:i+cs]
-    def _init(self):
+class Genome(object):
    """
    Normalizes and precalculates some properties of a Genome. Assumes that
    Genome argument passed in will not change.
    """
    # Fix the ctypes ugliness since switching to __getattribute__ in 2.7.
    # There are more elegant ways to do this, but I can't be bothered.
    def __getattr__(self, name):
        return getattr(self.cp, name)
    def __init__(self, ctypes_genome):
        self.cp = ctypes_genome
        self.xforms = [self.xform[i] for i in range(self.num_xforms)]
        dens = np.array([x.density for i, x in enumerate(self.xforms)
                         if i != self.final_xform_index])
        dens /= np.sum(dens)
        self.norm_density = [np.sum(dens[:i+1]) for i in range(len(dens))]
        self.camera_transform = self.calc_camera_transform()
    scale = property(lambda cp: 2.0 ** cp.zoom)
    adj_density = property(lambda cp: cp.sample_density * (cp.scale ** 2))
    ppu = property(lambda cp: cp.pixels_per_unit * cp.scale)
-    @property
+    def calc_camera_transform(cp):
    def camera_transform(cp):
        """
        An affine matrix which will transform IFS coordinates to image width
        and height. Assumes that width and height are constant.
        """
        # TODO: when reading as a property during packing, this may be
        # calculated 6 times instead of 1
        # TODO: also requires knowing gutter width
        g = Features.gutter
        return ( affine.translate(0.5 * cp.width + g, 0.5 * cp.height + g)
               * affine.scale(cp.ppu, cp.ppu)
@ -65,13 +78,294 @@ class Animation(object):
    In other words, it's best to use exactly one Animation for each
    interpolated sequence between one or two genomes.
    """
-    def __init__(self, genomes, ngenomes = None):
+    def __init__(self, ctypes_genome_array):
-        self.features = Features(genomes)
+        self._g_arr = ctypes_genome_array
        self.genomes = map(Genome, ctypes_genome_array)
        self.features = Features(self.genomes)
        self._iter = self._de = self.src = self.cubin = self.mod = None
-    def compile(self):
+    def compile(self, keep=False,
-        pass
+                cmp_options=('-use_fast_math', '-maxrregcount', '32')):
-    def render_frame(self, time=0):
+        """
-        pass
+        Compile a kernel capable of rendering every frame in this animation.
        The resulting compiled kernel is stored in the ``cubin`` property;
        the source is available as ``src``, and is also returned for
        inspection and display.
        This operation is idempotent, and has no side effects outside of
        setting properties on this instance (unless there's a compiler error,
        which is a bug); it should therefore be threadsafe as well.
        It is, however, rather slow.
        """
        self._iter = iter.IterCode(self.features)
        self._de = filtering.DensityEst(self.features, self.genomes[0])
        # TODO: make choice of filtering explicit
        # TODO: autoload dependent modules?
        self.src = util.assemble_code(util.BaseCode, mwc.MWC, self._iter.packer,
                                      self._iter, filtering.ColorClip, self._de)
        self.cubin = pycuda.compiler.compile(self.src, keep=False,
                                             options=list(cmp_options))
        return self.src
    def copy(self):
        """
        Return a copy of this animation without any references to the current
        CUDA context. This can be used to load an animation in multiple CUDA
        contexts without recompiling, so that rendering can proceed across
        multiple devices - but managing that is up to you.
        """
        import copy
        new = copy.copy(self)
        new.mod = None
        return new
    def load(self, jit_options=[]):
        """
        Replace the currently loaded CUDA module in the active CUDA context
        with the compiled code's module. A reference is kept to the module,
        meaning that rendering should henceforth only be called from the
        thread and context in which this function was called.
        """
        if self.cubin is None:
            self.compile()
        self.mod = cuda.module_from_buffer(self.cubin, jit_options)
    def render_frames(self, times=None):
        """
        Render a flame for each genome in the iterable value 'genomes'.
        Returns a Python generator object which will yield one NumPy array
        for each rendered image.
        This method produces a considerable amount of side effects, and should
        not be used lightly. Things may go poorly for you if this method is not
        allowed to run until completion (by exhausting all items in the
        generator object).
        A performance note: while any ready tasks will be scheduled on the GPU
        before yielding a result, spending a lot of time before returning
        control to this function can allow the GPU to become idle. It's best
        to hand the resulting array to another thread after grabbing it from
        the renderer for handling.
        ``times`` is a sequence of center times at which to render, or ``None``
        to render one frame for each genome used to create the animation.
        """
        # Don't see this changing, but empirical tests could prove me wrong
        NRENDERERS = 2
        # TODO: under a slightly modified sequencing, certain buffers can be
        # shared (though this may be unimportant if a good AA technique which
        # doesn't require full SS can be found)
        rdrs = [_AnimRenderer(self) for i in range(NRENDERERS)]
        # Zip up each genome with an alternating renderer, plus enough empty
        # genomes at the end to flush all pending tasks
        times = times or [cp.time for cp in self.genomes]
        exttimes = chain(times, repeat(None, NRENDERERS))
        for rdr, time in izip(cycle(rdrs), exttimes):
            if rdr.wait():
                yield rdr.get_result()
            if time is not None:
                rdr.render(time)
    def _interp(self, time, cp):
        flam3_interpolate(self._g_arr, len(self._g_arr), time, 0, byref(cp))
 class _AnimRenderer(object):
    # Large launches lock the display for a considerable period and may be
    # killed due to a device timeout; small launches are harder to load-balance
    # on the GPU and incur overhead. This empirical value is multiplied by the
    # number of SMs on the device to determine how many blocks should be in
    # each launch. Extremely high quality, high resolution renders may still
    # encounter a device timeout, and no workaround is in place for that yet.
    SM_FACTOR = 8
    # Currently, palette interpolation is done independently of animation
    # interpolation, so that the process is not biased and so we only need to
    # mess about with one texture per renderer. This many steps will always be
    # used, no matter the number of time steps.
    PAL_HEIGHT = 16
    def __init__(self, anim):
        self.anim = anim
        self.pending = False
        self.stream = cuda.Stream()
        self._nsms = cuda.Context.get_device().multiprocessor_count
        self.cps_per_block = self._nsms * self.SM_FACTOR
        self.ncps = anim.features.max_cps
        self.nblocks = int(math.ceil(self.ncps / float(self.cps_per_block)))
        # These are stored to avoid leaks, not to be stateful in method calls
        # TODO: ensure proper cleanup is done
        self._dst_cp = pyflam3.Genome()
        memset(byref(self._dst_cp), 0, sizeof(self._dst_cp))
        self._cen_cp = pyflam3.Genome()
        memset(byref(self._cen_cp), 0, sizeof(self._cen_cp))
        self.nbins = anim.features.acc_height * anim.features.acc_stride
        self.d_den = cuda.mem_alloc(4 * self.nbins)
        self.d_accum = cuda.mem_alloc(16 * self.nbins)
        self.d_out = cuda.mem_alloc(16 * self.nbins)
        self.d_infos = cuda.mem_alloc(anim._iter.packer.align * self.ncps)
        # Defer allocation until first needed
        self.d_seeds = [None] * self.nblocks
    def render(self, cen_time):
        assert not self.pending, "Tried to render with results pending!"
        self.pending = True
        a = self.anim
        cen_cp = self._cen_cp
        a._interp(cen_time, cen_cp)
        palette = self._interp_colors(cen_time, cen_cp)
        util.BaseCode.zero_dptr(a.mod, self.d_den, self.nbins,
                                self.stream)
        util.BaseCode.zero_dptr(a.mod, self.d_accum, 4 * self.nbins,
                                self.stream)
        # ------------------------------------------------------------
        # TODO WARNING TODO WARNING TODO WARNING TODO WARNING TODO
        # This will replace the palette while it's in use by the other
        # rendering function. Need to pass palette texref in function
        # invocation.
        # ------------------------------------------------------------
        dpal = cuda.make_multichannel_2d_array(palette, 'C')
        tref = a.mod.get_texref('palTex')
        tref.set_array(dpal)
        tref.set_format(cuda.array_format.UNSIGNED_INT8, 4)
        tref.set_flags(cuda.TRSF_NORMALIZED_COORDINATES)
        tref.set_filter_mode(cuda.filter_mode.LINEAR)
        cp = self._dst_cp
        packer = a._iter.packer
        iter_fun = a.mod.get_function("iter")
        iter_fun.set_cache_config(cuda.func_cache.PREFER_L1)
        # Must be accumulated over all CPs
        gam, vib, hipow = 0, 0, 0
        # This is gross, but there are a lot of fiddly corner cases with any
        # index-based iteration scheme.
        times = list(enumerate(self._mk_dts(cen_time, cen_cp, self.ncps)))
        for b, block_times in enumerate(_chunk(times, self.cps_per_block)):
            infos = []
            if len(a.genomes) > 1:
                for n, t in block_times:
                    a._interp(t, cp)
                    frac = float(n) / cen_cp.ntemporal_samples
                    info = packer.pack(cp=Genome(cp), cp_step_frac=frac)
                    infos.append(info)
                    gam += cp.gamma
                    vib += cp.vibrancy
                    hipow += cp.highlight_power
            else:
                # Can't interpolate normally; just pack copies
                # TODO: this still packs the genome 20 times or so instead of
                # once
                packed = packer.pack(cp=a.genomes[0], cp_step_frac=0)
                infos = [packed] * len(block_times)
                gam += a.genomes[0].gamma * len(block_times)
                vib += a.genomes[0].vibrancy * len(block_times)
                hipow += a.genomes[0].highlight_power * len(block_times)
            infos = np.concatenate(infos)
            offset = b * packer.align * self.cps_per_block
            # TODO: portable across 32/64-bit arches?
            d_info_off = int(self.d_infos) + offset
            cuda.memcpy_htod(d_info_off, infos)
            if not self.d_seeds[b]:
                seeds = mwc.MWC.make_seeds(iter.IterCode.NTHREADS *
                                           self.cps_per_block)
                self.d_seeds[b] = cuda.to_device(seeds)
            # TODO: get block config from IterCode
            # TODO: print timing information
            iter_fun(self.d_seeds[b], np.uint64(d_info_off),
                     self.d_accum, self.d_den,
                     block=(32, 16, 1), grid=(len(block_times), 1),
                     stream=self.stream)
        # MAJOR TODO: for now, we kill almost all parallelism by forcing the
        # stream here. Later, once we've decided on a density-buffer prefilter,
        # we will move it to the GPU, allowing it to be embedded in the stream
        # and letting the remaining code be asynchronous.
        self.stream.synchronize()
        dbuf_dim = (a.features.acc_height, a.features.acc_stride)
        dbuf = cuda.from_device(self.d_den, dbuf_dim, np.float32)
        dbuf = ndimage.filters.gaussian_filter(dbuf, 0.6)
        cuda.memcpy_htod(self.d_den, dbuf)
        util.BaseCode.zero_dptr(a.mod, self.d_out, 4 * self.nbins,
                                self.stream)
        self.stream.synchronize()
        a._de.invoke(a.mod, self.d_accum, self.d_out, self.d_den,
                     self.stream)
        self.stream.synchronize()
        n = np.float32(self.ncps)
        gam = np.float32(n / gam)
        vib = np.float32(vib / n)
        hipow = np.float32(hipow / n)
        # TODO: get block size from colorclip class? It actually does not
        # depend on that being the case
        color_fun = a.mod.get_function("colorclip")
        color_fun(self.d_out, gam, vib, hipow,
                  block=(256, 1, 1), grid=(self.nbins / 256, 1),
                  stream=self.stream)
    def _interp_colors(self, cen_time, cen_cp):
        # TODO: any visible difference between uint8 and richer formats?
        pal = np.empty((self.PAL_HEIGHT, 256, 4), dtype=np.uint8)
        a = self.anim
        if len(a.genomes) > 1:
            # The typical case; applying real motion blur
            cp = self._dst_cp
            times = self._mk_dts(cen_time, cen_cp, self.PAL_HEIGHT)
            for n, t in enumerate(times):
                a._interp(t, cp)
                for i, e in enumerate(cp.palette.entries):
                    pal[n][i] = np.uint8(np.array(e.color) * 255.0)
        else:
            # Cannot call any interp functions on a single genome; rather than
            # have alternate code-paths, just copy the same colors everywhere
            for i, e in enumerate(a.genomes[0].palette.entries):
                # TODO: This triggers a RuntimeWarning
                pal[0][i] = np.uint8(np.array(e.color) * 255.0)
            pal[1:] = pal[0]
        return pal
    def wait(self):
        if self.pending:
            self.stream.synchronize()
            self.pending = False
            return True
        return False
    def get_result(self):
        a = self.anim
        g = a.features.gutter
        obuf_dim = (a.features.acc_height, a.features.acc_stride, 4)
        out = cuda.from_device(self.d_out, obuf_dim, np.float32)
        # TODO: performance?
        out = np.delete(out, np.s_[:16], axis=0)
        out = np.delete(out, np.s_[:16], axis=1)
        out = np.delete(out, np.s_[-16:], axis=0)
        out = np.delete(out, np.s_[-16:], axis=1)
        return out
    @staticmethod
    def _mk_dts(cen_time, cen_cp, ncps):
        w = cen_cp.temporal_filter_width
        return [w * (t / (ncps - 1.0) - 0.5) for t in range(ncps)]
 class Features(object):
    """
@ -93,7 +387,8 @@ class Features(object):
    palette_height = 16
    # Maximum width of DE and other spatial filters, and thus in turn the
-    # amount of padding applied
+    # amount of padding applied. Note that, for now, this must not be changed!
    # The filtering code makes deep assumptions about this value.
    gutter = 16
    def __init__(self, genomes):
@ -116,11 +411,13 @@ class Features(object):
        else:
            self.final_xform_index = None
        self.max_cps = max([cp.ntemporal_samples for cp in genomes])
        self.width = genomes[0].width
        self.height = genomes[0].height
        self.acc_width = genomes[0].width + 2 * self.gutter
        self.acc_height = genomes[0].height + 2 * self.gutter
-        self.acc_stride = genomes[0].width + 2 * self.gutter
+        self.acc_stride = 32 * int(math.ceil(self.acc_width / 32.))
 class XFormFeatures(object):
    def __init__(self, xforms, xform_id):
--- a/main.py
+++ b/main.py
@ -22,13 +22,10 @@ import scipy
 import pyglet
 import pycuda.autoinit
 from fr0stlib.pyflam3 import *
 from fr0stlib.pyflam3._flam3 import *
 import cuburn._pyflam3_hacks
 from fr0stlib import pyflam3
 from cuburn.render import *
 from cuburn.code.mwc import MWCTest
 from cuburn.code.iter import render, membench
 # Required on my system; CUDA doesn't yet work with GCC 4.5
 os.environ['PATH'] = ('/usr/x86_64-pc-linux-gnu/gcc-bin/4.4.5:'
@ -37,24 +34,22 @@ os.environ['PATH'] = ('/usr/x86_64-pc-linux-gnu/gcc-bin/4.4.5:'
 def main(args):
    if '-t' in args:
        MWCTest.test_mwc()
        membench()
    with open(args[1]) as fp:
-        genomes = Genome.from_string(fp.read())
+        genome_ptr, ngenomes = pyflam3.Genome.from_string(fp.read())
        genomes = cast(genome_ptr, POINTER(pyflam3.Genome*ngenomes)).contents
    anim = Animation(genomes)
-    accum, den = render(anim.features, genomes)
+    anim.compile()
-    accum = np.delete(accum, np.s_[:16], axis=0)
+    anim.load()
-    accum = np.delete(accum, np.s_[:16], axis=1)
+    for n, out in enumerate(anim.render_frames()):
-    accum = np.delete(accum, np.s_[-16:], axis=0)
+        noalpha = np.delete(out, 3, axis=2)
-    accum = np.delete(accum, np.s_[-16:], axis=1)
+        scipy.misc.imsave('rendered_%03d.png' % n, noalpha)
        scipy.misc.imsave('rendered_%03d.jpg' % n, noalpha)
-    noalpha = np.delete(accum, 3, axis=2)
+    return
    scipy.misc.imsave('rendered.png', noalpha)
    scipy.misc.imsave('rendered.jpg', noalpha)
-    if '-g' not in args:
+    #if '-g' not in args:
-        return
+    #    return
    window = pyglet.window.Window(anim.features.width, anim.features.height)
    imgbuf = (np.minimum(accum * 255, 255)).astype(np.uint8)