cuburn/cuburn/code/interp.py

from collections import OrderedDict
from itertools import cycle
import numpy as np

import util
from util import Template, assemble_code, devlib, binsearchlib, ringbuflib
from color import yuvlib
from mwc import mwclib

class GenomePackerName(str):
    """Class to indicate that a property is precalculated on the device"""
    pass

class GenomePackerView(object):
    """
    Obtain accessors in generated code.

    Call ``GenomePacker.view(ptr_name, wrapped_obj, prefix)`` to generate a
    view, where ``ptr_name`` is the name of the CUDA pointer which holds the
    interpolated values, ``wrapped_obj`` is the base Genome instance which
    will be uploaded to the device for interpolating from, and ``prefix`` is
    the prefix that will be assigned to property accessors from this object.

    Accessing a property on the object synthesizes an accessor for use in your
    code and an interpolator for use in generating that code. This conversion
    is done when the property is coerced into a string by the templating
    mechanism, so you can easily nest objects by saying, for instance,
    {{pcp.camera.rotation}} from within templated code. The accessed property
    must be a SplEval object, or a precalculated value (see
    ``GenomePackerPrecalc``).

    Index operations are converted to property accesses as well, so that you
    don't have to make a mess with 'getattr' in your code: {{pcp.xforms[x]}}
    works just fine. This means, however, that no arrays can be packed
    directly; they must be converted to have string-based keys first, and
    any loops must be unrolled in your code.
    """
    def __init__(self, packer, ptr_name, wrapped, prefix=()):
        self.packer = packer
        self.ptr_name = ptr_name
        self.wrapped = wrapped
        self.prefix = prefix
    def __getattr__(self, name):
        w = getattr(self.wrapped, name)
        return type(self)(self.packer, self.ptr_name, w, self.prefix+(name,))
    # As with the Genome class, we're all-dict, no-array here
    __getitem__ = lambda s, n: getattr(s, str(n))

    def __str__(self):
        """
        Returns the packed name in a format suitable for embedding directly
        into device code.
        """
        # So evil. When the template calls __str__ to format the output, we
        # allocate things. This makes for neater embedded code, which is where
        # the real complexity lies, but it also means printf() debugging when
        # templating will screw with the allocation tables!
        if not isinstance(self.wrapped, GenomePackerName):
            self.packer._require(self.prefix)
        # TODO: verify namespace stomping, etc
        return '%s.%s' % (self.ptr_name, '_'.join(self.prefix))

    def _precalc(self):
        """Create a GenomePackerPrecalc object. See that class for details."""
        return GenomePackerPrecalc(self.packer, self.ptr_name,
                                   self.wrapped, self.prefix)

class GenomePackerPrecalc(GenomePackerView):
    """
    Insert precalculated values into the packed genome.

    Create a Precalc object by calling a view object's _precalc() method. The
    returned object shares the same referent dict as the view object, but
    instead of returning view accessors for use in the body of your code,
    accessing a property synthesizes a call to an interpolation function for
    use within the precalculating function. Use this in your precalculated
    code to obtain values from a genome.

    Once you've obtained the needed parameters and performed the
    precalculation, write them to the genome object with a call to the '_set'
    method in your precalc template. This method generates a new accessor to
    which you can assign a value in your precalculation function. It also
    makes this accessor available for use on the original packer object from
    within your main function.

    Finally, call the '_code' method on the precalc object with your generated
    precalculation code to add it to the precalculation function. The code
    will be wrapped in a C block to prevent namespace leakage, so that
    multiple invocations of the precalculation code on different code blocks
    can be performed.

    Example:

        def do_precalc(px):
            pcam = px._precalc()
            pcam._code(Template('''
                {{pcam._set('prop_sin')}} = sin({{pcam.prop}});
                ''').substitute(pcam=pcam))

        def gen_code(px):
            return Template('''
                {{do_precalc(px)}}
                printf("The sin of %g is %g.", {{px.prop}}, {{px.prop_sin}});
                ''').substitute(px=px)
    """
    def __init__(self, packer, ptr_name, wrapped, prefix):
        super(GenomePackerPrecalc, self).__init__(packer, 'out', wrapped, prefix)
    def __str__(self):
        return self.packer._require_pre(self.prefix)
    def _set(self, name):
        fullname = self.prefix + (name,)
        self.packer._pre_alloc(fullname)
        # This just modifies the underlying object, because I'm too lazy right
        # now to ghost the namespace
        self.wrapped[name] = GenomePackerName('_'.join(fullname))
        return '%s->%s' % (self.ptr_name, self.wrapped[name])
    def _code(self, code):
        self.packer.precalc_code.append(code)

class GenomePacker(object):
    """
    Packs a genome for use in iteration.
    """
    def __init__(self, tname):
        """
        Create a new DataPacker.

        ``tname`` is the name of the structure typedef that will be emitted
        via this object's ``decls`` property.
        """
        self.tname = tname
        # We could do this in the order that things are requested, but we want
        # to be able to treat the direct stuff as a list so this function
        # doesn't unroll any more than it has to. So we separate things into
        # direct requests, and those that need precalculation.
        # Values of OrderedDict are unused; basically, it's just OrderedSet.
        self.packed_direct = OrderedDict()
        self.genome_precalc = OrderedDict()
        self.packed_precalc = OrderedDict()
        self.precalc_code = []

        self.ns = {}

        self._len = None
        self.decls = None
        self.defs = None

        self.packed = None
        self.genome = None
        self.search_rounds = util.DEFAULT_SEARCH_ROUNDS

    def __len__(self):
        assert self._len is not None, 'len() called before finalize()'
        return self._len

    def view(self, ptr_name, wrapped_obj, prefix):
        """Create a DataPacker view. See DataPackerView class for details."""
        self.ns[prefix] = wrapped_obj
        return GenomePackerView(self, ptr_name, wrapped_obj, (prefix,))

    def _require(self, name):
        """
        Called to indicate that the named parameter from the original genome
        must be available during interpolation.
        """
        self.packed_direct[name] = None

    def _require_pre(self, name):
        i = len(self.genome_precalc) << self.search_rounds
        self.genome_precalc[name] = None
        return 'catmull_rom(&times[%d], &knots[%d], time)' % (i, i)

    def _pre_alloc(self, name):
        self.packed_precalc[name] = None

    def finalize(self):
        """
        Create the code to render this genome.
        """
        # At the risk of packing a few things more than once, we don't
        # uniquify the overall precalc order, sparing us the need to implement
        # recursive code generation
        self.packed = self.packed_direct.keys() + self.packed_precalc.keys()
        self.genome = self.packed_direct.keys() + self.genome_precalc.keys()

        self._len = len(self.packed)

        decls = self._decls.substitute(packed=self.packed, tname=self.tname)
        defs = self._defs.substitute(
                packed_direct=self.packed_direct, tname=self.tname,
                precalc_code=self.precalc_code,
                search_rounds=self.search_rounds)

        return devlib(deps=[catmullromlib], decls=decls, defs=defs)

    def pack(self, pool=None):
        """
        Return a packed copy of the genome ready for uploading to the GPU,
        as two float32 NDArrays for the knot times and values.
        """
        width = 1 << self.search_rounds
        if pool:
            times = pool.allocate((len(self.genome), width), 'f4')
            knots = pool.allocate((len(self.genome), width), 'f4')
        else:
            times, knots = np.empty((2, len(self.genome), width), 'f4')
        times.fill(1e9)

        for idx, gname in enumerate(self.genome):
            attr = self.ns[gname[0]]
            for g in gname[1:]:
                attr = getattr(attr, g)
            times[idx,:len(attr.knots[0])] = attr.knots[0]
            knots[idx,:len(attr.knots[1])] = attr.knots[1]
        return times, knots

    _defs = Template(r"""
__global__ void interp_{{tname}}(
        {{tname}}* out,
        const float *times, const float *knots,
        float tstart, float tstep, int maxid)
{
    int id = gtid();
    if (id >= maxid) return;
    out = &out[id];
    float time = tstart + id * tstep;

    float *outf = reinterpret_cast<float*>(out);

    // TODO: unroll pragma?
    for (int i = 0; i < {{len(packed_direct)}}; i++) {
        int j = i << {{search_rounds}};
        outf[i] = catmull_rom(&times[j], &knots[j], time);
    }

    // Advance 'times' and 'knots' to the purely generated sections, so that
    // the pregenerated statements emitted by _require_pre are correct.
    times = &times[{{len(packed_direct)<<search_rounds}}];
    knots = &knots[{{len(packed_direct)<<search_rounds}}];

    {{for hunk in precalc_code}}
    if (1) {
    {{hunk}}
    }
    {{endfor}}
}
""")

    _decls = Template(r"""
typedef struct {
{{for name in packed}}
    float   {{'_'.join(name)}};
{{endfor}}
} {{tname}};


""")

catmullromlib = devlib(deps=[binsearchlib], decls=r'''
__device__ __noinline__
float catmull_rom(const float *times, const float *knots, float t);
''', defs=r'''
__device__ __noinline__
float catmull_rom(const float *times, const float *knots, float t) {
    int idx = bitwise_binsearch(times, t);

    // The left bias of the search means that we never have to worry about
    // overshooting unless the genome is corrupted
    idx = max(idx, 1);

    float t1 = times[idx], t2 = times[idx+1] - t1;
    float rt2 = 1.0f / t2;
    float t0 = (times[idx-1] - t1) * rt2, t3 = (times[idx+2] - t1) * rt2;
    t = (t - t1) * rt2;

    // Now t1 is effectively 0 and t2 is 1

    float k0 = knots[idx-1], k1 = knots[idx],
          k2 = knots[idx+1], k3 = knots[idx+2];

    float m1 = (k2 - k0) / (1.0f - t0),
          m2 = (k3 - k1) / (t3);

    float tt = t * t, ttt = tt * t;

    return m1 * (      ttt - 2.0f*tt + t)
         + k1 * ( 2.0f*ttt - 3.0f*tt + 1)
         + m2 * (      ttt -      tt)
         + k2 * (-2.0f*ttt + 3.0f*tt);
}
''')

palintlib = devlib(deps=[binsearchlib, ringbuflib, yuvlib, mwclib], decls='''
surface<void, cudaSurfaceType2D> flatpal;
''', defs=r'''
__device__ float4
interp_color(const float *times, const float4 *sources, float time)
{
    int idx = fmaxf(bitwise_binsearch(times, time) + 1, 1);
    float lf = (times[idx] - time) / (times[idx] - times[idx-1]);
    float rf = 1.0f - lf;

    float4 left  = sources[blockDim.x * (idx - 1) + threadIdx.x];
    float4 right = sources[blockDim.x * (idx)     + threadIdx.x];
    float3 yuv;

    float3 l3 = make_float3(left.x, left.y, left.z);
    float3 r3 = make_float3(right.x, right.y, right.z);

    float3 lyuv = rgb2yuv(l3);
    float3 ryuv = rgb2yuv(r3);
    yuv.x = lyuv.x * lf + ryuv.x * rf;
    yuv.y = lyuv.y * lf + ryuv.y * rf;
    yuv.z = lyuv.z * lf + ryuv.z * rf;

    yuv.y += 0.5f;
    yuv.z += 0.5f;

    return make_float4(yuv.x, yuv.y, yuv.z, left.w * lf + right.w * rf);
}

__global__ void interp_palette_flat(
        ringbuf *rb, mwc_st *rctxs,
        const float *times, const float4 *sources,
        float tstart, float tstep)
{
    mwc_st rctx = rctxs[rb_incr(rb->head, threadIdx.x)];
    int gid = blockIdx.x * blockDim.x + threadIdx.x;

    float time = tstart + blockIdx.x * tstep;
    float4 yuva = interp_color(times, sources, time);

    // TODO: pack Y at full precision, UV at quarter
    uint2 out;

    uint32_t y = yuva.x * 255.0f + 0.49f * mwc_next_11(rctx);
    uint32_t u = yuva.y * 255.0f + 0.49f * mwc_next_11(rctx);
    uint32_t v = yuva.z * 255.0f + 0.49f * mwc_next_11(rctx);
    y = min(255, y);
    u = min(255, u);
    v = min(255, v);
    out.y = (1 << 22) | (y << 4);
    out.x = (u << 18) | v;

    surf2Dwrite(out, flatpal, 8 * threadIdx.x, blockIdx.x);
    rctxs[rb_incr(rb->tail, threadIdx.x)] = rctx;
}
''')

testcrlib = devlib(defs=r'''
__global__ void
test_cr(const float *times, const float *knots, const float *t, float *r) {
    int i = threadIdx.x + blockDim.x * blockIdx.x;
    r[i] = catmull_rom(times, knots, t[i]);
}
''')

if __name__ == "__main__":
    # Test spline evaluation. This code will probably drift pretty often.
    import pycuda.driver as cuda
    from pycuda.compiler import SourceModule
    import pycuda.autoinit
    from cuburn.genome import SplEval

    gp = GenomePacker("unused")
    gp.finalize()
    mod = SourceModule(assemble_code(BaseCode, gp))
    times = np.sort(np.concatenate(([-2.0, 0.0, 1.0, 3.0], np.random.rand(12))))
    knots = np.random.randn(16)

    print times
    print knots

    evaltimes = np.float32(np.linspace(0, 1, 1024))
    sp = SplEval([x for k in zip(times, knots) for x in k])
    vals = np.array([sp(t) for t in evaltimes], dtype=np.float32)

    dtimes = np.empty((32,), dtype=np.float32)
    dtimes.fill(1e9)
    dtimes[:16] = times
    dknots = np.zeros_like(dtimes)
    dknots[:16] = knots

    dvals = np.empty_like(vals)
    mod.get_function("test_cr")(cuda.In(dtimes), cuda.In(dknots),
            cuda.In(evaltimes), cuda.Out(dvals), block=(1024, 1, 1))
    for t, v, d in zip(evaltimes, vals, dvals):
        print '%6f %8g %8g' % (t, v, d)