New genome representation, and device interp.

2025-07-12 03:05:14 -04:00 · 2011-10-25 15:44:39 -04:00
parent be31708c09
commit 8939a6343a
8 changed files with 1030 additions and 729 deletions
--- a/cuburn/code/interp.py
+++ b/cuburn/code/interp.py
@ -0,0 +1,318 @@
+from collections import OrderedDict
+from itertools import cycle
+import numpy as np
+
+import tempita
+from cuburn.code.util import HunkOCode, Template
+from cuburn.genome import SplEval
+
+class GenomePackerName(str):
+    """Class to indicate that a property is precalculated on the device"""
+    pass
+
+class GenomePackerView(object):
+    """
+    Obtain accessors in generated code.
+
+    Call ``GenomePacker.view(ptr_name, wrapped_obj, prefix)`` to generate a
+    view, where ``ptr_name`` is the name of the CUDA pointer which holds the
+    interpolated values, ``wrapped_obj`` is the base Genome instance which
+    will be uploaded to the device for interpolating from, and ``prefix`` is
+    the prefix that will be assigned to property accessors from this object.
+
+    Accessing a property on the object synthesizes an accessor for use in your
+    code and an interpolator for use in generating that code. This conversion
+    is done when the property is coerced into a string by the templating
+    mechanism, so you can easily nest objects by saying, for instance,
+    {{pcp.camera.rotation}} from within templated code. The accessed property
+    must be a SplEval object, or a precalculated value (see
+    ``GenomePackerPrecalc``).
+
+    Index operations are converted to property accesses as well, so that you
+    don't have to make a mess with 'getattr' in your code: {{pcp.xforms[x]}}
+    works just fine. This means, however, that no arrays can be packed
+    directly; they must be converted to have string-based keys first, and
+    any loops must be unrolled in your code.
+    """
+    def __init__(self, packer, ptr_name, wrapped, prefix=()):
+        self.packer = packer
+        self.ptr_name = ptr_name
+        self.wrapped = wrapped
+        self.prefix = prefix
+    def __getattr__(self, name):
+        w = getattr(self.wrapped, name)
+        return type(self)(self.packer, self.ptr_name, w, self.prefix+(name,))
+    # As with the Genome class, we're all-dict, no-array here
+    __getitem__ = lambda s, n: getattr(s, str(n))
+
+    def __str__(self):
+        """
+        Returns the packed name in a format suitable for embedding directly
+        into device code.
+        """
+        # So evil. When the template calls __str__ to format the output, we
+        # allocate things. This makes for neater embedded code, which is where
+        # the real complexity lies, but it also means printf() debugging when
+        # templating will screw with the allocation tables!
+        if isinstance(self.wrapped, SplEval):
+            self.packer._require(self.prefix)
+        elif not isinstance(self.wrapped, GenomePackerName):
+            raise TypeError("Tried to pack something that wasn't a spline or "
+                            "a precalculated value")
+        # TODO: verify namespace stomping, etc
+        return '%s.%s' % (self.ptr_name, '_'.join(self.prefix))
+
+    def _precalc(self):
+        """Create a GenomePackerPrecalc object. See that class for details."""
+        return GenomePackerPrecalc(self.packer, self.ptr_name,
+                                   self.wrapped, self.prefix)
+
+class GenomePackerPrecalc(GenomePackerView):
+    """
+    Insert precalculated values into the packed genome.
+
+    Create a Precalc object by calling a view object's _precalc() method. The
+    returned object shares the same referent dict as the view object, but
+    instead of returning view accessors for use in the body of your code,
+    accessing a property synthesizes a call to an interpolation function for
+    use within the precalculating function. Use this in your precalculated
+    code to obtain values from a genome.
+
+    Once you've obtained the needed parameters and performed the
+    precalculation, write them to the genome object with a call to the '_set'
+    method in your precalc template. This method generates a new accessor to
+    which you can assign a value in your precalculation function. It also
+    makes this accessor available for use on the original packer object from
+    within your main function.
+
+    Finally, call the '_code' method on the precalc object with your generated
+    precalculation code to add it to the precalculation function. The code
+    will be wrapped in a C block to prevent namespace leakage, so that
+    multiple invocations of the precalculation code on different code blocks
+    can be performed.
+
+    Example:
+
+        def do_precalc(px):
+            pcam = px._precalc()
+            pcam._code(Template('''
+                {{pcam._set('prop_sin')}} = sin({{pcam.prop}});
+                ''').substitute(pcam=pcam))
+
+        def gen_code(px):
+            return Template('''
+                {{do_precalc(px)}}
+                printf("The sin of %g is %g.", {{px.prop}}, {{px.prop_sin}});
+                ''').substitute(px=px)
+    """
+    def __init__(self, packer, ptr_name, wrapped, prefix):
+        super(GenomePackerPrecalc, self).__init__(packer, 'out', wrapped, prefix)
+    def __str__(self):
+        return self.packer._require_pre(self.prefix)
+    def _set(self, name):
+        fullname = self.prefix + (name,)
+        self.packer._pre_alloc(fullname)
+        # This just modifies the underlying object, because I'm too lazy right
+        # now to ghost the namespace
+        self.wrapped[name] = GenomePackerName('_'.join(fullname))
+        return '%s->%s' % (self.ptr_name, self.wrapped[name])
+    def _code(self, code):
+        self.packer.precalc_code.append(code)
+
+class GenomePacker(HunkOCode):
+    """
+    Packs a genome for use in iteration.
+    """
+
+    # 2^search_rounds is the maximum number of control points, including
+    # endpoints, that can be used in a single genome. It should be okay to
+    # change this number without touching other code, but 32 samples fits
+    # nicely on a single cache line.
+    search_rounds = 5
+
+    def __init__(self, tname):
+        """
+        Create a new DataPacker.
+
+        ``tname`` is the name of the structure typedef that will be emitted
+        via this object's ``decls`` property.
+        """
+        self.tname = tname
+        # We could do this in the order that things are requested, but we want
+        # to be able to treat the direct stuff as a list so this function
+        # doesn't unroll any more than it has to. So we separate things into
+        # direct requests, and those that need precalculation.
+        # Values of OrderedDict are unused; basically, it's just OrderedSet.
+        self.packed_direct = OrderedDict()
+        self.genome_precalc = OrderedDict()
+        self.packed_precalc = OrderedDict()
+        self.precalc_code = []
+
+        self.ns = {}
+
+        self._len = None
+        self.decls = None
+        self.defs = None
+
+        self.packed = None
+        self.genome = None
+
+    def __len__(self):
+        assert self._len is not None, 'len() called before finalize()'
+        return self._len
+
+    def view(self, ptr_name, wrapped_obj, prefix):
+        """Create a DataPacker view. See DataPackerView class for details."""
+        self.ns[prefix] = wrapped_obj
+        return GenomePackerView(self, ptr_name, wrapped_obj, (prefix,))
+
+    def _require(self, name):
+        """
+        Called to indicate that the named parameter from the original genome
+        must be available during interpolation.
+        """
+        self.packed_direct[name] = None
+
+    def _require_pre(self, name):
+        i = len(self.genome_precalc) << self.search_rounds
+        self.genome_precalc[name] = None
+        return 'catmull_rom(&times[%d], &knots[%d], time)' % (i, i)
+
+    def _pre_alloc(self, name):
+        self.packed_precalc[name] = None
+
+    def finalize(self):
+        """
+        Create the code to render this genome.
+        """
+        # At the risk of packing a few things more than once, we don't
+        # uniquify the overall precalc order, sparing us the need to implement
+        # recursive code generation
+        self.packed = self.packed_direct.keys() + self.packed_precalc.keys()
+        self.genome = self.packed_direct.keys() + self.genome_precalc.keys()
+
+        self._len = len(self.packed)
+
+        self.decls = self._decls.substitute(
+                packed=self.packed, tname=self.tname,
+                search_rounds=self.search_rounds)
+        self.defs = self._defs.substitute(
+                packed_direct=self.packed_direct, tname=self.tname,
+                precalc_code=self.precalc_code,
+                search_rounds=self.search_rounds)
+
+
+    def pack(self):
+        """
+        Return a packed copy of the genome ready for uploading to the GPU as a
+        3D NDArray, with the first element being the times and the second
+        being the values.
+        """
+        width = 1 << self.search_rounds
+        out = np.empty((2, len(self.genome), width), dtype=np.float32)
+        # Ensure that unused values at the top are always big (must be >2.0)
+        out[0].fill(1e9)
+
+        for idx, gname in enumerate(self.genome):
+            attr = self.ns[gname[0]]
+            for g in gname[1:]:
+                attr = getattr(attr, g)
+            if not isinstance(attr, SplEval):
+                raise TypeError("%s isn't a spline" % '.'.join(gname))
+            out[0][idx][:len(attr.knots[0])] = attr.knots[0]
+            out[1][idx][:len(attr.knots[1])] = attr.knots[1]
+        return out
+
+    _defs = Template(r"""
+
+__global__
+void interp_{{tname}}({{tname}}* out, float *times, float *knots,
+        float tstart, float tstep, mwc_st *rctxes) {
+    int id = gtid();
+    out = &out[id];
+    mwc_st rctx = rctxes[id];
+    float time = tstart + id * tstep;
+
+    float *outf = reinterpret_cast<float*>(out);
+
+    // TODO: unroll pragma?
+    for (int i = 0; i < {{len(packed_direct)}}; i++) {
+        int j = i << {{search_rounds}};
+        outf[i] = catmull_rom(&times[j], &knots[j], time);
+    }
+
+    // Advance 'times' and 'knots' to the purely generated sections, so that
+    // the pregenerated statements emitted by _require_pre are correct.
+    times = &times[{{len(packed_direct)<<search_rounds}}];
+    knots = &knots[{{len(packed_direct)<<search_rounds}}];
+
+    {{for hunk in precalc_code}}
+    if (1) {
+    {{hunk}}
+    }
+    {{endfor}}
+}
+
+""")
+
+
+    _decls = Template(r"""
+typedef struct {
+{{for name in packed}}
+    float   {{'_'.join(name)}};
+{{endfor}}
+} {{tname}};
+
+/* Search through the fixed-size list 'hay' to find the rightmost index which
+ * contains a value strictly smaller than the input 'needle'. The crazy
+ * bitwise search is just for my own amusement.
+ */
+__device__
+int bitwise_binsearch(float *hay, float needle) {
+    int lo = 0;
+
+    {{for i in range(search_rounds-1, -1, -1)}}
+    if (needle > hay[lo | {{1 << i}}])
+        lo |= {{1 << i}};
+    {{endfor}}
+    return lo;
+}
+
+__device__
+float catmull_rom(float *times, float *knots, float t) {
+    int idx = bitwise_binsearch(times, t);
+
+    // The left bias of the search means that we never have to worry about
+    // overshooting unless the genome is corrupted
+    idx = max(idx, 1);
+
+    float t1 = times[idx], t2 = times[idx+1] - t1;
+    float rt2 = 1.0f / t2;
+    float t0 = (times[idx-1] - t1) * rt2, t3 = (times[idx+2] - t1) * rt2;
+    t = (t - t1) * rt2;
+
+    // Now t1 is effectively 0 and t2 is 1
+
+    float k0 = knots[idx-1], k1 = knots[idx],
+          k2 = knots[idx+1], k3 = knots[idx+2];
+
+    float m1 = (k2 - k0) / (1.0f - t0),
+          m2 = (k3 - k1) / (t3);
+
+    float tt = t * t, ttt = tt * t;
+
+    return m1 * (      ttt - 2.0f*tt + t)
+         + k1 * ( 2.0f*ttt - 3.0f*tt + 1)
+         + m2 * (      ttt -      tt)
+         + k2 * (-2.0f*ttt + 3.0f*tt);
+}
+
+__global__
+void test_cr(float *times, float *knots, float *t, float *r) {
+    int i = threadIdx.x + blockDim.x * blockIdx.x;
+    r[i] = catmull_rom(times, knots, t[i]);
+}
+
+
+""")
+