mirror of
https://github.com/stevenrobertson/cuburn.git
synced 2025-07-12 03:05:14 -04:00
New genome representation, and device interp.
This commit is contained in:
318
cuburn/code/interp.py
Normal file
318
cuburn/code/interp.py
Normal file
@ -0,0 +1,318 @@
|
||||
from collections import OrderedDict
|
||||
from itertools import cycle
|
||||
import numpy as np
|
||||
|
||||
import tempita
|
||||
from cuburn.code.util import HunkOCode, Template
|
||||
from cuburn.genome import SplEval
|
||||
|
||||
class GenomePackerName(str):
|
||||
"""Class to indicate that a property is precalculated on the device"""
|
||||
pass
|
||||
|
||||
class GenomePackerView(object):
|
||||
"""
|
||||
Obtain accessors in generated code.
|
||||
|
||||
Call ``GenomePacker.view(ptr_name, wrapped_obj, prefix)`` to generate a
|
||||
view, where ``ptr_name`` is the name of the CUDA pointer which holds the
|
||||
interpolated values, ``wrapped_obj`` is the base Genome instance which
|
||||
will be uploaded to the device for interpolating from, and ``prefix`` is
|
||||
the prefix that will be assigned to property accessors from this object.
|
||||
|
||||
Accessing a property on the object synthesizes an accessor for use in your
|
||||
code and an interpolator for use in generating that code. This conversion
|
||||
is done when the property is coerced into a string by the templating
|
||||
mechanism, so you can easily nest objects by saying, for instance,
|
||||
{{pcp.camera.rotation}} from within templated code. The accessed property
|
||||
must be a SplEval object, or a precalculated value (see
|
||||
``GenomePackerPrecalc``).
|
||||
|
||||
Index operations are converted to property accesses as well, so that you
|
||||
don't have to make a mess with 'getattr' in your code: {{pcp.xforms[x]}}
|
||||
works just fine. This means, however, that no arrays can be packed
|
||||
directly; they must be converted to have string-based keys first, and
|
||||
any loops must be unrolled in your code.
|
||||
"""
|
||||
def __init__(self, packer, ptr_name, wrapped, prefix=()):
|
||||
self.packer = packer
|
||||
self.ptr_name = ptr_name
|
||||
self.wrapped = wrapped
|
||||
self.prefix = prefix
|
||||
def __getattr__(self, name):
|
||||
w = getattr(self.wrapped, name)
|
||||
return type(self)(self.packer, self.ptr_name, w, self.prefix+(name,))
|
||||
# As with the Genome class, we're all-dict, no-array here
|
||||
__getitem__ = lambda s, n: getattr(s, str(n))
|
||||
|
||||
def __str__(self):
|
||||
"""
|
||||
Returns the packed name in a format suitable for embedding directly
|
||||
into device code.
|
||||
"""
|
||||
# So evil. When the template calls __str__ to format the output, we
|
||||
# allocate things. This makes for neater embedded code, which is where
|
||||
# the real complexity lies, but it also means printf() debugging when
|
||||
# templating will screw with the allocation tables!
|
||||
if isinstance(self.wrapped, SplEval):
|
||||
self.packer._require(self.prefix)
|
||||
elif not isinstance(self.wrapped, GenomePackerName):
|
||||
raise TypeError("Tried to pack something that wasn't a spline or "
|
||||
"a precalculated value")
|
||||
# TODO: verify namespace stomping, etc
|
||||
return '%s.%s' % (self.ptr_name, '_'.join(self.prefix))
|
||||
|
||||
def _precalc(self):
|
||||
"""Create a GenomePackerPrecalc object. See that class for details."""
|
||||
return GenomePackerPrecalc(self.packer, self.ptr_name,
|
||||
self.wrapped, self.prefix)
|
||||
|
||||
class GenomePackerPrecalc(GenomePackerView):
|
||||
"""
|
||||
Insert precalculated values into the packed genome.
|
||||
|
||||
Create a Precalc object by calling a view object's _precalc() method. The
|
||||
returned object shares the same referent dict as the view object, but
|
||||
instead of returning view accessors for use in the body of your code,
|
||||
accessing a property synthesizes a call to an interpolation function for
|
||||
use within the precalculating function. Use this in your precalculated
|
||||
code to obtain values from a genome.
|
||||
|
||||
Once you've obtained the needed parameters and performed the
|
||||
precalculation, write them to the genome object with a call to the '_set'
|
||||
method in your precalc template. This method generates a new accessor to
|
||||
which you can assign a value in your precalculation function. It also
|
||||
makes this accessor available for use on the original packer object from
|
||||
within your main function.
|
||||
|
||||
Finally, call the '_code' method on the precalc object with your generated
|
||||
precalculation code to add it to the precalculation function. The code
|
||||
will be wrapped in a C block to prevent namespace leakage, so that
|
||||
multiple invocations of the precalculation code on different code blocks
|
||||
can be performed.
|
||||
|
||||
Example:
|
||||
|
||||
def do_precalc(px):
|
||||
pcam = px._precalc()
|
||||
pcam._code(Template('''
|
||||
{{pcam._set('prop_sin')}} = sin({{pcam.prop}});
|
||||
''').substitute(pcam=pcam))
|
||||
|
||||
def gen_code(px):
|
||||
return Template('''
|
||||
{{do_precalc(px)}}
|
||||
printf("The sin of %g is %g.", {{px.prop}}, {{px.prop_sin}});
|
||||
''').substitute(px=px)
|
||||
"""
|
||||
def __init__(self, packer, ptr_name, wrapped, prefix):
|
||||
super(GenomePackerPrecalc, self).__init__(packer, 'out', wrapped, prefix)
|
||||
def __str__(self):
|
||||
return self.packer._require_pre(self.prefix)
|
||||
def _set(self, name):
|
||||
fullname = self.prefix + (name,)
|
||||
self.packer._pre_alloc(fullname)
|
||||
# This just modifies the underlying object, because I'm too lazy right
|
||||
# now to ghost the namespace
|
||||
self.wrapped[name] = GenomePackerName('_'.join(fullname))
|
||||
return '%s->%s' % (self.ptr_name, self.wrapped[name])
|
||||
def _code(self, code):
|
||||
self.packer.precalc_code.append(code)
|
||||
|
||||
class GenomePacker(HunkOCode):
|
||||
"""
|
||||
Packs a genome for use in iteration.
|
||||
"""
|
||||
|
||||
# 2^search_rounds is the maximum number of control points, including
|
||||
# endpoints, that can be used in a single genome. It should be okay to
|
||||
# change this number without touching other code, but 32 samples fits
|
||||
# nicely on a single cache line.
|
||||
search_rounds = 5
|
||||
|
||||
def __init__(self, tname):
|
||||
"""
|
||||
Create a new DataPacker.
|
||||
|
||||
``tname`` is the name of the structure typedef that will be emitted
|
||||
via this object's ``decls`` property.
|
||||
"""
|
||||
self.tname = tname
|
||||
# We could do this in the order that things are requested, but we want
|
||||
# to be able to treat the direct stuff as a list so this function
|
||||
# doesn't unroll any more than it has to. So we separate things into
|
||||
# direct requests, and those that need precalculation.
|
||||
# Values of OrderedDict are unused; basically, it's just OrderedSet.
|
||||
self.packed_direct = OrderedDict()
|
||||
self.genome_precalc = OrderedDict()
|
||||
self.packed_precalc = OrderedDict()
|
||||
self.precalc_code = []
|
||||
|
||||
self.ns = {}
|
||||
|
||||
self._len = None
|
||||
self.decls = None
|
||||
self.defs = None
|
||||
|
||||
self.packed = None
|
||||
self.genome = None
|
||||
|
||||
def __len__(self):
|
||||
assert self._len is not None, 'len() called before finalize()'
|
||||
return self._len
|
||||
|
||||
def view(self, ptr_name, wrapped_obj, prefix):
|
||||
"""Create a DataPacker view. See DataPackerView class for details."""
|
||||
self.ns[prefix] = wrapped_obj
|
||||
return GenomePackerView(self, ptr_name, wrapped_obj, (prefix,))
|
||||
|
||||
def _require(self, name):
|
||||
"""
|
||||
Called to indicate that the named parameter from the original genome
|
||||
must be available during interpolation.
|
||||
"""
|
||||
self.packed_direct[name] = None
|
||||
|
||||
def _require_pre(self, name):
|
||||
i = len(self.genome_precalc) << self.search_rounds
|
||||
self.genome_precalc[name] = None
|
||||
return 'catmull_rom(×[%d], &knots[%d], time)' % (i, i)
|
||||
|
||||
def _pre_alloc(self, name):
|
||||
self.packed_precalc[name] = None
|
||||
|
||||
def finalize(self):
|
||||
"""
|
||||
Create the code to render this genome.
|
||||
"""
|
||||
# At the risk of packing a few things more than once, we don't
|
||||
# uniquify the overall precalc order, sparing us the need to implement
|
||||
# recursive code generation
|
||||
self.packed = self.packed_direct.keys() + self.packed_precalc.keys()
|
||||
self.genome = self.packed_direct.keys() + self.genome_precalc.keys()
|
||||
|
||||
self._len = len(self.packed)
|
||||
|
||||
self.decls = self._decls.substitute(
|
||||
packed=self.packed, tname=self.tname,
|
||||
search_rounds=self.search_rounds)
|
||||
self.defs = self._defs.substitute(
|
||||
packed_direct=self.packed_direct, tname=self.tname,
|
||||
precalc_code=self.precalc_code,
|
||||
search_rounds=self.search_rounds)
|
||||
|
||||
|
||||
def pack(self):
|
||||
"""
|
||||
Return a packed copy of the genome ready for uploading to the GPU as a
|
||||
3D NDArray, with the first element being the times and the second
|
||||
being the values.
|
||||
"""
|
||||
width = 1 << self.search_rounds
|
||||
out = np.empty((2, len(self.genome), width), dtype=np.float32)
|
||||
# Ensure that unused values at the top are always big (must be >2.0)
|
||||
out[0].fill(1e9)
|
||||
|
||||
for idx, gname in enumerate(self.genome):
|
||||
attr = self.ns[gname[0]]
|
||||
for g in gname[1:]:
|
||||
attr = getattr(attr, g)
|
||||
if not isinstance(attr, SplEval):
|
||||
raise TypeError("%s isn't a spline" % '.'.join(gname))
|
||||
out[0][idx][:len(attr.knots[0])] = attr.knots[0]
|
||||
out[1][idx][:len(attr.knots[1])] = attr.knots[1]
|
||||
return out
|
||||
|
||||
_defs = Template(r"""
|
||||
|
||||
__global__
|
||||
void interp_{{tname}}({{tname}}* out, float *times, float *knots,
|
||||
float tstart, float tstep, mwc_st *rctxes) {
|
||||
int id = gtid();
|
||||
out = &out[id];
|
||||
mwc_st rctx = rctxes[id];
|
||||
float time = tstart + id * tstep;
|
||||
|
||||
float *outf = reinterpret_cast<float*>(out);
|
||||
|
||||
// TODO: unroll pragma?
|
||||
for (int i = 0; i < {{len(packed_direct)}}; i++) {
|
||||
int j = i << {{search_rounds}};
|
||||
outf[i] = catmull_rom(×[j], &knots[j], time);
|
||||
}
|
||||
|
||||
// Advance 'times' and 'knots' to the purely generated sections, so that
|
||||
// the pregenerated statements emitted by _require_pre are correct.
|
||||
times = ×[{{len(packed_direct)<<search_rounds}}];
|
||||
knots = &knots[{{len(packed_direct)<<search_rounds}}];
|
||||
|
||||
{{for hunk in precalc_code}}
|
||||
if (1) {
|
||||
{{hunk}}
|
||||
}
|
||||
{{endfor}}
|
||||
}
|
||||
|
||||
""")
|
||||
|
||||
|
||||
_decls = Template(r"""
|
||||
typedef struct {
|
||||
{{for name in packed}}
|
||||
float {{'_'.join(name)}};
|
||||
{{endfor}}
|
||||
} {{tname}};
|
||||
|
||||
/* Search through the fixed-size list 'hay' to find the rightmost index which
|
||||
* contains a value strictly smaller than the input 'needle'. The crazy
|
||||
* bitwise search is just for my own amusement.
|
||||
*/
|
||||
__device__
|
||||
int bitwise_binsearch(float *hay, float needle) {
|
||||
int lo = 0;
|
||||
|
||||
{{for i in range(search_rounds-1, -1, -1)}}
|
||||
if (needle > hay[lo | {{1 << i}}])
|
||||
lo |= {{1 << i}};
|
||||
{{endfor}}
|
||||
return lo;
|
||||
}
|
||||
|
||||
__device__
|
||||
float catmull_rom(float *times, float *knots, float t) {
|
||||
int idx = bitwise_binsearch(times, t);
|
||||
|
||||
// The left bias of the search means that we never have to worry about
|
||||
// overshooting unless the genome is corrupted
|
||||
idx = max(idx, 1);
|
||||
|
||||
float t1 = times[idx], t2 = times[idx+1] - t1;
|
||||
float rt2 = 1.0f / t2;
|
||||
float t0 = (times[idx-1] - t1) * rt2, t3 = (times[idx+2] - t1) * rt2;
|
||||
t = (t - t1) * rt2;
|
||||
|
||||
// Now t1 is effectively 0 and t2 is 1
|
||||
|
||||
float k0 = knots[idx-1], k1 = knots[idx],
|
||||
k2 = knots[idx+1], k3 = knots[idx+2];
|
||||
|
||||
float m1 = (k2 - k0) / (1.0f - t0),
|
||||
m2 = (k3 - k1) / (t3);
|
||||
|
||||
float tt = t * t, ttt = tt * t;
|
||||
|
||||
return m1 * ( ttt - 2.0f*tt + t)
|
||||
+ k1 * ( 2.0f*ttt - 3.0f*tt + 1)
|
||||
+ m2 * ( ttt - tt)
|
||||
+ k2 * (-2.0f*ttt + 3.0f*tt);
|
||||
}
|
||||
|
||||
__global__
|
||||
void test_cr(float *times, float *knots, float *t, float *r) {
|
||||
int i = threadIdx.x + blockDim.x * blockIdx.x;
|
||||
r[i] = catmull_rom(times, knots, t[i]);
|
||||
}
|
||||
|
||||
|
||||
""")
|
||||
|
Reference in New Issue
Block a user