mirror of
https://github.com/stevenrobertson/cuburn.git
synced 2025-04-21 09:01:31 -04:00

the 'short way' around the hue circle, and the correct palette is now chosen when > 2 palettes are present in the knots.
359 lines
13 KiB
Python
359 lines
13 KiB
Python
from collections import OrderedDict
|
|
from itertools import cycle
|
|
import numpy as np
|
|
|
|
import tempita
|
|
from cuburn.code.util import HunkOCode, Template
|
|
from cuburn.genome import SplEval
|
|
|
|
class GenomePackerName(str):
|
|
"""Class to indicate that a property is precalculated on the device"""
|
|
pass
|
|
|
|
class GenomePackerView(object):
|
|
"""
|
|
Obtain accessors in generated code.
|
|
|
|
Call ``GenomePacker.view(ptr_name, wrapped_obj, prefix)`` to generate a
|
|
view, where ``ptr_name`` is the name of the CUDA pointer which holds the
|
|
interpolated values, ``wrapped_obj`` is the base Genome instance which
|
|
will be uploaded to the device for interpolating from, and ``prefix`` is
|
|
the prefix that will be assigned to property accessors from this object.
|
|
|
|
Accessing a property on the object synthesizes an accessor for use in your
|
|
code and an interpolator for use in generating that code. This conversion
|
|
is done when the property is coerced into a string by the templating
|
|
mechanism, so you can easily nest objects by saying, for instance,
|
|
{{pcp.camera.rotation}} from within templated code. The accessed property
|
|
must be a SplEval object, or a precalculated value (see
|
|
``GenomePackerPrecalc``).
|
|
|
|
Index operations are converted to property accesses as well, so that you
|
|
don't have to make a mess with 'getattr' in your code: {{pcp.xforms[x]}}
|
|
works just fine. This means, however, that no arrays can be packed
|
|
directly; they must be converted to have string-based keys first, and
|
|
any loops must be unrolled in your code.
|
|
"""
|
|
def __init__(self, packer, ptr_name, wrapped, prefix=()):
|
|
self.packer = packer
|
|
self.ptr_name = ptr_name
|
|
self.wrapped = wrapped
|
|
self.prefix = prefix
|
|
def __getattr__(self, name):
|
|
w = getattr(self.wrapped, name)
|
|
return type(self)(self.packer, self.ptr_name, w, self.prefix+(name,))
|
|
# As with the Genome class, we're all-dict, no-array here
|
|
__getitem__ = lambda s, n: getattr(s, str(n))
|
|
|
|
def __str__(self):
|
|
"""
|
|
Returns the packed name in a format suitable for embedding directly
|
|
into device code.
|
|
"""
|
|
# So evil. When the template calls __str__ to format the output, we
|
|
# allocate things. This makes for neater embedded code, which is where
|
|
# the real complexity lies, but it also means printf() debugging when
|
|
# templating will screw with the allocation tables!
|
|
if isinstance(self.wrapped, SplEval):
|
|
self.packer._require(self.prefix)
|
|
elif not isinstance(self.wrapped, GenomePackerName):
|
|
raise TypeError("Tried to pack something that wasn't a spline or "
|
|
"a precalculated value")
|
|
# TODO: verify namespace stomping, etc
|
|
return '%s.%s' % (self.ptr_name, '_'.join(self.prefix))
|
|
|
|
def _precalc(self):
|
|
"""Create a GenomePackerPrecalc object. See that class for details."""
|
|
return GenomePackerPrecalc(self.packer, self.ptr_name,
|
|
self.wrapped, self.prefix)
|
|
|
|
class GenomePackerPrecalc(GenomePackerView):
|
|
"""
|
|
Insert precalculated values into the packed genome.
|
|
|
|
Create a Precalc object by calling a view object's _precalc() method. The
|
|
returned object shares the same referent dict as the view object, but
|
|
instead of returning view accessors for use in the body of your code,
|
|
accessing a property synthesizes a call to an interpolation function for
|
|
use within the precalculating function. Use this in your precalculated
|
|
code to obtain values from a genome.
|
|
|
|
Once you've obtained the needed parameters and performed the
|
|
precalculation, write them to the genome object with a call to the '_set'
|
|
method in your precalc template. This method generates a new accessor to
|
|
which you can assign a value in your precalculation function. It also
|
|
makes this accessor available for use on the original packer object from
|
|
within your main function.
|
|
|
|
Finally, call the '_code' method on the precalc object with your generated
|
|
precalculation code to add it to the precalculation function. The code
|
|
will be wrapped in a C block to prevent namespace leakage, so that
|
|
multiple invocations of the precalculation code on different code blocks
|
|
can be performed.
|
|
|
|
Example:
|
|
|
|
def do_precalc(px):
|
|
pcam = px._precalc()
|
|
pcam._code(Template('''
|
|
{{pcam._set('prop_sin')}} = sin({{pcam.prop}});
|
|
''').substitute(pcam=pcam))
|
|
|
|
def gen_code(px):
|
|
return Template('''
|
|
{{do_precalc(px)}}
|
|
printf("The sin of %g is %g.", {{px.prop}}, {{px.prop_sin}});
|
|
''').substitute(px=px)
|
|
"""
|
|
def __init__(self, packer, ptr_name, wrapped, prefix):
|
|
super(GenomePackerPrecalc, self).__init__(packer, 'out', wrapped, prefix)
|
|
def __str__(self):
|
|
return self.packer._require_pre(self.prefix)
|
|
def _set(self, name):
|
|
fullname = self.prefix + (name,)
|
|
self.packer._pre_alloc(fullname)
|
|
# This just modifies the underlying object, because I'm too lazy right
|
|
# now to ghost the namespace
|
|
self.wrapped[name] = GenomePackerName('_'.join(fullname))
|
|
return '%s->%s' % (self.ptr_name, self.wrapped[name])
|
|
def _code(self, code):
|
|
self.packer.precalc_code.append(code)
|
|
|
|
class GenomePacker(HunkOCode):
|
|
"""
|
|
Packs a genome for use in iteration.
|
|
"""
|
|
|
|
# 2^search_rounds is the maximum number of control points, including
|
|
# endpoints, that can be used in a single genome. It should be okay to
|
|
# change this number without touching other code, but 32 samples fits
|
|
# nicely on a single cache line.
|
|
search_rounds = 5
|
|
|
|
def __init__(self, tname):
|
|
"""
|
|
Create a new DataPacker.
|
|
|
|
``tname`` is the name of the structure typedef that will be emitted
|
|
via this object's ``decls`` property.
|
|
"""
|
|
self.tname = tname
|
|
# We could do this in the order that things are requested, but we want
|
|
# to be able to treat the direct stuff as a list so this function
|
|
# doesn't unroll any more than it has to. So we separate things into
|
|
# direct requests, and those that need precalculation.
|
|
# Values of OrderedDict are unused; basically, it's just OrderedSet.
|
|
self.packed_direct = OrderedDict()
|
|
self.genome_precalc = OrderedDict()
|
|
self.packed_precalc = OrderedDict()
|
|
self.precalc_code = []
|
|
|
|
self.ns = {}
|
|
|
|
self._len = None
|
|
self.decls = None
|
|
self.defs = None
|
|
|
|
self.packed = None
|
|
self.genome = None
|
|
|
|
def __len__(self):
|
|
assert self._len is not None, 'len() called before finalize()'
|
|
return self._len
|
|
|
|
def view(self, ptr_name, wrapped_obj, prefix):
|
|
"""Create a DataPacker view. See DataPackerView class for details."""
|
|
self.ns[prefix] = wrapped_obj
|
|
return GenomePackerView(self, ptr_name, wrapped_obj, (prefix,))
|
|
|
|
def _require(self, name):
|
|
"""
|
|
Called to indicate that the named parameter from the original genome
|
|
must be available during interpolation.
|
|
"""
|
|
self.packed_direct[name] = None
|
|
|
|
def _require_pre(self, name):
|
|
i = len(self.genome_precalc) << self.search_rounds
|
|
self.genome_precalc[name] = None
|
|
return 'catmull_rom(×[%d], &knots[%d], time)' % (i, i)
|
|
|
|
def _pre_alloc(self, name):
|
|
self.packed_precalc[name] = None
|
|
|
|
def finalize(self):
|
|
"""
|
|
Create the code to render this genome.
|
|
"""
|
|
# At the risk of packing a few things more than once, we don't
|
|
# uniquify the overall precalc order, sparing us the need to implement
|
|
# recursive code generation
|
|
self.packed = self.packed_direct.keys() + self.packed_precalc.keys()
|
|
self.genome = self.packed_direct.keys() + self.genome_precalc.keys()
|
|
|
|
self._len = len(self.packed)
|
|
|
|
self.decls = self._decls.substitute(
|
|
packed=self.packed, tname=self.tname,
|
|
search_rounds=self.search_rounds)
|
|
self.defs = self._defs.substitute(
|
|
packed_direct=self.packed_direct, tname=self.tname,
|
|
precalc_code=self.precalc_code,
|
|
search_rounds=self.search_rounds)
|
|
|
|
|
|
def pack(self):
|
|
"""
|
|
Return a packed copy of the genome ready for uploading to the GPU as a
|
|
3D NDArray, with the first element being the times and the second
|
|
being the values.
|
|
"""
|
|
width = 1 << self.search_rounds
|
|
out = np.empty((2, len(self.genome), width), dtype=np.float32)
|
|
# Ensure that unused values at the top are always big (must be >2.0)
|
|
out[0].fill(1e9)
|
|
|
|
for idx, gname in enumerate(self.genome):
|
|
attr = self.ns[gname[0]]
|
|
for g in gname[1:]:
|
|
attr = getattr(attr, g)
|
|
if not isinstance(attr, SplEval):
|
|
raise TypeError("%s isn't a spline" % '.'.join(gname))
|
|
out[0][idx][:len(attr.knots[0])] = attr.knots[0]
|
|
out[1][idx][:len(attr.knots[1])] = attr.knots[1]
|
|
return out
|
|
|
|
_defs = Template(r"""
|
|
|
|
__global__
|
|
void interp_{{tname}}({{tname}}* out, float *times, float *knots,
|
|
float tstart, float tstep, mwc_st *rctxes, int maxid) {
|
|
int id = gtid();
|
|
if (id >= maxid) return;
|
|
out = &out[id];
|
|
mwc_st rctx = rctxes[id];
|
|
float time = tstart + id * tstep;
|
|
|
|
float *outf = reinterpret_cast<float*>(out);
|
|
|
|
// TODO: unroll pragma?
|
|
for (int i = 0; i < {{len(packed_direct)}}; i++) {
|
|
int j = i << {{search_rounds}};
|
|
outf[i] = catmull_rom(×[j], &knots[j], time);
|
|
}
|
|
|
|
// Advance 'times' and 'knots' to the purely generated sections, so that
|
|
// the pregenerated statements emitted by _require_pre are correct.
|
|
times = ×[{{len(packed_direct)<<search_rounds}}];
|
|
knots = &knots[{{len(packed_direct)<<search_rounds}}];
|
|
|
|
{{for hunk in precalc_code}}
|
|
if (1) {
|
|
{{hunk}}
|
|
}
|
|
{{endfor}}
|
|
}
|
|
|
|
""")
|
|
|
|
|
|
_decls = Template(r"""
|
|
typedef struct {
|
|
{{for name in packed}}
|
|
float {{'_'.join(name)}};
|
|
{{endfor}}
|
|
} {{tname}};
|
|
|
|
/* Search through the fixed-size list 'hay' to find the rightmost index which
|
|
* contains a value strictly smaller than the input 'needle'. The crazy
|
|
* bitwise search is just for my own amusement.
|
|
*/
|
|
__device__
|
|
int bitwise_binsearch(const float *hay, float needle) {
|
|
int lo = 0;
|
|
|
|
{{for i in range(search_rounds-1, -1, -1)}}
|
|
if (needle > hay[lo | {{1 << i}}])
|
|
lo |= {{1 << i}};
|
|
{{endfor}}
|
|
return lo;
|
|
}
|
|
|
|
__device__ __noinline__
|
|
float catmull_rom(const float *times, const float *knots, float t) {
|
|
int idx = bitwise_binsearch(times, t);
|
|
|
|
// The left bias of the search means that we never have to worry about
|
|
// overshooting unless the genome is corrupted
|
|
idx = max(idx, 1);
|
|
|
|
float t1 = times[idx], t2 = times[idx+1] - t1;
|
|
float rt2 = 1.0f / t2;
|
|
float t0 = (times[idx-1] - t1) * rt2, t3 = (times[idx+2] - t1) * rt2;
|
|
t = (t - t1) * rt2;
|
|
|
|
// Now t1 is effectively 0 and t2 is 1
|
|
|
|
float k0 = knots[idx-1], k1 = knots[idx],
|
|
k2 = knots[idx+1], k3 = knots[idx+2];
|
|
|
|
float m1 = (k2 - k0) / (1.0f - t0),
|
|
m2 = (k3 - k1) / (t3);
|
|
|
|
float tt = t * t, ttt = tt * t;
|
|
|
|
return m1 * ( ttt - 2.0f*tt + t)
|
|
+ k1 * ( 2.0f*ttt - 3.0f*tt + 1)
|
|
+ m2 * ( ttt - tt)
|
|
+ k2 * (-2.0f*ttt + 3.0f*tt);
|
|
}
|
|
|
|
__global__
|
|
void test_cr(const float *times, const float *knots, const float *t, float *r) {
|
|
int i = threadIdx.x + blockDim.x * blockIdx.x;
|
|
r[i] = catmull_rom(times, knots, t[i]);
|
|
}
|
|
|
|
__global__
|
|
void interp_palette_hsv(uchar4 *outs, const float *times, const float4 *sources,
|
|
float tstart, float tstep) {
|
|
float time = tstart + blockIdx.x * tstep;
|
|
int idx = fmaxf(bitwise_binsearch(times, time) + 1, 1);
|
|
|
|
float4 left = sources[blockDim.x * (idx - 1) + threadIdx.x];
|
|
float4 right = sources[blockDim.x * (idx) + threadIdx.x];
|
|
|
|
float lf = (times[idx] - time) / (times[idx] - times[idx-1]);
|
|
float rf = 1.0f - lf;
|
|
|
|
float3 lhsv = rgb2hsv(make_float3(left.x, left.y, left.z));
|
|
float3 rhsv = rgb2hsv(make_float3(right.x, right.y, right.z));
|
|
|
|
if (fabs(lhsv.x - rhsv.x) > 3.0f)
|
|
if (lhsv.x < rhsv.x)
|
|
lhsv.x += 6.0f;
|
|
else
|
|
rhsv.x += 6.0f;
|
|
|
|
float3 hsv;
|
|
hsv.x = lhsv.x * lf + rhsv.x * rf;
|
|
hsv.y = lhsv.y * lf + rhsv.y * rf;
|
|
hsv.z = lhsv.z * lf + rhsv.z * rf;
|
|
|
|
if (hsv.x > 6.0f)
|
|
hsv.x -= 6.0f;
|
|
if (hsv.x < 0.0f)
|
|
hsv.x += 6.0f;
|
|
|
|
float3 rgb = hsv2rgb(hsv);
|
|
|
|
uchar4 out;
|
|
out.x = rgb.x * 255.0f;
|
|
out.y = rgb.y * 255.0f;
|
|
out.z = rgb.z * 255.0f;
|
|
out.w = 255.0f * (left.z * lf + right.z * rf);
|
|
outs[blockDim.x * blockIdx.x + threadIdx.x] = out;
|
|
}
|
|
""")
|
|
|