mirror of
				https://github.com/stevenrobertson/cuburn.git
				synced 2025-11-03 18:00:55 -05:00 
			
		
		
		
	
		
			
				
	
	
		
			347 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			347 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
from collections import OrderedDict
 | 
						|
from itertools import cycle
 | 
						|
import numpy as np
 | 
						|
 | 
						|
import tempita
 | 
						|
from cuburn.code.util import HunkOCode, Template
 | 
						|
from cuburn.genome import SplEval
 | 
						|
 | 
						|
class GenomePackerName(str):
 | 
						|
    """Class to indicate that a property is precalculated on the device"""
 | 
						|
    pass
 | 
						|
 | 
						|
class GenomePackerView(object):
 | 
						|
    """
 | 
						|
    Obtain accessors in generated code.
 | 
						|
 | 
						|
    Call ``GenomePacker.view(ptr_name, wrapped_obj, prefix)`` to generate a
 | 
						|
    view, where ``ptr_name`` is the name of the CUDA pointer which holds the
 | 
						|
    interpolated values, ``wrapped_obj`` is the base Genome instance which
 | 
						|
    will be uploaded to the device for interpolating from, and ``prefix`` is
 | 
						|
    the prefix that will be assigned to property accessors from this object.
 | 
						|
 | 
						|
    Accessing a property on the object synthesizes an accessor for use in your
 | 
						|
    code and an interpolator for use in generating that code. This conversion
 | 
						|
    is done when the property is coerced into a string by the templating
 | 
						|
    mechanism, so you can easily nest objects by saying, for instance,
 | 
						|
    {{pcp.camera.rotation}} from within templated code. The accessed property
 | 
						|
    must be a SplEval object, or a precalculated value (see
 | 
						|
    ``GenomePackerPrecalc``).
 | 
						|
 | 
						|
    Index operations are converted to property accesses as well, so that you
 | 
						|
    don't have to make a mess with 'getattr' in your code: {{pcp.xforms[x]}}
 | 
						|
    works just fine. This means, however, that no arrays can be packed
 | 
						|
    directly; they must be converted to have string-based keys first, and
 | 
						|
    any loops must be unrolled in your code.
 | 
						|
    """
 | 
						|
    def __init__(self, packer, ptr_name, wrapped, prefix=()):
 | 
						|
        self.packer = packer
 | 
						|
        self.ptr_name = ptr_name
 | 
						|
        self.wrapped = wrapped
 | 
						|
        self.prefix = prefix
 | 
						|
    def __getattr__(self, name):
 | 
						|
        w = getattr(self.wrapped, name)
 | 
						|
        return type(self)(self.packer, self.ptr_name, w, self.prefix+(name,))
 | 
						|
    # As with the Genome class, we're all-dict, no-array here
 | 
						|
    __getitem__ = lambda s, n: getattr(s, str(n))
 | 
						|
 | 
						|
    def __str__(self):
 | 
						|
        """
 | 
						|
        Returns the packed name in a format suitable for embedding directly
 | 
						|
        into device code.
 | 
						|
        """
 | 
						|
        # So evil. When the template calls __str__ to format the output, we
 | 
						|
        # allocate things. This makes for neater embedded code, which is where
 | 
						|
        # the real complexity lies, but it also means printf() debugging when
 | 
						|
        # templating will screw with the allocation tables!
 | 
						|
        if isinstance(self.wrapped, SplEval):
 | 
						|
            self.packer._require(self.prefix)
 | 
						|
        elif not isinstance(self.wrapped, GenomePackerName):
 | 
						|
            raise TypeError("Tried to pack something that wasn't a spline or "
 | 
						|
                            "a precalculated value")
 | 
						|
        # TODO: verify namespace stomping, etc
 | 
						|
        return '%s.%s' % (self.ptr_name, '_'.join(self.prefix))
 | 
						|
 | 
						|
    def _precalc(self):
 | 
						|
        """Create a GenomePackerPrecalc object. See that class for details."""
 | 
						|
        return GenomePackerPrecalc(self.packer, self.ptr_name,
 | 
						|
                                   self.wrapped, self.prefix)
 | 
						|
 | 
						|
class GenomePackerPrecalc(GenomePackerView):
 | 
						|
    """
 | 
						|
    Insert precalculated values into the packed genome.
 | 
						|
 | 
						|
    Create a Precalc object by calling a view object's _precalc() method. The
 | 
						|
    returned object shares the same referent dict as the view object, but
 | 
						|
    instead of returning view accessors for use in the body of your code,
 | 
						|
    accessing a property synthesizes a call to an interpolation function for
 | 
						|
    use within the precalculating function. Use this in your precalculated
 | 
						|
    code to obtain values from a genome.
 | 
						|
 | 
						|
    Once you've obtained the needed parameters and performed the
 | 
						|
    precalculation, write them to the genome object with a call to the '_set'
 | 
						|
    method in your precalc template. This method generates a new accessor to
 | 
						|
    which you can assign a value in your precalculation function. It also
 | 
						|
    makes this accessor available for use on the original packer object from
 | 
						|
    within your main function.
 | 
						|
 | 
						|
    Finally, call the '_code' method on the precalc object with your generated
 | 
						|
    precalculation code to add it to the precalculation function. The code
 | 
						|
    will be wrapped in a C block to prevent namespace leakage, so that
 | 
						|
    multiple invocations of the precalculation code on different code blocks
 | 
						|
    can be performed.
 | 
						|
 | 
						|
    Example:
 | 
						|
 | 
						|
        def do_precalc(px):
 | 
						|
            pcam = px._precalc()
 | 
						|
            pcam._code(Template('''
 | 
						|
                {{pcam._set('prop_sin')}} = sin({{pcam.prop}});
 | 
						|
                ''').substitute(pcam=pcam))
 | 
						|
 | 
						|
        def gen_code(px):
 | 
						|
            return Template('''
 | 
						|
                {{do_precalc(px)}}
 | 
						|
                printf("The sin of %g is %g.", {{px.prop}}, {{px.prop_sin}});
 | 
						|
                ''').substitute(px=px)
 | 
						|
    """
 | 
						|
    def __init__(self, packer, ptr_name, wrapped, prefix):
 | 
						|
        super(GenomePackerPrecalc, self).__init__(packer, 'out', wrapped, prefix)
 | 
						|
    def __str__(self):
 | 
						|
        return self.packer._require_pre(self.prefix)
 | 
						|
    def _set(self, name):
 | 
						|
        fullname = self.prefix + (name,)
 | 
						|
        self.packer._pre_alloc(fullname)
 | 
						|
        # This just modifies the underlying object, because I'm too lazy right
 | 
						|
        # now to ghost the namespace
 | 
						|
        self.wrapped[name] = GenomePackerName('_'.join(fullname))
 | 
						|
        return '%s->%s' % (self.ptr_name, self.wrapped[name])
 | 
						|
    def _code(self, code):
 | 
						|
        self.packer.precalc_code.append(code)
 | 
						|
 | 
						|
class GenomePacker(HunkOCode):
 | 
						|
    """
 | 
						|
    Packs a genome for use in iteration.
 | 
						|
    """
 | 
						|
 | 
						|
    # 2^search_rounds is the maximum number of control points, including
 | 
						|
    # endpoints, that can be used in a single genome. It should be okay to
 | 
						|
    # change this number without touching other code, but 32 samples fits
 | 
						|
    # nicely on a single cache line.
 | 
						|
    search_rounds = 5
 | 
						|
 | 
						|
    def __init__(self, tname):
 | 
						|
        """
 | 
						|
        Create a new DataPacker.
 | 
						|
 | 
						|
        ``tname`` is the name of the structure typedef that will be emitted
 | 
						|
        via this object's ``decls`` property.
 | 
						|
        """
 | 
						|
        self.tname = tname
 | 
						|
        # We could do this in the order that things are requested, but we want
 | 
						|
        # to be able to treat the direct stuff as a list so this function
 | 
						|
        # doesn't unroll any more than it has to. So we separate things into
 | 
						|
        # direct requests, and those that need precalculation.
 | 
						|
        # Values of OrderedDict are unused; basically, it's just OrderedSet.
 | 
						|
        self.packed_direct = OrderedDict()
 | 
						|
        self.genome_precalc = OrderedDict()
 | 
						|
        self.packed_precalc = OrderedDict()
 | 
						|
        self.precalc_code = []
 | 
						|
 | 
						|
        self.ns = {}
 | 
						|
 | 
						|
        self._len = None
 | 
						|
        self.decls = None
 | 
						|
        self.defs = None
 | 
						|
 | 
						|
        self.packed = None
 | 
						|
        self.genome = None
 | 
						|
 | 
						|
    def __len__(self):
 | 
						|
        assert self._len is not None, 'len() called before finalize()'
 | 
						|
        return self._len
 | 
						|
 | 
						|
    def view(self, ptr_name, wrapped_obj, prefix):
 | 
						|
        """Create a DataPacker view. See DataPackerView class for details."""
 | 
						|
        self.ns[prefix] = wrapped_obj
 | 
						|
        return GenomePackerView(self, ptr_name, wrapped_obj, (prefix,))
 | 
						|
 | 
						|
    def _require(self, name):
 | 
						|
        """
 | 
						|
        Called to indicate that the named parameter from the original genome
 | 
						|
        must be available during interpolation.
 | 
						|
        """
 | 
						|
        self.packed_direct[name] = None
 | 
						|
 | 
						|
    def _require_pre(self, name):
 | 
						|
        i = len(self.genome_precalc) << self.search_rounds
 | 
						|
        self.genome_precalc[name] = None
 | 
						|
        return 'catmull_rom(×[%d], &knots[%d], time)' % (i, i)
 | 
						|
 | 
						|
    def _pre_alloc(self, name):
 | 
						|
        self.packed_precalc[name] = None
 | 
						|
 | 
						|
    def finalize(self):
 | 
						|
        """
 | 
						|
        Create the code to render this genome.
 | 
						|
        """
 | 
						|
        # At the risk of packing a few things more than once, we don't
 | 
						|
        # uniquify the overall precalc order, sparing us the need to implement
 | 
						|
        # recursive code generation
 | 
						|
        self.packed = self.packed_direct.keys() + self.packed_precalc.keys()
 | 
						|
        self.genome = self.packed_direct.keys() + self.genome_precalc.keys()
 | 
						|
 | 
						|
        self._len = len(self.packed)
 | 
						|
 | 
						|
        self.decls = self._decls.substitute(
 | 
						|
                packed=self.packed, tname=self.tname,
 | 
						|
                search_rounds=self.search_rounds)
 | 
						|
        self.defs = self._defs.substitute(
 | 
						|
                packed_direct=self.packed_direct, tname=self.tname,
 | 
						|
                precalc_code=self.precalc_code,
 | 
						|
                search_rounds=self.search_rounds)
 | 
						|
 | 
						|
 | 
						|
    def pack(self):
 | 
						|
        """
 | 
						|
        Return a packed copy of the genome ready for uploading to the GPU as a
 | 
						|
        3D NDArray, with the first element being the times and the second
 | 
						|
        being the values.
 | 
						|
        """
 | 
						|
        width = 1 << self.search_rounds
 | 
						|
        out = np.empty((2, len(self.genome), width), dtype=np.float32)
 | 
						|
        # Ensure that unused values at the top are always big (must be >2.0)
 | 
						|
        out[0].fill(1e9)
 | 
						|
 | 
						|
        for idx, gname in enumerate(self.genome):
 | 
						|
            attr = self.ns[gname[0]]
 | 
						|
            for g in gname[1:]:
 | 
						|
                attr = getattr(attr, g)
 | 
						|
            if not isinstance(attr, SplEval):
 | 
						|
                raise TypeError("%s isn't a spline" % '.'.join(gname))
 | 
						|
            out[0][idx][:len(attr.knots[0])] = attr.knots[0]
 | 
						|
            out[1][idx][:len(attr.knots[1])] = attr.knots[1]
 | 
						|
        return out
 | 
						|
 | 
						|
    _defs = Template(r"""
 | 
						|
 | 
						|
__global__
 | 
						|
void interp_{{tname}}({{tname}}* out, float *times, float *knots,
 | 
						|
        float tstart, float tstep, mwc_st *rctxes, int maxid) {
 | 
						|
    int id = gtid();
 | 
						|
    if (id >= maxid) return;
 | 
						|
    out = &out[id];
 | 
						|
    mwc_st rctx = rctxes[id];
 | 
						|
    float time = tstart + id * tstep;
 | 
						|
 | 
						|
    float *outf = reinterpret_cast<float*>(out);
 | 
						|
 | 
						|
    // TODO: unroll pragma?
 | 
						|
    for (int i = 0; i < {{len(packed_direct)}}; i++) {
 | 
						|
        int j = i << {{search_rounds}};
 | 
						|
        outf[i] = catmull_rom(×[j], &knots[j], time);
 | 
						|
    }
 | 
						|
 | 
						|
    // Advance 'times' and 'knots' to the purely generated sections, so that
 | 
						|
    // the pregenerated statements emitted by _require_pre are correct.
 | 
						|
    times = ×[{{len(packed_direct)<<search_rounds}}];
 | 
						|
    knots = &knots[{{len(packed_direct)<<search_rounds}}];
 | 
						|
 | 
						|
    {{for hunk in precalc_code}}
 | 
						|
    if (1) {
 | 
						|
    {{hunk}}
 | 
						|
    }
 | 
						|
    {{endfor}}
 | 
						|
}
 | 
						|
 | 
						|
""")
 | 
						|
 | 
						|
 | 
						|
    _decls = Template(r"""
 | 
						|
typedef struct {
 | 
						|
{{for name in packed}}
 | 
						|
    float   {{'_'.join(name)}};
 | 
						|
{{endfor}}
 | 
						|
} {{tname}};
 | 
						|
 | 
						|
/* Search through the fixed-size list 'hay' to find the rightmost index which
 | 
						|
 * contains a value strictly smaller than the input 'needle'. The crazy
 | 
						|
 * bitwise search is just for my own amusement.
 | 
						|
 */
 | 
						|
__device__
 | 
						|
int bitwise_binsearch(const float *hay, float needle) {
 | 
						|
    int lo = 0;
 | 
						|
 | 
						|
    {{for i in range(search_rounds-1, -1, -1)}}
 | 
						|
    if (needle > hay[lo | {{1 << i}}])
 | 
						|
        lo |= {{1 << i}};
 | 
						|
    {{endfor}}
 | 
						|
    return lo;
 | 
						|
}
 | 
						|
 | 
						|
__device__ __noinline__
 | 
						|
float catmull_rom(const float *times, const float *knots, float t) {
 | 
						|
    int idx = bitwise_binsearch(times, t);
 | 
						|
 | 
						|
    // The left bias of the search means that we never have to worry about
 | 
						|
    // overshooting unless the genome is corrupted
 | 
						|
    idx = max(idx, 1);
 | 
						|
 | 
						|
    float t1 = times[idx], t2 = times[idx+1] - t1;
 | 
						|
    float rt2 = 1.0f / t2;
 | 
						|
    float t0 = (times[idx-1] - t1) * rt2, t3 = (times[idx+2] - t1) * rt2;
 | 
						|
    t = (t - t1) * rt2;
 | 
						|
 | 
						|
    // Now t1 is effectively 0 and t2 is 1
 | 
						|
 | 
						|
    float k0 = knots[idx-1], k1 = knots[idx],
 | 
						|
          k2 = knots[idx+1], k3 = knots[idx+2];
 | 
						|
 | 
						|
    float m1 = (k2 - k0) / (1.0f - t0),
 | 
						|
          m2 = (k3 - k1) / (t3);
 | 
						|
 | 
						|
    float tt = t * t, ttt = tt * t;
 | 
						|
 | 
						|
    return m1 * (      ttt - 2.0f*tt + t)
 | 
						|
         + k1 * ( 2.0f*ttt - 3.0f*tt + 1)
 | 
						|
         + m2 * (      ttt -      tt)
 | 
						|
         + k2 * (-2.0f*ttt + 3.0f*tt);
 | 
						|
}
 | 
						|
 | 
						|
__global__
 | 
						|
void test_cr(const float *times, const float *knots, const float *t, float *r) {
 | 
						|
    int i = threadIdx.x + blockDim.x * blockIdx.x;
 | 
						|
    r[i] = catmull_rom(times, knots, t[i]);
 | 
						|
}
 | 
						|
 | 
						|
__global__
 | 
						|
void interp_palette_hsv(uchar4 *outs, const float *times, const float4 *sources,
 | 
						|
        float tstart, float tstep) {
 | 
						|
    float time = tstart + blockIdx.x * tstep;
 | 
						|
    int idx = fmaxf(bitwise_binsearch(times, time), 1);
 | 
						|
 | 
						|
    float4 left  = sources[blockDim.x * (idx - 1) + threadIdx.x];
 | 
						|
    float4 right = sources[blockDim.x * (idx)     + threadIdx.x];
 | 
						|
 | 
						|
    float lf = (times[idx] - time) / (times[idx] - times[idx-1]);
 | 
						|
    float rf = 1.0f - lf;
 | 
						|
 | 
						|
    float3 lhsv = rgb2hsv(make_float3(left.x, left.y, left.z));
 | 
						|
    float3 rhsv = rgb2hsv(make_float3(right.x, right.y, right.z));
 | 
						|
 | 
						|
    float3 hsv;
 | 
						|
    hsv.x = lhsv.x * lf + rhsv.x * rf;
 | 
						|
    hsv.y = lhsv.y * lf + rhsv.y * rf;
 | 
						|
    hsv.z = lhsv.z * lf + rhsv.z * rf;
 | 
						|
    float3 rgb = hsv2rgb(hsv);
 | 
						|
 | 
						|
    uchar4 out;
 | 
						|
    out.x = rgb.x * 255.0f;
 | 
						|
    out.y = rgb.y * 255.0f;
 | 
						|
    out.z = rgb.z * 255.0f;
 | 
						|
    out.w = 255.0f * (left.z * lf + right.z * rf);
 | 
						|
    outs[blockDim.x * blockIdx.x + threadIdx.x] = out;
 | 
						|
}
 | 
						|
""")
 | 
						|
 |