diff --git a/cuburn/code/filtering.py b/cuburn/code/filtering.py index 702c15a..2da5cab 100644 --- a/cuburn/code/filtering.py +++ b/cuburn/code/filtering.py @@ -2,8 +2,8 @@ from cuburn.code.util import * class ColorClip(HunkOCode): - def __init__(self, features): - self.defs = self.defs_tmpl.substitute(features=features) + def __init__(self, info): + self.defs = self.defs_tmpl.substitute(info=info) defs_tmpl = Template(''' __global__ @@ -63,7 +63,7 @@ void colorclip(float4 *pixbuf, float gamma, float vibrancy, float highpow, pix.y += (1.0f - vibrancy) * powf(opix.y, gamma); pix.z += (1.0f - vibrancy) * powf(opix.z, gamma); - {{if features.alpha_output_channel}} + {{if info.alpha_output_channel}} float 1_alpha = 1 / alpha; pix.x *= 1_alpha; pix.y *= 1_alpha; @@ -94,13 +94,13 @@ class DensityEst(HunkOCode): # Note, changing this does not yet have any effect, it's just informational MAX_WIDTH=15 - def __init__(self, features, cp): - self.features, self.cp = features, cp + def __init__(self, info): + self.info = info headers = "#include\n" @property def defs(self): - return self.defs_tmpl.substitute(features=self.features, cp=self.cp) + return self.defs_tmpl.substitute(info=self.info) defs_tmpl = Template(''' #define W 15 // Filter width (regardless of standard deviation chosen) @@ -147,9 +147,9 @@ void density_est(float4 *pixbuf, float4 *outbuf, de_r[i] = de_g[i] = de_b[i] = de_a[i] = 0.0f; __syncthreads(); - for (int imrow = threadIdx.y + W2; imrow < {{features.acc_height}}; imrow += 32) + for (int imrow = threadIdx.y + W2; imrow < {{info.acc_height}}; imrow += 32) { - int idx = {{features.acc_stride}} * imrow + + int idx = {{info.acc_stride}} * imrow + + blockIdx.x * 32 + threadIdx.x + W2; float4 in = pixbuf[idx]; @@ -249,7 +249,7 @@ void density_est(float4 *pixbuf, float4 *outbuf, __syncthreads(); // TODO: could coalesce this, but what a pain for (int i = threadIdx.x; i < FW; i += 32) { - idx = {{features.acc_stride}} * imrow + blockIdx.x * 32 + i + W2; + idx = {{info.acc_stride}} * imrow + blockIdx.x * 32 + i + W2; int si = threadIdx.y * FW + i; float *out = reinterpret_cast(&outbuf[idx]); atomicAdd(out, de_r[si]); @@ -285,12 +285,14 @@ void density_est(float4 *pixbuf, float4 *outbuf, # TODO: add no-est version # TODO: come up with a general way to average these parameters - k1 = np.float32(cp.brightness * 268 / 256) - area = self.features.width * self.features.height / cp.ppu ** 2 - k2 = np.float32(1 / (area * cp.adj_density )) + k1 = np.float32(cp.color.brightness * 268 / 256) + # Old definition of area is (w*h/(s*s)). Since new scale 'ns' is now + # s/w, new definition is (w*h/(s*s*w*w)) = (h/(s*s*w)) + area = self.info.height / (cp.camera.scale ** 2 * self.info.width) + k2 = np.float32(1 / (area * self.info.density )) - if self.cp.estimator == 0: - nbins = self.features.acc_height * self.features.acc_stride + if cp.de.radius == 0: + nbins = self.info.acc_height * self.info.acc_stride fun = mod.get_function("logscale") t = fun(abufd, obufd, k1, k2, block=(512, 1, 1), grid=(nbins/512, 1), stream=stream) @@ -299,11 +301,11 @@ void density_est(float4 *pixbuf, float4 *outbuf, # 0.5, but the DE filters scale filter distance by the default # spatial support factor of 1.5, so the effective base SD is # (0.5/1.5)=1/3. - est_sd = np.float32(cp.estimator / 3.) - neg_est_curve = np.float32(-cp.estimator_curve) - est_min = np.float32(cp.estimator_minimum / 3.) + est_sd = np.float32(cp.de.radius / 3.) + neg_est_curve = np.float32(-cp.de.curve) + est_min = np.float32(cp.de.minimum / 3.) fun = mod.get_function("density_est") fun(abufd, obufd, est_sd, neg_est_curve, est_min, k1, k2, - block=(32, 32, 1), grid=(self.features.acc_width/32, 1), + block=(32, 32, 1), grid=(self.info.acc_width/32, 1), stream=stream) diff --git a/cuburn/code/interp.py b/cuburn/code/interp.py new file mode 100644 index 0000000..c3267e2 --- /dev/null +++ b/cuburn/code/interp.py @@ -0,0 +1,318 @@ +from collections import OrderedDict +from itertools import cycle +import numpy as np + +import tempita +from cuburn.code.util import HunkOCode, Template +from cuburn.genome import SplEval + +class GenomePackerName(str): + """Class to indicate that a property is precalculated on the device""" + pass + +class GenomePackerView(object): + """ + Obtain accessors in generated code. + + Call ``GenomePacker.view(ptr_name, wrapped_obj, prefix)`` to generate a + view, where ``ptr_name`` is the name of the CUDA pointer which holds the + interpolated values, ``wrapped_obj`` is the base Genome instance which + will be uploaded to the device for interpolating from, and ``prefix`` is + the prefix that will be assigned to property accessors from this object. + + Accessing a property on the object synthesizes an accessor for use in your + code and an interpolator for use in generating that code. This conversion + is done when the property is coerced into a string by the templating + mechanism, so you can easily nest objects by saying, for instance, + {{pcp.camera.rotation}} from within templated code. The accessed property + must be a SplEval object, or a precalculated value (see + ``GenomePackerPrecalc``). + + Index operations are converted to property accesses as well, so that you + don't have to make a mess with 'getattr' in your code: {{pcp.xforms[x]}} + works just fine. This means, however, that no arrays can be packed + directly; they must be converted to have string-based keys first, and + any loops must be unrolled in your code. + """ + def __init__(self, packer, ptr_name, wrapped, prefix=()): + self.packer = packer + self.ptr_name = ptr_name + self.wrapped = wrapped + self.prefix = prefix + def __getattr__(self, name): + w = getattr(self.wrapped, name) + return type(self)(self.packer, self.ptr_name, w, self.prefix+(name,)) + # As with the Genome class, we're all-dict, no-array here + __getitem__ = lambda s, n: getattr(s, str(n)) + + def __str__(self): + """ + Returns the packed name in a format suitable for embedding directly + into device code. + """ + # So evil. When the template calls __str__ to format the output, we + # allocate things. This makes for neater embedded code, which is where + # the real complexity lies, but it also means printf() debugging when + # templating will screw with the allocation tables! + if isinstance(self.wrapped, SplEval): + self.packer._require(self.prefix) + elif not isinstance(self.wrapped, GenomePackerName): + raise TypeError("Tried to pack something that wasn't a spline or " + "a precalculated value") + # TODO: verify namespace stomping, etc + return '%s.%s' % (self.ptr_name, '_'.join(self.prefix)) + + def _precalc(self): + """Create a GenomePackerPrecalc object. See that class for details.""" + return GenomePackerPrecalc(self.packer, self.ptr_name, + self.wrapped, self.prefix) + +class GenomePackerPrecalc(GenomePackerView): + """ + Insert precalculated values into the packed genome. + + Create a Precalc object by calling a view object's _precalc() method. The + returned object shares the same referent dict as the view object, but + instead of returning view accessors for use in the body of your code, + accessing a property synthesizes a call to an interpolation function for + use within the precalculating function. Use this in your precalculated + code to obtain values from a genome. + + Once you've obtained the needed parameters and performed the + precalculation, write them to the genome object with a call to the '_set' + method in your precalc template. This method generates a new accessor to + which you can assign a value in your precalculation function. It also + makes this accessor available for use on the original packer object from + within your main function. + + Finally, call the '_code' method on the precalc object with your generated + precalculation code to add it to the precalculation function. The code + will be wrapped in a C block to prevent namespace leakage, so that + multiple invocations of the precalculation code on different code blocks + can be performed. + + Example: + + def do_precalc(px): + pcam = px._precalc() + pcam._code(Template(''' + {{pcam._set('prop_sin')}} = sin({{pcam.prop}}); + ''').substitute(pcam=pcam)) + + def gen_code(px): + return Template(''' + {{do_precalc(px)}} + printf("The sin of %g is %g.", {{px.prop}}, {{px.prop_sin}}); + ''').substitute(px=px) + """ + def __init__(self, packer, ptr_name, wrapped, prefix): + super(GenomePackerPrecalc, self).__init__(packer, 'out', wrapped, prefix) + def __str__(self): + return self.packer._require_pre(self.prefix) + def _set(self, name): + fullname = self.prefix + (name,) + self.packer._pre_alloc(fullname) + # This just modifies the underlying object, because I'm too lazy right + # now to ghost the namespace + self.wrapped[name] = GenomePackerName('_'.join(fullname)) + return '%s->%s' % (self.ptr_name, self.wrapped[name]) + def _code(self, code): + self.packer.precalc_code.append(code) + +class GenomePacker(HunkOCode): + """ + Packs a genome for use in iteration. + """ + + # 2^search_rounds is the maximum number of control points, including + # endpoints, that can be used in a single genome. It should be okay to + # change this number without touching other code, but 32 samples fits + # nicely on a single cache line. + search_rounds = 5 + + def __init__(self, tname): + """ + Create a new DataPacker. + + ``tname`` is the name of the structure typedef that will be emitted + via this object's ``decls`` property. + """ + self.tname = tname + # We could do this in the order that things are requested, but we want + # to be able to treat the direct stuff as a list so this function + # doesn't unroll any more than it has to. So we separate things into + # direct requests, and those that need precalculation. + # Values of OrderedDict are unused; basically, it's just OrderedSet. + self.packed_direct = OrderedDict() + self.genome_precalc = OrderedDict() + self.packed_precalc = OrderedDict() + self.precalc_code = [] + + self.ns = {} + + self._len = None + self.decls = None + self.defs = None + + self.packed = None + self.genome = None + + def __len__(self): + assert self._len is not None, 'len() called before finalize()' + return self._len + + def view(self, ptr_name, wrapped_obj, prefix): + """Create a DataPacker view. See DataPackerView class for details.""" + self.ns[prefix] = wrapped_obj + return GenomePackerView(self, ptr_name, wrapped_obj, (prefix,)) + + def _require(self, name): + """ + Called to indicate that the named parameter from the original genome + must be available during interpolation. + """ + self.packed_direct[name] = None + + def _require_pre(self, name): + i = len(self.genome_precalc) << self.search_rounds + self.genome_precalc[name] = None + return 'catmull_rom(×[%d], &knots[%d], time)' % (i, i) + + def _pre_alloc(self, name): + self.packed_precalc[name] = None + + def finalize(self): + """ + Create the code to render this genome. + """ + # At the risk of packing a few things more than once, we don't + # uniquify the overall precalc order, sparing us the need to implement + # recursive code generation + self.packed = self.packed_direct.keys() + self.packed_precalc.keys() + self.genome = self.packed_direct.keys() + self.genome_precalc.keys() + + self._len = len(self.packed) + + self.decls = self._decls.substitute( + packed=self.packed, tname=self.tname, + search_rounds=self.search_rounds) + self.defs = self._defs.substitute( + packed_direct=self.packed_direct, tname=self.tname, + precalc_code=self.precalc_code, + search_rounds=self.search_rounds) + + + def pack(self): + """ + Return a packed copy of the genome ready for uploading to the GPU as a + 3D NDArray, with the first element being the times and the second + being the values. + """ + width = 1 << self.search_rounds + out = np.empty((2, len(self.genome), width), dtype=np.float32) + # Ensure that unused values at the top are always big (must be >2.0) + out[0].fill(1e9) + + for idx, gname in enumerate(self.genome): + attr = self.ns[gname[0]] + for g in gname[1:]: + attr = getattr(attr, g) + if not isinstance(attr, SplEval): + raise TypeError("%s isn't a spline" % '.'.join(gname)) + out[0][idx][:len(attr.knots[0])] = attr.knots[0] + out[1][idx][:len(attr.knots[1])] = attr.knots[1] + return out + + _defs = Template(r""" + +__global__ +void interp_{{tname}}({{tname}}* out, float *times, float *knots, + float tstart, float tstep, mwc_st *rctxes) { + int id = gtid(); + out = &out[id]; + mwc_st rctx = rctxes[id]; + float time = tstart + id * tstep; + + float *outf = reinterpret_cast(out); + + // TODO: unroll pragma? + for (int i = 0; i < {{len(packed_direct)}}; i++) { + int j = i << {{search_rounds}}; + outf[i] = catmull_rom(×[j], &knots[j], time); + } + + // Advance 'times' and 'knots' to the purely generated sections, so that + // the pregenerated statements emitted by _require_pre are correct. + times = ×[{{len(packed_direct)< hay[lo | {{1 << i}}]) + lo |= {{1 << i}}; + {{endfor}} + return lo; +} + +__device__ +float catmull_rom(float *times, float *knots, float t) { + int idx = bitwise_binsearch(times, t); + + // The left bias of the search means that we never have to worry about + // overshooting unless the genome is corrupted + idx = max(idx, 1); + + float t1 = times[idx], t2 = times[idx+1] - t1; + float rt2 = 1.0f / t2; + float t0 = (times[idx-1] - t1) * rt2, t3 = (times[idx+2] - t1) * rt2; + t = (t - t1) * rt2; + + // Now t1 is effectively 0 and t2 is 1 + + float k0 = knots[idx-1], k1 = knots[idx], + k2 = knots[idx+1], k3 = knots[idx+2]; + + float m1 = (k2 - k0) / (1.0f - t0), + m2 = (k3 - k1) / (t3); + + float tt = t * t, ttt = tt * t; + + return m1 * ( ttt - 2.0f*tt + t) + + k1 * ( 2.0f*ttt - 3.0f*tt + 1) + + m2 * ( ttt - tt) + + k2 * (-2.0f*ttt + 3.0f*tt); +} + +__global__ +void test_cr(float *times, float *knots, float *t, float *r) { + int i = threadIdx.x + blockDim.x * blockIdx.x; + r[i] = catmull_rom(times, knots, t[i]); +} + + +""") + diff --git a/cuburn/code/iter.py b/cuburn/code/iter.py index c46fcfb..1dd89bf 100644 --- a/cuburn/code/iter.py +++ b/cuburn/code/iter.py @@ -2,26 +2,132 @@ The main iteration loop. """ -from cuburn.code import mwc, variations +from cuburn.code import mwc, variations, interp from cuburn.code.util import * +def precalc_densities(pcp, std_xforms): + # This pattern recurs a few times for precalc segments. Unfortunately, + # namespace stuff means it's not easy to functionalize this boilerplate + pre_cp = pcp._precalc() + pre_cp._code(Template(r""" + + float sum = 0.0f; + + {{for n in std_xforms}} + float den_{{n}} = {{pre_cp.xforms[n].density}}; + sum += den_{{n}}; + {{endfor}} + + float rsum = 1.0f / sum; + sum = 0.0f; + + {{for n in std_xforms[:-1]}} + sum += den_{{n}} * rsum; + {{pre_cp._set('den_' + n)}} = sum; + {{endfor}} + """).substitute(locals())) + +def precalc_chaos(pcp, std_xforms): + pre_cp = pcp._precalc() + pre_cp._code(Template(""" + + float sum, rsum; + + {{for p in std_xforms}} + sum = 0.0f; + + {{for n in std_xforms}} + float den_{{p}}_{{n}} = {{pre_cp.xforms[p].chaos[n]}}; + sum += den_{{p}}_{{n}}; + {{endfor}} + + rsum = 1.0f / sum; + sum = 0.0f; + + {{for n in std_xforms[:-1]}} + sum += den_{{p}}_{{n}} * rsum; + {{pre_cp._set('chaos_%s_%s' % (p, n))}} = sum; + {{endfor}} + + {{endfor}} + + """).substitute(locals())) + +def precalc_camera(info, pcam): + pre_cam = pcam._precalc() + + # Maxima code to check my logic: + # matrix([1,0,0.5*width + g],[0,1,0.5*height+g],[0,0,1]) + # . matrix([width * scale,0,0], [0,width * scale,0], [0,0,1]) + # . matrix([cosr,-sinr,0], [sinr,cosr,0], [0,0,1]) + # . matrix([1,0,-cenx],[0,1,-ceny],[0,0,1]) + # . matrix([X],[Y],[1]); + + pre_cam._code(Template(r""" + + float rot = {{pre_cam.rotation}} * M_PI / 180.0f; + float rotsin = sin(rot), rotcos = cos(rot); + float cenx = {{pre_cam.center.x}}, ceny = {{pre_cam.center.y}}; + float scale = {{pre_cam.scale}} * {{info.width}}; + + float ditherwidth = {{pre_cam.dither_width}} * 0.33f; + float u0 = mwc_next_01(rctx); + float r = ditherwidth * sqrtf(-2.0f * log2f(u0) / M_LOG2E); + + float u1 = 2.0f * M_PI * mwc_next_01(rctx); + float ditherx = r * cos(u1); + float dithery = r * sin(u1); + + {{pre_cam._set('xx')}} = scale * rotcos; + {{pre_cam._set('xy')}} = scale * -rotsin; + {{pre_cam._set('xo')}} = scale * (rotsin * ceny - rotcos * cenx) + + {{0.5 * info.width + info.gutter}} + ditherx; + + {{pre_cam._set('yx')}} = scale * rotsin; + {{pre_cam._set('yy')}} = scale * rotcos; + {{pre_cam._set('yo')}} = scale * -(rotsin * cenx + rotcos * ceny) + + {{0.5 * info.height + info.gutter}} + dithery; + + """).substitute(locals())) + +def precalc_xf_affine(px): + pre = px._precalc() + pre._code(Template(r""" + + float pri = {{pre.angle}} * M_PI / 180.0f; + float spr = {{pre.spread}} * M_PI / 180.0f; + + float magx = {{pre.magnitude.x}}; + float magy = {{pre.magnitude.y}}; + + {{pre._set('xx')}} = magx * cos(pri-spr); + {{pre._set('xy')}} = magx * sin(pri-spr); + {{pre._set('yx')}} = magy * cos(pri+spr); + {{pre._set('yy')}} = magy * sin(pri+spr); + {{pre._set('xo')}} = {{pre.offset.x}}; + {{pre._set('yo')}} = {{pre.offset.y}}; + + """).substitute(locals())) + class IterCode(HunkOCode): # The number of threads per block NTHREADS = 256 - def __init__(self, features): - self.features = features - self.packer = DataPacker('iter_info') + def __init__(self, info): + self.info = info + self.packer = interp.GenomePacker('iter_params') + self.pcp = self.packer.view('params', self.info.genome, 'cp') + iterbody = self._iterbody() - bodies = [self._xfbody(i,x) for i,x in enumerate(self.features.xforms)] + bodies = [self._xfbody(i,x) for i,x in sorted(info.genome.xforms.items())] bodies.append(iterbody) self.defs = '\n'.join(bodies) - self.decls += self.pix_helpers.substitute(features=features) + self.decls += self.pix_helpers.substitute(info=info) decls = """ // Note: for normalized lookups, uchar4 actually returns floats texture palTex; -__shared__ iter_info info; +__shared__ iter_params params; """ @@ -29,14 +135,14 @@ __shared__ iter_info info; __device__ void read_pix(float4 &pix, float &den) { den = pix.w; - {{if features.pal_has_alpha}} + {{if info.pal_has_alpha}} read_half(pix.z, pix.w, pix.z, den); {{endif}} } __device__ void write_pix(float4 &pix, float den) { - {{if features.pal_has_alpha}} + {{if info.pal_has_alpha}} write_half(pix.z, pix.z, pix.w, den); {{endif}} pix.w = den; @@ -44,7 +150,7 @@ void write_pix(float4 &pix, float den) { __device__ void update_pix(uint64_t ptr, uint32_t i, float4 c) { - {{if features.pal_has_alpha}} + {{if info.pal_has_alpha}} asm volatile ({{crep(''' { .reg .u16 sz, sw; @@ -98,34 +204,37 @@ void update_pix(uint64_t ptr, uint32_t i, float4 c) { """) def _xfbody(self, xfid, xform): - px = self.packer.view('info', 'xf%d_' % xfid) - px.sub('xf', 'cp.xforms[%d]' % xfid) - - tmpl = Template(""" + px = self.pcp.xforms[xfid] + tmpl = Template(r""" __device__ -void apply_xf{{xfid}}(float &ox, float &oy, float &color, mwc_st &rctx) { +void apply_xf_{{xfid}}(float &ox, float &oy, float &color, mwc_st &rctx) { float tx, ty; - {{apply_affine_flam3('ox', 'oy', 'tx', 'ty', px, 'xf.c', 'pre')}} + {{precalc_xf_affine(px.affine)}} + {{apply_affine('ox', 'oy', 'tx', 'ty', px.affine)}} ox = 0; oy = 0; - {{for v in xform.vars}} + {{for name in xform.variations}} if (1) { - float w = {{px.get('xf.var[%d]' % v)}}; - {{variations.var_code[variations.var_nos[v]].substitute(locals())}} + {{py:pv = px.variations[name]}} + float w = {{pv.weight}}; + {{variations.var_code[name].substitute(locals())}} } {{endfor}} - {{if xform.has_post}} + {{if 'post' in xform}} tx = ox; ty = oy; - {{apply_affine_flam3('tx', 'ty', 'ox', 'oy', px, 'xf.post', 'post')}} + {{precalc_xf_affine(px.post)}} + {{apply_affine('tx', 'ty', 'ox', 'oy', px.post)}} {{endif}} - float csp = {{px.get('xf.color_speed')}}; - color = color * (1.0f - csp) + {{px.get('xf.color')}} * csp; + {{if 'color' in xform}} + float csp = {{px.color_speed}}; + color = color * (1.0f - csp) + {{px.color}} * csp; + {{endif}} }; """) g = dict(globals()) @@ -135,42 +244,38 @@ void apply_xf{{xfid}}(float &ox, float &oy, float &color, mwc_st &rctx) { def _iterbody(self): tmpl = Template(r''' __global__ -void iter(mwc_st *msts, iter_info *infos, uint64_t accbuf_ptr) { - __shared__ int nsamps; +void iter(uint64_t accbuf_ptr, mwc_st *msts, iter_params *all_params, + int nsamps_to_generate) { mwc_st rctx = msts[gtid()]; - iter_info *info_glob = &(infos[blockIdx.x]); + iter_params *global_params = &(all_params[blockIdx.x]); - // load info to shared memory cooperatively + __shared__ int nsamps; + nsamps = nsamps_to_generate; + + __shared__ float time_frac; + time_frac = blockIdx.x / (float) gridDim.x; + + // load params to shared memory cooperatively for (int i = threadIdx.y * blockDim.x + threadIdx.x; - i * 4 < sizeof(iter_info); i += blockDim.x * blockDim.y) - reinterpret_cast(&info)[i] = - reinterpret_cast(info_glob)[i]; + i * 4 < sizeof(iter_params); i += blockDim.x * blockDim.y) + reinterpret_cast(¶ms)[i] = + reinterpret_cast(global_params)[i]; - if (threadIdx.y == 0 && threadIdx.x == 0) - nsamps = {{packer.get("cp.width * cp.height / cp.ntemporal_samples * cp.adj_density")}}; - - {{if features.chaos_used}} +{{if info.chaos_used}} int last_xf_used = 0; - {{else}} - // Size can be reduced by a factor of four using a slower 4-stage reduce +{{else}} + // Shared memory size can be reduced by a factor of four using a slower + // 4-stage reduce, but on Fermi hardware shmem use isn't a bottleneck __shared__ float swap[{{4*NTHREADS}}]; __shared__ float cosel[{{NWARPS}}]; + + // This is normally done after the swap-sync in the main loop if (threadIdx.y == 0 && threadIdx.x < {{NWARPS}}) cosel[threadIdx.x] = mwc_next_01(rctx); - {{endif}} +{{endif}} __syncthreads(); - int consec_bad = -{{features.fuse}}; - - if (threadIdx.y == 1 && threadIdx.x == 0) { - float ditherwidth = {{packer.get("0.33 * cp.spatial_filter_radius")}}; - float u0 = mwc_next_01(rctx); - float r = ditherwidth * sqrtf(-2.0f * log2f(u0) / M_LOG2E); - - float u1 = 2.0f * M_PI * mwc_next_01(rctx); - info.cam_xo += r * cos(u1); - info.cam_yo += r * sin(u1); - } + int consec_bad = -{{info.fuse}}; float x, y, color; x = mwc_next_11(rctx); @@ -178,37 +283,44 @@ void iter(mwc_st *msts, iter_info *infos, uint64_t accbuf_ptr) { color = mwc_next_01(rctx); while (1) { - {{if features.chaos_used}} - // For now, we can't use the swap buffer with chaos enabled - float xfsel = mwc_next_01(rctx); - {{else}} - float xfsel = cosel[threadIdx.y]; - {{endif}} - {{if features.chaos_used}} - {{for density_row_idx, prior_xform_idx in enumerate(features.std_xforms)}} - {{for density_col_idx, this_xform_idx in enumerate(features.std_xforms)}} - if (last_xf_used == {{prior_xform_idx}} && - xfsel <= {{packer.get("cp.chaos_densities[%d][%d]" % (density_row_idx, density_col_idx))}}) { - apply_xf{{this_xform_idx}}(x, y, color, rctx); - last_xf_used = {{this_xform_idx}}; +{{if info.chaos_used}} + + {{precalc_chaos(pcp, std_xforms)}} + + // For now, we don't attempt to use the swap buffer when chaos is used + float xfsel = mwc_next_01(rctx); + + {{for prior_xform_idx, prior_xform_name in enumerate(std_xforms)}} + if (last_xf_used == {{prior_xform_idx}}) { + {{for xform_idx, xform_name in enumerate(std_xforms[:-1])}} + if (xfsel <= {{pcp['chaos_'+prior_xform_name+'_'+xform_name]}}) { + apply_xf_{{xform_name}}(x, y, color, rctx); + last_xf_used = {{xform_idx}}; + } else + {{endfor}} + { + apply_xf_{{std_xforms[-1]}}(x, y, color, rctx); + last_xf_used = {{len(std_xforms)-1}}; + } } else {{endfor}} - {{endfor}} - {{else}} - {{for density_col_idx, this_xform_idx in enumerate(features.std_xforms)}} - if (xfsel <= {{packer.get("cp.norm_density[%d]" % (density_col_idx))}}) { - apply_xf{{this_xform_idx}}(x, y, color, rctx); - } else - {{endfor}} - {{endif}} { - printf("Reached trap, aborting execution! %g (%d,%d,%d)\n", - xfsel, blockIdx.x, threadIdx.y, threadIdx.x); - asm volatile ("trap;"); + printf("Something went *very* wrong.\n"); + asm("trap;"); } - {{if not features.chaos_used}} +{{else}} + {{precalc_densities(pcp, std_xforms)}} + float xfsel = cosel[threadIdx.y]; + + {{for xform_name in std_xforms[:-1]}} + if (xfsel <= {{pcp['den_'+xform_name]}}) { + apply_xf_{{xform_name}}(x, y, color, rctx); + } else + {{endfor}} + apply_xf_{{std_xforms[-1]}}(x, y, color, rctx); + // Swap thread states here so that writeback skipping logic doesn't die int sw = (threadIdx.y * 32 + threadIdx.x * 33) & {{NTHREADS-1}}; int sr = threadIdx.y * 32 + threadIdx.x; @@ -222,17 +334,16 @@ void iter(mwc_st *msts, iter_info *infos, uint64_t accbuf_ptr) { // required per loop. if (nsamps < 0) break; - {{if not features.chaos_used}} // Similarly, we select the next xforms here. if (threadIdx.y == 0 && threadIdx.x < {{NWARPS}}) cosel[threadIdx.x] = mwc_next_01(rctx); - {{endif}} consec_bad = swap[sr]; x = swap[sr+{{NTHREADS}}]; y = swap[sr+{{2*NTHREADS}}]; color = swap[sr+{{3*NTHREADS}}]; - {{endif}} + +{{endif}} if (consec_bad < 0) { consec_bad++; @@ -242,45 +353,50 @@ void iter(mwc_st *msts, iter_info *infos, uint64_t accbuf_ptr) { int remain = __popc(__ballot(1)); if (threadIdx.x == 0) atomicSub(&nsamps, remain); - {{if features.final_xform_index}} +{{if 'final' in cp.xforms}} float fx = x, fy = y, fcolor; - apply_xf{{features.final_xform_index}}(fx, fy, fcolor, rctx); - {{endif}} + apply_xf_final(fx, fy, fcolor, rctx); +{{endif}} float cx, cy; - {{if features.final_xform_index}} - {{apply_affine('fx', 'fy', 'cx', 'cy', packer, - 'cp.camera_transform', 'cam')}} - {{else}} - {{apply_affine('x', 'y', 'cx', 'cy', packer, - 'cp.camera_transform', 'cam')}} - {{endif}} + + {{precalc_camera(info, pcp.camera)}} + +{{if 'final' in cp.xforms}} + {{apply_affine('fx', 'fy', 'cx', 'cy', pcp.camera)}} +{{else}} + {{apply_affine('x', 'y', 'cx', 'cy', pcp.camera)}} +{{endif}} + uint32_t ix = trunca(cx), iy = trunca(cy); - if (ix >= {{features.acc_width}} || iy >= {{features.acc_height}} ) { + if (ix >= {{info.acc_width}} || iy >= {{info.acc_height}} ) { consec_bad++; - if (consec_bad > {{features.max_oob}}) { + if (consec_bad > {{info.max_oob}}) { x = mwc_next_11(rctx); y = mwc_next_11(rctx); color = mwc_next_01(rctx); - consec_bad = -{{features.fuse}}; + consec_bad = -{{info.fuse}}; } continue; } - uint32_t i = iy * {{features.acc_stride}} + ix; + uint32_t i = iy * {{info.acc_stride}} + ix; - float4 outcol = tex2D(palTex, color, {{packer.get("cp_step_frac")}}); + + float4 outcol = tex2D(palTex, color, time_frac); update_pix(accbuf_ptr, i, outcol); - } msts[gtid()] = rctx; } ''') return tmpl.substitute( - features = self.features, - packer = self.packer.view('info'), + info = self.info, + cp = self.info.genome, + pcp = self.pcp, NTHREADS = self.NTHREADS, NWARPS = self.NTHREADS / 32, + std_xforms = [n for n in sorted(self.info.genome.xforms) + if n != 'final'], **globals()) diff --git a/cuburn/code/util.py b/cuburn/code/util.py index d6676d7..6b71bcf 100644 --- a/cuburn/code/util.py +++ b/cuburn/code/util.py @@ -24,28 +24,11 @@ def assemble_code(*sections): return ''.join([''.join([getattr(sect, kind) for sect in sections]) for kind in ['headers', 'decls', 'defs']]) -def apply_affine(x, y, xo, yo, packer, base_accessor, base_name): +def apply_affine(x, y, xo, yo, packer): return Template(""" - {{xo}} = {{packer.get(ba + '[0,0]', bn + '_xx')}} * {{x}} - + {{packer.get(ba + '[0,1]', bn + '_xy')}} * {{y}} - + {{packer.get(ba + '[0,2]', bn + '_xo')}}; - {{yo}} = {{packer.get(ba + '[1,0]', bn + '_yx')}} * {{x}} - + {{packer.get(ba + '[1,1]', bn + '_yy')}} * {{y}} - + {{packer.get(ba + '[1,2]', bn + '_yo')}}; - """).substitute(x=x, y=y, xo=xo, yo=yo, packer=packer, - ba=base_accessor, bn=base_name) - -def apply_affine_flam3(x, y, xo, yo, packer, base_accessor, base_name): - """Read an affine transformation in *flam3 order* and apply it.""" - return tempita.Template(""" - {{xo}} = {{packer.get(ba + '[0][0]', bn + '_xx')}} * {{x}} - + {{packer.get(ba + '[1][0]', bn + '_xy')}} * {{y}} - + {{packer.get(ba + '[2][0]', bn + '_xo')}}; - {{yo}} = {{packer.get(ba + '[0][1]', bn + '_yx')}} * {{x}} - + {{packer.get(ba + '[1][1]', bn + '_yy')}} * {{y}} - + {{packer.get(ba + '[2][1]', bn + '_yo')}}; - """).substitute(x=x, y=y, xo=xo, yo=yo, packer=packer, - ba=base_accessor, bn=base_name) + {{xo}} = {{packer.xx}} * {{x}} + {{packer.xy}} * {{y}} + {{packer.xo}}; + {{yo}} = {{packer.yx}} * {{x}} + {{packer.yy}} * {{y}} + {{packer.yo}}; + """).substitute(locals()) class BaseCode(HunkOCode): headers = """ @@ -165,126 +148,4 @@ void write_half(float &xy, float x, float y, float den) { zero(dptr, np.int32(size), stream=stream, block=(1024, 1, 1), grid=(blocks, blocks, 1)) -class DataPackerView(object): - """ - View of a data packer. Intended to be initialized using DataPacker.view(). - - All views of a data packer share the same stream parameters, such as - position and total size, but do not share other parameters, such as the - pointer name used in emitted code lookups or the lookup context. - """ - def __init__(self, packer, ptr, prefix, ns): - self.packer, self.ptr, self.prefix, self.ns = packer, ptr, prefix, ns - - def get(self, accessor, name=None): - """ - Add an access to the stream, returning the formatted load expression - for device use. If 'name' is missing, the name components after the - final dot in the accessor will be used. Little effort is made to - ensure that this is valid C. - """ - if name is None: - name = accessor.rsplit('.', 1)[-1] - name = name.replace('[', '_').replace(']', '') - name = self.prefix + name - self.packer._access(self, accessor, name) - return '%s.%s' % (self.ptr, name) - - def sub(self, dst, src): - """Add a substitution to the namespace.""" - self.ns.append((src, dst)) - - def view(self, ptr_name, prefix=''): - """ - As DataPacker.view(), but preserving the current set of namespace - substitutions. - """ - return DataPackerView(self.packer, ptr_name, prefix, list(self.ns)) - - def _apply_subs(self, ns): - for s, d in self.ns: - ns[d] = eval(s, ns) - return ns - -class DataPacker(HunkOCode): - """ - Packs 32-bit float values into a dynamic data structure, and emits - accessors to those data values from device code. Might get fancier in the - future, but for now it's incredibly barebones. - """ - - default_namespace = {'np': np} - - def __init__(self, tname, clsize=4): - """ - Create a new DataPacker. - - ``tname`` is the name of the structure typedef that will be emitted - via this object's ``decls`` property. - - ``clsize`` is the size of a cache line, in bytes. The resulting - data structure will be padded to that size. - """ - self.tname = tname - self.clsize = clsize - self.packed = {} - self.packed_order = [] - - def view(self, ptr_name, prefix=''): - """Create a DataPacker view. See DataPackerView class for details.""" - return DataPackerView(self, ptr_name, prefix, list()) - - def _access(self, view, accessor, name): - if name in self.packed: - pview, paccessor, pcomp = self.packed[name] - if pview == view and (accessor is None or paccessor == accessor): - return - raise ValueError("Same name, different accessor or view: %s" % name) - comp_accessor = compile(accessor, '{{template}}', 'eval') - self.packed[name] = (view, accessor, comp_accessor) - self.packed_order.append(name) - - def __len__(self): - return len(self.packed_order) - - @property - def align(self): - return (4 * len(self) + self.clsize - 1) / self.clsize * self.clsize - - def pack(self, **kwargs): - base_ns = self.default_namespace.copy() - base_ns.update(kwargs) - out = np.zeros(self.align/4, dtype=np.float32) - subbed_nses = {} - - for i, name in enumerate(self.packed_order): - view, accessor, comp = self.packed[name] - if view not in subbed_nses: - subbed_nses[view] = view._apply_subs(dict(base_ns)) - try: - val = eval(comp, subbed_nses[view]) - except Exception, e: - print 'Error while evaluating accessor "%s"' % accessor - raise e - out[i] = val - return out - - @property - def decls(self): - tmpl = Template(""" -typedef struct { -{{for name, accessor in values}} - float {{'%-20s' % name}}; // {{accessor}} -{{endfor}} -{{if padding > 0}} - // Align to fill whole cache lines - float padding[{{padding}}]; -{{endif}} -} {{tname}}; -""") - return tmpl.substitute( - values = [(n, self.packed[n][1]) for n in self.packed_order], - padding = self.align / 4 - len(self), - tname = self.tname - ) diff --git a/cuburn/code/variations.py b/cuburn/code/variations.py index 0ad211e..d2eaa57 100644 --- a/cuburn/code/variations.py +++ b/cuburn/code/variations.py @@ -1,11 +1,32 @@ -import tempita +import numpy as np + +from cuburn.code.util import Template -var_nos = {} var_code = {} +var_params = {} -def var(num, name, code): - var_nos[num] = name - var_code[name] = tempita.Template(code) +def var(num, name, code, params=None): + var_code[name] = Template(code) + if params is not None: + r = {} + for p in params.split(): + if '=' in p: + p, default = p.split('=') + if default == 'M_PI': + default = np.pi + else: + default = float(default) + else: + default = 0.0 + r[p] = default + var_params[name] = r + +# TODO: This is a shitty hack +def precalc(name, code): + def precalc_fun(pv): + pre = pv._precalc() + pre._code(Template(code).substitute(pre=pre)) + Template.default_namespace[name+'_precalc'] = precalc_fun # Variables note: all functions will have their weights as 'w', # input variables 'tx' and 'ty', and output 'ox' and 'oy' available @@ -119,10 +140,10 @@ var(14, 'bent', """ """) var(15, 'waves', """ - float c10 = {{px.get(None, 'pre_xy')}}; - float c11 = {{px.get(None, 'pre_yy')}}; - float dx = {{px.get(None, 'pre_xo')}}; - float dy = {{px.get(None, 'pre_yo')}}; + float c10 = {{px.affine.xy}}; + float c11 = {{px.affine.yy}}; + float dx = {{px.affine.xo}}; + float dy = {{px.affine.yo}}; float dx2 = 1.0f / (dx * dx); float dy2 = 1.0f / (dy * dy); @@ -140,8 +161,8 @@ var(16, 'fisheye', """ var(17, 'popcorn', """ float dx = tanf(3.0f*ty); float dy = tanf(3.0f*tx); - ox += w * (tx + {{px.get(None, 'pre_xo')}} * sinf(dx)); - oy += w * (ty + {{px.get(None, 'pre_yo')}} * sinf(dy)); + ox += w * (tx + {{px.affine.xo}} * sinf(dx)); + oy += w * (ty + {{px.affine.yo}} * sinf(dy)); """) var(18, 'exponential', """ @@ -166,7 +187,7 @@ var(20, 'cosine', """ """) var(21, 'rings', """ - float dx = {{px.get(None, 'pre_xo')}} * {{px.get(None, 'pre_xo')}}; + float dx = {{px.affine.xo}} * {{px.affine.xo}}; float r = sqrtf(tx*tx + ty*ty); float a = atan2f(tx, ty); r = w * (fmodf(r+dx, 2.0f*dx) - dx + r * (1.0f - dx)); @@ -175,9 +196,9 @@ var(21, 'rings', """ """) var(22, 'fan', """ - float dx = M_PI * ({{px.get(None, 'pre_xo')}} * {{px.get(None, 'pre_xo')}}); + float dx = M_PI * ({{px.affine.xo}} * {{px.affine.xo}}); float dx2 = 0.5f * dx; - float dy = {{px.get(None, 'pre_yo')}}; + float dy = {{px.affine.yo}}; float a = atan2f(tx, ty); a += (fmodf(a+dy, dx) > dx2) ? -dx2 : dx2; float r = w * sqrtf(tx*tx + ty*ty); @@ -188,24 +209,24 @@ var(22, 'fan', """ var(23, 'blob', """ float r = sqrtf(tx*tx + ty*ty); float a = atan2f(tx, ty); - float bdiff = 0.5f * ({{px.get('xf.blob_high - xf.blob_low','blob_diff')}}); - r *= w * ({{px.get('xf.blob_low')}} + bdiff * (1.0f + sinf({{px.get('xf.blob_waves')}} * a))); + float bdiff = 0.5f * ({{pv.high}} - {{pv.low}}); + r *= w * ({{pv.low}} + bdiff * (1.0f + sinf({{pv.waves}} * a))); ox += sinf(a) * r; oy += cosf(a) * r; - """) + """, 'low high=1 waves=1') var(24, 'pdj', """ - float nx1 = cosf({{px.get('xf.pdj_b')}} * tx); - float nx2 = sinf({{px.get('xf.pdj_c')}} * tx); - float ny1 = sinf({{px.get('xf.pdj_a')}} * ty); - float ny2 = cosf({{px.get('xf.pdj_d')}} * ty); + float nx1 = cosf({{pv.b}} * tx); + float nx2 = sinf({{pv.c}} * tx); + float ny1 = sinf({{pv.a}} * ty); + float ny2 = cosf({{pv.d}} * ty); ox += w * (ny1 - nx1); oy += w * (nx2 - ny2); - """) + """, 'a b c d') var(25, 'fan2', """ - float dy = {{px.get('xf.fan2_y')}}; - float dx = M_PI * {{px.get('xf.fan2_x')}} * {{px.get('xf.fan2_x')}}; + float dy = {{pv.y}}; + float dx = M_PI * {{pv.x}} * {{pv.x}}; float dx2 = 0.5f * dx; float a = atan2f(tx, ty); float r = w * sqrtf(tx*tx + ty*ty); @@ -217,16 +238,16 @@ var(25, 'fan2', """ ox += r * sinf(a); oy += r * cosf(a); - """) + """, 'x y') var(26, 'rings2', """ - float dx = {{px.get('xf.rings2_val')}} * {{px.get('xf.rings2_val')}}; + float dx = {{pv.val}} * {{pv.val}}; float r = sqrtf(tx*tx + ty*ty); float a = atan2f(tx, ty); r += -2.0f * dx * (int)((r+dx)/(2.0f*dx)) + r * (1.0f - dx); ox += w * sinf(a) * r; oy += w * cosf(a) * r; - """) + """, 'val') var(27, 'eyefish', """ float r = 2.0f * w / (sqrtf(tx*tx + ty*ty) + 1.0f); @@ -245,16 +266,21 @@ var(29, 'cylinder', """ oy += w * ty; """) -var(30, 'perspective', """ - float pdist = {{px.get('xf.perspective_dist')}}; - float pvsin = {{px.get('np.sin(xf.perspective_angle*np.pi/2)', 'pvsin')}}; - float pvfcos = {{px.get( - 'xf.perspective_dist*np.cos(xf.perspective_angle*np.pi/2)', 'pvfcos')}}; +precalc('perspective', """ + float pang = {{pre.angle}} * M_PI_2; + float pdist = fmaxf(1e-9, {{pre.dist}}); + {{pre._set('mdist')}} = pdist; + {{pre._set('sin')}} = sin(pang); + {{pre._set('cos')}} = pdist * cos(pang); + """) - float t = 1.0f / (pdist - ty * pvsin); - ox += w * pdist * tx * t; - oy += w * pvfcos * ty * t; - """) +var(30, 'perspective', """ + {{perspective_precalc(pv)}} + + float t = 1.0f / ({{pv.mdist}} - ty * {{pv.sin}}); + ox += w * {{pv.mdist}} * tx * t; + oy += w * {{pv.cos}} * ty * t; + """, 'angle dist') var(31, 'noise', """ float tmpr = mwc_next_01(rctx) * 2.0f * M_PI; @@ -263,33 +289,39 @@ var(31, 'noise', """ oy += ty * r * sinf(tmpr); """) +precalc('julian', + "{{pre._set('cn')}} = {{pre.dist}} / (2.0f * {{pre.power}});\n") + var(32, 'julian', """ - float power = {{px.get('xf.julian_power')}}; + {{julina_precalc(pv)}} + float power = {{pv.power}}; float t_rnd = truncf(mwc_next_01(rctx) * fabsf(power)); float a = atan2f(ty, tx); float tmpr = (a + 2.0f * M_PI * t_rnd) / power; - float cn = {{px.get('xf.julian_dist / xf.julian_power / 2', 'julian_cn')}}; + float cn = {{pv.cn}}; float r = w * powf(tx * tx + ty * ty, cn); ox += r * cosf(tmpr); oy += r * sinf(tmpr); - """) + """, 'power=1 dist=1') + +precalc('juliascope', + "{{pre._set('cn')}} = {{pre.dist}} / (2.0f * {{pre.power}});\n") var(33, 'juliascope', """ + {{juliascope_precalc(pv)}} + float ang = atan2f(ty, tx); - float power = {{px.get('xf.juliascope_power', 'juscope_power')}}; + float power = {{pv.power}}; float t_rnd = truncf(mwc_next_01(rctx) * fabsf(power)); // TODO: don't draw the extra random number if (mwc_next(rctx) & 1) ang = -ang; float tmpr = (2.0f * M_PI * t_rnd + ang) / power; - - float cn = {{px.get('xf.juliascope_dist / xf.juliascope_power / 2', - 'juscope_cn')}}; - float r = w * powf(tx * tx + ty * ty, cn); + float r = w * powf(tx * tx + ty * ty, {{pv.cn}}); ox += r * cosf(tmpr); oy += r * sinf(tmpr); - """) + """, 'power=1 dist=1') var(34, 'blur', """ float tmpr = mwc_next_01(rctx) * 2.0f * M_PI; @@ -300,40 +332,39 @@ var(34, 'blur', """ var(35, 'gaussian', """ float ang = mwc_next_01(rctx) * 2.0f * M_PI; - float r = w * ( mwc_next_01(rctx) + mwc_next_01(rctx) - + mwc_next_01(rctx) + mwc_next_01(rctx) - 2.0f ); + float r = w * sqrtf(-2.0f * log2f(mwc_next_01(rctx)) / M_LOG2E); ox += r * cosf(ang); oy += r * sinf(ang); """) var(36, 'radial_blur', """ - float blur_angle = {{px.get('xf.radial_blur_angle')}} * M_PI * 0.5f; + float blur_angle = {{pv.angle}} * M_PI * 0.5f; float spinvar = sinf(blur_angle); float zoomvar = cosf(blur_angle); - float r = w * ( mwc_next_01(rctx) + mwc_next_01(rctx) - + mwc_next_01(rctx) + mwc_next_01(rctx) - 2.0f ); + + float r = w * sqrtf(-2.0f * log2f(mwc_next_01(rctx)) / M_LOG2E); float ra = sqrtf(tx*tx + ty*ty); float tmpa = atan2f(ty, tx) + spinvar * r; float rz = zoomvar * r - 1.0f; ox += ra*cosf(tmpa) + rz*tx; oy += ra*sinf(tmpa) + rz*ty; - """) + """, 'angle') var(37, 'pie', """ - float slices = {{px.get('xf.pie_slices')}}; + float slices = {{pv.slices}}; float sl = truncf(mwc_next_01(rctx) * slices + 0.5f); - float a = {{px.get('xf.pie_rotation')}} + - 2.0f * M_PI * (sl + mwc_next_01(rctx) * {{px.get('xf.pie_thickness')}}) / slices; + float a = {{pv.rotation}} + + 2.0f * M_PI * (sl + mwc_next_01(rctx) * {{pv.thickness}}) / slices; float r = w * mwc_next_01(rctx); ox += r * cosf(a); oy += r * sinf(a); - """) + """, 'slices=6 rotation thickness=0.5') var(38, 'ngon', """ - float power = {{px.get('xf.ngon_power')}} * 0.5f; - float b = 2.0f * M_PI / {{px.get('xf.ngon_sides')}}; - float corners = {{px.get('xf.ngon_corners')}}; - float circle = {{px.get('xf.ngon_circle')}}; + float power = {{pv.power}} * 0.5f; + float b = 2.0f * M_PI / {{pv.sides}}; + float corners = {{pv.corners}}; + float circle = {{pv.circle}}; float r_factor = powf(tx*tx + ty*ty, power); float theta = atan2f(ty, tx); @@ -343,11 +374,11 @@ var(38, 'ngon', """ ox += w * tx * amp; oy += w * ty * amp; - """) + """, 'sides=5 power=3 circle=1 corners=2') var(39, 'curl', """ - float c1 = {{px.get('xf.curl_c1')}}; - float c2 = {{px.get('xf.curl_c2')}}; + float c1 = {{pv.c1}}; + float c2 = {{pv.c2}}; float re = 1.0f + c1*tx + c2*(tx*tx - ty*ty); float im = c1*ty + 2.0f*c2*tx*ty; @@ -355,15 +386,15 @@ var(39, 'curl', """ ox += r * (tx*re + ty*im); oy += r * (ty*re - tx*im); - """) + """, 'c1=1 c2') var(40, 'rectangles', """ - float rx = {{px.get('xf.rectangles_x')}}; - float ry = {{px.get('xf.rectangles_y')}}; + float rx = {{pv.x}}; + float ry = {{pv.y}}; ox += w * ( (rx==0.0f) ? tx : rx * (2.0f * floorf(tx/rx) + 1.0f) - tx); oy += w * ( (ry==0.0f) ? ty : ry * (2.0f * floorf(ty/ry) + 1.0f) - ty); - """) + """, 'x y') var(41, 'arch', """ float ang = mwc_next_01(rctx) * w * M_PI; @@ -417,9 +448,8 @@ var(48, 'cross', """ """) var(49, 'disc2', """ - float twist = {{px.get('xf.disc2_twist')}}; - float rotpi = {{px.get('xf.disc2_rot', 'disc2_rotpi')}}; - rotpi *= M_PI; + float twist = {{pv.twist}} + float rotpi = {{pv.rot}} * M_PI; float sintwist = sinf(twist); float costwist = cosf(twist) - 1.0f; @@ -441,70 +471,71 @@ var(49, 'disc2', """ ox += r * (sinf(t) + costwist); oy += r * (cosf(t) + sintwist); - """) + """, 'rot twist') var(50, 'super_shape', """ float ang = atan2f(ty, tx); - float theta = 0.25f * ({{px.get('xf.super_shape_m')}} * ang + M_PI); + float theta = 0.25f * ({{pv.m}} * ang + M_PI); float t1 = fabsf(cosf(theta)); float t2 = fabsf(sinf(theta)); - t1 = powf(t1,{{px.get('xf.super_shape_n2')}}); - t2 = powf(t2,{{px.get('xf.super_shape_n3')}}); - float myrnd = {{px.get('xf.super_shape_rnd')}}; + t1 = powf(t1, {{pv.n2}}); + t2 = powf(t2, {{pv.n3}}); + float myrnd = {{pv.rnd}}; float d = sqrtf(tx*tx+ty*ty); - float r = w * ((myrnd*mwc_next_01(rctx) + (1.0f-myrnd)*d) - {{px.get('xf.super_shape_holes')}}) - * powf(t1+t2, {{px.get('-1.0 / xf.super_shape_n1', 'super_shape_pneg')}}) / d; + float r = w * ((myrnd*mwc_next_01(rctx) + (1.0f-myrnd)*d) - {{pv.holes}}) + * powf(t1+t2, -1.0f / {{pv.n1}}) / d; ox += r * tx; oy += r * ty; - """) + """, 'rnd m n1=1 n2=1 n3=1 holes') var(51, 'flower', """ - float holes = {{px.get('xf.flower_holes')}}; - float petals = {{px.get('xf.flower_petals')}}; + float holes = {{pv.holes}}; + float petals = {{pv.petals}}; - float r = w * (mwc_next_01(rctx) - holes) * cosf(petals*atan2f(ty, tx)) / sqrtf(tx*tx + ty*ty); + float r = w * (mwc_next_01(rctx) - holes) + * cosf(petals*atan2f(ty, tx)) / sqrtf(tx*tx + ty*ty); ox += r * tx; oy += r * ty; - """) + """, 'holes petals') var(52, 'conic', """ float d = sqrtf(tx*tx + ty*ty); float ct = tx / d; - float holes = {{px.get('xf.conic_holes')}}; - float eccen = {{px.get('xf.conic_eccentricity')}}; + float holes = {{pv.holes}}; + float eccen = {{pv.eccentricity}}; float r = w * (mwc_next_01(rctx) - holes) * eccen / (1.0f + eccen*ct) / d; ox += r * tx; oy += r * ty; - """) + """, 'holes eccentricity=1') var(53, 'parabola', """ float r = sqrtf(tx*tx + ty*ty); float sr = sinf(r); float cr = cosf(r); - ox += {{px.get('xf.parabola_height')}} * w * sr * sr * mwc_next_01(rctx); - oy += {{px.get('xf.parabola_width')}} * w * cr * mwc_next_01(rctx); - """) + ox += {{pv.height}} * w * sr * sr * mwc_next_01(rctx); + oy += {{pv.width}} * w * cr * mwc_next_01(rctx); + """, 'height width') var(54, 'bent2', """ float nx = 1.0f; - if (tx < 0.0f) nx = {{px.get('xf.bent2_x')}}; + if (tx < 0.0f) nx = {{pv.x}}; float ny = 1.0f; - if (ty < 0.0f) ny = {{px.get('xf.bent2_y')}}; + if (ty < 0.0f) ny = {{pv.y}}; ox += w * nx * tx; oy += w * ny * ty; - """) + """, 'x=1 y=1') var(55, 'bipolar', """ float x2y2 = tx*tx + ty*ty; float t = x2y2 + 1.0f; float x2 = tx * 2.0f; - float ps = -M_PI_2 * {{px.get('xf.bipolar_shift')}}; + float ps = -M_PI_2 * {{pv.shift}}; float y = 0.5f * atan2f(2.0f * ty, x2y2 - 1.0f) + ps; if (y > M_PI_2) @@ -514,7 +545,7 @@ var(55, 'bipolar', """ ox += w * 0.25f * M_2_PI * logf( (t+x2) / (t-x2) ); oy += w * M_2_PI * y; - """) + """, 'shift') var(56, 'boarders', """ float roundX = rintf(tx); @@ -556,7 +587,7 @@ var(57, 'butterfly', """ """) var(58, 'cell', """ - float cell_size = {{px.get('xf.cell_size')}}; + float cell_size = {{pv.size}}; float inv_cell_size = 1.0f/cell_size; /* calculate input cell */ @@ -588,31 +619,35 @@ var(58, 'cell', """ ox += w * (dx + x*cell_size); oy -= w * (dy + y*cell_size); - """) + """, 'size=1') var(59, 'cpow', """ float a = atan2f(ty, tx); float lnr = 0.5f * logf(tx*tx+ty*ty); - float power = {{px.get('xf.cpow_power')}}; + float power = {{pv.power}}; float va = 2.0f * M_PI / power; - float vc = {{px.get('xf.cpow_r')}} / power; - float vd = {{px.get('xf.cpow_i')}} / power; + float vc = {{pv.cpow_r}} / power; + float vd = {{pv.cpow_i}} / power; float ang = vc*a + vd*lnr + va*floorf(power*mwc_next_01(rctx)); float m = w * expf(vc * lnr - vd * a); ox += m * cosf(ang); oy += m * sinf(ang); - """) + """, 'r=1 i power=1') + + +precalc('curve', ''' + float xl = {{pv.xlength}}, yl = {{pv.ylength}}; + {{pre._set('x2')}} = 1.0f / max(1e-20f, xl * xl); + {{pre._set('y2')}} = 1.0f / max(1e-20f, yl * yl); + ''') var(60, 'curve', """ - float pc_xlen = {{px.get('xf.curve_xlength * xf.curve_xlength','pc_xlen')}}; - float pc_ylen = {{px.get('xf.curve_ylength * xf.curve_ylength','pc_ylen')}}; + {{curve_precalc()}} + float pc_xlen = {{pv.x2}}, pc_ylen = {{pv.y2}}; - if (pc_xlen<1E-20f) pc_xlen = 1E-20f; - if (pc_ylen<1E-20f) pc_ylen = 1E-20f; - - ox += w * (tx + {{px.get('xf.curve_xamp')}} * expf(-ty*ty/pc_xlen)); - oy += w * (ty + {{px.get('xf.curve_yamp')}} * expf(-tx*tx/pc_ylen)); - """) + ox += w * (tx + {{pv.xamp}} * expf(-ty*ty*pc_xlen)); + oy += w * (ty + {{pv.yamp}} * expf(-tx*tx*pc_ylen)); + """, 'xamp yamp xlength=1 ylength=1') var(61, 'edisc', """ float tmp = tx*tx + ty*ty + 1.0f; @@ -662,7 +697,7 @@ var(62, 'elliptic', """ var(63, 'escher', """ float a = atan2f(ty,tx); float lnr = 0.5f * logf(tx*tx + ty*ty); - float ebeta = {{px.get('xf.escher_beta')}}; + float ebeta = {{pv.beta}}; float seb = sinf(ebeta); float ceb = cosf(ebeta); float vc = 0.5f * (1.0f + ceb); @@ -672,7 +707,7 @@ var(63, 'escher', """ ox += m * cosf(n); oy += m * sinf(n); - """) + """, 'beta') var(64, 'foci', """ float expx = expf(tx) * 0.5f; @@ -685,27 +720,26 @@ var(64, 'foci', """ """) var(65, 'lazysusan', """ - float lx = {{px.get('xf.lazysusan_x')}}; - float ly = {{px.get('xf.lazysusan_y')}}; + float lx = {{pv.x}}; + float ly = {{pv.y}}; float x = tx - lx; float y = ty + ly; float r = sqrtf(x*x + y*y); if (r < w) { - float a = atan2f(y,x) + {{px.get('xf.lazysusan_spin')}} + - {{px.get('xf.lazysusan_twist')}}*(w-r); + float a = atan2f(y,x) + {{pv.spin}} + {{pv.twist}} * (w - r); r = w * r; ox += r * cosf(a) + lx; oy += r * sinf(a) - ly; } else { - r = w * (1.0f + {{px.get('xf.lazysusan_space')}} / r); + r = w * (1.0f + {{pv.space}} / r); ox += r * x + lx; oy += r * y - ly; } - """) + """, 'x y twist space spin') var(66, 'loonie', """ float r2 = tx*tx + ty*ty;; @@ -732,8 +766,7 @@ var(67, 'pre_blur', """ """) var(68, 'modulus', """ - float mx = {{px.get('xf.modulus_x')}}; - float my = {{px.get('xf.modulus_y')}}; + float mx = {{pv.x}}, my = {{pv.y}}; float xr = 2.0f*mx; float yr = 2.0f*my; @@ -750,13 +783,13 @@ var(68, 'modulus', """ oy += w * ( my - fmodf(my - ty, yr)); else oy += w * ty; - """) + """, 'x y') var(69, 'oscope', """ - float tpf = 2.0f * M_PI * {{px.get('xf.oscope_frequency')}}; - float amp = {{px.get('xf.oscope_amplitude')}}; - float sep = {{px.get('xf.oscope_separation')}}; - float dmp = {{px.get('xf.oscope_damping')}}; + float tpf = 2.0f * M_PI * {{pv.frequency}}; + float amp = {{pv.amplitude}}; + float sep = {{pv.separation}}; + float dmp = {{pv.damping}}; float t = amp * expf(-fabsf(tx)*dmp) * cosf(tpf*tx) + sep; @@ -765,7 +798,7 @@ var(69, 'oscope', """ oy -= w*ty; else oy += w*ty; - """) + """, 'separation=1 frequency=M_PI amplitude=1 damping') var(70, 'polar2', """ float p2v = w / M_PI; @@ -774,10 +807,10 @@ var(70, 'polar2', """ """) var(71, 'popcorn2', """ - float c = {{px.get('xf.popcorn2_c')}}; - ox += w * (tx + {{px.get('xf.popcorn2_x')}} * sinf(tanf(ty*c))); - oy += w * (ty + {{px.get('xf.popcorn2_y')}} * sinf(tanf(tx*c))); - """) + float c = {{pv.c}}; + ox += w * (tx + {{pv.x}} * sinf(tanf(ty*c))); + oy += w * (ty + {{pv.y}} * sinf(tanf(tx*c))); + """, 'x y c') var(72, 'scry', """ /* note that scry does not multiply by weight, but as the */ @@ -790,68 +823,61 @@ var(72, 'scry', """ """) var(73, 'separation', """ - float sx2 = {{px.get('xf.separation_x * xf.separation_x', 'sx2')}}; - float sy2 = {{px.get('xf.separation_y * xf.separation_y', 'sy2')}}; + float sx2 = {{pv.x}} * {{pv.x}}; + float sy2 = {{pv.y}} * {{pv.y}}; if (tx > 0.0f) - ox += w * (sqrtf(tx*tx + sx2) - tx*{{px.get('xf.separation_xinside')}}); + ox += w * (sqrtf(tx*tx + sx2) - tx*{{pv.xinside}}); else - ox -= w * (sqrtf(tx*tx + sx2) + tx*{{px.get('xf.separation_xinside')}}); + ox -= w * (sqrtf(tx*tx + sx2) + tx*{{pv.xinside}}); if (ty > 0.0f) - oy += w * (sqrtf(ty*ty + sy2) - ty*{{px.get('xf.separation_yinside')}}); + oy += w * (sqrtf(ty*ty + sy2) - ty*{{pv.yinside}}); else - oy -= w * (sqrtf(ty*ty + sy2) + ty*{{px.get('xf.separation_yinside')}}); - """) + oy -= w * (sqrtf(ty*ty + sy2) + ty*{{pv.yinside}}); + """, 'x xinside y yinside') var(74, 'split', """ - if (cosf(tx*{{px.get('xf.split_xsize')}}*M_PI) >= 0.0f) + if (cosf(tx*{{pv.xsize}}*M_PI) >= 0.0f) oy += w*ty; else oy -= w*ty; - if (cosf(ty*{{px.get('xf.split_ysize')}}*M_PI) >= 0.0f) + if (cosf(ty*{{pv.ysize}}*M_PI) >= 0.0f) ox += w*tx; else ox -= w*tx; - """) + """, 'xsize ysize') var(75, 'splits', """ - if (tx >= 0.0f) - ox += w*(tx + {{px.get('xf.splits_x')}}); - else - ox += w*(tx - {{px.get('xf.splits_x')}}); - - if (ty >= 0) - oy += w*(ty + {{px.get('xf.splits_y')}}); - else - oy += w*(ty - {{px.get('xf.splits_y')}}); - """) + ox += w*(tx + copysignf({{pv.x}}, tx)); + oy += w*(ty + copysignf({{pv.y}}, ty)); + """, 'x y') var(76, 'stripes', """ float roundx = floorf(tx + 0.5f); float offsetx = tx - roundx; - ox += w * (offsetx * (1.0f - {{px.get('xf.stripes_space')}}) + roundx); - oy += w * (ty + offsetx*offsetx*{{px.get('xf.stripes_warp')}}); - """) + ox += w * (offsetx * (1.0f - {{pv.space}}) + roundx); + oy += w * (ty + offsetx*offsetx*{{pv.warp}}); + """, 'space warp') var(77, 'wedge', """ float r = sqrtf(tx*tx + ty*ty); - float a = atan2f(ty, tx) + {{px.get('xf.wedge_swirl')}} * r; - float wc = {{px.get('xf.wedge_count')}}; - float wa = {{px.get('xf.wedge_angle')}}; + float a = atan2f(ty, tx) + {{pv.swirl}} * r; + float wc = {{pv.count}}; + float wa = {{pv.angle}}; float c = floorf((wc * a + M_PI) * M_1_PI * 0.5f); float comp_fac = 1 - wa * wc * M_1_PI * 0.5f; a = a * comp_fac + c * wa; - r = w * (r + {{px.get('xf.wedge_hole')}}); + r = w * (r + {{pv.hole}}); ox += r * cosf(a); oy += r * sinf(a); - """) + """, 'angle hole count=1 swirl') var(81, 'waves2', """ - ox += w*(tx + {{px.get('xf.waves2_scalex')}}*sinf(ty * {{px.get('xf.waves2_freqx')}})); - oy += w*(ty + {{px.get('xf.waves2_scaley')}}*sinf(tx * {{px.get('xf.waves2_freqy')}})); - """) + ox += w*(tx + {{pv.scalex}}*sinf(ty * {{pv.freqx}})); + oy += w*(ty + {{pv.scaley}}*sinf(tx * {{pv.freqy}})); + """, 'scalex scaley freqx freqy') var(82, 'exp', """ float expe = expf(tx); @@ -935,21 +961,22 @@ var(95, 'coth', """ var(97, 'flux', """ float xpw = tx + w; float xmw = tx - w; - float avgr = w * (2.0f + {{px.get('xf.flux_spread')}}) * sqrtf(sqrtf(ty*ty+xpw*xpw)/sqrtf(ty*ty+xmw*xmw)); + float avgr = w * (2.0f + {{pv.spread}}) + * sqrtf(sqrtf(ty*ty+xpw*xpw)/sqrtf(ty*ty+xmw*xmw)); float avga = (atan2f(ty, xmw) - atan2f(ty,xpw))*0.5f; ox += avgr * cosf(avga); oy += avgr * sinf(avga); - """) + """, 'spread') var(98, 'mobius', """ - float rea = {{px.get('xf.mobius_re_a')}}; - float ima = {{px.get('xf.mobius_im_a')}}; - float reb = {{px.get('xf.mobius_re_b')}}; - float imb = {{px.get('xf.mobius_im_b')}}; - float rec = {{px.get('xf.mobius_re_c')}}; - float imc = {{px.get('xf.mobius_im_c')}}; - float red = {{px.get('xf.mobius_re_d')}}; - float imd = {{px.get('xf.mobius_im_d')}}; + float rea = {{pv.re_a}}; + float ima = {{pv.im_a}}; + float reb = {{pv.re_b}}; + float imb = {{pv.im_b}}; + float rec = {{pv.re_c}}; + float imc = {{pv.im_c}}; + float red = {{pv.re_d}}; + float imd = {{pv.im_d}}; float re_u, im_u, re_v, im_v, rad_v; @@ -962,5 +989,5 @@ var(98, 'mobius', """ ox += rad_v * (re_u*re_v + im_u*im_v); oy += rad_v * (im_u*re_v - re_u*im_v); - """) + """, 're_a im_a re_b im_b re_c im_c re_d im_d') diff --git a/cuburn/genome.py b/cuburn/genome.py new file mode 100644 index 0000000..bb0b041 --- /dev/null +++ b/cuburn/genome.py @@ -0,0 +1,151 @@ +import json +import numpy as np +import scipy.interpolate +from cuburn import affine + +class SplEval(object): + def __init__(self, knots): + # If a single value is passed, normalize to a constant set of knots + if isinstance(knots, (int, float)): + knots = [-0.1, knots, 0.0, knots, 1.0, knots, 1.1, knots] + + # If stabilizing knots are missing before or after the edges of the + # [0,1] interval, add them. TODO: choose better knots + if knots[0] >= 0: + knots = [-0.1, knots[1]] + knots + if knots[-2] <= 1: + knots.extend([1.1, knots[-1]]) + + self.knots = np.zeros((2, len(knots)/2)) + self.knots.T.flat[:] = knots + + def __call__(self, itime): + try: + return np.asarray(map(self, itime)) + except: + pass + idx = np.searchsorted(self.knots[0], itime) - 2 + idx = max(0, min(len(self.knots[0]) - 4, idx)) + + times = self.knots[0][idx:idx+4] + vals = self.knots[1][idx:idx+4] + # Normalize to [0,1] + t = itime - times[1] + times = times - times[1] + t = t / times[2] + times = times / times[2] + + return self._interp(times, vals, t) + + @staticmethod + def _interp(times, vals, t): + t2 = t * t + t3 = t * t2 + + m1 = (vals[2] - vals[0]) / (1.0 - times[0]) + m2 = (vals[3] - vals[1]) / times[3] + + r = ( m1 * (t3 - 2*t2 + t) + vals[1] * (2*t3 - 3*t2 + 1) + + m2 * (t3 - t2) + vals[2] * (-2*t3 + 3*t2) ) + return r + + def __str__(self): + return '[%g:%g]' % (self(0), self(1)) + def __repr__(self): + return '' % (self(0), self(1)) + + @classmethod + def wrap(cls, obj): + """ + Given a dict 'obj' representing, for instance, a Genome object, walk + through the object recursively and in-place, turning any number or + list of numbers into an SplEval. + """ + for k, v in obj.items(): + if (isinstance(v, (float, int)) or + (isinstance(v, list) and isinstance(v[1], (float, int)))): + obj[k] = cls(v) + elif isinstance(v, dict): + cls.wrap(v) + +class RenderInfo(object): + """ + Determine features and constants required to render a particular set of + genomes. The values of this class are fixed before compilation begins. + """ + # Constant parameters which control handling of out-of-frame samples: + # Number of iterations to iterate without write after new point + fuse = 10 + # Maximum consecutive out-of-bounds points before picking new point + max_oob = 10 + + # Height of the texture pallete which gets uploaded to the GPU (assuming + # that palette-from-texture is enabled). For most genomes, this doesn't + # need to be very large at all. However, since only an easily-cached + # fraction of this will be accessed per SM, larger values shouldn't hurt + # performance too much. Power-of-two, please. + palette_height = 16 + + # Maximum width of DE and other spatial filters, and thus in turn the + # amount of padding applied. Note that, for now, this must not be changed! + # The filtering code makes deep assumptions about this value. + gutter = 16 + + # TODO: for now, we always throw away the alpha channel before writing. + # All code is in place to not do this, we just need to find a way to expose + # this preference via the API (or push alpha blending entirely on the client, + # which I'm not opposed to) + alpha_output_channel = False + + # TODO: fix these + chaos_used = False + std_xforms = [0, 1, 2] + final_xform_index = 3 + pal_has_alpha = False + density = 2000 + + def __init__(self, db, **kwargs): + self.db = db + # Copy all args into this object's namespace + self.__dict__.update(kwargs) + + self.acc_width = self.width + 2 * self.gutter + self.acc_height = self.height + 2 * self.gutter + self.acc_stride = 32 * int(np.ceil(self.acc_width / 32.)) + + # Deref genome + self.genome = self.db.genomes[self.genome] + +class _AttrDict(dict): + def __getattr__(self, name): + return self[name] + +def load_info(contents): + result = json.loads(contents, object_hook=_AttrDict) + SplEval.wrap(result.genomes) + + # A Job object will have more details or something + result = RenderInfo(result, **result.renders.values()[0]) + return result + +class HacketyGenome(object): + """ + Holdover class to postpone a very deep refactoring as long as possible. + Converts property accesses into interpolations over predetermined times. + """ + def __init__(self, referent, times): + # Times can be singular + self.referent, self.times = referent, times + def __getattr__(self, name): + r = getattr(self.referent, str(name)) + if isinstance(r, _AttrDict): + return HacketyGenome(r, self.times) + elif isinstance(r, SplEval): + return r(self.times) + return r + __getitem__ = __getattr__ + +if __name__ == "__main__": + import sys + import pprint + pprint.pprint(read_genome(sys.stdin)) diff --git a/cuburn/render.py b/cuburn/render.py index 28dd6bd..83e7b0e 100644 --- a/cuburn/render.py +++ b/cuburn/render.py @@ -1,12 +1,15 @@ +import os import sys import math import re -import time +import time as timemod +import tempfile from itertools import cycle, repeat, chain, izip from ctypes import * from cStringIO import StringIO import numpy as np from scipy import ndimage +import base64 from fr0stlib import pyflam3 from fr0stlib.pyflam3._flam3 import * @@ -17,79 +20,11 @@ import pycuda.driver as cuda import pycuda.tools from pycuda.gpuarray import vec +import cuburn.genome from cuburn import affine from cuburn.code import util, mwc, iter, filtering -def _chunk(l, cs): - """ - Yield the contents of list ``l`` in chunks of size no more than ``cs``. - """ - for i in range(0, len(l), cs): - yield l[i:i+cs] - -class Genome(object): - """ - Normalizes and precalculates some properties of a Genome. Assumes that - Genome argument passed in will not change. - """ - # Fix the ctypes ugliness since switching to __getattribute__ in 2.7. - # There are more elegant ways to do this, but I can't be bothered. - def __getattr__(self, name): - return getattr(self.cp, name) - - def __init__(self, ctypes_genome): - self.cp = ctypes_genome - self.xforms = [self.xform[i] for i in range(self.num_xforms)] - dens = np.array([x.density for i, x in enumerate(self.xforms) - if i != self.final_xform_index]) - - - num_std_xf = len(dens) - self.chaos_densities = np.zeros( (num_std_xf,num_std_xf) ) - for r in range(num_std_xf): - chaos_row = np.array([ctypes_genome.chaos[r][c] - for c in range(num_std_xf)]) - chaos_row = chaos_row * dens - chaos_row /= np.sum(chaos_row) - chaos_row = np.cumsum(chaos_row) - self.chaos_densities[r,:] = chaos_row - - dens /= np.sum(dens) - self.norm_density = np.cumsum(dens) - - # For performance reasons, defer this calculation - self._camera_transform = None - - scale = property(lambda cp: 2.0 ** cp.zoom) - adj_density = property(lambda cp: cp.sample_density * (cp.scale ** 2)) - ppu = property(lambda cp: cp.pixels_per_unit * cp.scale) - - @property - def camera_transform(self): - """ - An affine matrix which will transform IFS coordinates to image width - and height. Assumes that width and height are constant. - """ - cp = self - if self._camera_transform is not None: - return self._camera_transform - g = Features.gutter - if cp.estimator: - # The filter shifts by this amount as a side effect of it being - # written in a confusing and sloppy manner - # TODO: this will be weird in an animation where one endpoint has - # a radius of 0, and the other does not - g -= Features.gutter / 2 - 1 - self._camera_transform = ( - affine.translate(0.5 * cp.width + g, 0.5 * cp.height + g) - * affine.scale(cp.ppu, cp.ppu) - * affine.translate(-cp._center[0], -cp._center[1]) - * affine.rotate(cp.rotate * 2 * np.pi / 360, - cp.rot_center[0], - cp.rot_center[1]) ) - return self._camera_transform - -class Animation(object): +class Renderer(object): """ Control structure for rendering a series of frames. @@ -108,27 +43,13 @@ class Animation(object): interpolated sequence between one or two genomes. """ - # Large launches lock the display for a considerable period and may be - # killed due to a device timeout; small launches are harder to load-balance - # on the GPU and incur overhead. This empirical value is multiplied by the - # number of SMs on the device to determine how many blocks should be in - # each launch. Extremely high quality, high resolution renders may still - # encounter a device timeout, requiring the user to increase the split - # amount. This factor is not used in async mode. - SM_FACTOR = 8 - cmp_options = ('-use_fast_math', '-maxrregcount', '42') - - keep = False - def __init__(self, ctypes_genome_array): - self._g_arr = type(ctypes_genome_array)() - libflam3.flam3_align(self._g_arr, ctypes_genome_array, - len(ctypes_genome_array)) - self.genomes = map(Genome, self._g_arr) - self.features = Features(self.genomes) + def __init__(self, info): + self.info = info self._iter = self._de = self.src = self.cubin = self.mod = None + self.packed_genome = None # Ensure class options don't get contaminated on an instance self.cmp_options = list(self.cmp_options) @@ -148,12 +69,14 @@ class Animation(object): keep = self.keep if keep is None else keep cmp_options = self.cmp_options if cmp_options is None else cmp_options - self._iter = iter.IterCode(self.features) - self._de = filtering.DensityEst(self.features, self.genomes[0]) - cclip = filtering.ColorClip(self.features) - # TODO: make choice of filtering explicit + self._iter = iter.IterCode(self.info) + self._de = filtering.DensityEst(self.info) + cclip = filtering.ColorClip(self.info) + self._iter.packer.finalize() self.src = util.assemble_code(util.BaseCode, mwc.MWC, self._iter.packer, self._iter, cclip, self._de) + with open(os.path.join(tempfile.gettempdir(), 'kernel.cu'), 'w') as fp: + fp.write(self.src) self.cubin = pycuda.compiler.compile( self.src, keep=keep, options=cmp_options, cache_dir=False if keep else None) @@ -183,11 +106,11 @@ class Animation(object): self.mod = cuda.module_from_buffer(self.cubin, jit_options) - def render_frames(self, times=None, sync=False): + def render(self, times): """ Render a flame for each genome in the iterable value 'genomes'. Returns a Python generator object which will yield a 2-tuple of - ``(time, buf)``, where ``time`` is the central time of the frame and + ``(time, buf)``, where ``time`` is the start time of the frame and ``buf`` is a 3D (width, height, channel) NumPy array containing [0,1]-valued RGBA components. @@ -196,73 +119,64 @@ class Animation(object): allowed to run until completion (by exhausting all items in the generator object). - A performance note: while any ready tasks will be scheduled on the GPU - before yielding a result, spending a lot of time before returning - control to this function can allow the GPU to become idle. It's best - to hand the resulting array to another thread after grabbing it from - the renderer for handling. - - ``times`` is a sequence of center times at which to render, or ``None`` - to render one frame for each genome used to create the animation. - - If ``sync`` is True, the CPU will sync with the GPU after every block - of temporal samples and yield None until the frame is ready. This - allows a single-card system to avoid having to go thirty seconds - between window refreshes while rendering. Otherwise, tasks will be - piled asynchronously on the card so that it is always under load. + ``times`` is a sequence of (start, stop) times defining the temporal + range to be rendered for each frame. This will change to be more + frame-centric in the future, allowing for interpolated temporal width. """ if times == []: return - f = self.features - - times = times if times is not None else [cp.time for cp in self.genomes] + info = self.info iter_stream = cuda.Stream() filt_stream = cuda.Stream() - cen_cp = pyflam3.Genome() - dst_cp = pyflam3.Genome() - nbins = f.acc_height * f.acc_stride + nbins = info.acc_height * info.acc_stride d_accum = cuda.mem_alloc(16 * nbins) d_out = cuda.mem_alloc(16 * nbins) num_sm = cuda.Context.get_device().multiprocessor_count - if sync: - cps_per_block = num_sm * self.SM_FACTOR - else: - cps_per_block = f.max_cps + cps_per_block = 1024 - info_size = self._iter.packer.align * cps_per_block + genome_times, genome_knots = self._iter.packer.pack() + d_genome_times = cuda.to_device(genome_times) + d_genome_knots = cuda.to_device(genome_knots) + info_size = 4 * len(self._iter.packer) * cps_per_block d_infos = cuda.mem_alloc(info_size) - d_palmem = cuda.mem_alloc(256 * f.palette_height * 4) + d_palmem = cuda.mem_alloc(256 * info.palette_height * 4) seeds = mwc.MWC.make_seeds(self._iter.NTHREADS * cps_per_block) d_seeds = cuda.to_device(seeds) - h_infos = cuda.pagelocked_empty((info_size / 4,), np.float32) h_palmem = cuda.pagelocked_empty( - (f.palette_height, 256, 4), np.uint8) - h_out = cuda.pagelocked_empty((f.acc_height, f.acc_stride, 4), np.float32) + (info.palette_height, 256, 4), np.uint8) + h_out = cuda.pagelocked_empty((info.acc_height, info.acc_stride, 4), + np.float32) filter_done_event = None - packer = self._iter.packer + packer_fun = self.mod.get_function("interp_iter_params") iter_fun = self.mod.get_function("iter") #iter_fun.set_cache_config(cuda.func_cache.PREFER_L1) util.BaseCode.zero_dptr(self.mod, d_accum, 4 * nbins, filt_stream) - last_time = times[0] + last_time = times[0][0] - for time in times: - self._interp(cen_cp, time) + # TODO: move palette stuff to separate class; do interp + pal = np.fromstring(base64.b64decode(info.db.palettes.values()[0]), + np.uint8) + pal = np.reshape(pal, (256, 3)) + h_palmem[0,:,:3] = pal + h_palmem[1:] = h_palmem[0] - h_palmem[:] = self._interp_colors(dst_cp, time, - cen_cp.temporal_filter_width) + for start, stop in times: + cen_cp = cuburn.genome.HacketyGenome(info.genome, (start+stop)/2) + + # "Interp" already done above, but... cuda.memcpy_htod_async(d_palmem, h_palmem, iter_stream) tref = self.mod.get_texref('palTex') array_info = cuda.ArrayDescriptor() - array_info.height = f.palette_height + array_info.height = info.palette_height array_info.width = 256 array_info.array_format = cuda.array_format.UNSIGNED_INT8 array_info.num_channels = 4 @@ -272,69 +186,52 @@ class Animation(object): tref.set_flags(cuda.TRSF_NORMALIZED_COORDINATES) tref.set_filter_mode(cuda.filter_mode.LINEAR) - # Must be accumulated over all CPs - gam, vib = 0, 0 - bkgd = np.zeros(3) - mblur_times = enumerate( np.linspace(-0.5, 0.5, cen_cp.ntemporal_samples) - * cen_cp.temporal_filter_width + time ) + if filter_done_event: + iter_stream.wait_for_event(filter_done_event) - for block_times in _chunk(list(mblur_times), cps_per_block): - infos = [] - if len(self.genomes) > 1: - for n, t in block_times: - self._interp(dst_cp, t) - frac = float(n) / cen_cp.ntemporal_samples - info = packer.pack(cp=Genome(dst_cp), cp_step_frac=frac) - infos.append(info) - gam += dst_cp.gamma - vib += dst_cp.vibrancy - bkgd += np.array(dst_cp.background) - else: - # Can't interpolate normally; just pack copies - packed = packer.pack(cp=self.genomes[0], cp_step_frac=0) - infos = [packed] * len(block_times) - gam += self.genomes[0].gamma * len(block_times) - vib += self.genomes[0].vibrancy * len(block_times) - bkgd += np.array(self.genomes[0].background) * len(block_times) + width = np.float32((stop-start) / cps_per_block) + packer_fun(d_infos, d_genome_times, d_genome_knots, + np.float32(start), width, d_seeds, + block=(256,1,1), grid=(cps_per_block/256,1), + stream=iter_stream) - infos = np.concatenate(infos) - h_infos[:len(infos)] = infos - cuda.memcpy_htod_async(d_infos, h_infos) + # Get interpolated control points for debugging + #iter_stream.synchronize() + #d_temp = cuda.from_device(d_infos, + #(cps_per_block, len(self._iter.packer)), np.float32) + #for i, n in zip(d_temp[5], self._iter.packer.packed): + #print '%60s %g' % ('_'.join(n), i) - if filter_done_event: - iter_stream.wait_for_event(filter_done_event) + nsamps = info.density * info.width * info.height / cps_per_block + iter_fun(np.uint64(d_accum), d_seeds, d_infos, np.int32(nsamps), + block=(32, self._iter.NTHREADS/32, 1), + grid=(cps_per_block, 1), + texrefs=[tref], stream=iter_stream) - # TODO: replace with option to split long runs shorter ones - # for interactivity - for i in range(1): - iter_fun(d_seeds, d_infos, np.uint64(d_accum), - block=(32, self._iter.NTHREADS/32, 1), - grid=(len(block_times), 1), - texrefs=[tref], stream=iter_stream) - - if sync: - iter_stream.synchronize() - yield None - - if filter_done_event and not sync: + if filter_done_event: + while not filt_stream.is_done(): + timemod.sleep(0.01) filt_stream.synchronize() yield last_time, self._trim(h_out) - last_time = time + last_time = start util.BaseCode.zero_dptr(self.mod, d_out, 4 * nbins, filt_stream) - self._de.invoke(self.mod, Genome(cen_cp), d_accum, d_out, filt_stream) + self._de.invoke(self.mod, cen_cp, d_accum, d_out, filt_stream) util.BaseCode.zero_dptr(self.mod, d_accum, 4 * nbins, filt_stream) filter_done_event = cuda.Event().record(filt_stream) f32 = np.float32 - n = f32(cen_cp.ntemporal_samples) - gam = f32(n / gam) - vib = f32(vib / n) - hipow = f32(cen_cp.highlight_power) - lin = f32(cen_cp.gam_lin_thresh) - lingam = f32(math.pow(cen_cp.gam_lin_thresh, gam-1.0) if lin > 0 else 0) - bkgd = vec.make_float3(*(bkgd / n)) + # TODO: implement integration over cubic splines? + gam = f32(1 / cen_cp.color.gamma) + vib = f32(cen_cp.color.vibrancy) + hipow = f32(cen_cp.color.highlight_power) + lin = f32(cen_cp.color.gamma_threshold) + lingam = f32(math.pow(lin, gam-1.0) if lin > 0 else 0) + bkgd = vec.make_float3( + cen_cp.color.background.r, + cen_cp.color.background.g, + cen_cp.color.background.b) color_fun = self.mod.get_function("colorclip") blocks = int(np.ceil(np.sqrt(nbins / 256))) @@ -343,133 +240,10 @@ class Animation(object): stream=filt_stream) cuda.memcpy_dtoh_async(h_out, d_out, filt_stream) - if sync: - filt_stream.synchronize() - yield time, self._trim(h_out) - - if not sync: - filt_stream.synchronize() - yield time, self._trim(h_out) - - def _interp(self, cp, time): - flam3_interpolate(self._g_arr, len(self._g_arr), time, 0, byref(cp)) - - @staticmethod - def _pal_to_np(cp): - # Converting palettes by iteration has an enormous performance - # overhead. We cheat massively and dangerously here. - pal = cast(pointer(cp.palette), POINTER(c_double * (256 * 5))) - val = np.frombuffer(buffer(pal.contents), count=256*5) - return np.uint8(np.reshape(val, (256, 5))[:,1:] * 255.0) - - def _interp_colors(self, cp, time, twidth): - # TODO: any visible difference between uint8 and richer formats? - height = self.features.palette_height - pal = np.empty((height, 256, 4), dtype=np.uint8) - - if len(self.genomes) > 1: - # The typical case; applying real motion blur - times = np.linspace(-0.5, 0.5, height) * twidth + time - for n, t in enumerate(times): - self._interp(cp, t) - pal[n] = self._pal_to_np(cp) - else: - # Cannot call any interp functions on a single genome; rather than - # have alternate code-paths, just copy the same colors everywhere - pal[0] = self._pal_to_np(self.genomes[0]) - pal[1:] = pal[0] - return pal + filt_stream.synchronize() + yield start, self._trim(h_out) def _trim(self, result): - g = self.features.gutter + g = self.info.gutter return result[g:-g,g:-g].copy() - -class Features(object): - """ - Determine features and constants required to render a particular set of - genomes. The values of this class are fixed before compilation begins. - """ - # Constant parameters which control handling of out-of-frame samples: - # Number of iterations to iterate without write after new point - fuse = 10 - # Maximum consecutive out-of-bounds points before picking new point - max_oob = 10 - max_nxforms = 12 - - # Height of the texture pallete which gets uploaded to the GPU (assuming - # that palette-from-texture is enabled). For most genomes, this doesn't - # need to be very large at all. However, since only an easily-cached - # fraction of this will be accessed per SM, larger values shouldn't hurt - # performance too much. Power-of-two, please. - palette_height = 16 - - # Maximum width of DE and other spatial filters, and thus in turn the - # amount of padding applied. Note that, for now, this must not be changed! - # The filtering code makes deep assumptions about this value. - gutter = 16 - - # TODO: for now, we always throw away the alpha channel before writing. - # All code is in place to not do this, we just need to find a way to expose - # this preference via the API (or push alpha blending entirely on the client, - # which I'm not opposed to) - alpha_output_channel = False - - def __init__(self, genomes): - any = lambda l: bool(filter(None, map(l, genomes))) - self.max_ntemporal_samples = max( - [cp.nbatches * cp.ntemporal_samples for cp in genomes]) - self.non_box_temporal_filter = genomes[0].temporal_filter_type - self.palette_mode = genomes[0].palette_mode and "linear" or "nearest" - - assert len(set([len(cp.xforms) for cp in genomes])) == 1, ("genomes " - "must have same number of xforms! (use flam3-genome first)") - self.nxforms = len(genomes[0].xforms) - self.xforms = [XFormFeatures([cp.xforms[i] for cp in genomes], i) - for i in range(self.nxforms)] - if any(lambda cp: cp.final_xform_enable): - if not all([cp.final_xform_index == genomes[0].final_xform_index - for cp in genomes]): - raise ValueError("Differing final xform indexes") - self.final_xform_index = genomes[0].final_xform_index - else: - self.final_xform_index = None - - alphas = np.array([c.color[3] for g in genomes - for c in g.palette.entries]) - self.pal_has_alpha = np.any(alphas != 1.0) - - self.max_cps = max([cp.ntemporal_samples for cp in genomes]) - - self.width = genomes[0].width - self.height = genomes[0].height - self.acc_width = genomes[0].width + 2 * self.gutter - self.acc_height = genomes[0].height + 2 * self.gutter - self.acc_stride = 32 * int(math.ceil(self.acc_width / 32.)) - self.std_xforms = filter(lambda v: v != self.final_xform_index, - range(self.nxforms)) - self.chaos_used = False - for cp in genomes: - for r in range(len(self.std_xforms)): - for c in range(len(self.std_xforms)): - if cp.chaos[r][c] != 1.0: - self.chaos_used = True - - - -class XFormFeatures(object): - def __init__(self, xforms, xform_id): - self.id = xform_id - any = lambda l: bool(filter(None, map(l, xforms))) - - self.has_post = any(lambda xf: not self.id_matrix(xf.post)) - self.vars = set() - for x in xforms: - self.vars = ( - self.vars.union(set([i for i, v in enumerate(x.var) if v]))) - - @staticmethod - def id_matrix(m): - return (m[0][0] == 1 and m[1][0] == 0 and m[2][0] == 0 and - m[0][1] == 0 and m[1][1] == 1 and m[2][1] == 0) - diff --git a/run_job.py b/run_job.py new file mode 100644 index 0000000..03526ea --- /dev/null +++ b/run_job.py @@ -0,0 +1,52 @@ +#!/usr/bin/python + +# Temporary helper script while I update main.py + +import os +import sys +import time +import argparse +import multiprocessing +from subprocess import Popen +from ctypes import * +from itertools import ifilter + +import numpy as np +import Image +import scipy +import pycuda.autoinit + +import cuburn.render +import cuburn.genome + +import pycuda.compiler +import pycuda.driver as cuda +import cuburn.code.interp + +np.set_printoptions(precision=5, edgeitems=20) + +real_stdout = sys.stdout + +def save(time, raw): + noalpha = raw[:,:,:3] + name = 'out/%05d' % time + img = scipy.misc.toimage(noalpha, cmin=0, cmax=1) + img.save(name+'.png') + +def main(jobfilepath): + contents = '\n'.join([l.split('//', 1)[0] for l in open(jobfilepath)]) + # This includes the genomes and other cruft, a dedicated reader will be + # built in time tho + info = cuburn.genome.load_info(contents) + + times = np.linspace(0, 1, info.duration * info.fps + 1)[:3] + + renderer = cuburn.render.Renderer(info) + renderer.compile() + renderer.load() + + for idx, (ftime, out) in enumerate(renderer.render(zip(times, times[1:]))): + save(idx, out) + +if __name__ == "__main__": + main('04653_2.jsonc')