From 088299423e653d13d10c12f5d5987ba35456b6ef Mon Sep 17 00:00:00 2001 From: Steven Robertson Date: Sat, 30 Apr 2011 16:40:16 -0400 Subject: [PATCH] Some amount of dynamic rendering --- cuburn/code/__init__.py | 20 ++---- cuburn/code/iter.py | 110 +++++++++++++++++++------------ cuburn/code/mwc.py | 142 ++++++++++++++++++++-------------------- cuburn/render.py | 23 +++++-- main.py | 7 +- 5 files changed, 165 insertions(+), 137 deletions(-) diff --git a/cuburn/code/__init__.py b/cuburn/code/__init__.py index ba98613..949d19a 100644 --- a/cuburn/code/__init__.py +++ b/cuburn/code/__init__.py @@ -1,19 +1,9 @@ """ -Contains the PTX fragments which will drive the device. +Contains the PTX fragments which will drive the device, and helper functions +to combine those fragments. """ -# Basic headers, utility functions, and so on -base = """ -#include -#include - -// TODO: use launch parameter preconfig to eliminate unnecessary parts -__device__ -uint32_t gtid() { - return threadIdx.x + blockDim.x * - (threadIdx.y + blockDim.y * - (threadIdx.z + blockDim.z * - (blockIdx.x + (gridDim.x * blockIdx.y)))); -} -""" +import util +import mwc +import iter diff --git a/cuburn/code/iter.py b/cuburn/code/iter.py index 4eabf38..7d519dd 100644 --- a/cuburn/code/iter.py +++ b/cuburn/code/iter.py @@ -7,62 +7,86 @@ from pycuda.driver import In, Out, InOut from pycuda.compiler import SourceModule import numpy as np -from cuburn import code from cuburn.code import mwc +from cuburn.code.util import * -src = r""" -#define FUSE 20 -#define MAXOOB 10 +import tempita -typedef struct { - // Number of iterations to perform, *per thread*. - uint32_t niters; +class IterCode(HunkOCode): + def __init__(self, features): + self.features = features + self.packer = DataPacker('iter_info') + iterbody = self._iterbody() + bodies = [self._xfbody(i,x) for i,x in enumerate(self.features.xforms)] + bodies.append(iterbody) + self.defs = '\n'.join(bodies) - // Number of accumulators per row and column in the accum buffer - uint32_t accwidth, accheight; -} iter_info; + def _xfbody(self, xfid, xform): + px = self.packer.view('info', 'xf%d_' % xfid) + px.sub('xf', 'cp.xforms[%d]' % xfid) + tmpl = tempita.Template(""" +__device__ +void apply_xf{{xfid}}(float *ix, float *iy, float *icolor, + const iter_info *info) { + float tx, ty, ox = *ix, oy = *iy; + {{apply_affine('ox', 'oy', 'tx', 'ty', px, 'xf.c', 'pre')}} + + // tiny little TODO: variations + + *ix = tx; + *iy = ty; + + float csp = {{px.get('xf.color_speed')}}; + *icolor = *icolor * (1.0f - csp) + {{px.get('xf.color')}} * csp; +}; +""") + g = dict(globals()) + g.update(locals()) + return tmpl.substitute(g) + + def _iterbody(self): + tmpl = tempita.Template(""" __global__ -void silly(mwc_st *msts, iter_info *infos, float *accbuf, float *denbuf) { +void iter(mwc_st *msts, const iter_info *infos, float *accbuf, float *denbuf) { mwc_st rctx = msts[gtid()]; - iter_info *info = &(infos[blockIdx.x]); + const iter_info *info = &(infos[blockIdx.x]); - float consec_bad = -FUSE; - float nsamps = info->niters; + int consec_bad = -{{features.fuse}}; + int nsamps = 500; float x, y, color; x = mwc_next_11(&rctx); y = mwc_next_11(&rctx); color = mwc_next_01(&rctx); - while (nsamps > 0.0f) { + while (nsamps > 0) { float xfsel = mwc_next_01(&rctx); - x *= 0.5f; - y *= 0.5f; - color *= 0.5f; - if (xfsel < 0.33f) { - color += 0.25f; - x += 0.5f; - } else if (xfsel < 0.66f) { - color += 0.5f; - y += 0.5f; + {{for xfid, xform in enumerate(features.xforms)}} + if (xfsel < {{packer.get('cp.norm_density[%d]' % xfid)}}) { + apply_xf{{xfid}}(&x, &y, &color, info); + } else + {{endfor}} + { + denbuf[0] = xfsel; + break; // TODO: fail here } - if (consec_bad < 0.0f) { + if (consec_bad < 0) { consec_bad++; continue; } if (x <= -1.0f || x >= 1.0f || y <= -1.0f || y >= 1.0f - || consec_bad < 0.0f) { + || consec_bad < 0) { consec_bad++; - if (consec_bad > MAXOOB) { + if (consec_bad > {{features.max_oob}}) { x = mwc_next_11(&rctx); y = mwc_next_11(&rctx); color = mwc_next_01(&rctx); - consec_bad = -FUSE; + consec_bad = -{{features.fuse}}; } continue; } @@ -80,26 +104,28 @@ void silly(mwc_st *msts, iter_info *infos, float *accbuf, float *denbuf) { nsamps--; } } -""" +""") + return tmpl.substitute( + features = self.features, + packer = self.packer.view('info')) -def silly(): - mod = SourceModule(code.base + mwc.src + src) + +def silly(features, cp): abuf = np.zeros((512, 512, 4), dtype=np.float32) dbuf = np.zeros((512, 512), dtype=np.float32) - seeds = mwc.build_mwc_seeds(512 * 24, seed=5) + seeds = mwc.MWC.make_seeds(512 * 24) - info = np.zeros(3, dtype=np.uint32) - info[0] = 5000 - info[1] = 512 - info[2] = 512 - info = np.repeat([info], 24, axis=0) + iter = IterCode(features) + code = assemble_code(BaseCode, mwc.MWC, iter, iter.packer) + print code + mod = SourceModule(code) - fun = mod.get_function("silly") + info = iter.packer.pack(cp=cp) + print info + + fun = mod.get_function("iter") fun(InOut(seeds), In(info), InOut(abuf), InOut(dbuf), - block=(512,1,1), grid=(24,1), time_kernel=True) + block=(512,1,1), grid=(1,1), time_kernel=True) - print abuf - print dbuf - print sum(dbuf) return abuf, dbuf diff --git a/cuburn/code/mwc.py b/cuburn/code/mwc.py index 216682c..588f801 100644 --- a/cuburn/code/mwc.py +++ b/cuburn/code/mwc.py @@ -2,23 +2,21 @@ The multiply-with-carry random number generator. """ -import time - -import pycuda.driver as cuda -from pycuda.compiler import SourceModule import numpy as np +import tempita -from jinja2 import Template +from cuburn.code.util import * -from cuburn import code - -src = r""" +class MWC(HunkOCode): + decls = """ typedef struct { uint32_t mul; uint32_t state; uint32_t carry; } mwc_st; +""" + defs = r""" __device__ uint32_t mwc_next(mwc_st *st) { asm("{\n\t.reg .u64 val;\n\t" "cvt.u64.u32 val, %0;\n\t" @@ -35,10 +33,38 @@ __device__ float mwc_next_01(mwc_st *st) { __device__ float mwc_next_11(mwc_st *st) { return ((int32_t) mwc_next(st)) * (1.0f / 2147483648.0f); } - """ -testsrc = code.base + src + """ + @staticmethod + def make_seeds(nthreads, host_seed=None): + if host_seed: + rand = np.random.RandomState(host_seed) + else: + rand = np.random + + # Load raw big-endian u32 multipliers from primes.bin. + with open('primes.bin') as primefp: + dt = np.dtype(np.uint32).newbyteorder('B') + mults = np.frombuffer(primefp.read(), dtype=dt) + + # Create the seed structures. TODO: check that struct is 4-byte aligned + seeds = np.empty((3, nthreads), dtype=np.uint32, order='F') + + # Randomness in choosing multipliers is good, but larger multipliers + # have longer periods, which is also good. This is a compromise. + mults = np.array(mults[:nthreads*4]) + rand.shuffle(mults) + seeds[0][:] = mults[:nthreads] + + # Intentionally excludes both 0 and (2^32-1), as they can lead to + # degenerate sequences of period 0 + seeds[1] = rand.randint(1, 0xffffffff, size=nthreads) + seeds[2] = rand.randint(1, 0xffffffff, size=nthreads) + + return seeds + +class MWCTest(HunkOCode): + defs = """ __global__ void test_mwc(mwc_st *msts, uint64_t *sums, float nrounds) { mwc_st rctx = msts[gtid()]; uint64_t sum = 0; @@ -48,69 +74,45 @@ __global__ void test_mwc(mwc_st *msts, uint64_t *sums, float nrounds) { } """ -def build_mwc_seeds(nthreads, seed=None): - if seed: - rand = np.random.RandomState(seed) - else: - rand = np.random + @classmethod + def test_mwc(cls, rounds=5000, nblocks=64, blockwidth=512): + import pycuda.driver as cuda + from pycuda.compiler import SourceModule + import time - # Load raw big-endian u32 multipliers from primes.bin. - with open('primes.bin') as primefp: - dt = np.dtype(np.uint32).newbyteorder('B') - mults = np.frombuffer(primefp.read(), dtype=dt) + nthreads = blockwidth * nblocks + seeds = MWC.make_seeds(nthreads, host_seed = 5) + dseeds = cuda.to_device(seeds) - # Create the seed structures. TODO: check that struct is 4-byte aligned - seeds = np.empty((3, nthreads), dtype=np.uint32, order='F') + mod = SourceModule(assemble_code(BaseCode, MWC, cls)) - # Randomness in choosing multipliers is good, but larger multipliers - # have longer periods, which is also good. This is a compromise. - mults = np.array(mults[:nthreads*4]) - rand.shuffle(mults) - seeds[0][:] = mults[:nthreads] + for trial in range(2): + print "Trial %d, on CPU: " % trial, + sums = np.zeros(nthreads, dtype=np.uint64) + ctime = time.time() + mults = seeds[0].astype(np.uint64) + states = seeds[1] + carries = seeds[2] + for i in range(rounds): + step = np.frombuffer((mults * states + carries).data, + dtype=np.uint32).reshape((2, nthreads), order='F') + states[:] = step[0] + carries[:] = step[1] + sums += states - # Intentionally excludes both 0 and (2^32-1), as they can lead to - # degenerate sequences of period 0 - seeds[1] = rand.randint(1, 0xffffffff, size=nthreads) - seeds[2] = rand.randint(1, 0xffffffff, size=nthreads) + ctime = time.time() - ctime + print "Took %g seconds." % ctime - return seeds - -def test_mwc(): - rounds = 5000 - nblocks = 64 - nthreads = 512 * nblocks - - seeds = build_mwc_seeds(nthreads, seed = 5) - dseeds = cuda.to_device(seeds) - - mod = SourceModule(testsrc) - - for trial in range(2): - print "Trial %d, on CPU: " % trial, - sums = np.zeros(nthreads, dtype=np.uint64) - ctime = time.time() - mults = seeds[0].astype(np.uint64) - states = seeds[1] - carries = seeds[2] - for i in range(rounds): - step = np.frombuffer((mults * states + carries).data, - dtype=np.uint32).reshape((2, nthreads), order='F') - states[:] = step[0] - carries[:] = step[1] - sums += states - - ctime = time.time() - ctime - print "Took %g seconds." % ctime - - print "Trial %d, on device: " % trial, - dsums = cuda.mem_alloc(8*nthreads) - fun = mod.get_function("test_mwc") - dtime = fun(dseeds, dsums, np.float32(rounds), - block=(512,1,1), grid=(nblocks,1), time_kernel=True) - print "Took %g seconds." % dtime - dsums = cuda.from_device(dsums, nthreads, np.uint64) - if not np.all(np.equal(sums, dsums)): - print "Sum discrepancy!" - print sums - print dsums + print "Trial %d, on device: " % trial, + dsums = cuda.mem_alloc(8*nthreads) + fun = mod.get_function("test_mwc") + dtime = fun(dseeds, dsums, np.float32(rounds), + block=(blockwidth,1,1), grid=(nblocks,1), + time_kernel=True) + print "Took %g seconds." % dtime + dsums = cuda.from_device(dsums, nthreads, np.uint64) + if not np.all(np.equal(sums, dsums)): + print "Sum discrepancy!" + print sums + print dsums diff --git a/cuburn/render.py b/cuburn/render.py index b56b78a..018590f 100644 --- a/cuburn/render.py +++ b/cuburn/render.py @@ -14,7 +14,17 @@ from cuburn.variations import Variations Point = lambda x, y: np.array([x, y], dtype=np.double) class Genome(pyflam3.Genome): - pass + @classmethod + def from_string(cls, *args, **kwargs): + gnms = super(Genome, cls).from_string(*args, **kwargs) + for g in gnms: g._init() + return gnms + + def _init(self): + self.xforms = [self.xform[i] for i in range(self.num_xforms)] + dens = np.array([x.density for x in self.xforms]) + dens /= np.sum(dens) + self.norm_density = [np.sum(dens[:i+1]) for i in range(len(dens))] class XForm(object): """ @@ -99,7 +109,7 @@ class Frame(object): cp.camera = Camera(self._frame, cp, filters) cp.nsamples = (cp.camera.sample_density * center.width * center.height) / ncps - cp.xforms = XForm.parse(cp) + print "Expected writes:", ( cp.camera.sample_density * center.width * center.height) @@ -190,9 +200,10 @@ class Features(object): """ # Constant parameters which control handling of out-of-frame samples: # Number of iterations to iterate without write after new point - fuse = 2 - # Maximum consecutive out-of-frame points before picking new point - max_bad = 3 + fuse = 20 + # Maximum consecutive out-of-bounds points before picking new point + max_oob = 10 + max_nxforms = 12 # Height of the texture pallete which gets uploaded to the GPU (assuming # that palette-from-texture is enabled). For most genomes, this doesn't @@ -205,7 +216,6 @@ class Features(object): any = lambda l: bool(filter(None, map(l, genomes))) self.max_ntemporal_samples = max( [cp.nbatches * cp.ntemporal_samples for cp in genomes]) - self.camera_rotation = any(lambda cp: cp.rotate) self.non_box_temporal_filter = genomes[0].temporal_filter_type self.palette_mode = genomes[0].palette_mode and "linear" or "nearest" @@ -214,6 +224,7 @@ class Features(object): "number of xforms! (try running through flam3-genome first)") self.xforms = [XFormFeatures([x[i] for x in xforms], i) for i in range(len(xforms[0]))] + self.nxforms = len(self.xforms) if any(lambda cp: cp.final_xform_enable): raise NotImplementedError("Final xform") diff --git a/main.py b/main.py index 8867882..eed759c 100644 --- a/main.py +++ b/main.py @@ -24,16 +24,15 @@ import pyglet import pycuda.autoinit from cuburn.render import * -from cuburn.code.mwc import test_mwc +from cuburn.code.mwc import MWCTest from cuburn.code.iter import silly - def main(args): + #MWCTest.test_mwc() with open(args[-1]) as fp: genomes = Genome.from_string(fp.read()) anim = Animation(genomes) - - accum, den = silly() + accum, den = silly(anim.features, genomes[0]) if False: bins = anim.render_frame()