Some amount of dynamic rendering

This commit is contained in:
Steven Robertson 2011-04-30 16:40:16 -04:00
parent 1302f31ec7
commit 088299423e
5 changed files with 165 additions and 137 deletions

View File

@ -1,19 +1,9 @@
""" """
Contains the PTX fragments which will drive the device. Contains the PTX fragments which will drive the device, and helper functions
to combine those fragments.
""" """
# Basic headers, utility functions, and so on import util
base = """ import mwc
#include<cuda.h> import iter
#include<stdint.h>
// TODO: use launch parameter preconfig to eliminate unnecessary parts
__device__
uint32_t gtid() {
return threadIdx.x + blockDim.x *
(threadIdx.y + blockDim.y *
(threadIdx.z + blockDim.z *
(blockIdx.x + (gridDim.x * blockIdx.y))));
}
"""

View File

@ -7,62 +7,86 @@ from pycuda.driver import In, Out, InOut
from pycuda.compiler import SourceModule from pycuda.compiler import SourceModule
import numpy as np import numpy as np
from cuburn import code
from cuburn.code import mwc from cuburn.code import mwc
from cuburn.code.util import *
src = r""" import tempita
#define FUSE 20
#define MAXOOB 10
typedef struct { class IterCode(HunkOCode):
// Number of iterations to perform, *per thread*. def __init__(self, features):
uint32_t niters; self.features = features
self.packer = DataPacker('iter_info')
iterbody = self._iterbody()
bodies = [self._xfbody(i,x) for i,x in enumerate(self.features.xforms)]
bodies.append(iterbody)
self.defs = '\n'.join(bodies)
// Number of accumulators per row and column in the accum buffer def _xfbody(self, xfid, xform):
uint32_t accwidth, accheight; px = self.packer.view('info', 'xf%d_' % xfid)
} iter_info; px.sub('xf', 'cp.xforms[%d]' % xfid)
tmpl = tempita.Template("""
__device__
void apply_xf{{xfid}}(float *ix, float *iy, float *icolor,
const iter_info *info) {
float tx, ty, ox = *ix, oy = *iy;
{{apply_affine('ox', 'oy', 'tx', 'ty', px, 'xf.c', 'pre')}}
// tiny little TODO: variations
*ix = tx;
*iy = ty;
float csp = {{px.get('xf.color_speed')}};
*icolor = *icolor * (1.0f - csp) + {{px.get('xf.color')}} * csp;
};
""")
g = dict(globals())
g.update(locals())
return tmpl.substitute(g)
def _iterbody(self):
tmpl = tempita.Template("""
__global__ __global__
void silly(mwc_st *msts, iter_info *infos, float *accbuf, float *denbuf) { void iter(mwc_st *msts, const iter_info *infos, float *accbuf, float *denbuf) {
mwc_st rctx = msts[gtid()]; mwc_st rctx = msts[gtid()];
iter_info *info = &(infos[blockIdx.x]); const iter_info *info = &(infos[blockIdx.x]);
float consec_bad = -FUSE; int consec_bad = -{{features.fuse}};
float nsamps = info->niters; int nsamps = 500;
float x, y, color; float x, y, color;
x = mwc_next_11(&rctx); x = mwc_next_11(&rctx);
y = mwc_next_11(&rctx); y = mwc_next_11(&rctx);
color = mwc_next_01(&rctx); color = mwc_next_01(&rctx);
while (nsamps > 0.0f) { while (nsamps > 0) {
float xfsel = mwc_next_01(&rctx); float xfsel = mwc_next_01(&rctx);
x *= 0.5f; {{for xfid, xform in enumerate(features.xforms)}}
y *= 0.5f; if (xfsel < {{packer.get('cp.norm_density[%d]' % xfid)}}) {
color *= 0.5f; apply_xf{{xfid}}(&x, &y, &color, info);
if (xfsel < 0.33f) { } else
color += 0.25f; {{endfor}}
x += 0.5f; {
} else if (xfsel < 0.66f) { denbuf[0] = xfsel;
color += 0.5f; break; // TODO: fail here
y += 0.5f;
} }
if (consec_bad < 0.0f) { if (consec_bad < 0) {
consec_bad++; consec_bad++;
continue; continue;
} }
if (x <= -1.0f || x >= 1.0f || y <= -1.0f || y >= 1.0f if (x <= -1.0f || x >= 1.0f || y <= -1.0f || y >= 1.0f
|| consec_bad < 0.0f) { || consec_bad < 0) {
consec_bad++; consec_bad++;
if (consec_bad > MAXOOB) { if (consec_bad > {{features.max_oob}}) {
x = mwc_next_11(&rctx); x = mwc_next_11(&rctx);
y = mwc_next_11(&rctx); y = mwc_next_11(&rctx);
color = mwc_next_01(&rctx); color = mwc_next_01(&rctx);
consec_bad = -FUSE; consec_bad = -{{features.fuse}};
} }
continue; continue;
} }
@ -80,26 +104,28 @@ void silly(mwc_st *msts, iter_info *infos, float *accbuf, float *denbuf) {
nsamps--; nsamps--;
} }
} }
""" """)
return tmpl.substitute(
features = self.features,
packer = self.packer.view('info'))
def silly():
mod = SourceModule(code.base + mwc.src + src) def silly(features, cp):
abuf = np.zeros((512, 512, 4), dtype=np.float32) abuf = np.zeros((512, 512, 4), dtype=np.float32)
dbuf = np.zeros((512, 512), dtype=np.float32) dbuf = np.zeros((512, 512), dtype=np.float32)
seeds = mwc.build_mwc_seeds(512 * 24, seed=5) seeds = mwc.MWC.make_seeds(512 * 24)
info = np.zeros(3, dtype=np.uint32) iter = IterCode(features)
info[0] = 5000 code = assemble_code(BaseCode, mwc.MWC, iter, iter.packer)
info[1] = 512 print code
info[2] = 512 mod = SourceModule(code)
info = np.repeat([info], 24, axis=0)
fun = mod.get_function("silly") info = iter.packer.pack(cp=cp)
print info
fun = mod.get_function("iter")
fun(InOut(seeds), In(info), InOut(abuf), InOut(dbuf), fun(InOut(seeds), In(info), InOut(abuf), InOut(dbuf),
block=(512,1,1), grid=(24,1), time_kernel=True) block=(512,1,1), grid=(1,1), time_kernel=True)
print abuf
print dbuf
print sum(dbuf)
return abuf, dbuf return abuf, dbuf

View File

@ -2,23 +2,21 @@
The multiply-with-carry random number generator. The multiply-with-carry random number generator.
""" """
import time
import pycuda.driver as cuda
from pycuda.compiler import SourceModule
import numpy as np import numpy as np
import tempita
from jinja2 import Template from cuburn.code.util import *
from cuburn import code class MWC(HunkOCode):
decls = """
src = r"""
typedef struct { typedef struct {
uint32_t mul; uint32_t mul;
uint32_t state; uint32_t state;
uint32_t carry; uint32_t carry;
} mwc_st; } mwc_st;
"""
defs = r"""
__device__ uint32_t mwc_next(mwc_st *st) { __device__ uint32_t mwc_next(mwc_st *st) {
asm("{\n\t.reg .u64 val;\n\t" asm("{\n\t.reg .u64 val;\n\t"
"cvt.u64.u32 val, %0;\n\t" "cvt.u64.u32 val, %0;\n\t"
@ -35,22 +33,12 @@ __device__ float mwc_next_01(mwc_st *st) {
__device__ float mwc_next_11(mwc_st *st) { __device__ float mwc_next_11(mwc_st *st) {
return ((int32_t) mwc_next(st)) * (1.0f / 2147483648.0f); return ((int32_t) mwc_next(st)) * (1.0f / 2147483648.0f);
} }
""" """
testsrc = code.base + src + """ @staticmethod
__global__ void test_mwc(mwc_st *msts, uint64_t *sums, float nrounds) { def make_seeds(nthreads, host_seed=None):
mwc_st rctx = msts[gtid()]; if host_seed:
uint64_t sum = 0; rand = np.random.RandomState(host_seed)
for (float i = 0; i < nrounds; i++) sum += mwc_next(&rctx);
sums[gtid()] = sum;
msts[gtid()] = rctx;
}
"""
def build_mwc_seeds(nthreads, seed=None):
if seed:
rand = np.random.RandomState(seed)
else: else:
rand = np.random rand = np.random
@ -75,15 +63,28 @@ def build_mwc_seeds(nthreads, seed=None):
return seeds return seeds
def test_mwc(): class MWCTest(HunkOCode):
rounds = 5000 defs = """
nblocks = 64 __global__ void test_mwc(mwc_st *msts, uint64_t *sums, float nrounds) {
nthreads = 512 * nblocks mwc_st rctx = msts[gtid()];
uint64_t sum = 0;
for (float i = 0; i < nrounds; i++) sum += mwc_next(&rctx);
sums[gtid()] = sum;
msts[gtid()] = rctx;
}
"""
seeds = build_mwc_seeds(nthreads, seed = 5) @classmethod
def test_mwc(cls, rounds=5000, nblocks=64, blockwidth=512):
import pycuda.driver as cuda
from pycuda.compiler import SourceModule
import time
nthreads = blockwidth * nblocks
seeds = MWC.make_seeds(nthreads, host_seed = 5)
dseeds = cuda.to_device(seeds) dseeds = cuda.to_device(seeds)
mod = SourceModule(testsrc) mod = SourceModule(assemble_code(BaseCode, MWC, cls))
for trial in range(2): for trial in range(2):
print "Trial %d, on CPU: " % trial, print "Trial %d, on CPU: " % trial,
@ -106,7 +107,8 @@ def test_mwc():
dsums = cuda.mem_alloc(8*nthreads) dsums = cuda.mem_alloc(8*nthreads)
fun = mod.get_function("test_mwc") fun = mod.get_function("test_mwc")
dtime = fun(dseeds, dsums, np.float32(rounds), dtime = fun(dseeds, dsums, np.float32(rounds),
block=(512,1,1), grid=(nblocks,1), time_kernel=True) block=(blockwidth,1,1), grid=(nblocks,1),
time_kernel=True)
print "Took %g seconds." % dtime print "Took %g seconds." % dtime
dsums = cuda.from_device(dsums, nthreads, np.uint64) dsums = cuda.from_device(dsums, nthreads, np.uint64)
if not np.all(np.equal(sums, dsums)): if not np.all(np.equal(sums, dsums)):

View File

@ -14,7 +14,17 @@ from cuburn.variations import Variations
Point = lambda x, y: np.array([x, y], dtype=np.double) Point = lambda x, y: np.array([x, y], dtype=np.double)
class Genome(pyflam3.Genome): class Genome(pyflam3.Genome):
pass @classmethod
def from_string(cls, *args, **kwargs):
gnms = super(Genome, cls).from_string(*args, **kwargs)
for g in gnms: g._init()
return gnms
def _init(self):
self.xforms = [self.xform[i] for i in range(self.num_xforms)]
dens = np.array([x.density for x in self.xforms])
dens /= np.sum(dens)
self.norm_density = [np.sum(dens[:i+1]) for i in range(len(dens))]
class XForm(object): class XForm(object):
""" """
@ -99,7 +109,7 @@ class Frame(object):
cp.camera = Camera(self._frame, cp, filters) cp.camera = Camera(self._frame, cp, filters)
cp.nsamples = (cp.camera.sample_density * cp.nsamples = (cp.camera.sample_density *
center.width * center.height) / ncps center.width * center.height) / ncps
cp.xforms = XForm.parse(cp)
print "Expected writes:", ( print "Expected writes:", (
cp.camera.sample_density * center.width * center.height) cp.camera.sample_density * center.width * center.height)
@ -190,9 +200,10 @@ class Features(object):
""" """
# Constant parameters which control handling of out-of-frame samples: # Constant parameters which control handling of out-of-frame samples:
# Number of iterations to iterate without write after new point # Number of iterations to iterate without write after new point
fuse = 2 fuse = 20
# Maximum consecutive out-of-frame points before picking new point # Maximum consecutive out-of-bounds points before picking new point
max_bad = 3 max_oob = 10
max_nxforms = 12
# Height of the texture pallete which gets uploaded to the GPU (assuming # Height of the texture pallete which gets uploaded to the GPU (assuming
# that palette-from-texture is enabled). For most genomes, this doesn't # that palette-from-texture is enabled). For most genomes, this doesn't
@ -205,7 +216,6 @@ class Features(object):
any = lambda l: bool(filter(None, map(l, genomes))) any = lambda l: bool(filter(None, map(l, genomes)))
self.max_ntemporal_samples = max( self.max_ntemporal_samples = max(
[cp.nbatches * cp.ntemporal_samples for cp in genomes]) [cp.nbatches * cp.ntemporal_samples for cp in genomes])
self.camera_rotation = any(lambda cp: cp.rotate)
self.non_box_temporal_filter = genomes[0].temporal_filter_type self.non_box_temporal_filter = genomes[0].temporal_filter_type
self.palette_mode = genomes[0].palette_mode and "linear" or "nearest" self.palette_mode = genomes[0].palette_mode and "linear" or "nearest"
@ -214,6 +224,7 @@ class Features(object):
"number of xforms! (try running through flam3-genome first)") "number of xforms! (try running through flam3-genome first)")
self.xforms = [XFormFeatures([x[i] for x in xforms], i) self.xforms = [XFormFeatures([x[i] for x in xforms], i)
for i in range(len(xforms[0]))] for i in range(len(xforms[0]))]
self.nxforms = len(self.xforms)
if any(lambda cp: cp.final_xform_enable): if any(lambda cp: cp.final_xform_enable):
raise NotImplementedError("Final xform") raise NotImplementedError("Final xform")

View File

@ -24,16 +24,15 @@ import pyglet
import pycuda.autoinit import pycuda.autoinit
from cuburn.render import * from cuburn.render import *
from cuburn.code.mwc import test_mwc from cuburn.code.mwc import MWCTest
from cuburn.code.iter import silly from cuburn.code.iter import silly
def main(args): def main(args):
#MWCTest.test_mwc()
with open(args[-1]) as fp: with open(args[-1]) as fp:
genomes = Genome.from_string(fp.read()) genomes = Genome.from_string(fp.read())
anim = Animation(genomes) anim = Animation(genomes)
accum, den = silly(anim.features, genomes[0])
accum, den = silly()
if False: if False:
bins = anim.render_frame() bins = anim.render_frame()