From fe6367821ff3bec512ec1274fe3eb3611f310eaa Mon Sep 17 00:00:00 2001 From: Steven Robertson Date: Fri, 29 Apr 2011 11:00:18 -0400 Subject: [PATCH] RNG, again. Hooray. --- cuburn/code/__init__.py | 19 ++ cuburn/code/mwc.py | 107 ++++++++ cuburn/device_code.py | 584 ---------------------------------------- main.py | 25 +- 4 files changed, 133 insertions(+), 602 deletions(-) create mode 100644 cuburn/code/__init__.py create mode 100644 cuburn/code/mwc.py delete mode 100644 cuburn/device_code.py diff --git a/cuburn/code/__init__.py b/cuburn/code/__init__.py new file mode 100644 index 0000000..ba98613 --- /dev/null +++ b/cuburn/code/__init__.py @@ -0,0 +1,19 @@ +""" +Contains the PTX fragments which will drive the device. +""" + +# Basic headers, utility functions, and so on +base = """ +#include +#include + +// TODO: use launch parameter preconfig to eliminate unnecessary parts +__device__ +uint32_t gtid() { + return threadIdx.x + blockDim.x * + (threadIdx.y + blockDim.y * + (threadIdx.z + blockDim.z * + (blockIdx.x + (gridDim.x * blockIdx.y)))); +} +""" + diff --git a/cuburn/code/mwc.py b/cuburn/code/mwc.py new file mode 100644 index 0000000..4665967 --- /dev/null +++ b/cuburn/code/mwc.py @@ -0,0 +1,107 @@ +""" +The multiply-with-carry random number generator. +""" + +import time + +import pycuda.driver as cuda +from pycuda.compiler import SourceModule +import numpy as np + +from jinja2 import Template + +from cuburn import code + +src = r""" +typedef struct { + uint32_t mul; + uint32_t state; + uint32_t carry; +} mwc_st; + +__device__ uint32_t mwc_next(mwc_st *st) { + asm(".reg .u64 val;\n\t" + "cvt.u64.u32 val, %0;\n\t" + "mad.wide.u32 val, %1, %2, val;\n\t" + "mov.b64 {%1, %0}, val;\n\t" + : "=r"(st->carry), "=r"(st->state) : "r"(st->mul)); + return st->state; +} +""" + +testsrc = code.base + src + """ +__global__ void test_mwc(mwc_st *msts, uint64_t *sums, float nrounds) { + mwc_st rctx = msts[gtid()]; + uint64_t sum = 0; + for (float i = 0; i < nrounds; i++) sum += mwc_next(&rctx); + sums[gtid()] = sum; + msts[gtid()] = rctx; +} +""" + +def build_mwc_seeds(nthreads, seed=None): + if seed: + rand = np.random.RandomState(seed) + else: + rand = np.random + + # Load raw big-endian u32 multipliers from primes.bin. + with open('primes.bin') as primefp: + dt = np.dtype(np.uint32).newbyteorder('B') + mults = np.frombuffer(primefp.read(), dtype=dt) + + # Create the seed structures. TODO: check that struct is 4-byte aligned + seeds = np.empty((3, nthreads), dtype=np.uint32, order='F') + + # Randomness in choosing multipliers is good, but larger multipliers + # have longer periods, which is also good. This is a compromise. + mults = np.array(mults[:nthreads*4]) + rand.shuffle(mults) + seeds[0][:] = mults[:nthreads] + + # Intentionally excludes both 0 and (2^32-1), as they can lead to + # degenerate sequences of period 0 + seeds[1] = rand.randint(1, 0xffffffff, size=nthreads) + seeds[2] = rand.randint(1, 0xffffffff, size=nthreads) + + return seeds + +def test_mwc(): + rounds = 5000 + nblocks = 64 + nthreads = 512 * nblocks + + seeds = build_mwc_seeds(nthreads, seed = 5) + dseeds = cuda.to_device(seeds) + + mod = SourceModule(testsrc) + + for trial in range(2): + print "Trial %d, on CPU: " % trial, + sums = np.zeros(nthreads, dtype=np.uint64) + ctime = time.time() + mults = seeds[0].astype(np.uint64) + states = seeds[1] + carries = seeds[2] + for i in range(rounds): + step = np.frombuffer((mults * states + carries).data, + dtype=np.uint32).reshape((2, nthreads), order='F') + states[:] = step[0] + carries[:] = step[1] + sums += states + + ctime = time.time() - ctime + print "Took %g seconds." % ctime + + print "Trial %d, on device: " % trial, + dsums = cuda.mem_alloc(8*nthreads) + fun = mod.get_function("test_mwc") + dtime = fun(dseeds, dsums, np.float32(rounds), + block=(512,1,1), grid=(nblocks,1), time_kernel=True) + print "Took %g seconds." % dtime + dsums = cuda.from_device(dsums, nthreads, np.uint64) + if not np.all(np.equal(sums, dsums)): + print "Sum discrepancy!" + print sums + print dsums + diff --git a/cuburn/device_code.py b/cuburn/device_code.py deleted file mode 100644 index 49228de..0000000 --- a/cuburn/device_code.py +++ /dev/null @@ -1,584 +0,0 @@ -""" -Contains the PTX fragments which will drive the device. -""" - -import os -import time -import struct - -import pycuda.driver as cuda -import numpy as np - -from pyptx import ptx, run, util -from cuburn.variations import Variations - -class IterThread(object): - def __init__(self, entry, features): - self.features = features - self.mwc = MWCRNG(entry) - self.cp = util.DataStream(entry) - self.vars = Variations(features) - self.camera = CameraTransform(features) - self.hist = HistScatter(entry, features) - self.shuf = ShufflePoints(entry) - - entry.add_param('u32', 'num_cps') - entry.add_ptr_param('u32', 'cp_started_count') - entry.add_ptr_param('u8', 'cp_data') - - with entry.body(): - self.entry_body(entry) - - def entry_body(self, entry): - e, r, o, m, p, s = entry.locals - # Index of this CTA's current CP - e.declare_mem('shared', 'u32', 'cp_idx') - - # Number of samples that have been generated so far in this CTA - # If this number is negative, we're still fusing points, so this - # behaves slightly differently (see ``fuse_loop_start``) - # TODO: replace (or at least simplify) this logic - e.declare_mem('shared', 'u32', 'num_samples') - - # The per-warp transform selection indices - e.declare_mem('shared', 'f32', 'xf_sel', e.nwarps_cta) - - # TODO: re-add this logic using the printf formatter. - #mem.local.u32('l_num_rounds') - #mem.local.u32('l_num_writes') - #op.st.local.u32(addr(l_num_rounds), 0) - #op.st.local.u32(addr(l_num_writes), 0) - - # Declare IFS-space coordinates for doing iterations - r.x, r.y, r.color = r.f32(), r.f32(), r.f32() - r.x, r.y = self.mwc.next_f32_11(), self.mwc.next_f32_11() - r.color = self.mwc.next_f32_01() - - # This thread's sample's good/bad/fusing state - r.consec_bad = r.f32(-self.features.fuse) - - e.comment("The main loop entry point") - cp_loop_start = e.label() - with s.tid_x == 0: - o.st(m.cp_idx.addr, o.atom.add(p.cp_started_count[0], 1)) - o.st(m.num_samples.addr, 0) - - e.comment("Load the CP index in all threads") - o.bar.sync(0) - cp_idx = o.ld.volatile(m.cp_idx.addr) - - e.comment("Check to see if this CP is valid (if not, we're done)") - all_cps_done = e.forward_label() - with cp_idx < p.num_cps.val: - o.bra.uni(all_cps_done) - self.cp.addr = p.cp_data[cp_idx * self.cp.stream_size] - - loop_start = e.forward_label() - with s.tid_x < e.nwarps_cta: - o.bra(loop_start) - - e.comment("Choose the xform for each warp") - choose_xform = e.label() - o.st.volatile(m.xf_sel[s.tid_x], self.mwc.next_f32_01()) - - e.declare_label(loop_start) - e.comment("Execute the xform given by xf_sel") - xf_labels = [e.forward_label() for xf in self.features.xforms] - xf_sel = o.ld.volatile(m.xf_sel[s.tid_x >> 5]) - for i, xf in enumerate(self.features.xforms): - xf_density = self.cp.get.f32('cp.xforms[%d].cweight'%xf.id) - with xf_density <= xf_sel: - o.bra.uni(xf_labels[i]) - - e.comment("This code should be unreachable") - o.trap() - - xforms_done = e.forward_label() - for i, xf in enumerate(self.features.xforms): - e.declare_label(xf_labels[i]) - r.x, r.y, r.color = self.vars.apply_xform( - e, self.cp, r.x, r.y, r.color, xf.id) - o.bra.uni(xforms_done) - - e.comment("Determine write location, and whether point is valid") - e.declare_label(xforms_done) - histidx, is_valid = self.camera.get_index(e, self.cp, r.x, r.y) - is_valid &= (r.consec_bad >= 0) - - e.comment("Scatter point to pointbuffer") - self.hist.scatter(self.cp, histidx, r.color, 0, is_valid) - - done_picking_new_point = e.forward_label() - with ~is_valid: - r.consec_bad += 1 - with r.consec_bad < self.features.max_bad: - o.bra(done_picking_new_point) - - e.comment("If too many consecutive bad values, pick a new point") - r.x, r.y = self.mwc.next_f32_11(), self.mwc.next_f32_11() - r.color = self.mwc.next_f32_01() - r.consec_bad = -self.features.fuse - - e.declare_label(done_picking_new_point) - - e.comment("Determine number of good samples, and whether we're done") - num_samples = o.ld(m.num_samples.addr) - num_samples += o.bar.red.popc(0, is_valid) - with s.tid_x == 0: - o.st(m.num_samples, num_samples) - with num_samples >= self.cp.get.u32('nsamples'): - o.bra.uni(cp_loop_start) - - self.shuf.shuffle(e, r.x, r.y, r.color, r.consec_bad) - - with s.tid_x < e.nwarps_cta: - o.bra(choose_xform) - o.bra(loop_start) - - e.declare_label(all_cps_done) - - def upload_cp_stream(self, ctx, cp_stream, num_cps): - cp_array_dp, cp_array_l = ctx.mod.get_global('g_cp_array') - assert len(cp_stream) <= cp_array_l, "Stream too big!" - cuda.memcpy_htod(cp_array_dp, cp_stream) - - num_cps_dp, num_cps_l = ctx.mod.get_global('g_num_cps') - cuda.memset_d32(num_cps_dp, num_cps, 1) - # TODO: "if debug >= 3" - print "Uploaded stream to card:" - CPDataStream.print_record(ctx, cp_stream, 5) - self.cps_uploaded = True - - def call_setup(self, ctx): - if not self.cps_uploaded: - raise Error("Cannot call IterThread before uploading CPs") - num_cps_st_dp, num_cps_st_l = ctx.mod.get_global('g_num_cps_started') - cuda.memset_d32(num_cps_st_dp, 0, 1) - - def _call(self, ctx, func): - # Get texture reference from the Palette - # TODO: more elegant method than reaching into ctx.ptx? - tr = ctx.ptx.instances[PaletteLookup].texref - super(IterThread, self)._call(ctx, func, texrefs=[tr]) - - def call_teardown(self, ctx): - def print_thing(s, a): - print '%s:' % s - for i, r in enumerate(a): - for j in range(0,len(r),ctx.warps_per_cta): - print '%2d' % i, - for k in range(j,j+ctx.warps_per_cta,8): - print '\t' + ' '.join( - ['%8g'%np.mean(r[l]) for l in range(k,k+8)]) - - rounds = ctx.get_per_thread('g_num_rounds', np.int32, shaped=True) - writes = ctx.get_per_thread('g_num_writes', np.int32, shaped=True) - print_thing("Rounds", rounds) - print_thing("Writes", writes) - print "Total number of rounds:", np.sum(rounds) - - dp, l = ctx.mod.get_global('g_num_cps_started') - cps_started = cuda.from_device(dp, 1, np.uint32) - print "CPs started:", cps_started - -class CameraTransform(object): - def __init__(self, features): - self.features = features - - def rotate(self, cp, x, y): - """ - Rotate an IFS-space coordinate as defined by the camera. - """ - if not self.features.camera_rotation: - return x, y - rot_center_x, rot_center_y = cp.get.v2.f32('cp.rot_center[0]', - 'cp.rot_center[1]') - tx, ty = x - rot_center_x, y - rot_center_y - rot_cos_t, rot_sin_t = cp.get.v2.f32('cos(cp.rotate * 2 * pi / 360.)', - '-sin(cp.rotate * 2 * pi / 360.)') - rx = tx * rot_cos_t + ty * rot_sin_t + rot_center_x - ry = tx * (-rot_sin_t) + ty * rot_cos_t + rot_center_y - return rx, ry - - def get_norm(self, cp, x, y): - """ - Find the [0,1]-normalized floating-point histogram coordinates - ``norm_x, norm_y`` from the given IFS-space coordinates ``x, y``. - """ - rx, ry = self.rotate(cp, x, y) - cam_scale, cam_offset = cp.get.v2.f32( - 'cp.camera.norm_scale[0]', 'cp.camera.norm_offset[0]') - norm_x = rx * cam_scale + cam_offset - cam_scale, cam_offset = cp.get.v2.f32( - 'cp.camera.norm_scale[1]', 'cp.camera.norm_offset[1]') - norm_y = ry * cam_scale + cam_offset - return norm_x, norm_y - - def get_index(self, entry, cp, x, y): - """ - Find the histogram index (as a u32) from the IFS spatial coordinate in - ``x, y``. Returns ``index, oob``, where ``oob`` is a predicate value - that is set if the result is out of bounds. - """ - o = entry.ops - rx, ry = self.rotate(cp, x, y) - cam_scale, cam_offset = cp.get.v2.f32( - 'cp.camera.idx_scale[0]', 'cp.camera.idx_offset[0]') - idx_x = rx * cam_scale + cam_offset - cam_scale, cam_offset = cp.get.v2.f32( - 'cp.camera.idx_scale[1]', 'cp.camera.idx_offset[1]') - idx_y = ry * cam_scale + cam_offset - - idx_x_u32 = o.cvt.rzi.s32(idx_x) - idx_y_u32 = o.cvt.rzi.s32(idx_y) - oob = o.setp.lt.u32(idx_x_u32, self.features.hist_width) - oob |= o.setp.lt.u32(idx_y_u32, self.features.hist_height) - - idx = idx_y_u32 * self.features.hist_stride + idx_x_u32 - return idx, oob - -class PaletteLookup(object): - def __init__(self, entry, features): - self.entry, self.features = entry, features - #entry.declare_mem('global', 'texref', 'palette') - - def look_up(self, color, norm_time): - """ - Look up the values of ``r, g, b, a`` corresponding to ``color_coord`` - at the CP indexed in ``timestamp_idx``. Note that both ``color_coord`` - and ``timestamp_idx`` should be [0,1]-normalized floats. - """ - n = self.entry.ops.mov.f32 - return n(1), n(1), n(1), n(1) - #o.tex._2d.v4.f32.f32(m.palette, norm_time, color) - if features.non_box_temporal_filter: - raise NotImplementedError("Non-box temporal filters not supported") - - def upload_palette(self, ctx, frame, cp_list): - """ - Extract the palette from the given list of interpolated CPs, and upload - it to the device as a texture. - """ - # TODO: figure out if storing the full list is an actual drag on - # performance/memory - if frame.center_cp.temporal_filter_type != 0: - # TODO: make texture sample based on time, not on CP index - raise NotImplementedError("Use box temporal filters for now") - pal = np.ndarray((self.texheight, 256, 4), dtype=np.float32) - inv = float(len(cp_list) - 1) / (self.texheight - 1) - for y in range(self.texheight): - for x in range(256): - for c in range(4): - # TODO: interpolate here? - cy = int(round(y * inv)) - pal[y][x][c] = cp_list[cy].palette.entries[x].color[c] - dev_array = cuda.make_multichannel_2d_array(pal, "C") - self.texref = ctx.mod.get_texref('t_palette') - # TODO: float16? or can we still use interp with int storage? - self.texref.set_format(cuda.array_format.FLOAT, 4) - self.texref.set_flags(cuda.TRSF_NORMALIZED_COORDINATES) - self.texref.set_filter_mode(cuda.filter_mode.LINEAR) - self.texref.set_address_mode(0, cuda.address_mode.CLAMP) - self.texref.set_address_mode(1, cuda.address_mode.CLAMP) - self.texref.set_array(dev_array) - self.pal = pal - - def call_setup(self, ctx): - assert self.texref, "Must upload palette texture before launch!" - -class HistScatter(object): - def __init__(self, entry, features): - self.entry, self.features = entry, features - self.palette = PaletteLookup(entry, features) - entry.add_ptr_param('f32', 'hist_bins') - - def scatter(self, cp, hist_index, color, xf_idx, p_valid): - """ - Scatter the given point directly to the histogram bins. - """ - e, r, o, m, p, s = self.entry.locals - norm_time = cp.get.f32('cp.norm_time') - base = p.hist_bins[4*hist_index] - colors = self.palette.look_up(color, norm_time) - g_colors = o.ld.v4(base) - for col, gcol in zip(colors, g_colors): - gcol += col - o.st.v4(base, *g_colors) - - -class ShufflePoints(object): - """ - Shuffle points in shared memory. See helpers/shuf.py for details. - """ - def __init__(self, entry): - entry.declare_mem('shared', 'b32', 'shuf_data', entry.nthreads_cta) - - def shuffle(self, entry, *args, **kwargs): - """ - Shuffle the data from each register in args across threads. Keyword - argument ``bar`` specifies which barrier to use (default is 0). Each - register is overwritten in place. - """ - e, r, o, m, p, s = entry.locals - bar = kwargs.pop('bar', 0) - assert not kwargs, "Unrecognized keyword arguments." - - e.comment("Calculate read and write offsets for shuffle") - # See helpers/shuf.py for details - shuf_write = m.shuf_data[s.tid_x] - shuf_read = m.shuf_data[(s.tid_x + (32 * s.laneid)) & - (e.nthreads_cta - 1)] - for var in args: - o.bar.sync(bar) - o.st.volatile(shuf_write, var) - o.bar.sync(bar) - var.val = o.ld.volatile(shuf_read) - -class MWCRNG(object): - """ - Marsaglia multiply-with-carry random number generator. Produces very long - periods with sufficient statistical properties using only three 32-bit - state registers. Since each thread uses a separate multiplier, no two - threads will ever be on the same sequence, but beyond this the independence - of each thread's sequence was not explicitly tested. - - The RNG must be seeded at least once per entry point using the ``seed`` - method. - """ - def __init__(self, entry): - # TODO: install this in data directory or something - if not os.path.isfile('primes.bin'): - raise EnvironmentError('primes.bin not found') - self.nthreads_ready = 0 - self.mults, self.state = None, None - self.entry = entry - - entry.add_ptr_param('u32', 'mwc_mults') - entry.add_ptr_param('u32', 'mwc_states') - - with entry.head(): - self.entry_head() - entry.tail_callback(self.entry_tail) - - def entry_head(self): - e, r, o, m, p, s = self.entry.locals - gtid = s.ctaid_x * s.ntid_x + s.tid_x - r.mwc_mult, r.mwc_state, r.mwc_carry = r.u32(), r.u32(), r.u32() - r.mwc_mult = o.ld(p.mwc_mults[gtid]) - r.mwc_state, r.mwc_carry = o.ld.v2(p.mwc_states[2*gtid]) - - def entry_tail(self): - e, r, o, m, p, s = self.entry.locals - gtid = s.ctaid_x * s.ntid_x + s.tid_x - o.st.v2.u32(p.mwc_states[2*gtid], r.mwc_state, r.mwc_carry) - - def next_b32(self): - e, r, o, m, p, s = self.entry.locals - carry = o.cvt.u64(r.mwc_carry) - mwc_out = o.mad.wide(r.mwc_mult, r.mwc_state, carry) - r.mwc_state, r.mwc_carry = o.split.v2(mwc_out) - return r.mwc_state - - def next_f32_01(self): - e, r, o, m, p, s = self.entry.locals - mwc_float = o.cvt.rn.f32.u32(self.next_b32()) - return o.mul.f32(mwc_float, 1./(1<<32)) - - def next_f32_11(self): - e, r, o, m, p, s = self.entry.locals - mwc_float = o.cvt.rn.f32.s32(self.next_b32()) - return o.mul.f32(mwc_float, 1./(1<<31)) - - def seed(self, ctx, seed=None, force=False): - """ - Seed the random number generators with values taken from a - ``np.random`` instance. - """ - if force or self.nthreads_ready < ctx.nthreads: - if seed: - rand = np.random.RandomState(seed) - else: - rand = np.random - # Load raw big-endian u32 multipliers from primes.bin. - with open('primes.bin') as primefp: - dt = np.dtype(np.uint32).newbyteorder('B') - mults = np.frombuffer(primefp.read(), dtype=dt) - # Randomness in choosing multipliers is good, but larger multipliers - # have longer periods, which is also good. This is a compromise. - mults = np.array(mults[:ctx.nthreads*4]) - rand.shuffle(mults) - #locked_mults = ctx.hostpool.allocate(ctx.nthreads, np.uint32) - #locked_mults[:] = mults[ctx.nthreads] - #self.mults = ctx.pool.allocate(4*ctx.nthreads) - #cuda.memcpy_htod_async(self.mults, locked_mults.base, ctx.stream) - self.mults = cuda.mem_alloc(4*ctx.nthreads) - cuda.memcpy_htod(self.mults, mults[:ctx.nthreads].tostring()) - # Intentionally excludes both 0 and (2^32-1), as they can lead to - # degenerate sequences of period 0 - states = np.array(rand.randint(1, 0xffffffff, size=2*ctx.nthreads), - dtype=np.uint32) - #locked_states = ctx.hostpool.allocate(2*ctx.nthreads, np.uint32) - #locked_states[:] = states - #self.states = ctx.pool.allocate(8*ctx.nthreads) - #cuda.memcpy_htod_async(self.states, locked_states, ctx.stream) - self.states = cuda.mem_alloc(8*ctx.nthreads) - cuda.memcpy_htod(self.states, states.tostring()) - self.nthreads_ready = ctx.nthreads - ctx.set_param('mwc_mults', self.mults) - ctx.set_param('mwc_states', self.states) - -class MWCRNGTest(object): - """ - Test the ``MWCRNG`` class. This is not a test of the generator's - statistical properties, but merely a test that the generator is implemented - correctly on the GPU. - """ - rounds = 5000 - - def __init__(self, entry): - self.mwc = MWCRNG(entry) - entry.add_ptr_param('u64', 'mwc_test_sums') - - with entry.body(): - self.entry_body(entry) - - def entry_body(self, entry): - e, r, o, m, p, s = entry.locals - r.sum = r.u64(0) - r.count = r.f32(self.rounds) - start = e.label() - r.sum = r.sum + o.cvt.u64.u32(self.mwc.next_b32()) - r.count = r.count - 1 - with r.count > 0: - o.bra.uni(start) - e.comment('yay') - gtid = s.ctaid_x * s.ntid_x + s.tid_x - o.st(p.mwc_test_sums[gtid], r.sum) - - def run_test(self, ctx): - self.mwc.seed(ctx) - mults = cuda.from_device(self.mwc.mults, ctx.nthreads, np.uint32) - states = cuda.from_device(self.mwc.states, ctx.nthreads, np.uint64) - - for trial in range(2): - print "Trial %d, on CPU: " % trial, - sums = np.zeros_like(states) - ctime = time.time() - for i in range(self.rounds): - vals = states & 0xffffffff - carries = states >> 32 - states = mults * vals + carries - sums += states & 0xffffffff - ctime = time.time() - ctime - print "Took %g seconds." % ctime - - print "Trial %d, on device: " % trial, - dsums = cuda.mem_alloc(8*ctx.nthreads) - ctx.set_param('mwc_test_sums', dsums) - print "Took %g seconds." % ctx.call_timed() - dsums = cuda.from_device(dsums, ctx.nthreads, np.uint64) - if not np.all(np.equal(sums, dsums)): - print "Sum discrepancy!" - print sums - print dsums - -class MWCRNGFloatsTest(object): - """ - Note this only tests that the distributions are in the correct range, *not* - that they have good random properties. MWC is a suitable algorithm, but - implementation bugs may still lead to poor performance. - """ - rounds = 1024 - entry_name = 'MWC_RNG_floats_test' - - def deps(self): - return [MWCRNG] - - def module_setup(self): - mem.global_.f32('mwc_rng_float_01_test_sums', ctx.nthreads) - mem.global_.f32('mwc_rng_float_01_test_mins', ctx.nthreads) - mem.global_.f32('mwc_rng_float_01_test_maxs', ctx.nthreads) - mem.global_.f32('mwc_rng_float_11_test_sums', ctx.nthreads) - mem.global_.f32('mwc_rng_float_11_test_mins', ctx.nthreads) - mem.global_.f32('mwc_rng_float_11_test_maxs', ctx.nthreads) - - def loop(self, kind): - with block('Sum %d floats in %s' % (self.rounds, kind)): - reg.f32('loopct val rsum rmin rmax') - reg.pred('p_done') - op.mov.f32(loopct, 0.) - op.mov.f32(rsum, 0.) - op.mov.f32(rmin, 2.) - op.mov.f32(rmax, -2.) - label('loopstart' + kind) - getattr(mwc, 'next_f32_' + kind)(val) - op.add.f32(rsum, rsum, val) - op.min.f32(rmin, rmin, val) - op.max.f32(rmax, rmax, val) - op.add.f32(loopct, loopct, 1.) - op.setp.ge.f32(p_done, loopct, float(self.rounds)) - op.bra('loopstart' + kind, ifnotp=p_done) - op.mul.f32(rsum, rsum, 1./self.rounds) - std.store_per_thread('mwc_rng_float_%s_test_sums' % kind, rsum, - 'mwc_rng_float_%s_test_mins' % kind, rmin, - 'mwc_rng_float_%s_test_maxs' % kind, rmax) - - def entry(self): - self.loop('01') - self.loop('11') - - def call_teardown(self, ctx): - # Tolerance of all-threads averages - tol = 0.05 - # float distribution kind, test kind, expected value, limit func - tests = [ - ('01', 'sums', 0.5, None), - ('01', 'mins', 0.0, np.min), - ('01', 'maxs', 1.0, np.max), - ('11', 'sums', 0.0, None), - ('11', 'mins', -1.0, np.min), - ('11', 'maxs', 1.0, np.max) - ] - - for fkind, rkind, exp, lim in tests: - name = 'mwc_rng_float_%s_test_%s' % (fkind, rkind) - vals = ctx.get_per_thread(name, np.float32) - avg = np.mean(vals) - if np.abs(avg - exp) > tol: - raise PTXTestFailure("%s %s %g too far from %g" % - (fkind, rkind, avg, exp)) - if lim is None: continue - if lim([lim(vals), exp]) != exp: - raise PTXTestFailure("%s %s %g violates hard limit %g" % - (fkind, rkind, lim(vals), exp)) - -class CPDataStream(object): - """DataStream which stores the control points.""" - shortname = 'cp' - -class Timeouter(object): - """Time-out infinite loops so that data can still be retrieved.""" - shortname = 'timeout' - - def entry_setup(self): - mem.shared.u64('s_timeouter_start_time') - with block("Load start time for this block"): - reg.u64('now') - op.mov.u64(now, '%clock64') - op.st.shared.u64(addr(s_timeouter_start_time), now) - - def check_time(self, secs): - """ - Drop this into your mainloop somewhere. - """ - # TODO: if debug.device_timeout_loops or whatever - with block("Check current time for this loop"): - d = cuda.Context.get_device() - clks = int(secs * d.clock_rate * 1000) - reg.u64('now then') - op.mov.u64(now, '%clock64') - op.ld.shared.u64(then, addr(s_timeouter_start_time)) - op.sub.u64(now, now, then) - std.asrt("Loop timed out", 'lt.u64', now, clks) - - diff --git a/main.py b/main.py index c052a51..496af8e 100644 --- a/main.py +++ b/main.py @@ -21,29 +21,18 @@ from fr0stlib.pyflam3 import * from fr0stlib.pyflam3._flam3 import * import pyglet +import pycuda.autoinit + from cuburn.render import * +from cuburn.code.mwc import test_mwc -def mwctest(): - mwcent = ptx.Entry("mwc_test", 512) - mwctest = MWCRNGTest(mwcent) - - # Get the source for saving and disassembly before potentially crashing - mod = ptx.Module([mwcent]) - print '\n'.join(['%4d %s' % t for t in enumerate(mod.source.split('\n'))]) - util.disass(mod) - - mod = run.Module([mwcent]) - mod.print_func_info() - - ctx = mod.get_context('mwc_test', 14) - mwctest.run_test(ctx) - def main(args): - #mwctest() + test_mwc() + return with open(args[-1]) as fp: - genomes = Genome.from_string(fp.read()) - anim = Animation(genomes) + genomes, ngenomes = Genome.from_string(fp.read()) + anim = Animation(genomes, ngenomes) anim.compile() bins = anim.render_frame() w, h = anim.features.hist_width, anim.features.hist_height