mirror of
https://github.com/stevenrobertson/cuburn.git
synced 2025-02-05 11:40:04 -05:00
562 lines
21 KiB
Python
562 lines
21 KiB
Python
"""
|
|
Contains the PTX fragments which will drive the device.
|
|
"""
|
|
|
|
import os
|
|
import time
|
|
import struct
|
|
|
|
import pycuda.driver as cuda
|
|
import numpy as np
|
|
|
|
from cuburnlib.ptx import *
|
|
|
|
class IterThread(PTXEntryPoint):
|
|
entry_name = 'iter_thread'
|
|
entry_params = []
|
|
|
|
def __init__(self):
|
|
self.cps_uploaded = False
|
|
|
|
def deps(self):
|
|
return [MWCRNG, CPDataStream, HistScatter]
|
|
|
|
@ptx_func
|
|
def module_setup(self):
|
|
mem.global_.u32('g_cp_array',
|
|
cp.stream_size*features.max_ntemporal_samples)
|
|
mem.global_.u32('g_num_cps')
|
|
mem.global_.u32('g_num_cps_started')
|
|
# TODO move into debug statement
|
|
mem.global_.u32('g_num_rounds', ctx.threads)
|
|
mem.global_.u32('g_num_writes', ctx.threads)
|
|
|
|
@ptx_func
|
|
def entry(self):
|
|
# For now, we indulge in the luxury of shared memory.
|
|
|
|
# Index number of current CP, shared across CTA
|
|
mem.shared.u32('s_cp_idx')
|
|
|
|
# Number of samples that have been generated so far in this CTA
|
|
# If this number is negative, we're still fusing points, so this
|
|
# behaves slightly differently (see ``fuse_loop_start``)
|
|
mem.shared.u32('s_num_samples')
|
|
op.st.shared.u32(addr(s_num_samples), -(features.num_fuse_samples+1))
|
|
|
|
# TODO: temporary, for testing
|
|
reg.u32('num_rounds num_writes')
|
|
op.mov.u32(num_rounds, 0)
|
|
op.mov.u32(num_writes, 0)
|
|
|
|
reg.f32('x_coord y_coord color_coord')
|
|
mwc.next_f32_11(x_coord)
|
|
mwc.next_f32_11(y_coord)
|
|
mwc.next_f32_01(color_coord)
|
|
|
|
comment("Ensure all init is done")
|
|
op.bar.sync(0)
|
|
|
|
label('cp_loop_start')
|
|
reg.u32('cp_idx cpA')
|
|
with block("Claim a CP"):
|
|
std.set_is_first_thread(reg.pred('p_is_first'))
|
|
op.atom.add.u32(cp_idx, addr(g_num_cps_started), 1, ifp=p_is_first)
|
|
op.st.shared.u32(addr(s_cp_idx), cp_idx, ifp=p_is_first)
|
|
op.st.shared.u32(addr(s_num_samples), 0, ifp=p_is_first)
|
|
|
|
comment("Load the CP index in all threads")
|
|
op.bar.sync(1)
|
|
op.ld.shared.u32(cp_idx, addr(s_cp_idx))
|
|
|
|
with block("Check to see if this CP is valid (if not, we're done)"):
|
|
reg.u32('num_cps')
|
|
reg.pred('p_last_cp')
|
|
op.ldu.u32(num_cps, addr(g_num_cps))
|
|
op.setp.ge.u32(p_last_cp, cp_idx, num_cps)
|
|
op.bra.uni('all_cps_done', ifp=p_last_cp)
|
|
|
|
with block('Load CP address'):
|
|
op.mov.u32(cpA, g_cp_array)
|
|
op.mad.lo.u32(cpA, cp_idx, cp.stream_size, cpA)
|
|
|
|
label('fuse_loop_start')
|
|
# When fusing, num_samples holds the (negative) number of iterations
|
|
# left across the CP, rather than the number of samples in total.
|
|
with block("If still fusing, increment count unconditionally"):
|
|
std.set_is_first_thread(reg.pred('p_is_first'))
|
|
op.red.shared.add.s32(addr(s_num_samples), 1, ifp=p_is_first)
|
|
op.bar.sync(2)
|
|
|
|
label('iter_loop_start')
|
|
|
|
comment('Do... well, most of everything')
|
|
|
|
mwc.next_f32_11(x_coord)
|
|
mwc.next_f32_11(y_coord)
|
|
mwc.next_f32_01(color_coord)
|
|
|
|
op.add.u32(num_rounds, num_rounds, 1)
|
|
|
|
with block("Test if we're still in FUSE"):
|
|
reg.s32('num_samples')
|
|
reg.pred('p_in_fuse')
|
|
op.ld.shared.s32(num_samples, addr(s_num_samples))
|
|
op.setp.lt.s32(p_in_fuse, num_samples, 0)
|
|
op.bra.uni(fuse_loop_start, ifp=p_in_fuse)
|
|
|
|
reg.pred('p_point_is_valid')
|
|
with block("Write the result"):
|
|
hist.scatter(x_coord, y_coord, color_coord, 0, p_point_is_valid)
|
|
op.add.u32(num_writes, num_writes, 1, ifp=p_point_is_valid)
|
|
|
|
with block("Increment number of samples by number of good values"):
|
|
reg.b32('good_samples laneid')
|
|
reg.pred('p_is_first')
|
|
op.vote.ballot.b32(good_samples, p_point_is_valid)
|
|
op.popc.b32(good_samples, good_samples)
|
|
op.mov.u32(laneid, '%laneid')
|
|
op.setp.eq.u32(p_is_first, laneid, 0)
|
|
op.red.shared.add.s32(addr(s_num_samples), good_samples,
|
|
ifp=p_is_first)
|
|
|
|
with block("Check to see if we're done with this CP"):
|
|
reg.pred('p_cp_done')
|
|
reg.s32('num_samples num_samples_needed')
|
|
op.ld.shared.s32(num_samples, addr(s_num_samples))
|
|
cp.get(cpA, num_samples_needed, 'cp.nsamples')
|
|
op.setp.ge.s32(p_cp_done, num_samples, num_samples_needed)
|
|
op.bra.uni(cp_loop_start, ifp=p_cp_done)
|
|
|
|
op.bra.uni(iter_loop_start)
|
|
|
|
label('all_cps_done')
|
|
# TODO this is for testing, move it to a debug statement
|
|
std.store_per_thread(g_num_rounds, num_rounds)
|
|
std.store_per_thread(g_num_writes, num_writes)
|
|
|
|
@instmethod
|
|
def upload_cp_stream(self, ctx, cp_stream, num_cps):
|
|
cp_array_dp, cp_array_l = ctx.mod.get_global('g_cp_array')
|
|
assert len(cp_stream) <= cp_array_l, "Stream too big!"
|
|
cuda.memcpy_htod_async(cp_array_dp, cp_stream)
|
|
|
|
num_cps_dp, num_cps_l = ctx.mod.get_global('g_num_cps')
|
|
cuda.memset_d32(num_cps_dp, num_cps, 1)
|
|
# TODO: "if debug >= 3"
|
|
print "Uploaded stream to card:"
|
|
CPDataStream.print_record(ctx, cp_stream, 5)
|
|
self.cps_uploaded = True
|
|
|
|
@instmethod
|
|
def call(self, ctx):
|
|
if not self.cps_uploaded:
|
|
raise Error("Cannot call IterThread before uploading CPs")
|
|
num_cps_st_dp, num_cps_st_l = ctx.mod.get_global('g_num_cps_started')
|
|
cuda.memset_d32(num_cps_st_dp, 0, 1)
|
|
|
|
func = ctx.mod.get_function('iter_thread')
|
|
tr = ctx.ptx.instances[PaletteLookup].texref
|
|
dtime = func(block=ctx.block, grid=ctx.grid, time_kernel=True,
|
|
texrefs=[tr])
|
|
|
|
shape = (ctx.grid[0], ctx.block[0]/32, 32)
|
|
num_rounds_dp, num_rounds_l = ctx.mod.get_global('g_num_rounds')
|
|
num_writes_dp, num_writes_l = ctx.mod.get_global('g_num_writes')
|
|
rounds = cuda.from_device(num_rounds_dp, shape, np.int32)
|
|
writes = cuda.from_device(num_writes_dp, shape, np.int32)
|
|
print "Rounds:", sum(rounds)
|
|
print "Writes:", sum(writes)
|
|
print rounds
|
|
print writes
|
|
|
|
class CameraTransform(PTXFragment):
|
|
shortname = 'camera'
|
|
def deps(self):
|
|
return [CPDataStream]
|
|
|
|
@ptx_func
|
|
def rotate(self, rotated_x, rotated_y, x, y):
|
|
"""
|
|
Rotate an IFS-space coordinate as defined by the camera.
|
|
"""
|
|
if features.camera_rotation:
|
|
assert rotated_x.name != x.name and rotated_y.name != y.name
|
|
with block("Rotate %s, %s to camera alignment" % (x, y)):
|
|
reg.f32('rot_center_x rot_center_y')
|
|
cp.get_v2(cpA, rot_center_x, 'cp.rot_center[0]',
|
|
rot_center_y, 'cp.rot_center[1]')
|
|
op.sub.f32(x, x, rot_center_x)
|
|
op.sub.f32(y, y, rot_center_y)
|
|
|
|
reg.f32('rot_sin_t rot_cos_t rot_old_x rot_old_y')
|
|
cp.get_v2(cpA, rot_cos_t, 'cos(cp.rotate * 2 * pi / 360.)',
|
|
rot_sin_t, '-sin(cp.rotate * 2 * pi / 360.)')
|
|
|
|
comment('rotated_x = x * cos(t) - y * sin(t) + rot_center_x')
|
|
op.fma.rn.f32(rotated_x, x, rot_cos_t, rot_center_x)
|
|
op.fma.rn.f32(rotated_x, y, rot_sin_t, rotated_x)
|
|
|
|
op.neg.f32(rot_sin_t, rot_sin_t)
|
|
comment('rotated_y = x * sin(t) + y * cos(t) + rot_center_y')
|
|
op.fma.rn.f32(rotated_y, x, rot_sin_t, rot_center_y)
|
|
op.fma.rn.f32(rotated_y, y, rot_cos_t, rotated_y)
|
|
|
|
# TODO: if this is a register-critical section, reloading
|
|
# rot_center_[xy] here should save two regs. OTOH, if this is
|
|
# *not* reg-crit, moving the subtraction above to new variables
|
|
# may save a few clocks
|
|
op.add.f32(x, x, rot_center_x)
|
|
op.add.f32(y, y, rot_center_y)
|
|
else:
|
|
comment("No camera rotation in this kernel")
|
|
op.mov.f32(rotated_x, x)
|
|
op.mov.f32(rotated_y, y)
|
|
|
|
@ptx_func
|
|
def get_norm(self, norm_x, norm_y, x, y):
|
|
"""
|
|
Find the [0,1]-normalized floating-point histogram coordinates
|
|
``norm_x, norm_y`` from the given IFS-space coordinates ``x, y``.
|
|
"""
|
|
self.rotate(norm_x, norm_y, x, y)
|
|
with block("Scale rotated points to [0,1]-normalized coordinates"):
|
|
reg.f32('cam_scale cam_offset')
|
|
cp.get_v2(cpA, cam_scale, 'cp.camera.norm_scale[0]',
|
|
cam_offset, 'cp.camera.norm_offset[0]')
|
|
op.fma.f32(norm_x, norm_x, cam_scale, cam_offset)
|
|
cp.get_v2(cpA, cam_scale, 'cp.camera.norm_scale[1]',
|
|
cam_offset, 'cp.camera.norm_offset[1]')
|
|
op.fma.f32(norm_y, norm_y, cam_scale, cam_offset)
|
|
|
|
@ptx_func
|
|
def get_index(self, index, x, y, pred=None):
|
|
"""
|
|
Find the histogram index (as a u32) from the IFS spatial coordinate in
|
|
``x, y``.
|
|
|
|
If the coordinates are out of bounds, 0xffffffff will be stored to
|
|
``index``. If ``pred`` is given, it will be set if the point is valid,
|
|
and cleared if not.
|
|
"""
|
|
# A few instructions could probably be shaved off of this one
|
|
with block("Find histogram index"):
|
|
reg.f32('norm_x norm_y')
|
|
self.rotate(norm_x, norm_y, x, y)
|
|
comment('Scale and offset from IFS to index coordinates')
|
|
reg.f32('cam_scale cam_offset')
|
|
cp.get_v2(cpA, cam_scale, 'cp.camera.idx_scale[0]',
|
|
cam_offset, 'cp.camera.idx_offset[0]')
|
|
op.fma.rn.f32(norm_x, norm_x, cam_scale, cam_offset)
|
|
|
|
cp.get_v2(cpA, cam_scale, 'cp.camera.idx_scale[1]',
|
|
cam_offset, 'cp.camera.idx_offset[1]')
|
|
op.fma.rn.f32(norm_y, norm_y, cam_scale, cam_offset)
|
|
|
|
comment('Check for bad value')
|
|
reg.u32('index_x index_y')
|
|
if not pred:
|
|
pred = reg.pred('p_valid')
|
|
|
|
op.cvt.rzi.s32.f32(index_x, norm_x)
|
|
op.setp.ge.s32(pred, index_x, 0)
|
|
op.setp.lt.and_.s32(pred, index_x, features.hist_width, pred)
|
|
|
|
op.cvt.rzi.s32.f32(index_y, norm_y)
|
|
op.setp.ge.and_.s32(pred, index_y, 0, pred)
|
|
op.setp.lt.and_.s32(pred, index_y, features.hist_height, pred)
|
|
|
|
op.mad.lo.u32(index, index_y, features.hist_stride, index_x)
|
|
op.mov.u32(index, 0xffffffff, ifnotp=pred)
|
|
|
|
class PaletteLookup(PTXFragment):
|
|
shortname = "palette"
|
|
# Resolution of texture on device. Bigger = more palette rez, maybe slower
|
|
texheight = 16
|
|
|
|
def __init__(self):
|
|
self.texref = None
|
|
|
|
def deps(self):
|
|
return [CPDataStream]
|
|
|
|
@ptx_func
|
|
def module_setup(self):
|
|
mem.global_.texref('t_palette')
|
|
|
|
@ptx_func
|
|
def look_up(self, r, g, b, a, color, norm_time):
|
|
"""
|
|
Look up the values of ``r, g, b, a`` corresponding to ``color_coord``
|
|
at the CP indexed in ``timestamp_idx``. Note that both ``color_coord``
|
|
and ``timestamp_idx`` should be [0,1]-normalized floats.
|
|
"""
|
|
op.tex._2d.v4.f32.f32(vec(r, g, b, a),
|
|
addr([t_palette, ', ', vec(norm_time, color)]))
|
|
if features.non_box_temporal_filter:
|
|
raise NotImplementedError("Non-box temporal filters not supported")
|
|
|
|
@instmethod
|
|
def upload_palette(self, ctx, frame, cp_list):
|
|
"""
|
|
Extract the palette from the given list of interpolated CPs, and upload
|
|
it to the device as a texture.
|
|
"""
|
|
# TODO: figure out if storing the full list is an actual drag on
|
|
# performance/memory
|
|
if frame.center_cp.temporal_filter_type != 0:
|
|
# TODO: make texture sample based on time, not on CP index
|
|
raise NotImplementedError("Use box temporal filters for now")
|
|
pal = np.ndarray((self.texheight, 256, 4), dtype=np.float32)
|
|
inv = float(len(cp_list) - 1) / (self.texheight - 1)
|
|
for y in range(self.texheight):
|
|
for x in range(256):
|
|
for c in range(4):
|
|
# TODO: interpolate here?
|
|
cy = int(round(y * inv))
|
|
pal[y][x][c] = cp_list[cy].palette.entries[x].color[c]
|
|
dev_array = cuda.make_multichannel_2d_array(pal, "C")
|
|
self.texref = ctx.mod.get_texref('t_palette')
|
|
# TODO: float16? or can we still use interp with int storage?
|
|
self.texref.set_format(cuda.array_format.FLOAT, 4)
|
|
self.texref.set_flags(cuda.TRSF_NORMALIZED_COORDINATES)
|
|
self.texref.set_filter_mode(cuda.filter_mode.LINEAR)
|
|
self.texref.set_address_mode(0, cuda.address_mode.CLAMP)
|
|
self.texref.set_address_mode(1, cuda.address_mode.CLAMP)
|
|
self.texref.set_array(dev_array)
|
|
|
|
def device_init(self, ctx):
|
|
assert self.texref, "Must upload palette texture before launch!"
|
|
|
|
class HistScatter(PTXFragment):
|
|
shortname = "hist"
|
|
def deps(self):
|
|
return [CPDataStream, CameraTransform, PaletteLookup]
|
|
|
|
@ptx_func
|
|
def module_setup(self):
|
|
mem.global_.f32('g_hist_bins',
|
|
features.hist_height * features.hist_stride * 4)
|
|
|
|
@ptx_func
|
|
def entry_setup(self):
|
|
comment("For now, assume histogram bins have been cleared by host")
|
|
|
|
@ptx_func
|
|
def scatter(self, x, y, color, xf_idx, p_valid=None):
|
|
"""
|
|
Scatter the given point directly to the histogram bins. I think this
|
|
technique has the worst performance of all of 'em. Accesses ``cpA``
|
|
directly.
|
|
"""
|
|
with block("Scatter directly to buffer"):
|
|
if p_valid is None:
|
|
p_valid = reg.pred('p_valid')
|
|
reg.u32('hist_index')
|
|
camera.get_index(hist_index, x, y, p_valid)
|
|
reg.u32('hist_bin_addr')
|
|
op.mov.u32(hist_bin_addr, g_hist_bins)
|
|
op.mad.lo.u32(hist_bin_addr, hist_index, 16, hist_bin_addr)
|
|
|
|
reg.f32('r g b a norm_time')
|
|
cp.get(cpA, norm_time, 'cp.norm_time')
|
|
palette.look_up(r, g, b, a, color, norm_time)
|
|
# TODO: look up, scale by xform visibility
|
|
op.red.add.f32(addr(hist_bin_addr), r)
|
|
op.red.add.f32(addr(hist_bin_addr,4), g)
|
|
op.red.add.f32(addr(hist_bin_addr,8), b)
|
|
op.red.add.f32(addr(hist_bin_addr,12), a)
|
|
|
|
|
|
def device_init(self, ctx):
|
|
hist_bins_dp, hist_bins_l = ctx.mod.get_global('g_hist_bins')
|
|
cuda.memset_d32(hist_bins_dp, 0, hist_bins_l/4)
|
|
|
|
@instmethod
|
|
def get_bins(self, ctx, features):
|
|
hist_bins_dp, hist_bins_l = ctx.mod.get_global('g_hist_bins')
|
|
return cuda.from_device(hist_bins_dp,
|
|
(features.hist_height, features.hist_stride, 4),
|
|
dtype=np.float32)
|
|
|
|
class MWCRNG(PTXFragment):
|
|
shortname = "mwc"
|
|
|
|
def __init__(self):
|
|
self.rand = np.random
|
|
self.threads_ready = 0
|
|
if not os.path.isfile('primes.bin'):
|
|
raise EnvironmentError('primes.bin not found')
|
|
|
|
def set_seed(self, seed):
|
|
self.rand = np.random.mtrand.RandomState(seed)
|
|
|
|
@ptx_func
|
|
def module_setup(self):
|
|
mem.global_.u32('mwc_rng_mults', ctx.threads)
|
|
mem.global_.u64('mwc_rng_state', ctx.threads)
|
|
|
|
@ptx_func
|
|
def entry_setup(self):
|
|
reg.u32('mwc_st mwc_mult mwc_car')
|
|
with block('Load MWC multipliers and states'):
|
|
reg.u32('mwc_off mwc_addr')
|
|
std.get_gtid(mwc_off)
|
|
op.mov.u32(mwc_addr, mwc_rng_mults)
|
|
op.mad.lo.u32(mwc_addr, mwc_off, 4, mwc_addr)
|
|
op.ld.global_.u32(mwc_mult, addr(mwc_addr))
|
|
|
|
op.mov.u32(mwc_addr, mwc_rng_state)
|
|
op.mad.lo.u32(mwc_addr, mwc_off, 8, mwc_addr)
|
|
op.ld.global_.v2.u32(vec(mwc_st, mwc_car), addr(mwc_addr))
|
|
|
|
@ptx_func
|
|
def entry_teardown(self):
|
|
with block('Save MWC states'):
|
|
reg.u32('mwc_off mwc_addr')
|
|
std.get_gtid(mwc_off)
|
|
op.mov.u32(mwc_addr, mwc_rng_state)
|
|
op.mad.lo.u32(mwc_addr, mwc_off, 8, mwc_addr)
|
|
op.st.global_.v2.u32(addr(mwc_addr), vec(mwc_st, mwc_car))
|
|
|
|
@ptx_func
|
|
def _next(self):
|
|
# Call from inside a block!
|
|
reg.u64('mwc_out')
|
|
op.cvt.u64.u32(mwc_out, mwc_car)
|
|
op.mad.wide.u32(mwc_out, mwc_st, mwc_mult, mwc_out)
|
|
op.mov.b64(vec(mwc_st, mwc_car), mwc_out)
|
|
|
|
@ptx_func
|
|
def next_b32(self, dst_reg):
|
|
with block('Load next random u32 into ' + dst_reg.name):
|
|
self._next()
|
|
op.mov.u32(dst_reg, mwc_st)
|
|
|
|
@ptx_func
|
|
def next_f32_01(self, dst_reg):
|
|
# TODO: verify that this is the fastest-performance method
|
|
# TODO: verify that this actually does what I think it does
|
|
with block('Load random float [0,1] into ' + dst_reg.name):
|
|
self._next()
|
|
op.cvt.rn.f32.u32(dst_reg, mwc_st)
|
|
op.mul.f32(dst_reg, dst_reg, '0f2F800000') # 1./(1<<32)
|
|
|
|
@ptx_func
|
|
def next_f32_11(self, dst_reg):
|
|
with block('Load random float [-1,1) into ' + dst_reg.name):
|
|
reg.u32('mwc_to_float')
|
|
self._next()
|
|
op.cvt.rn.f32.s32(dst_reg, mwc_st)
|
|
op.mul.f32(dst_reg, dst_reg, '0f30000000') # 1./(1<<31)
|
|
|
|
def device_init(self, ctx):
|
|
if self.threads_ready >= ctx.threads:
|
|
# Already set up enough random states, don't push again
|
|
return
|
|
|
|
# Load raw big-endian u32 multipliers from primes.bin.
|
|
with open('primes.bin') as primefp:
|
|
dt = np.dtype(np.uint32).newbyteorder('B')
|
|
mults = np.frombuffer(primefp.read(), dtype=dt)
|
|
stream = cuda.Stream()
|
|
# Randomness in choosing multipliers is good, but larger multipliers
|
|
# have longer periods, which is also good. This is a compromise.
|
|
mults = np.array(mults[:ctx.threads*4])
|
|
self.rand.shuffle(mults)
|
|
# Copy multipliers and seeds to the device
|
|
multdp, multl = ctx.mod.get_global('mwc_rng_mults')
|
|
cuda.memcpy_htod_async(multdp, mults.tostring()[:multl])
|
|
# Intentionally excludes both 0 and (2^32-1), as they can lead to
|
|
# degenerate sequences of period 0
|
|
states = np.array(self.rand.randint(1, 0xffffffff, size=2*ctx.threads),
|
|
dtype=np.uint32)
|
|
statedp, statel = ctx.mod.get_global('mwc_rng_state')
|
|
cuda.memcpy_htod_async(statedp, states.tostring())
|
|
self.threads_ready = ctx.threads
|
|
|
|
def tests(self):
|
|
return [MWCRNGTest]
|
|
|
|
class MWCRNGTest(PTXTest):
|
|
name = "MWC RNG sum-of-threads"
|
|
rounds = 5000
|
|
entry_name = 'MWC_RNG_test'
|
|
entry_params = ''
|
|
|
|
def deps(self):
|
|
return [MWCRNG]
|
|
|
|
@ptx_func
|
|
def module_setup(self):
|
|
mem.global_.u64('mwc_rng_test_sums', ctx.threads)
|
|
|
|
@ptx_func
|
|
def entry(self):
|
|
reg.u64('sum addl')
|
|
reg.u32('addend')
|
|
op.mov.u64(sum, 0)
|
|
with block('Sum next %d random numbers' % self.rounds):
|
|
reg.u32('loopct')
|
|
reg.pred('p')
|
|
op.mov.u32(loopct, self.rounds)
|
|
label('loopstart')
|
|
mwc.next_b32(addend)
|
|
op.cvt.u64.u32(addl, addend)
|
|
op.add.u64(sum, sum, addl)
|
|
op.sub.u32(loopct, loopct, 1)
|
|
op.setp.gt.u32(p, loopct, 0)
|
|
op.bra.uni(loopstart, ifp=p)
|
|
|
|
with block('Store sum and state'):
|
|
reg.u32('adr offset')
|
|
std.get_gtid(offset)
|
|
op.mov.u32(adr, mwc_rng_test_sums)
|
|
op.mad.lo.u32(adr, offset, 8, adr)
|
|
op.st.global_.u64(addr(adr), sum)
|
|
|
|
def call(self, ctx):
|
|
# Get current multipliers and seeds from the device
|
|
multdp, multl = ctx.mod.get_global('mwc_rng_mults')
|
|
mults = cuda.from_device(multdp, ctx.threads, np.uint32)
|
|
statedp, statel = ctx.mod.get_global('mwc_rng_state')
|
|
fullstates = cuda.from_device(statedp, ctx.threads, np.uint64)
|
|
sums = np.zeros(ctx.threads, np.uint64)
|
|
|
|
print "Running %d states forward %d rounds" % (len(mults), self.rounds)
|
|
ctime = time.time()
|
|
for i in range(self.rounds):
|
|
states = fullstates & 0xffffffff
|
|
carries = fullstates >> 32
|
|
fullstates = mults * states + carries
|
|
sums = sums + (fullstates & 0xffffffff)
|
|
ctime = time.time() - ctime
|
|
print "Done on host, took %g seconds" % ctime
|
|
|
|
func = ctx.mod.get_function('MWC_RNG_test')
|
|
dtime = func(block=ctx.block, grid=ctx.grid, time_kernel=True)
|
|
print "Done on device, took %g seconds (%gx)" % (dtime, ctime/dtime)
|
|
dfullstates = cuda.from_device(statedp, ctx.threads, np.uint64)
|
|
if not (dfullstates == fullstates).all():
|
|
print "State discrepancy"
|
|
print dfullstates
|
|
print fullstates
|
|
return False
|
|
|
|
sumdp, suml = ctx.mod.get_global('mwc_rng_test_sums')
|
|
dsums = cuda.from_device(sumdp, ctx.threads, np.uint64)
|
|
if not (dsums == sums).all():
|
|
print "Sum discrepancy"
|
|
print dsums
|
|
print sums
|
|
return False
|
|
return True
|
|
|
|
class CameraCoordTransform(PTXFragment):
|
|
pass
|
|
|
|
class CPDataStream(DataStream):
|
|
"""DataStream which stores the control points."""
|
|
shortname = 'cp'
|
|
|