mirror of
https://github.com/stevenrobertson/cuburn.git
synced 2025-02-06 04:00:05 -05:00
123 lines
3.8 KiB
Python
123 lines
3.8 KiB
Python
"""
|
|
The multiply-with-carry random number generator.
|
|
"""
|
|
|
|
import numpy as np
|
|
|
|
from cuburn.code.util import *
|
|
|
|
class MWC(HunkOCode):
|
|
decls = """
|
|
typedef struct {
|
|
uint32_t mul;
|
|
uint32_t state;
|
|
uint32_t carry;
|
|
} mwc_st;
|
|
"""
|
|
|
|
defs = r"""
|
|
__device__ uint32_t mwc_next(mwc_st &st) {
|
|
asm("{\n\t.reg .u64 val;\n\t"
|
|
"cvt.u64.u32 val, %0;\n\t"
|
|
"mad.wide.u32 val, %1, %2, val;\n\t"
|
|
"mov.b64 {%1, %0}, val;\n\t}\n\t"
|
|
: "+r"(st.carry), "+r"(st.state) : "r"(st.mul));
|
|
return st.state;
|
|
}
|
|
|
|
__device__ float mwc_next_01(mwc_st &st) {
|
|
return mwc_next(st) * (1.0f / 4294967296.0f);
|
|
}
|
|
|
|
__device__ float mwc_next_11(mwc_st &st) {
|
|
uint32_t val = mwc_next(st);
|
|
float ret;
|
|
asm("cvt.rn.f32.s32 %0, %1;\n\t"
|
|
"mul.f32 %0, %0, (1.0 / 2147483648.0);"
|
|
: "=f"(ret) : "r"(val));
|
|
return ret;
|
|
}
|
|
"""
|
|
|
|
@staticmethod
|
|
def make_seeds(nthreads, host_seed=None):
|
|
if host_seed:
|
|
rand = np.random.RandomState(host_seed)
|
|
else:
|
|
rand = np.random
|
|
|
|
# Load raw big-endian u32 multipliers from primes.bin.
|
|
with open('primes.bin') as primefp:
|
|
dt = np.dtype(np.uint32).newbyteorder('B')
|
|
mults = np.frombuffer(primefp.read(), dtype=dt)
|
|
|
|
# Create the seed structures. TODO: check that struct is 4-byte aligned
|
|
seeds = np.empty((3, nthreads), dtype=np.uint32, order='F')
|
|
|
|
# Randomness in choosing multipliers is good, but larger multipliers
|
|
# have longer periods, which is also good. This is a compromise.
|
|
mults = np.array(mults[:nthreads*4])
|
|
rand.shuffle(mults)
|
|
seeds[0][:] = mults[:nthreads]
|
|
|
|
# Intentionally excludes both 0 and (2^32-1), as they can lead to
|
|
# degenerate sequences of period 0
|
|
seeds[1] = rand.randint(1, 0xffffffff, size=nthreads)
|
|
seeds[2] = rand.randint(1, 0xffffffff, size=nthreads)
|
|
|
|
return seeds
|
|
|
|
class MWCTest(HunkOCode):
|
|
defs = """
|
|
__global__ void test_mwc(mwc_st *msts, uint64_t *sums, float nrounds) {
|
|
mwc_st rctx = msts[gtid()];
|
|
uint64_t sum = 0;
|
|
for (float i = 0; i < nrounds; i++) sum += mwc_next(rctx);
|
|
sums[gtid()] = sum;
|
|
msts[gtid()] = rctx;
|
|
}
|
|
"""
|
|
|
|
@classmethod
|
|
def test_mwc(cls, rounds=5000, nblocks=64, blockwidth=512):
|
|
import pycuda.driver as cuda
|
|
from pycuda.compiler import SourceModule
|
|
import time
|
|
|
|
nthreads = blockwidth * nblocks
|
|
seeds = MWC.make_seeds(nthreads, host_seed = 5)
|
|
dseeds = cuda.to_device(seeds)
|
|
|
|
mod = SourceModule(assemble_code(BaseCode, MWC, cls))
|
|
|
|
for trial in range(2):
|
|
print "Trial %d, on CPU: " % trial,
|
|
sums = np.zeros(nthreads, dtype=np.uint64)
|
|
ctime = time.time()
|
|
mults = seeds[0].astype(np.uint64)
|
|
states = seeds[1]
|
|
carries = seeds[2]
|
|
for i in range(rounds):
|
|
step = np.frombuffer((mults * states + carries).data,
|
|
dtype=np.uint32).reshape((2, nthreads), order='F')
|
|
states[:] = step[0]
|
|
carries[:] = step[1]
|
|
sums += states
|
|
|
|
ctime = time.time() - ctime
|
|
print "Took %g seconds." % ctime
|
|
|
|
print "Trial %d, on device: " % trial,
|
|
dsums = cuda.mem_alloc(8*nthreads)
|
|
fun = mod.get_function("test_mwc")
|
|
dtime = fun(dseeds, dsums, np.float32(rounds),
|
|
block=(blockwidth,1,1), grid=(nblocks,1),
|
|
time_kernel=True)
|
|
print "Took %g seconds." % dtime
|
|
dsums = cuda.from_device(dsums, nthreads, np.uint64)
|
|
if not np.all(np.equal(sums, dsums)):
|
|
print "Sum discrepancy!"
|
|
print sums
|
|
print dsums
|
|
|