RNG, again. Hooray.

2025-11-04 02:10:45 -05:00 · 2011-04-29 11:00:18 -04:00
parent bd1a943914
commit fe6367821f
4 changed files with 133 additions and 602 deletions
--- a/cuburn/code/init.py
+++ b/cuburn/code/init.py
@ -0,0 +1,19 @@
+"""
+Contains the PTX fragments which will drive the device.
+"""
+
+# Basic headers, utility functions, and so on
+base = """
+#include<cuda.h>
+#include<stdint.h>
+
+// TODO: use launch parameter preconfig to eliminate unnecessary parts
+__device__
+uint32_t gtid() {
+    return threadIdx.x + blockDim.x *
+            (threadIdx.y + blockDim.y *
+                (threadIdx.z + blockDim.z *
+                    (blockIdx.x + (gridDim.x * blockIdx.y))));
+}
+"""
+
--- a/cuburn/code/mwc.py
+++ b/cuburn/code/mwc.py
@ -0,0 +1,107 @@
+"""
+The multiply-with-carry random number generator.
+"""
+
+import time
+
+import pycuda.driver as cuda
+from pycuda.compiler import SourceModule
+import numpy as np
+
+from jinja2 import Template
+
+from cuburn import code
+
+src = r"""
+typedef struct {
+    uint32_t    mul;
+    uint32_t    state;
+    uint32_t    carry;
+} mwc_st;
+
+__device__ uint32_t mwc_next(mwc_st *st) {
+    asm(".reg .u64 val;\n\t"
+        "cvt.u64.u32  val, %0;\n\t"
+        "mad.wide.u32 val, %1, %2, val;\n\t"
+        "mov.b64 {%1, %0}, val;\n\t"
+        : "=r"(st->carry), "=r"(st->state) : "r"(st->mul));
+    return st->state;
+}
+"""
+
+testsrc = code.base + src + """
+__global__ void test_mwc(mwc_st *msts, uint64_t *sums, float nrounds) {
+    mwc_st rctx = msts[gtid()];
+    uint64_t sum = 0;
+    for (float i = 0; i < nrounds; i++) sum += mwc_next(&rctx);
+    sums[gtid()] = sum;
+    msts[gtid()] = rctx;
+}
+"""
+
+def build_mwc_seeds(nthreads, seed=None):
+    if seed:
+        rand = np.random.RandomState(seed)
+    else:
+        rand = np.random
+
+    # Load raw big-endian u32 multipliers from primes.bin.
+    with open('primes.bin') as primefp:
+        dt = np.dtype(np.uint32).newbyteorder('B')
+        mults = np.frombuffer(primefp.read(), dtype=dt)
+
+    # Create the seed structures. TODO: check that struct is 4-byte aligned
+    seeds = np.empty((3, nthreads), dtype=np.uint32, order='F')
+
+    # Randomness in choosing multipliers is good, but larger multipliers
+    # have longer periods, which is also good. This is a compromise.
+    mults = np.array(mults[:nthreads*4])
+    rand.shuffle(mults)
+    seeds[0][:] = mults[:nthreads]
+
+    # Intentionally excludes both 0 and (2^32-1), as they can lead to
+    # degenerate sequences of period 0
+    seeds[1] = rand.randint(1, 0xffffffff, size=nthreads)
+    seeds[2] = rand.randint(1, 0xffffffff, size=nthreads)
+
+    return seeds
+
+def test_mwc():
+    rounds = 5000
+    nblocks = 64
+    nthreads = 512 * nblocks
+
+    seeds = build_mwc_seeds(nthreads, seed = 5)
+    dseeds = cuda.to_device(seeds)
+
+    mod = SourceModule(testsrc)
+
+    for trial in range(2):
+        print "Trial %d, on CPU: " % trial,
+        sums = np.zeros(nthreads, dtype=np.uint64)
+        ctime = time.time()
+        mults = seeds[0].astype(np.uint64)
+        states = seeds[1]
+        carries = seeds[2]
+        for i in range(rounds):
+            step = np.frombuffer((mults * states + carries).data,
+                       dtype=np.uint32).reshape((2, nthreads), order='F')
+            states[:] = step[0]
+            carries[:] = step[1]
+            sums += states
+
+        ctime = time.time() - ctime
+        print "Took %g seconds." % ctime
+
+        print "Trial %d, on device: " % trial,
+        dsums = cuda.mem_alloc(8*nthreads)
+        fun = mod.get_function("test_mwc")
+        dtime = fun(dseeds, dsums, np.float32(rounds),
+                    block=(512,1,1), grid=(nblocks,1), time_kernel=True)
+        print "Took %g seconds." % dtime
+        dsums = cuda.from_device(dsums, nthreads, np.uint64)
+        if not np.all(np.equal(sums, dsums)):
+            print "Sum discrepancy!"
+            print sums
+            print dsums
+