mirror of
				https://github.com/stevenrobertson/cuburn.git
				synced 2025-10-31 09:20:44 -04:00 
			
		
		
		
	
		
			
				
	
	
		
			123 lines
		
	
	
		
			3.8 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			123 lines
		
	
	
		
			3.8 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| """
 | |
| The multiply-with-carry random number generator.
 | |
| """
 | |
| 
 | |
| import numpy as np
 | |
| 
 | |
| from cuburn.code.util import *
 | |
| 
 | |
| class MWC(HunkOCode):
 | |
|     decls = """
 | |
| typedef struct {
 | |
|     uint32_t    mul;
 | |
|     uint32_t    state;
 | |
|     uint32_t    carry;
 | |
| } mwc_st;
 | |
| """
 | |
| 
 | |
|     defs = r"""
 | |
| __device__ uint32_t mwc_next(mwc_st &st) {
 | |
|     asm("{\n\t.reg .u64 val;\n\t"
 | |
|         "cvt.u64.u32  val, %0;\n\t"
 | |
|         "mad.wide.u32 val, %1, %2, val;\n\t"
 | |
|         "mov.b64 {%1, %0}, val;\n\t}\n\t"
 | |
|         : "+r"(st.carry), "+r"(st.state) : "r"(st.mul));
 | |
|     return st.state;
 | |
| }
 | |
| 
 | |
| __device__ float mwc_next_01(mwc_st &st) {
 | |
|     return mwc_next(st) * (1.0f / 4294967296.0f);
 | |
| }
 | |
| 
 | |
| __device__ float mwc_next_11(mwc_st &st) {
 | |
|     uint32_t val = mwc_next(st);
 | |
|     float ret;
 | |
|     asm("cvt.rn.f32.s32 %0,     %1;\n\t"
 | |
|         "mul.f32        %0,     %0,     (1.0 / 2147483648.0);"
 | |
|         : "=f"(ret) : "r"(val));
 | |
|     return ret;
 | |
| }
 | |
| """
 | |
| 
 | |
|     @staticmethod
 | |
|     def make_seeds(nthreads, host_seed=None):
 | |
|         if host_seed:
 | |
|             rand = np.random.RandomState(host_seed)
 | |
|         else:
 | |
|             rand = np.random
 | |
| 
 | |
|         # Load raw big-endian u32 multipliers from primes.bin.
 | |
|         with open('primes.bin') as primefp:
 | |
|             dt = np.dtype(np.uint32).newbyteorder('B')
 | |
|             mults = np.frombuffer(primefp.read(), dtype=dt)
 | |
| 
 | |
|         # Create the seed structures. TODO: check that struct is 4-byte aligned
 | |
|         seeds = np.empty((3, nthreads), dtype=np.uint32, order='F')
 | |
| 
 | |
|         # Randomness in choosing multipliers is good, but larger multipliers
 | |
|         # have longer periods, which is also good. This is a compromise.
 | |
|         mults = np.array(mults[:nthreads*4])
 | |
|         rand.shuffle(mults)
 | |
|         seeds[0][:] = mults[:nthreads]
 | |
| 
 | |
|         # Intentionally excludes both 0 and (2^32-1), as they can lead to
 | |
|         # degenerate sequences of period 0
 | |
|         seeds[1] = rand.randint(1, 0xffffffff, size=nthreads)
 | |
|         seeds[2] = rand.randint(1, 0xffffffff, size=nthreads)
 | |
| 
 | |
|         return seeds
 | |
| 
 | |
| class MWCTest(HunkOCode):
 | |
|     defs = """
 | |
| __global__ void test_mwc(mwc_st *msts, uint64_t *sums, float nrounds) {
 | |
|     mwc_st rctx = msts[gtid()];
 | |
|     uint64_t sum = 0;
 | |
|     for (float i = 0; i < nrounds; i++) sum += mwc_next(rctx);
 | |
|     sums[gtid()] = sum;
 | |
|     msts[gtid()] = rctx;
 | |
| }
 | |
| """
 | |
| 
 | |
|     @classmethod
 | |
|     def test_mwc(cls, rounds=5000, nblocks=64, blockwidth=512):
 | |
|         import pycuda.driver as cuda
 | |
|         from pycuda.compiler import SourceModule
 | |
|         import time
 | |
| 
 | |
|         nthreads = blockwidth * nblocks
 | |
|         seeds = MWC.make_seeds(nthreads, host_seed = 5)
 | |
|         dseeds = cuda.to_device(seeds)
 | |
| 
 | |
|         mod = SourceModule(assemble_code(BaseCode, MWC, cls))
 | |
| 
 | |
|         for trial in range(2):
 | |
|             print "Trial %d, on CPU: " % trial,
 | |
|             sums = np.zeros(nthreads, dtype=np.uint64)
 | |
|             ctime = time.time()
 | |
|             mults = seeds[0].astype(np.uint64)
 | |
|             states = seeds[1]
 | |
|             carries = seeds[2]
 | |
|             for i in range(rounds):
 | |
|                 step = np.frombuffer((mults * states + carries).data,
 | |
|                            dtype=np.uint32).reshape((2, nthreads), order='F')
 | |
|                 states[:] = step[0]
 | |
|                 carries[:] = step[1]
 | |
|                 sums += states
 | |
| 
 | |
|             ctime = time.time() - ctime
 | |
|             print "Took %g seconds." % ctime
 | |
| 
 | |
|             print "Trial %d, on device: " % trial,
 | |
|             dsums = cuda.mem_alloc(8*nthreads)
 | |
|             fun = mod.get_function("test_mwc")
 | |
|             dtime = fun(dseeds, dsums, np.float32(rounds),
 | |
|                         block=(blockwidth,1,1), grid=(nblocks,1),
 | |
|                         time_kernel=True)
 | |
|             print "Took %g seconds." % dtime
 | |
|             dsums = cuda.from_device(dsums, nthreads, np.uint64)
 | |
|             if not np.all(np.equal(sums, dsums)):
 | |
|                 print "Sum discrepancy!"
 | |
|                 print sums
 | |
|                 print dsums
 | |
| 
 | 
