mirror of
				https://github.com/stevenrobertson/cuburn.git
				synced 2025-10-30 17:00:48 -04:00 
			
		
		
		
	Known broken checkin because I'm nervous.
This commit is contained in:
		| @ -0,0 +1,32 @@ | ||||
|  | ||||
| from collections import namedtuple | ||||
|  | ||||
| Flag = namedtuple('Flag', 'level desc') | ||||
|  | ||||
| class DebugSettings(object): | ||||
|     """ | ||||
|     Container for default debug settings. | ||||
|     """ | ||||
|     def __init__(self, items): | ||||
|         self.items = items | ||||
|         self.values = {} | ||||
|         self.level = 1 | ||||
|     def __getattr__(self, name): | ||||
|         if name not in self.items: | ||||
|             raise KeyError("Unknown debug flag name!") | ||||
|         if name in self.values: | ||||
|             return self.values[name] | ||||
|         return (self.items[name].level <= self.level) | ||||
|     def format_help(self): | ||||
|         name_len = min(30, max(map(len, self.items.keys()))) | ||||
|         fmt = '%-' + name_len + 's %d %s' | ||||
|         return '\n'.join([fmt % (k, v.level, v.desc) | ||||
|                           for k, v in self.items.items()]) | ||||
|  | ||||
| debug_flags = dict( | ||||
|     count_writes = Flag(3,  "Count the number of points written per thread " | ||||
|                             "when doing iterations."), | ||||
|     count_rounds = Flag(3,  "Count the number of times the iteration loop " | ||||
|                             "runs per thread when doing iterations.") | ||||
|     ) | ||||
|  | ||||
|  | ||||
| @ -1,39 +1,48 @@ | ||||
| # These imports are order-sensitive! | ||||
| import pyglet | ||||
| import pyglet.gl as gl | ||||
| gl.get_current_context() | ||||
| #import pyglet | ||||
| #import pyglet.gl as gl | ||||
| #gl.get_current_context() | ||||
|  | ||||
| import pycuda.driver as cuda | ||||
| from pycuda.compiler import SourceModule | ||||
| import pycuda.tools | ||||
| import pycuda.gl as cudagl | ||||
| import pycuda.gl.autoinit | ||||
| #import pycuda.gl as cudagl | ||||
| #import pycuda.gl.autoinit | ||||
| import pycuda.autoinit | ||||
|  | ||||
| import numpy as np | ||||
|  | ||||
| from cuburn.ptx import PTXModule, PTXTest, PTXTestFailure | ||||
| from cuburn.ptx import PTXFormatter | ||||
|  | ||||
| class Module(object): | ||||
|     def __init__(self, entries): | ||||
|         self.entries = entries | ||||
|         self.source = self.compile(entries) | ||||
|         self.mod = self.assemble(self.source) | ||||
|  | ||||
|     @staticmethod | ||||
|     def compile(entries): | ||||
|         formatter = PTXFormatter() | ||||
|         for entry in entries: | ||||
|             entry.format_source(formatter) | ||||
|         return formatter.get_source() | ||||
|  | ||||
|     def assemble(self, src): | ||||
|         # TODO: make this a debugging option | ||||
|         with open('/tmp/cuburn.ptx', 'w') as f: f.write(src) | ||||
|         try: | ||||
|             mod = cuda.module_from_buffer(src, | ||||
|                 [(cuda.jit_option.OPTIMIZATION_LEVEL, 0), | ||||
|                  (cuda.jit_option.TARGET_FROM_CUCONTEXT, 1)]) | ||||
|         except (cuda.CompileError, cuda.RuntimeError), e: | ||||
|             # TODO: if output not written above, print different message | ||||
|             # TODO: read assembler output and recover Python source lines | ||||
|             print "Compile error. Source is at /tmp/cuburn.ptx" | ||||
|             print e | ||||
|             raise e | ||||
|         return mod | ||||
|  | ||||
| class LaunchContext(object): | ||||
|     """ | ||||
|     Context collecting the information needed to create, run, and gather the | ||||
|     results of a device computation. This may eventually also include an actual | ||||
|     CUDA context, but for now it just uses the global one. | ||||
|  | ||||
|     To create the fastest device code across multiple device families, this | ||||
|     context may decide to iteratively refine the final PTX by regenerating | ||||
|     and recompiling it several times to optimize certain parameters of the | ||||
|     launch, such as the distribution of threads throughout the device. | ||||
|     The properties of this device which are tuned are listed below. Any PTX | ||||
|     fragments which use this information must emit valid PTX for any state | ||||
|     given below, but the PTX is only required to actually run with the final, | ||||
|     fixed values of all tuned parameters below. | ||||
|  | ||||
|         `block`:    3-tuple of (x,y,z); dimensions of each CTA. | ||||
|         `grid`:     2-tuple of (x,y); dimensions of the grid of CTAs. | ||||
|         `nthreads`: Number of active threads on device as a whole. | ||||
|         `mod`:      Final compiled module. Unavailable during assembly. | ||||
|  | ||||
|     """ | ||||
|     def __init__(self, entries, block=(1,1,1), grid=(1,1), tests=False): | ||||
|         self.entry_types = entries | ||||
|         self.block, self.grid, self.build_tests = block, grid, tests | ||||
| @ -60,18 +69,6 @@ class LaunchContext(object): | ||||
|         kwargs['ctx'] = self | ||||
|         self.ptx = PTXModule(self.entry_types, kwargs, self.build_tests) | ||||
|         # TODO: make this optional and let user choose path | ||||
|         with open('/tmp/cuburn.ptx', 'w') as f: f.write(self.ptx.source) | ||||
|         try: | ||||
|             # TODO: detect/customize arch, code; verbose setting; | ||||
|             # keep directory enable/disable via debug | ||||
|             self.mod = cuda.module_from_buffer(self.ptx.source, | ||||
|                 [(cuda.jit_option.OPTIMIZATION_LEVEL, 0), | ||||
|                  (cuda.jit_option.TARGET_FROM_CUCONTEXT, 1)]) | ||||
|         except (cuda.CompileError, cuda.RuntimeError), e: | ||||
|             # TODO: if output not written above, print different message | ||||
|             print "Compile error. Source is at /tmp/cuburn.ptx" | ||||
|             print e | ||||
|             raise e | ||||
|         if verbose: | ||||
|             for entry in self.ptx.entries: | ||||
|                 func = self.mod.get_function(entry.entry_name) | ||||
|  | ||||
| @ -523,175 +523,130 @@ class ShufflePoints(PTXFragment): | ||||
|                 op.bar.sync(bar) | ||||
|                 op.ld.volatile.shared.b32(var, addr(shuf_read)) | ||||
|  | ||||
| class MWCRNG(PTXFragment): | ||||
|     shortname = "mwc" | ||||
|  | ||||
|     def __init__(self): | ||||
|         self.threads_ready = 0 | ||||
| class MWCRNG(object): | ||||
|     def __init__(self, entry, seed=None): | ||||
|         # TODO: install this in data directory or something | ||||
|         if not os.path.isfile('primes.bin'): | ||||
|             raise EnvironmentError('primes.bin not found') | ||||
|         self.threads_ready = 0 | ||||
|         self.mults, self.state = None, None | ||||
|  | ||||
|     @ptx_func | ||||
|     def module_setup(self): | ||||
|         mem.global_.u32('mwc_rng_mults', ctx.nthreads) | ||||
|         mem.global_.u64('mwc_rng_state', ctx.nthreads) | ||||
|         self.entry = entry | ||||
|         entry.add_param('mwc_mults', entry.types.u32) | ||||
|         entry.add_param('mwc_states', entry.types.u32) | ||||
|         r, o = entry.regs, entry.ops | ||||
|         with entry.head as e: | ||||
|             #mwc_mult_addr = gtid * 4 + e.params.mwc_mults | ||||
|             gtid = o.mad.lo(e.special.ctaid_x, ctx.threads_per_cta, | ||||
|                             e.special.tid_x) | ||||
|             mwc_mult_addr = o.mad.lo.u32(gtid, 4, e.params.mwc_mults) | ||||
|             r.mwc_mult = o.load.u32(mwc_mult_addr) | ||||
|             mwc_state_addr = o.mad.lo.u32(gtid, 8, e.params.mwc_states) | ||||
|             r.mwc_state, r.mwc_carry = o.load.u64(mwc_state_addr) | ||||
|         with entry.tail as e: | ||||
|             #gtid = e.special.ctaid_x * ctx.threads_per_cta + e.special.tid_x | ||||
|             gtid = o.mad.lo(e.special.ctaid_x, ctx.threads_per_cta, | ||||
|                             e.special.tid_x) | ||||
|             mwc_state_addr = o.mad.lo.u32(gtid, 8, e.params.mwc_states) | ||||
|             o.store.v2(mwc_state_addr, (r.mwc_state, r.mwc_carry)) | ||||
|  | ||||
|     @ptx_func | ||||
|     def entry_setup(self): | ||||
|         reg.u32('mwc_st mwc_mult mwc_car') | ||||
|         with block('Load MWC multipliers and states'): | ||||
|             reg.u32('mwc_off mwc_addr') | ||||
|             std.get_gtid(mwc_off) | ||||
|             op.mov.u32(mwc_addr, mwc_rng_mults) | ||||
|             op.mad.lo.u32(mwc_addr, mwc_off, 4, mwc_addr) | ||||
|             op.ld.global_.u32(mwc_mult, addr(mwc_addr)) | ||||
|     def next_b32(self): | ||||
|         e, r, o = self.entry, self.entry.regs, self.entry.ops | ||||
|         mwc_out = o.cvt.u64(r.mwc_carry) | ||||
|         mwc_out = o.mad.wide.u32(r.mwc_mult, r.mwc_state, mwc_out) | ||||
|         r.mwc_state, r.mwc_carry = o.mov(mwc_out) | ||||
|         return r.mwc_state | ||||
|  | ||||
|             op.mov.u32(mwc_addr, mwc_rng_state) | ||||
|             op.mad.lo.u32(mwc_addr, mwc_off, 8, mwc_addr) | ||||
|             op.ld.global_.v2.u32(vec(mwc_st, mwc_car), addr(mwc_addr)) | ||||
|     def next_f32_01(self): | ||||
|         e, r, o = self.entry, self.entry.regs, self.entry.ops | ||||
|         mwc_float = o.cvt.rn.f32.u32(self.next_b32()) | ||||
|         # TODO: check the precision on the uploaded types here | ||||
|         return o.mul.f32(mwc_float, 1./(1<<32)) | ||||
|  | ||||
|     @ptx_func | ||||
|     def entry_teardown(self): | ||||
|         with block('Save MWC states'): | ||||
|             reg.u32('mwc_off mwc_addr') | ||||
|             std.get_gtid(mwc_off) | ||||
|             op.mov.u32(mwc_addr, mwc_rng_state) | ||||
|             op.mad.lo.u32(mwc_addr, mwc_off, 8, mwc_addr) | ||||
|             op.st.global_.v2.u32(addr(mwc_addr), vec(mwc_st, mwc_car)) | ||||
|     def next_f32_11(self): | ||||
|         e, r, o = self.entry, self.entry.regs, self.entry.ops | ||||
|         mwc_float = o.cvt.rn.f32.s32(self.next_b32()) | ||||
|         return o.mul.f32(mwc_float, 1./(1<<31)) | ||||
|  | ||||
|     @ptx_func | ||||
|     def _next(self): | ||||
|         # Call from inside a block! | ||||
|         reg.u64('mwc_out') | ||||
|         op.cvt.u64.u32(mwc_out, mwc_car) | ||||
|         op.mad.wide.u32(mwc_out, mwc_st, mwc_mult, mwc_out) | ||||
|         op.mov.b64(vec(mwc_st, mwc_car), mwc_out) | ||||
|  | ||||
|     @ptx_func | ||||
|     def next_b32(self, dst_reg): | ||||
|         with block('Load next random u32 into ' + dst_reg.name): | ||||
|             self._next() | ||||
|             op.mov.u32(dst_reg, mwc_st) | ||||
|  | ||||
|     @ptx_func | ||||
|     def next_f32_01(self, dst_reg): | ||||
|         # TODO: verify that this is the fastest-performance method | ||||
|         # TODO: verify that this actually does what I think it does | ||||
|         with block('Load random float [0,1] into ' + dst_reg.name): | ||||
|             self._next() | ||||
|             op.cvt.rn.f32.u32(dst_reg, mwc_st) | ||||
|             op.mul.f32(dst_reg, dst_reg, '0f2F800000') # 1./(1<<32) | ||||
|  | ||||
|     @ptx_func | ||||
|     def next_f32_11(self, dst_reg): | ||||
|         with block('Load random float [-1,1) into ' + dst_reg.name): | ||||
|             reg.u32('mwc_to_float') | ||||
|             self._next() | ||||
|             op.cvt.rn.f32.s32(dst_reg, mwc_st) | ||||
|             op.mul.f32(dst_reg, dst_reg, '0f30000000') # 1./(1<<31) | ||||
|  | ||||
|     @instmethod | ||||
|     def seed(self, ctx, rand=np.random): | ||||
|     def call_setup(self, ctx, force=False): | ||||
|         """ | ||||
|         Seed the random number generators with values taken from a | ||||
|         ``np.random`` instance. | ||||
|         """ | ||||
|         # Load raw big-endian u32 multipliers from primes.bin. | ||||
|         with open('primes.bin') as primefp: | ||||
|             dt = np.dtype(np.uint32).newbyteorder('B') | ||||
|             mults = np.frombuffer(primefp.read(), dtype=dt) | ||||
|         stream = cuda.Stream() | ||||
|         # Randomness in choosing multipliers is good, but larger multipliers | ||||
|         # have longer periods, which is also good. This is a compromise. | ||||
|         mults = np.array(mults[:ctx.nthreads*4]) | ||||
|         rand.shuffle(mults) | ||||
|         # Copy multipliers and seeds to the device | ||||
|         multdp, multl = ctx.mod.get_global('mwc_rng_mults') | ||||
|         cuda.memcpy_htod(multdp, mults.tostring()[:multl]) | ||||
|         # Intentionally excludes both 0 and (2^32-1), as they can lead to | ||||
|         # degenerate sequences of period 0 | ||||
|         states = np.array(rand.randint(1, 0xffffffff, size=2*ctx.nthreads), | ||||
|                           dtype=np.uint32) | ||||
|         statedp, statel = ctx.mod.get_global('mwc_rng_state') | ||||
|         cuda.memcpy_htod(statedp, states.tostring()) | ||||
|         self.threads_ready = ctx.nthreads | ||||
|         if force or self.nthreads_ready < ctx.nthreads: | ||||
|             # Load raw big-endian u32 multipliers from primes.bin. | ||||
|             with open('primes.bin') as primefp: | ||||
|                 dt = np.dtype(np.uint32).newbyteorder('B') | ||||
|                 mults = np.frombuffer(primefp.read(), dtype=dt) | ||||
|             # Randomness in choosing multipliers is good, but larger multipliers | ||||
|             # have longer periods, which is also good. This is a compromise. | ||||
|             mults = np.array(mults[:ctx.nthreads*4]) | ||||
|             rand.shuffle(mults) | ||||
|             locked_mults = ctx.hostpool.allocate(ctx.nthreads, np.uint32) | ||||
|             locked_mults[:] = mults[ctx.nthreads] | ||||
|             self.mults = ctx.pool.allocate(4*ctx.nthreads) | ||||
|             cuda.memcpy_htod_async(self.mults, locked_mults.base, ctx.stream) | ||||
|             # Intentionally excludes both 0 and (2^32-1), as they can lead to | ||||
|             # degenerate sequences of period 0 | ||||
|             states = np.array(rand.randint(1, 0xffffffff, size=2*ctx.nthreads), | ||||
|                               dtype=np.uint32) | ||||
|             locked_states = ctx.hostpool.allocate(2*ctx.nthreads, np.uint32) | ||||
|             locked_states[:] = states | ||||
|             self.states = ctx.pool.allocate(8*ctx.nthreads) | ||||
|             cuda.memcpy_htod_async(self.states, locked_states, ctx.stream) | ||||
|             self.nthreads_ready = ctx.nthreads | ||||
|         ctx.set_param('mwc_mults', self.mults) | ||||
|         ctx.set_param('mwc_states', self.states) | ||||
|  | ||||
|     def call_setup(self, ctx): | ||||
|         if self.threads_ready < ctx.nthreads: | ||||
|             self.seed(ctx) | ||||
|  | ||||
|     def tests(self): | ||||
|         return [MWCRNGTest, MWCRNGFloatsTest] | ||||
|  | ||||
| class MWCRNGTest(PTXTest): | ||||
|     name = "MWC RNG sum-of-threads" | ||||
| class MWCRNGTest(PTXEntry): | ||||
|     rounds = 5000 | ||||
|     entry_name = 'MWC_RNG_test' | ||||
|     entry_params = '' | ||||
|  | ||||
|     def deps(self): | ||||
|         return [MWCRNG] | ||||
|     def __init__(self, entry): | ||||
|         self.entry = entry | ||||
|         self.mwc = MWCRNG(entry) | ||||
|  | ||||
|     @ptx_func | ||||
|     def module_setup(self): | ||||
|         mem.global_.u64('mwc_rng_test_sums', ctx.nthreads) | ||||
|         entry.add_param('mwc_test_sums', entry.types.u32) | ||||
|         with entry.body(): | ||||
|             self.entry_body() | ||||
|  | ||||
|     @ptx_func | ||||
|     def entry(self): | ||||
|         reg.u64('sum addl') | ||||
|         reg.u32('addend') | ||||
|         op.mov.u64(sum, 0) | ||||
|         with block('Sum next %d random numbers' % self.rounds): | ||||
|             reg.u32('loopct') | ||||
|             reg.pred('p') | ||||
|             op.mov.u32(loopct, self.rounds) | ||||
|             label('loopstart') | ||||
|             mwc.next_b32(addend) | ||||
|             op.cvt.u64.u32(addl, addend) | ||||
|             op.add.u64(sum, sum, addl) | ||||
|             op.sub.u32(loopct, loopct, 1) | ||||
|             op.setp.gt.u32(p, loopct, 0) | ||||
|             op.bra.uni(loopstart, ifp=p) | ||||
|     def entry_body(self): | ||||
|         e, r, o = self.entry, self.entry.regs, self.entry.ops | ||||
|  | ||||
|         with block('Store sum and state'): | ||||
|             reg.u32('adr offset') | ||||
|             std.get_gtid(offset) | ||||
|             op.mov.u32(adr, mwc_rng_test_sums) | ||||
|             op.mad.lo.u32(adr, offset, 8, adr) | ||||
|             op.st.global_.u64(addr(adr), sum) | ||||
|         r.sum = 0 | ||||
|         with e.std.loop(self.rounds) as mwc_rng_sum: | ||||
|             addend = o.cvt.u64.u32(self.mwc.next_b32()) | ||||
|             r.sum = o.add.u64(r.sum, addend) | ||||
|  | ||||
|     def call_setup(self, ctx): | ||||
|         # Get current multipliers and seeds from the device | ||||
|         self.mults = ctx.get_per_thread('mwc_rng_mults', np.uint32) | ||||
|         self.fullstates = ctx.get_per_thread('mwc_rng_state', np.uint64) | ||||
|         self.sums = np.zeros(ctx.nthreads, np.uint64) | ||||
|         e.std.store_per_thread(e.params.mwc_test_sums, r.sum) | ||||
|  | ||||
|         print "Running %d states forward %d rounds" % \ | ||||
|               (len(self.mults), self.rounds) | ||||
|         ctime = time.time() | ||||
|         for i in range(self.rounds): | ||||
|             states = self.fullstates & 0xffffffff | ||||
|             carries = self.fullstates >> 32 | ||||
|             self.fullstates = self.mults * states + carries | ||||
|             self.sums += self.fullstates & 0xffffffff | ||||
|         ctime = time.time() - ctime | ||||
|         print "Done on host, took %g seconds" % ctime | ||||
|     def call(self, ctx): | ||||
|         # Generate current state, upload it to GPU | ||||
|         self.mwc.call_setup(ctx, force=True) | ||||
|         mults, fullstates = self.mwc.mults, self.mwc.fullstates | ||||
|         sums = np.zeros_like(fullstates) | ||||
|  | ||||
|     def call_teardown(self, ctx): | ||||
|         dfullstates = ctx.get_per_thread('mwc_rng_state', np.uint64) | ||||
|         if not (dfullstates == self.fullstates).all(): | ||||
|             print "State discrepancy" | ||||
|             print dfullstates | ||||
|             print self.fullstates | ||||
|             raise PTXTestFailure("MWC RNG state discrepancy") | ||||
|         # Run two trials, to ensure device state is getting saved properly | ||||
|         for trial in range(2): | ||||
|             print "Trial %d, on CPU: " % trial, | ||||
|             ctime = time.time() | ||||
|             for i in range(self.rounds): | ||||
|                 states = fullstates & 0xffffffff | ||||
|                 carries = fullstates >> 32 | ||||
|                 fullstates = self.mults * states + carries | ||||
|                 sums += fullstates & 0xffffffff | ||||
|             ctime = time.time() - ctime | ||||
|             print "Took %g seconds." % ctime | ||||
|  | ||||
|             print "Trial %d, on device: " % trial, | ||||
|             dsums = np.empty_like(sums) | ||||
|             ctx.set_param('mwc_test_sums', cuda.Out(dsums)) | ||||
|             print "Took %g seconds." % ctx.call() | ||||
|  | ||||
|         dsums = ctx.get_per_thread('mwc_rng_test_sums', np.uint64) | ||||
|         if not (dsums == self.sums).all(): | ||||
|             print "Sum discrepancy" | ||||
|             print dsums | ||||
|             print self.sums | ||||
|             raise PTXTestFailure("MWC RNG sum discrepancy") | ||||
|             if not np.all(np.equal(sums, dsums)): | ||||
|                 print "Sum discrepancy!" | ||||
|                 print sums | ||||
|                 print dsums | ||||
|                 raise TODOSomeKindOfException() | ||||
|  | ||||
| class MWCRNGFloatsTest(PTXTest): | ||||
|     """ | ||||
|  | ||||
							
								
								
									
										1594
									
								
								cuburn/ptx.py
									
									
									
									
									
								
							
							
						
						
									
										1594
									
								
								cuburn/ptx.py
									
									
									
									
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
		Reference in New Issue
	
	Block a user
	 Steven Robertson
					Steven Robertson