Known broken checkin because I'm nervous.

2025-07-14 12:15:18 -04:00 · 2010-10-01 01:20:20 -04:00
parent b938c320a8
commit c0e3c1d599
5 changed files with 786 additions and 1164 deletions
--- a/cuburn/init.py
+++ b/cuburn/init.py
@ -0,0 +1,32 @@
 from collections import namedtuple
 Flag = namedtuple('Flag', 'level desc')
 class DebugSettings(object):
    """
    Container for default debug settings.
    """
    def __init__(self, items):
        self.items = items
        self.values = {}
        self.level = 1
    def __getattr__(self, name):
        if name not in self.items:
            raise KeyError("Unknown debug flag name!")
        if name in self.values:
            return self.values[name]
        return (self.items[name].level <= self.level)
    def format_help(self):
        name_len = min(30, max(map(len, self.items.keys())))
        fmt = '%-' + name_len + 's %d %s'
        return '\n'.join([fmt % (k, v.level, v.desc)
                          for k, v in self.items.items()])
 debug_flags = dict(
    count_writes = Flag(3,  "Count the number of points written per thread "
                            "when doing iterations."),
    count_rounds = Flag(3,  "Count the number of times the iteration loop "
                            "runs per thread when doing iterations.")
    )
--- a/cuburn/cuda.py
+++ b/cuburn/cuda.py
@ -1,39 +1,48 @@
 # These imports are order-sensitive!
-import pyglet
+#import pyglet
-import pyglet.gl as gl
+#import pyglet.gl as gl
-gl.get_current_context()
+#gl.get_current_context()
 import pycuda.driver as cuda
 from pycuda.compiler import SourceModule
 import pycuda.tools
-import pycuda.gl as cudagl
+#import pycuda.gl as cudagl
-import pycuda.gl.autoinit
+#import pycuda.gl.autoinit
 import pycuda.autoinit
 import numpy as np
-from cuburn.ptx import PTXModule, PTXTest, PTXTestFailure
+from cuburn.ptx import PTXFormatter
 class Module(object):
    def __init__(self, entries):
        self.entries = entries
        self.source = self.compile(entries)
        self.mod = self.assemble(self.source)
    @staticmethod
    def compile(entries):
        formatter = PTXFormatter()
        for entry in entries:
            entry.format_source(formatter)
        return formatter.get_source()
    def assemble(self, src):
        # TODO: make this a debugging option
        with open('/tmp/cuburn.ptx', 'w') as f: f.write(src)
        try:
            mod = cuda.module_from_buffer(src,
                [(cuda.jit_option.OPTIMIZATION_LEVEL, 0),
                 (cuda.jit_option.TARGET_FROM_CUCONTEXT, 1)])
        except (cuda.CompileError, cuda.RuntimeError), e:
            # TODO: if output not written above, print different message
            # TODO: read assembler output and recover Python source lines
            print "Compile error. Source is at /tmp/cuburn.ptx"
            print e
            raise e
        return mod
 class LaunchContext(object):
    """
    Context collecting the information needed to create, run, and gather the
    results of a device computation. This may eventually also include an actual
    CUDA context, but for now it just uses the global one.
    To create the fastest device code across multiple device families, this
    context may decide to iteratively refine the final PTX by regenerating
    and recompiling it several times to optimize certain parameters of the
    launch, such as the distribution of threads throughout the device.
    The properties of this device which are tuned are listed below. Any PTX
    fragments which use this information must emit valid PTX for any state
    given below, but the PTX is only required to actually run with the final,
    fixed values of all tuned parameters below.
        `block`:    3-tuple of (x,y,z); dimensions of each CTA.
        `grid`:     2-tuple of (x,y); dimensions of the grid of CTAs.
        `nthreads`: Number of active threads on device as a whole.
        `mod`:      Final compiled module. Unavailable during assembly.
    """
    def __init__(self, entries, block=(1,1,1), grid=(1,1), tests=False):
        self.entry_types = entries
        self.block, self.grid, self.build_tests = block, grid, tests
@ -60,18 +69,6 @@ class LaunchContext(object):
        kwargs['ctx'] = self
        self.ptx = PTXModule(self.entry_types, kwargs, self.build_tests)
        # TODO: make this optional and let user choose path
        with open('/tmp/cuburn.ptx', 'w') as f: f.write(self.ptx.source)
        try:
            # TODO: detect/customize arch, code; verbose setting;
            # keep directory enable/disable via debug
            self.mod = cuda.module_from_buffer(self.ptx.source,
                [(cuda.jit_option.OPTIMIZATION_LEVEL, 0),
                 (cuda.jit_option.TARGET_FROM_CUCONTEXT, 1)])
        except (cuda.CompileError, cuda.RuntimeError), e:
            # TODO: if output not written above, print different message
            print "Compile error. Source is at /tmp/cuburn.ptx"
            print e
            raise e
        if verbose:
            for entry in self.ptx.entries:
                func = self.mod.get_function(entry.entry_name)
--- a/cuburn/device_code.py
+++ b/cuburn/device_code.py
@ -523,175 +523,130 @@ class ShufflePoints(PTXFragment):
                op.bar.sync(bar)
                op.ld.volatile.shared.b32(var, addr(shuf_read))
-class MWCRNG(PTXFragment):
+class MWCRNG(object):
-    shortname = "mwc"
+    def __init__(self, entry, seed=None):
-
+        # TODO: install this in data directory or something
    def __init__(self):
        self.threads_ready = 0
        if not os.path.isfile('primes.bin'):
            raise EnvironmentError('primes.bin not found')
        self.threads_ready = 0
        self.mults, self.state = None, None
-    @ptx_func
+        self.entry = entry
-    def module_setup(self):
+        entry.add_param('mwc_mults', entry.types.u32)
-        mem.global_.u32('mwc_rng_mults', ctx.nthreads)
+        entry.add_param('mwc_states', entry.types.u32)
-        mem.global_.u64('mwc_rng_state', ctx.nthreads)
+        r, o = entry.regs, entry.ops
        with entry.head as e:
            #mwc_mult_addr = gtid * 4 + e.params.mwc_mults
            gtid = o.mad.lo(e.special.ctaid_x, ctx.threads_per_cta,
                            e.special.tid_x)
            mwc_mult_addr = o.mad.lo.u32(gtid, 4, e.params.mwc_mults)
            r.mwc_mult = o.load.u32(mwc_mult_addr)
            mwc_state_addr = o.mad.lo.u32(gtid, 8, e.params.mwc_states)
            r.mwc_state, r.mwc_carry = o.load.u64(mwc_state_addr)
        with entry.tail as e:
            #gtid = e.special.ctaid_x * ctx.threads_per_cta + e.special.tid_x
            gtid = o.mad.lo(e.special.ctaid_x, ctx.threads_per_cta,
                            e.special.tid_x)
            mwc_state_addr = o.mad.lo.u32(gtid, 8, e.params.mwc_states)
            o.store.v2(mwc_state_addr, (r.mwc_state, r.mwc_carry))
-    @ptx_func
+    def next_b32(self):
-    def entry_setup(self):
+        e, r, o = self.entry, self.entry.regs, self.entry.ops
-        reg.u32('mwc_st mwc_mult mwc_car')
+        mwc_out = o.cvt.u64(r.mwc_carry)
-        with block('Load MWC multipliers and states'):
+        mwc_out = o.mad.wide.u32(r.mwc_mult, r.mwc_state, mwc_out)
-            reg.u32('mwc_off mwc_addr')
+        r.mwc_state, r.mwc_carry = o.mov(mwc_out)
-            std.get_gtid(mwc_off)
+        return r.mwc_state
            op.mov.u32(mwc_addr, mwc_rng_mults)
            op.mad.lo.u32(mwc_addr, mwc_off, 4, mwc_addr)
            op.ld.global_.u32(mwc_mult, addr(mwc_addr))
-            op.mov.u32(mwc_addr, mwc_rng_state)
+    def next_f32_01(self):
-            op.mad.lo.u32(mwc_addr, mwc_off, 8, mwc_addr)
+        e, r, o = self.entry, self.entry.regs, self.entry.ops
-            op.ld.global_.v2.u32(vec(mwc_st, mwc_car), addr(mwc_addr))
+        mwc_float = o.cvt.rn.f32.u32(self.next_b32())
        # TODO: check the precision on the uploaded types here
        return o.mul.f32(mwc_float, 1./(1<<32))
-    @ptx_func
+    def next_f32_11(self):
-    def entry_teardown(self):
+        e, r, o = self.entry, self.entry.regs, self.entry.ops
-        with block('Save MWC states'):
+        mwc_float = o.cvt.rn.f32.s32(self.next_b32())
-            reg.u32('mwc_off mwc_addr')
+        return o.mul.f32(mwc_float, 1./(1<<31))
            std.get_gtid(mwc_off)
            op.mov.u32(mwc_addr, mwc_rng_state)
            op.mad.lo.u32(mwc_addr, mwc_off, 8, mwc_addr)
            op.st.global_.v2.u32(addr(mwc_addr), vec(mwc_st, mwc_car))
-    @ptx_func
+    def call_setup(self, ctx, force=False):
    def _next(self):
        # Call from inside a block!
        reg.u64('mwc_out')
        op.cvt.u64.u32(mwc_out, mwc_car)
        op.mad.wide.u32(mwc_out, mwc_st, mwc_mult, mwc_out)
        op.mov.b64(vec(mwc_st, mwc_car), mwc_out)
    @ptx_func
    def next_b32(self, dst_reg):
        with block('Load next random u32 into ' + dst_reg.name):
            self._next()
            op.mov.u32(dst_reg, mwc_st)
    @ptx_func
    def next_f32_01(self, dst_reg):
        # TODO: verify that this is the fastest-performance method
        # TODO: verify that this actually does what I think it does
        with block('Load random float [0,1] into ' + dst_reg.name):
            self._next()
            op.cvt.rn.f32.u32(dst_reg, mwc_st)
            op.mul.f32(dst_reg, dst_reg, '0f2F800000') # 1./(1<<32)
    @ptx_func
    def next_f32_11(self, dst_reg):
        with block('Load random float [-1,1) into ' + dst_reg.name):
            reg.u32('mwc_to_float')
            self._next()
            op.cvt.rn.f32.s32(dst_reg, mwc_st)
            op.mul.f32(dst_reg, dst_reg, '0f30000000') # 1./(1<<31)
    @instmethod
    def seed(self, ctx, rand=np.random):
        """
        Seed the random number generators with values taken from a
        ``np.random`` instance.
        """
        if force or self.nthreads_ready < ctx.nthreads:
            # Load raw big-endian u32 multipliers from primes.bin.
            with open('primes.bin') as primefp:
                dt = np.dtype(np.uint32).newbyteorder('B')
                mults = np.frombuffer(primefp.read(), dtype=dt)
        stream = cuda.Stream()
            # Randomness in choosing multipliers is good, but larger multipliers
            # have longer periods, which is also good. This is a compromise.
            mults = np.array(mults[:ctx.nthreads*4])
            rand.shuffle(mults)
-        # Copy multipliers and seeds to the device
+            locked_mults = ctx.hostpool.allocate(ctx.nthreads, np.uint32)
-        multdp, multl = ctx.mod.get_global('mwc_rng_mults')
+            locked_mults[:] = mults[ctx.nthreads]
-        cuda.memcpy_htod(multdp, mults.tostring()[:multl])
+            self.mults = ctx.pool.allocate(4*ctx.nthreads)
            cuda.memcpy_htod_async(self.mults, locked_mults.base, ctx.stream)
            # Intentionally excludes both 0 and (2^32-1), as they can lead to
            # degenerate sequences of period 0
            states = np.array(rand.randint(1, 0xffffffff, size=2*ctx.nthreads),
                              dtype=np.uint32)
-        statedp, statel = ctx.mod.get_global('mwc_rng_state')
+            locked_states = ctx.hostpool.allocate(2*ctx.nthreads, np.uint32)
-        cuda.memcpy_htod(statedp, states.tostring())
+            locked_states[:] = states
-        self.threads_ready = ctx.nthreads
+            self.states = ctx.pool.allocate(8*ctx.nthreads)
            cuda.memcpy_htod_async(self.states, locked_states, ctx.stream)
            self.nthreads_ready = ctx.nthreads
        ctx.set_param('mwc_mults', self.mults)
        ctx.set_param('mwc_states', self.states)
-    def call_setup(self, ctx):
+class MWCRNGTest(PTXEntry):
        if self.threads_ready < ctx.nthreads:
            self.seed(ctx)
    def tests(self):
        return [MWCRNGTest, MWCRNGFloatsTest]
 class MWCRNGTest(PTXTest):
    name = "MWC RNG sum-of-threads"
    rounds = 5000
    entry_name = 'MWC_RNG_test'
    entry_params = ''
-    def deps(self):
+    def __init__(self, entry):
-        return [MWCRNG]
+        self.entry = entry
        self.mwc = MWCRNG(entry)
-    @ptx_func
+        entry.add_param('mwc_test_sums', entry.types.u32)
-    def module_setup(self):
+        with entry.body():
-        mem.global_.u64('mwc_rng_test_sums', ctx.nthreads)
+            self.entry_body()
-    @ptx_func
+    def entry_body(self):
-    def entry(self):
+        e, r, o = self.entry, self.entry.regs, self.entry.ops
        reg.u64('sum addl')
        reg.u32('addend')
        op.mov.u64(sum, 0)
        with block('Sum next %d random numbers' % self.rounds):
            reg.u32('loopct')
            reg.pred('p')
            op.mov.u32(loopct, self.rounds)
            label('loopstart')
            mwc.next_b32(addend)
            op.cvt.u64.u32(addl, addend)
            op.add.u64(sum, sum, addl)
            op.sub.u32(loopct, loopct, 1)
            op.setp.gt.u32(p, loopct, 0)
            op.bra.uni(loopstart, ifp=p)
-        with block('Store sum and state'):
+        r.sum = 0
-            reg.u32('adr offset')
+        with e.std.loop(self.rounds) as mwc_rng_sum:
-            std.get_gtid(offset)
+            addend = o.cvt.u64.u32(self.mwc.next_b32())
-            op.mov.u32(adr, mwc_rng_test_sums)
+            r.sum = o.add.u64(r.sum, addend)
            op.mad.lo.u32(adr, offset, 8, adr)
            op.st.global_.u64(addr(adr), sum)
-    def call_setup(self, ctx):
+        e.std.store_per_thread(e.params.mwc_test_sums, r.sum)
        # Get current multipliers and seeds from the device
        self.mults = ctx.get_per_thread('mwc_rng_mults', np.uint32)
        self.fullstates = ctx.get_per_thread('mwc_rng_state', np.uint64)
        self.sums = np.zeros(ctx.nthreads, np.uint64)
-        print "Running %d states forward %d rounds" % \
+    def call(self, ctx):
-              (len(self.mults), self.rounds)
+        # Generate current state, upload it to GPU
        self.mwc.call_setup(ctx, force=True)
        mults, fullstates = self.mwc.mults, self.mwc.fullstates
        sums = np.zeros_like(fullstates)
        # Run two trials, to ensure device state is getting saved properly
        for trial in range(2):
            print "Trial %d, on CPU: " % trial,
            ctime = time.time()
            for i in range(self.rounds):
-            states = self.fullstates & 0xffffffff
+                states = fullstates & 0xffffffff
-            carries = self.fullstates >> 32
+                carries = fullstates >> 32
-            self.fullstates = self.mults * states + carries
+                fullstates = self.mults * states + carries
-            self.sums += self.fullstates & 0xffffffff
+                sums += fullstates & 0xffffffff
            ctime = time.time() - ctime
-        print "Done on host, took %g seconds" % ctime
+            print "Took %g seconds." % ctime
-    def call_teardown(self, ctx):
+            print "Trial %d, on device: " % trial,
-        dfullstates = ctx.get_per_thread('mwc_rng_state', np.uint64)
+            dsums = np.empty_like(sums)
-        if not (dfullstates == self.fullstates).all():
+            ctx.set_param('mwc_test_sums', cuda.Out(dsums))
-            print "State discrepancy"
+            print "Took %g seconds." % ctx.call()
            print dfullstates
            print self.fullstates
            raise PTXTestFailure("MWC RNG state discrepancy")
-
+            if not np.all(np.equal(sums, dsums)):
-        dsums = ctx.get_per_thread('mwc_rng_test_sums', np.uint64)
+                print "Sum discrepancy!"
-        if not (dsums == self.sums).all():
+                print sums
            print "Sum discrepancy"
                print dsums
-            print self.sums
+                raise TODOSomeKindOfException()
            raise PTXTestFailure("MWC RNG sum discrepancy")
 class MWCRNGFloatsTest(PTXTest):
    """
--- a/cuburn/ptx.py
+++ b/cuburn/ptx.py
--- a/main.py
+++ b/main.py
@ -16,6 +16,8 @@ from ctypes import *
 import numpy as np
 np.set_printoptions(precision=5, edgeitems=20)
 from cuburn.device_code import *
 from cuburn.cuda import LaunchContext
 from fr0stlib.pyflam3 import *