Known broken checkin because I'm nervous.

2025-08-01 13:05:25 -04:00 · 2010-10-01 01:20:20 -04:00
parent b938c320a8
commit c0e3c1d599
5 changed files with 786 additions and 1164 deletions
--- a/cuburn/init.py
+++ b/cuburn/init.py
@ -0,0 +1,32 @@
+
+from collections import namedtuple
+
+Flag = namedtuple('Flag', 'level desc')
+
+class DebugSettings(object):
+    """
+    Container for default debug settings.
+    """
+    def __init__(self, items):
+        self.items = items
+        self.values = {}
+        self.level = 1
+    def __getattr__(self, name):
+        if name not in self.items:
+            raise KeyError("Unknown debug flag name!")
+        if name in self.values:
+            return self.values[name]
+        return (self.items[name].level <= self.level)
+    def format_help(self):
+        name_len = min(30, max(map(len, self.items.keys())))
+        fmt = '%-' + name_len + 's %d %s'
+        return '\n'.join([fmt % (k, v.level, v.desc)
+                          for k, v in self.items.items()])
+
+debug_flags = dict(
+    count_writes = Flag(3,  "Count the number of points written per thread "
+                            "when doing iterations."),
+    count_rounds = Flag(3,  "Count the number of times the iteration loop "
+                            "runs per thread when doing iterations.")
+    )
+
--- a/cuburn/cuda.py
+++ b/cuburn/cuda.py
@ -1,39 +1,48 @@
 # These imports are order-sensitive!
-import pyglet
-import pyglet.gl as gl
-gl.get_current_context()
+#import pyglet
+#import pyglet.gl as gl
+#gl.get_current_context()

 import pycuda.driver as cuda
 from pycuda.compiler import SourceModule
 import pycuda.tools
-import pycuda.gl as cudagl
-import pycuda.gl.autoinit
+#import pycuda.gl as cudagl
+#import pycuda.gl.autoinit
+import pycuda.autoinit

 import numpy as np

-from cuburn.ptx import PTXModule, PTXTest, PTXTestFailure
+from cuburn.ptx import PTXFormatter
+
+class Module(object):
+    def __init__(self, entries):
+        self.entries = entries
+        self.source = self.compile(entries)
+        self.mod = self.assemble(self.source)
+
+    @staticmethod
+    def compile(entries):
+        formatter = PTXFormatter()
+        for entry in entries:
+            entry.format_source(formatter)
+        return formatter.get_source()
+
+    def assemble(self, src):
+        # TODO: make this a debugging option
+        with open('/tmp/cuburn.ptx', 'w') as f: f.write(src)
+        try:
+            mod = cuda.module_from_buffer(src,
+                [(cuda.jit_option.OPTIMIZATION_LEVEL, 0),
+                 (cuda.jit_option.TARGET_FROM_CUCONTEXT, 1)])
+        except (cuda.CompileError, cuda.RuntimeError), e:
+            # TODO: if output not written above, print different message
+            # TODO: read assembler output and recover Python source lines
+            print "Compile error. Source is at /tmp/cuburn.ptx"
+            print e
+            raise e
+        return mod

 class LaunchContext(object):
-    """
-    Context collecting the information needed to create, run, and gather the
-    results of a device computation. This may eventually also include an actual
-    CUDA context, but for now it just uses the global one.
-
-    To create the fastest device code across multiple device families, this
-    context may decide to iteratively refine the final PTX by regenerating
-    and recompiling it several times to optimize certain parameters of the
-    launch, such as the distribution of threads throughout the device.
-    The properties of this device which are tuned are listed below. Any PTX
-    fragments which use this information must emit valid PTX for any state
-    given below, but the PTX is only required to actually run with the final,
-    fixed values of all tuned parameters below.
-
-        `block`:    3-tuple of (x,y,z); dimensions of each CTA.
-        `grid`:     2-tuple of (x,y); dimensions of the grid of CTAs.
-        `nthreads`: Number of active threads on device as a whole.
-        `mod`:      Final compiled module. Unavailable during assembly.
-
-    """
    def __init__(self, entries, block=(1,1,1), grid=(1,1), tests=False):
        self.entry_types = entries
        self.block, self.grid, self.build_tests = block, grid, tests
@ -60,18 +69,6 @@ class LaunchContext(object):
        kwargs['ctx'] = self
        self.ptx = PTXModule(self.entry_types, kwargs, self.build_tests)
        # TODO: make this optional and let user choose path
-        with open('/tmp/cuburn.ptx', 'w') as f: f.write(self.ptx.source)
-        try:
-            # TODO: detect/customize arch, code; verbose setting;
-            # keep directory enable/disable via debug
-            self.mod = cuda.module_from_buffer(self.ptx.source,
-                [(cuda.jit_option.OPTIMIZATION_LEVEL, 0),
-                 (cuda.jit_option.TARGET_FROM_CUCONTEXT, 1)])
-        except (cuda.CompileError, cuda.RuntimeError), e:
-            # TODO: if output not written above, print different message
-            print "Compile error. Source is at /tmp/cuburn.ptx"
-            print e
-            raise e
        if verbose:
            for entry in self.ptx.entries:
                func = self.mod.get_function(entry.entry_name)
--- a/cuburn/device_code.py
+++ b/cuburn/device_code.py
@ -523,175 +523,130 @@ class ShufflePoints(PTXFragment):
                op.bar.sync(bar)
                op.ld.volatile.shared.b32(var, addr(shuf_read))

-class MWCRNG(PTXFragment):
-    shortname = "mwc"
-
-    def __init__(self):
-        self.threads_ready = 0
+class MWCRNG(object):
+    def __init__(self, entry, seed=None):
+        # TODO: install this in data directory or something
        if not os.path.isfile('primes.bin'):
            raise EnvironmentError('primes.bin not found')
+        self.threads_ready = 0
+        self.mults, self.state = None, None

-    @ptx_func
-    def module_setup(self):
-        mem.global_.u32('mwc_rng_mults', ctx.nthreads)
-        mem.global_.u64('mwc_rng_state', ctx.nthreads)
+        self.entry = entry
+        entry.add_param('mwc_mults', entry.types.u32)
+        entry.add_param('mwc_states', entry.types.u32)
+        r, o = entry.regs, entry.ops
+        with entry.head as e:
+            #mwc_mult_addr = gtid * 4 + e.params.mwc_mults
+            gtid = o.mad.lo(e.special.ctaid_x, ctx.threads_per_cta,
+                            e.special.tid_x)
+            mwc_mult_addr = o.mad.lo.u32(gtid, 4, e.params.mwc_mults)
+            r.mwc_mult = o.load.u32(mwc_mult_addr)
+            mwc_state_addr = o.mad.lo.u32(gtid, 8, e.params.mwc_states)
+            r.mwc_state, r.mwc_carry = o.load.u64(mwc_state_addr)
+        with entry.tail as e:
+            #gtid = e.special.ctaid_x * ctx.threads_per_cta + e.special.tid_x
+            gtid = o.mad.lo(e.special.ctaid_x, ctx.threads_per_cta,
+                            e.special.tid_x)
+            mwc_state_addr = o.mad.lo.u32(gtid, 8, e.params.mwc_states)
+            o.store.v2(mwc_state_addr, (r.mwc_state, r.mwc_carry))

-    @ptx_func
-    def entry_setup(self):
-        reg.u32('mwc_st mwc_mult mwc_car')
-        with block('Load MWC multipliers and states'):
-            reg.u32('mwc_off mwc_addr')
-            std.get_gtid(mwc_off)
-            op.mov.u32(mwc_addr, mwc_rng_mults)
-            op.mad.lo.u32(mwc_addr, mwc_off, 4, mwc_addr)
-            op.ld.global_.u32(mwc_mult, addr(mwc_addr))
+    def next_b32(self):
+        e, r, o = self.entry, self.entry.regs, self.entry.ops
+        mwc_out = o.cvt.u64(r.mwc_carry)
+        mwc_out = o.mad.wide.u32(r.mwc_mult, r.mwc_state, mwc_out)
+        r.mwc_state, r.mwc_carry = o.mov(mwc_out)
+        return r.mwc_state

-            op.mov.u32(mwc_addr, mwc_rng_state)
-            op.mad.lo.u32(mwc_addr, mwc_off, 8, mwc_addr)
-            op.ld.global_.v2.u32(vec(mwc_st, mwc_car), addr(mwc_addr))
+    def next_f32_01(self):
+        e, r, o = self.entry, self.entry.regs, self.entry.ops
+        mwc_float = o.cvt.rn.f32.u32(self.next_b32())
+        # TODO: check the precision on the uploaded types here
+        return o.mul.f32(mwc_float, 1./(1<<32))

-    @ptx_func
-    def entry_teardown(self):
-        with block('Save MWC states'):
-            reg.u32('mwc_off mwc_addr')
-            std.get_gtid(mwc_off)
-            op.mov.u32(mwc_addr, mwc_rng_state)
-            op.mad.lo.u32(mwc_addr, mwc_off, 8, mwc_addr)
-            op.st.global_.v2.u32(addr(mwc_addr), vec(mwc_st, mwc_car))
+    def next_f32_11(self):
+        e, r, o = self.entry, self.entry.regs, self.entry.ops
+        mwc_float = o.cvt.rn.f32.s32(self.next_b32())
+        return o.mul.f32(mwc_float, 1./(1<<31))

-    @ptx_func
-    def _next(self):
-        # Call from inside a block!
-        reg.u64('mwc_out')
-        op.cvt.u64.u32(mwc_out, mwc_car)
-        op.mad.wide.u32(mwc_out, mwc_st, mwc_mult, mwc_out)
-        op.mov.b64(vec(mwc_st, mwc_car), mwc_out)
-
-    @ptx_func
-    def next_b32(self, dst_reg):
-        with block('Load next random u32 into ' + dst_reg.name):
-            self._next()
-            op.mov.u32(dst_reg, mwc_st)
-
-    @ptx_func
-    def next_f32_01(self, dst_reg):
-        # TODO: verify that this is the fastest-performance method
-        # TODO: verify that this actually does what I think it does
-        with block('Load random float [0,1] into ' + dst_reg.name):
-            self._next()
-            op.cvt.rn.f32.u32(dst_reg, mwc_st)
-            op.mul.f32(dst_reg, dst_reg, '0f2F800000') # 1./(1<<32)
-
-    @ptx_func
-    def next_f32_11(self, dst_reg):
-        with block('Load random float [-1,1) into ' + dst_reg.name):
-            reg.u32('mwc_to_float')
-            self._next()
-            op.cvt.rn.f32.s32(dst_reg, mwc_st)
-            op.mul.f32(dst_reg, dst_reg, '0f30000000') # 1./(1<<31)
-
-    @instmethod
-    def seed(self, ctx, rand=np.random):
+    def call_setup(self, ctx, force=False):
        """
        Seed the random number generators with values taken from a
        ``np.random`` instance.
        """
-        # Load raw big-endian u32 multipliers from primes.bin.
-        with open('primes.bin') as primefp:
-            dt = np.dtype(np.uint32).newbyteorder('B')
-            mults = np.frombuffer(primefp.read(), dtype=dt)
-        stream = cuda.Stream()
-        # Randomness in choosing multipliers is good, but larger multipliers
-        # have longer periods, which is also good. This is a compromise.
-        mults = np.array(mults[:ctx.nthreads*4])
-        rand.shuffle(mults)
-        # Copy multipliers and seeds to the device
-        multdp, multl = ctx.mod.get_global('mwc_rng_mults')
-        cuda.memcpy_htod(multdp, mults.tostring()[:multl])
-        # Intentionally excludes both 0 and (2^32-1), as they can lead to
-        # degenerate sequences of period 0
-        states = np.array(rand.randint(1, 0xffffffff, size=2*ctx.nthreads),
-                          dtype=np.uint32)
-        statedp, statel = ctx.mod.get_global('mwc_rng_state')
-        cuda.memcpy_htod(statedp, states.tostring())
-        self.threads_ready = ctx.nthreads
+        if force or self.nthreads_ready < ctx.nthreads:
+            # Load raw big-endian u32 multipliers from primes.bin.
+            with open('primes.bin') as primefp:
+                dt = np.dtype(np.uint32).newbyteorder('B')
+                mults = np.frombuffer(primefp.read(), dtype=dt)
+            # Randomness in choosing multipliers is good, but larger multipliers
+            # have longer periods, which is also good. This is a compromise.
+            mults = np.array(mults[:ctx.nthreads*4])
+            rand.shuffle(mults)
+            locked_mults = ctx.hostpool.allocate(ctx.nthreads, np.uint32)
+            locked_mults[:] = mults[ctx.nthreads]
+            self.mults = ctx.pool.allocate(4*ctx.nthreads)
+            cuda.memcpy_htod_async(self.mults, locked_mults.base, ctx.stream)
+            # Intentionally excludes both 0 and (2^32-1), as they can lead to
+            # degenerate sequences of period 0
+            states = np.array(rand.randint(1, 0xffffffff, size=2*ctx.nthreads),
+                              dtype=np.uint32)
+            locked_states = ctx.hostpool.allocate(2*ctx.nthreads, np.uint32)
+            locked_states[:] = states
+            self.states = ctx.pool.allocate(8*ctx.nthreads)
+            cuda.memcpy_htod_async(self.states, locked_states, ctx.stream)
+            self.nthreads_ready = ctx.nthreads
+        ctx.set_param('mwc_mults', self.mults)
+        ctx.set_param('mwc_states', self.states)

-    def call_setup(self, ctx):
-        if self.threads_ready < ctx.nthreads:
-            self.seed(ctx)
-
-    def tests(self):
-        return [MWCRNGTest, MWCRNGFloatsTest]
-
-class MWCRNGTest(PTXTest):
-    name = "MWC RNG sum-of-threads"
+class MWCRNGTest(PTXEntry):
    rounds = 5000
-    entry_name = 'MWC_RNG_test'
-    entry_params = ''

-    def deps(self):
-        return [MWCRNG]
+    def __init__(self, entry):
+        self.entry = entry
+        self.mwc = MWCRNG(entry)

-    @ptx_func
-    def module_setup(self):
-        mem.global_.u64('mwc_rng_test_sums', ctx.nthreads)
+        entry.add_param('mwc_test_sums', entry.types.u32)
+        with entry.body():
+            self.entry_body()

-    @ptx_func
-    def entry(self):
-        reg.u64('sum addl')
-        reg.u32('addend')
-        op.mov.u64(sum, 0)
-        with block('Sum next %d random numbers' % self.rounds):
-            reg.u32('loopct')
-            reg.pred('p')
-            op.mov.u32(loopct, self.rounds)
-            label('loopstart')
-            mwc.next_b32(addend)
-            op.cvt.u64.u32(addl, addend)
-            op.add.u64(sum, sum, addl)
-            op.sub.u32(loopct, loopct, 1)
-            op.setp.gt.u32(p, loopct, 0)
-            op.bra.uni(loopstart, ifp=p)
+    def entry_body(self):
+        e, r, o = self.entry, self.entry.regs, self.entry.ops

-        with block('Store sum and state'):
-            reg.u32('adr offset')
-            std.get_gtid(offset)
-            op.mov.u32(adr, mwc_rng_test_sums)
-            op.mad.lo.u32(adr, offset, 8, adr)
-            op.st.global_.u64(addr(adr), sum)
+        r.sum = 0
+        with e.std.loop(self.rounds) as mwc_rng_sum:
+            addend = o.cvt.u64.u32(self.mwc.next_b32())
+            r.sum = o.add.u64(r.sum, addend)

-    def call_setup(self, ctx):
-        # Get current multipliers and seeds from the device
-        self.mults = ctx.get_per_thread('mwc_rng_mults', np.uint32)
-        self.fullstates = ctx.get_per_thread('mwc_rng_state', np.uint64)
-        self.sums = np.zeros(ctx.nthreads, np.uint64)
+        e.std.store_per_thread(e.params.mwc_test_sums, r.sum)

-        print "Running %d states forward %d rounds" % \
-              (len(self.mults), self.rounds)
-        ctime = time.time()
-        for i in range(self.rounds):
-            states = self.fullstates & 0xffffffff
-            carries = self.fullstates >> 32
-            self.fullstates = self.mults * states + carries
-            self.sums += self.fullstates & 0xffffffff
-        ctime = time.time() - ctime
-        print "Done on host, took %g seconds" % ctime
+    def call(self, ctx):
+        # Generate current state, upload it to GPU
+        self.mwc.call_setup(ctx, force=True)
+        mults, fullstates = self.mwc.mults, self.mwc.fullstates
+        sums = np.zeros_like(fullstates)

-    def call_teardown(self, ctx):
-        dfullstates = ctx.get_per_thread('mwc_rng_state', np.uint64)
-        if not (dfullstates == self.fullstates).all():
-            print "State discrepancy"
-            print dfullstates
-            print self.fullstates
-            raise PTXTestFailure("MWC RNG state discrepancy")
+        # Run two trials, to ensure device state is getting saved properly
+        for trial in range(2):
+            print "Trial %d, on CPU: " % trial,
+            ctime = time.time()
+            for i in range(self.rounds):
+                states = fullstates & 0xffffffff
+                carries = fullstates >> 32
+                fullstates = self.mults * states + carries
+                sums += fullstates & 0xffffffff
+            ctime = time.time() - ctime
+            print "Took %g seconds." % ctime

+            print "Trial %d, on device: " % trial,
+            dsums = np.empty_like(sums)
+            ctx.set_param('mwc_test_sums', cuda.Out(dsums))
+            print "Took %g seconds." % ctx.call()

-        dsums = ctx.get_per_thread('mwc_rng_test_sums', np.uint64)
-        if not (dsums == self.sums).all():
-            print "Sum discrepancy"
-            print dsums
-            print self.sums
-            raise PTXTestFailure("MWC RNG sum discrepancy")
+            if not np.all(np.equal(sums, dsums)):
+                print "Sum discrepancy!"
+                print sums
+                print dsums
+                raise TODOSomeKindOfException()

 class MWCRNGFloatsTest(PTXTest):
    """
--- a/cuburn/ptx.py
+++ b/cuburn/ptx.py
--- a/main.py
+++ b/main.py
@ -16,6 +16,8 @@ from ctypes import *

 import numpy as np

+np.set_printoptions(precision=5, edgeitems=20)
+
 from cuburn.device_code import *
 from cuburn.cuda import LaunchContext
 from fr0stlib.pyflam3 import *