diff --git a/bench.py b/bench.py
deleted file mode 100644
index 8151775..0000000
--- a/bench.py
+++ /dev/null
@@ -1,237 +0,0 @@
-#!/usr/bin/python
-# -*- encoding: utf-8 -*-
-
-"""
-Various micro-benchmarks and other experiments.
-"""
-import numpy as np
-import pycuda.autoinit
-import pycuda.driver as cuda
-from cuburn.ptx import PTXFragment, PTXTest, ptx_func, instmethod
-from cuburn.cuda import LaunchContext
-from cuburn.device_code import MWCRNG, MWCRNGTest
-
-class L2WriteCombining(PTXTest):
-    """
-    Test of L2 write combining.
-    """
-    entry_name = 'l2_write_combining'
-    entry_params = [('u64', 'a_report_addr'), ('u64', 'a_scratch_addr')]
-
-    block_size = 2**20 # 1MB/CTA.
-    rounds = int(1e6)
-
-    @ptx_func
-    def entry(self):
-        mem.shared.u32('s_offset')
-        reg.u32('bytes_written offset write_size laneid ctaid rounds x')
-        reg.u64('scratch_addr scratch_offset clka clkb bytes')
-        reg.pred('p_write p_loop_wrsz p_is_first p_done p_coalesced')
-
-        op.mov.u32(laneid, '%laneid')
-        op.setp.eq.u32(p_is_first, laneid, 0)
-
-        op.ld.param.u32(scratch_addr, addr(a_scratch_addr))
-        op.mov.u32(ctaid, '%ctaid.x')
-        op.cvt.u64.u32(scratch_offset, ctaid)
-        op.mad.lo.u64(scratch_addr, scratch_offset, self.block_size,
-                      scratch_addr)
-
-        op.mov.u32(x, 0)
-
-        label('l2_restart')
-        comment("If CTA is even, do coalesced first")
-        op.and_.b32(ctaid, ctaid, 1)
-        op.setp.eq.u32(p_coalesced, ctaid, 0)
-        op.bra.uni('l2_loop_start')
-
-        label('l2_loop_start')
-        op.st.shared.u32(addr(s_offset), 0, ifp=p_is_first)
-        op.mov.u32(rounds, 0)
-        op.mov.u32(write_size, 16)
-        op.mov.u64(clka, '%clock64')
-        op.mov.u64(bytes, 0)
-
-        label('l2_loop')
-        comment("Increment offset across the CTA")
-        op.atom.shared.add.u32(offset, addr(s_offset), write_size,
-                               ifp=p_is_first)
-
-        comment("Find write address from current offset and lane")
-        op.ld.shared.u32(offset, addr(s_offset))
-        op.add.u32(offset, offset, laneid)
-        op.mul.lo.u32(offset, offset, 8)
-        op.and_.b32(offset, offset, self.block_size-1)
-
-        op.cvt.u64.u32(scratch_offset, offset)
-        op.add.u64(scratch_offset, scratch_offset, scratch_addr)
-
-        comment("If lane < write_size, write to address")
-        op.setp.lt.u32(p_write, laneid, write_size)
-        op.st.u64(addr(scratch_offset), scratch_offset, ifp=p_write)
-
-        comment("Add to number of bytes written")
-        op.add.u64(bytes, bytes, 8, ifp=p_write)
-
-        comment("If uncoalesced, store new write size")
-        op.add.u32(write_size, write_size, 1, ifnotp=p_coalesced)
-        op.setp.gt.u32(p_loop_wrsz, write_size, 32)
-        op.mov.u32(write_size, 2, ifp=p_loop_wrsz)
-
-        comment("Loop!")
-        op.add.u32(rounds, rounds, 1)
-        op.setp.ge.u32(p_done, rounds, self.rounds)
-        op.bra.uni(l2_loop, ifnotp=p_done)
-
-        label('l2_loop_end')
-        op.mov.u64(clkb, '%clock64')
-        op.sub.u64(clka, clkb, clka)
-        with block("Store the time l2_loop took"):
-            reg.u64('report_addr report_offset')
-            reg.u32('gtid')
-            std.get_gtid(gtid)
-            op.mul.lo.u32(gtid, gtid, 32)
-            op.add.u32(gtid, gtid, 16, ifnotp=p_coalesced)
-            op.cvt.u64.u32(report_offset, gtid)
-            op.ld.param.u64(report_addr, addr(a_report_addr))
-            op.add.u64(report_addr, report_addr, report_offset)
-            op.st.u64(addr(report_addr), clka)
-            op.st.u64(addr(report_addr,8), bytes)
-
-        comment("If we did coalesced, go back and do uncoalesced")
-        op.add.u32(ctaid, ctaid, 1)
-        op.add.u32(x, x, 1)
-        op.setp.ge.u32(p_done, x, 2)
-        op.bra.uni(l2_restart, ifnotp=p_done)
-
-    def call_setup(self, ctx):
-        self.scratch = np.zeros(self.block_size*ctx.nctas/4, np.uint64)
-        self.times_bytes = np.zeros((4, ctx.nthreads), np.uint64, 'F')
-
-    def _call(self, ctx, func):
-        super(L2WriteCombining, self)._call(ctx, func,
-                cuda.InOut(self.times_bytes), cuda.InOut(self.scratch))
-
-    def call_teardown(self, ctx):
-        pm = lambda a: (np.mean(a), np.std(a) / np.sqrt(len(a)))
-        print "Clks for coa was %g ± %g" % pm(self.times_bytes[0])
-        print "Bytes for coa was %g ± %g" % pm(self.times_bytes[1])
-        print "Clks for uncoa was %g ± %g" % pm(self.times_bytes[2])
-        print "Bytes for uncoa was %g ± %g" % pm(self.times_bytes[3])
-        print
-
-class SimulOccupancy(PTXTest):
-    """
-    Test to discover whether Fermi GPUs will launch multiple entry points
-    in the same kernel on the same CTA simultaneously.
-    """
-    entry_name = 'simul1'
-    # Only has to be big enough to hold the kernel on the device for a while
-    rounds = 1000000
-
-    def deps(self):
-        return [MWCRNG]
-
-    @ptx_func
-    def module_setup(self):
-        n = self.entry_name + '_'
-        mem.global_.u64(n+'start', ctx.nthreads)
-        mem.global_.u64(n+'end', ctx.nthreads)
-        mem.global_.u32(n+'smid', ctx.nthreads)
-        mem.global_.u32(n+'warpid_start', ctx.nthreads)
-        mem.global_.u32(n+'warpid_end', ctx.nthreads)
-
-    @ptx_func
-    def entry(self):
-        n = self.entry_name + '_'
-        reg.u64('now')
-        reg.u32('warpid')
-        op.mov.u64(now, '%clock64')
-        op.mov.u32(warpid, '%warpid')
-        std.store_per_thread(n+'start', now,
-                             n+'warpid_start', warpid)
-
-        reg.u32('loopct rnd')
-        reg.pred('p_done')
-        op.mov.u32(loopct, self.rounds)
-        label('loopstart')
-        mwc.next_b32(rnd)
-        std.store_per_thread(n+'smid', rnd)
-        op.sub.u32(loopct, loopct, 1)
-        op.setp.eq.u32(p_done, loopct, 0)
-        op.bra.uni(loopstart, ifnotp=p_done)
-
-        reg.u32('smid')
-        op.mov.u32(smid, '%smid')
-        op.mov.u32(warpid, '%warpid')
-        op.mov.u64(now, '%clock64')
-        std.store_per_thread(n+'end', now,
-                             n+'smid', smid,
-                             n+'warpid_end', warpid)
-
-    def _call(self, ctx, func):
-        stream1, stream2 = cuda.Stream(), cuda.Stream()
-        self._call2(ctx, stream1)
-        _SimulOccupancy._call2(ctx, stream2)
-        stream2.synchronize()
-        stream1.synchronize()
-
-    @instmethod
-    def _call2(self, ctx, stream):
-        func = ctx.mod.get_function(self.entry_name)
-        func.prepare([], ctx.block)
-        # TODO: load number of SMs from ctx
-        func.launch_grid_async(7, 1, stream)
-
-    def call_teardown(self, ctx):
-        sm_log = [[] for i in range(7)]
-        self._teardown(ctx, sm_log)
-        _SimulOccupancy._teardown(ctx, sm_log)
-        for sm in range(len(sm_log)):
-            print "\nPrinting log for SM %d" % sm
-            for t, ev in sorted(sm_log[sm]):
-                print '%6d %s' % (t/1000, ev)
-
-    @instmethod
-    def _teardown(self, ctx, sm_log):
-        # For this method, the GPU is intentionally underloaded; trim results
-        th = 7 * ctx.threads_per_cta
-        n = self.entry_name + '_'
-        start = ctx.get_per_thread(n+'start', np.uint64)[:th]
-        end = ctx.get_per_thread(n+'end', np.uint64)[:th]
-        smid = ctx.get_per_thread(n+'smid', np.uint32)[:th]
-        warpid_start = ctx.get_per_thread(n+'warpid_start', np.uint32)[:th]
-        warpid_end = ctx.get_per_thread(n+'warpid_end', np.uint32)[:th]
-        for i in range(0, th, 32):
-            sm_log[smid[i]].append((start[i], "%s%4d entered SM" % (n, i/32)))
-            sm_log[smid[i]].append((end[i],   "%s%4d left SM" % (n, i/32)))
-        if not np.alltrue(np.equal(warpid_start, warpid_end)):
-            print "Warp IDs changed. Do further research."
-
-class _SimulOccupancy(SimulOccupancy):
-    # Don't call this one
-    entry_name = 'simul2'
-    def call(self, ctx):
-        pass
-    def call_teardown(self, ctx):
-        pass
-
-def printover(a, r, s=1):
-    for i in range(0, len(a), r*s):
-        for j in range(i, i+r*s, s):
-            if j < len(a): print a[j],
-        print
-
-def main():
-    # TODO: block/grid auto-optimization
-    ctx = LaunchContext([L2WriteCombining, SimulOccupancy, _SimulOccupancy],
-                        block=(128,1,1), grid=(7*8,1), tests=True)
-    ctx.compile(verbose=3)
-    ctx.run_tests()
-    SimulOccupancy.call(ctx)
-    L2WriteCombining.call(ctx)
-
-if __name__ == "__main__":
-    main()
-
-