diff --git a/bench.py b/bench.py
index e385870..c0d7df1 100644
--- a/bench.py
+++ b/bench.py
@@ -116,7 +116,7 @@ class L2WriteCombining(PTXTest):
         print "Bytes for coa was %g ± %g" % pm(self.times_bytes[1])
         print "Clks for uncoa was %g ± %g" % pm(self.times_bytes[2])
         print "Bytes for uncoa was %g ± %g" % pm(self.times_bytes[3])
-        print ''
+        print
 
 def printover(a, r, s=1):
     for i in range(0, len(a), r*s):
diff --git a/cuburn/cuda.py b/cuburn/cuda.py
index ae83b06..42eba12 100644
--- a/cuburn/cuda.py
+++ b/cuburn/cuda.py
@@ -104,7 +104,7 @@ class LaunchContext(object):
         for test in self.ptx.tests:
             cuda.Context.synchronize()
             if test.call(self):
-                print "Test %s passed." % test.entry_name
+                print "Test %s passed.\n" % test.entry_name
             else:
                 all_okay = False
         return all_okay
diff --git a/cuburn/device_code.py b/cuburn/device_code.py
index dd399d4..a5e727b 100644
--- a/cuburn/device_code.py
+++ b/cuburn/device_code.py
@@ -478,7 +478,7 @@ class MWCRNG(PTXFragment):
             self.seed(ctx)
 
     def tests(self):
-        return [MWCRNGTest]
+        return [MWCRNGTest, MWCRNGFloatsTest]
 
 class MWCRNGTest(PTXTest):
     name = "MWC RNG sum-of-threads"
@@ -555,6 +555,79 @@ class MWCRNGTest(PTXTest):
             print self.sums
             raise PTXTestFailure("MWC RNG sum discrepancy")
 
+class MWCRNGFloatsTest(PTXTest):
+    """
+    Note this only tests that the distributions are in the correct range, *not*
+    that they have good random properties. MWC is a suitable algorithm, but
+    implementation bugs may still lead to poor performance.
+    """
+    rounds = 1024
+    entry_name = 'MWC_RNG_floats_test'
+
+    def deps(self):
+        return [MWCRNG]
+
+    @ptx_func
+    def module_setup(self):
+        mem.global_.f32('mwc_rng_float_01_test_sums', ctx.threads)
+        mem.global_.f32('mwc_rng_float_01_test_mins', ctx.threads)
+        mem.global_.f32('mwc_rng_float_01_test_maxs', ctx.threads)
+        mem.global_.f32('mwc_rng_float_11_test_sums', ctx.threads)
+        mem.global_.f32('mwc_rng_float_11_test_mins', ctx.threads)
+        mem.global_.f32('mwc_rng_float_11_test_maxs', ctx.threads)
+
+    @ptx_func
+    def loop(self, kind):
+        with block('Sum %d floats in %s' % (self.rounds, kind)):
+            reg.f32('loopct val sum rmin rmax')
+            reg.pred('p_done')
+            op.mov.f32(loopct, 0.)
+            op.mov.f32(sum, 0.)
+            op.mov.f32(rmin, 2.)
+            op.mov.f32(rmax, -2.)
+            label('loopstart' + kind)
+            getattr(mwc, 'next_f32_' + kind)(val)
+            op.add.f32(sum, sum, val)
+            op.min.f32(rmin, rmin, val)
+            op.max.f32(rmax, rmax, val)
+            op.add.f32(loopct, loopct, 1.)
+            op.setp.ge.f32(p_done, loopct, float(self.rounds))
+            op.bra('loopstart' + kind, ifnotp=p_done)
+            op.mul.f32(sum, sum, 1./self.rounds)
+            std.store_per_thread('mwc_rng_float_%s_test_sums' % kind, sum,
+                                 'mwc_rng_float_%s_test_mins' % kind, rmin,
+                                 'mwc_rng_float_%s_test_maxs' % kind, rmax)
+
+    @ptx_func
+    def entry(self):
+        self.loop('01')
+        self.loop('11')
+
+    def call_teardown(self, ctx):
+        # Tolerance of all-threads averages
+        tol = 0.05
+        # float distribution kind, test kind, expected value, limit func
+        tests = [
+                    ('01', 'sums',  0.5, None),
+                    ('01', 'mins',  0.0, np.min),
+                    ('01', 'maxs',  1.0, np.max),
+                    ('11', 'sums',  0.0, None),
+                    ('11', 'mins', -1.0, np.min),
+                    ('11', 'maxs',  1.0, np.max)
+                ]
+
+        for fkind, rkind, exp, lim in tests:
+            dp, l = ctx.mod.get_global(
+                    'mwc_rng_float_%s_test_%s' % (fkind, rkind))
+            vals = cuda.from_device(dp, ctx.threads, np.float32)
+            avg = np.mean(vals)
+            if np.abs(avg - exp) > tol:
+                raise PTXTestFailure("%s %s %g too far from %g" %
+                                     (fkind, rkind, avg, exp))
+            if lim is None: continue
+            if lim([lim(vals), exp]) != exp:
+                raise PTXTestFailure("%s %s %g violates hard limit %g" %
+                                     (fkind, rkind, lim(vals), exp))
 
 class CPDataStream(DataStream):
     """DataStream which stores the control points."""