From ada0fe20c72bc8cc39ace9ba7e17068533c2c002 Mon Sep 17 00:00:00 2001
From: Steven Robertson <steven@strobe.cc>
Date: Mon, 6 Sep 2010 14:19:06 -0400
Subject: [PATCH] Random floats (I think)

---
 TODO                     | 59 ++++++++++++++++++++++++----------------
 cuburnlib/device_code.py | 43 ++++++++++++++++++++++-------
 2 files changed, 69 insertions(+), 33 deletions(-)

diff --git a/TODO b/TODO
index 8ac22ee..ebc2be4 100644
--- a/TODO
+++ b/TODO
@@ -1,23 +1,23 @@
-Status: currently broken (syntax errors, incomplete sections)
+Status: passes rudimentary tests
 
 Current goals:
-- Test DeviceStream, and get it working. Bugs are expected.
-    - Test allocator
-    - Test statement evaluator
-    - Test packing correctly
-    - Test that device instructions get injected correctly
-    - Test in working implementation
-- Load a set of genomes and calculate a bare minimum `Feature` set (no xforms,
-  no filters, no oversample)
-- Get frames loaded for rendering
-- Get IterThread running in device kernel
-    - For now, implement as `PTXTest`
-    - For each frame, loop for FUSE times, then loop through expected number of
-      points for each CP. Keep a count of number of times looped, and number of
-      stores that would be done. Verify against expected counts.
 
+- Draw some dang points!
+    - Allocate buffer (can it be pre-allocated?)
+    - Direct scatter linear points by GTID from flame number
+    - Re-enable preview window
+    - Execute frame, update texture, repeat
+- Writeback of points to the buffer
+    - Define writeback class, args
+    - Do camera rotation across frameset
+    - Postpone other kinds of testing and address clamping for now
+- Start xforms
+    - At first, fixed Sierpinski triangle or something
+    - xform selection, pre- and post-transform in xform
+    - first of the variations
 
 Things to do (rather severely incomplete):
+
 - LaunchContext thread distribution based on generated code register count and
   shared memory size
 - qlocal storage
@@ -27,9 +27,6 @@ Things to do (rather severely incomplete):
 - The `Feature` class
     - Transform count and per-transform code layout
     - Filter size, oversample, final buffer size
-- Palette storage
-    - Performance implications of different state spaces
-    - Performance and quality of 2D texture interpolation
 - Buffer allocation, clearing, reading from device
 - Preview window
     - When/how to sample?
@@ -41,8 +38,24 @@ Things to do (rather severely incomplete):
     - Implement
     - Test effects on quality by masking off writes on all but one lane and
       boosting the sample density to compensate (muuuuuch later on)
-- MWC RNG output types
-    - float in range [0, 1]
-- Debug statements
-    - Some code can't be tested separately (notably IterThread). Make a debug
-      flag which embeds extra tests into the kernel
+- DE
+
+Things to test:
+
+- DeviceStream allocator and proper handling of corner cases
+- Debug flag/dict/whatever for entire project in general
+    - Iteration counters for IterThread
+
+Things to benchmark:
+
+- Kernel invocation and/or interrupt times (will high load freeze X?)
+- 1D/2D texture load+interpolation speeds vs constant memory loading
+    - Must test under high SFU load
+    - Tex uses separate cache? Has lower bandwidth penalty for gather?
+- MWC float conversion
+- The entire scatter process
+    - Radix sort of writeback coordinates
+    - Log-copy-histogram approach
+    - Direct reductions
+    - Surface loads, stores, reductions
+
diff --git a/cuburnlib/device_code.py b/cuburnlib/device_code.py
index 8401fcf..bed0e38 100644
--- a/cuburnlib/device_code.py
+++ b/cuburnlib/device_code.py
@@ -40,10 +40,10 @@ class IterThread(PTXTest):
         op.mov.u32(num_writes, 0)
 
         # TODO: MWC float output types
-        #mwc_next_f32_01(x_coord)
-        #mwc_next_f32_01(y_coord)
-        #mwc_next_f32_01(color_coord)
-        #mwc_next_f32_01(alpha_coord)
+        mwc_next_f32_01(x_coord)
+        mwc_next_f32_01(y_coord)
+        mwc_next_f32_01(color_coord)
+        mwc_next_f32_01(alpha_coord)
 
         # Registers are hard to come by. To avoid having to track both the count
         # of samples processed and the number of samples to generate,
@@ -189,17 +189,40 @@ class MWCRNG(PTXFragment):
             op.mad.lo.u32(mwc_addr, mwc_off, 8, mwc_addr)
             op.st.global_.v2.u32(addr(mwc_addr), vec(mwc_st, mwc_car))
 
+    @ptx_func
+    def _next(self):
+        # Call from inside a block!
+        reg.u64('mwc_out')
+        op.cvt.u64.u32(mwc_out, mwc_car)
+        op.mad.wide.u32(mwc_out, mwc_st, mwc_mult, mwc_out)
+        op.mov.b64(vec(mwc_st, mwc_car), mwc_out)
+
     @ptx_func
     def next_b32(self, dst_reg):
-        with block('Load next random into ' + dst_reg.name):
-            reg.u64('mwc_out')
-            op.cvt.u64.u32(mwc_out, mwc_car)
-            op.mad.wide.u32(mwc_out, mwc_st, mwc_mult, mwc_out)
-            op.mov.b64(vec(mwc_st, mwc_car), mwc_out)
+        with block('Load next random u32 into ' + dst_reg.name):
+            self._next()
             op.mov.u32(dst_reg, mwc_st)
 
+    @ptx_func
+    def next_f32_01(self, dst_reg):
+        # TODO: verify that this is the fastest-performance method
+        # TODO: verify that this actually does what I think it does
+        with block('Load random float [0,1] into ' + dst_reg.name):
+            self._next()
+            op.cvt.rn.f32.u32(dst_reg, mwc_st)
+            op.mul.f32(dst_reg, dst_reg, '0f0000802F') # 1./(1<<32)
+
+    @ptx_func
+    def next_f32_11(self, dst_reg):
+        with block('Load random float [-1,1) into ' + dst_reg.name):
+            self._next()
+            op.cvt.rn.f32.s32(dst_reg, mwc_st)
+            op.mul.lo.f32(dst_reg, dst_reg, '0f00000030') # 1./(1<<31)
+
     def to_inject(self):
-        return dict(mwc_next_b32=self.next_b32)
+        return dict(mwc_next_b32=self.next_b32,
+                    mwc_next_f32_01=self.next_f32_01,
+                    mwc_next_f32_11=self.next_f32_11)
 
     def device_init(self, ctx):
         if self.threads_ready >= ctx.threads: