From a5d7c2cc1aa7fc216dde14fe8324026867b55cda Mon Sep 17 00:00:00 2001
From: Steven Robertson <steven@strobe.cc>
Date: Sat, 11 Sep 2010 13:15:36 -0400
Subject: [PATCH] Use variations. This works, but is still fragile.

---
 cuburn/cuda.py        |   1 +
 cuburn/device_code.py | 169 +++++++++++++++++++++++++++++++++++-------
 main.py               |  11 ++-
 3 files changed, 150 insertions(+), 31 deletions(-)

diff --git a/cuburn/cuda.py b/cuburn/cuda.py
index 42eba12..08aaa5f 100644
--- a/cuburn/cuda.py
+++ b/cuburn/cuda.py
@@ -38,6 +38,7 @@ class LaunchContext(object):
         self.entry_types = entries
         self.block, self.grid, self.build_tests = block, grid, tests
         self.setup_done = False
+        self.stream = cuda.Stream()
 
     @property
     def threads(self):
diff --git a/cuburn/device_code.py b/cuburn/device_code.py
index 858309b..53559bc 100644
--- a/cuburn/device_code.py
+++ b/cuburn/device_code.py
@@ -10,16 +10,18 @@ import pycuda.driver as cuda
 import numpy as np
 
 from cuburn.ptx import *
+from cuburn.variations import Variations
 
 class IterThread(PTXEntryPoint):
     entry_name = 'iter_thread'
     entry_params = []
+    maxnreg = 16
 
     def __init__(self):
         self.cps_uploaded = False
 
     def deps(self):
-        return [MWCRNG, CPDataStream, HistScatter]
+        return [MWCRNG, CPDataStream, HistScatter, Variations]
 
     @ptx_func
     def module_setup(self):
@@ -30,24 +32,32 @@ class IterThread(PTXEntryPoint):
         # TODO move into debug statement
         mem.global_.u32('g_num_rounds', ctx.threads)
         mem.global_.u32('g_num_writes', ctx.threads)
+        mem.global_.b32('g_whatever', ctx.threads)
 
     @ptx_func
     def entry(self):
         # For now, we indulge in the luxury of shared memory.
-
         # Index number of current CP, shared across CTA
         mem.shared.u32('s_cp_idx')
 
         # Number of samples that have been generated so far in this CTA
         # If this number is negative, we're still fusing points, so this
         # behaves slightly differently (see ``fuse_loop_start``)
-        mem.shared.u32('s_num_samples')
-        op.st.shared.u32(addr(s_num_samples), -(features.num_fuse_samples+1))
+        mem.shared.s32('s_num_samples')
+        op.st.shared.s32(addr(s_num_samples), -(features.num_fuse_samples+1))
+
+        mem.shared.f32('s_xf_sel', ctx.warps_per_cta)
+
+        std.store_per_thread(g_whatever, 1234)
 
         # TODO: temporary, for testing
-        reg.u32('num_rounds num_writes')
-        op.mov.u32(num_rounds, 0)
-        op.mov.u32(num_writes, 0)
+        mem.local.u32('l_num_rounds')
+        mem.local.u32('l_num_writes')
+        op.st.local.u32(addr(l_num_rounds), 0)
+        op.st.local.u32(addr(l_num_writes), 0)
+
+        mem.local.f32('l_consec')
+        op.st.local.f32(addr(l_consec), 0.)
 
         reg.f32('x_coord y_coord color_coord')
         mwc.next_f32_11(x_coord)
@@ -57,16 +67,24 @@ class IterThread(PTXEntryPoint):
         comment("Ensure all init is done")
         op.bar.sync(0)
 
+
+
         label('cp_loop_start')
         reg.u32('cp_idx cpA')
         with block("Claim a CP"):
             std.set_is_first_thread(reg.pred('p_is_first'))
             op.atom.add.u32(cp_idx, addr(g_num_cps_started), 1, ifp=p_is_first)
             op.st.shared.u32(addr(s_cp_idx), cp_idx, ifp=p_is_first)
-            op.st.shared.u32(addr(s_num_samples), 0, ifp=p_is_first)
+
+        with block("If done fusing, reset the sample count now"):
+            reg.pred("p_done_fusing")
+            reg.s32('num_samples')
+            op.ld.shared.s32(num_samples, addr(s_num_samples))
+            op.setp.gt.s32(p_done_fusing, num_samples, 0)
+            op.st.shared.s32(addr(s_num_samples), 0, ifp=p_done_fusing)
 
         comment("Load the CP index in all threads")
-        op.bar.sync(1)
+        op.bar.sync(0)
         op.ld.shared.u32(cp_idx, addr(s_cp_idx))
 
         with block("Check to see if this CP is valid (if not, we're done)"):
@@ -80,24 +98,68 @@ class IterThread(PTXEntryPoint):
             op.mov.u32(cpA, g_cp_array)
             op.mad.lo.u32(cpA, cp_idx, cp.stream_size, cpA)
 
+
+
         label('fuse_loop_start')
         # When fusing, num_samples holds the (negative) number of iterations
         # left across the CP, rather than the number of samples in total.
         with block("If still fusing, increment count unconditionally"):
             std.set_is_first_thread(reg.pred('p_is_first'))
             op.red.shared.add.s32(addr(s_num_samples), 1, ifp=p_is_first)
-            op.bar.sync(2)
+
+        label('iter_loop_choose_xform')
+        with block("Choose the xform for each warp"):
+            comment("On subsequent runs, only warp 0 will hit this code")
+            reg.u32('x_addr x_offset')
+            reg.f32('xf_sel')
+            op.mov.u32(x_addr, s_xf_sel)
+            op.mov.u32(x_offset, '%tid.x')
+            op.and_.b32(x_offset, x_offset, ctx.warps_per_cta-1)
+            op.mad.lo.u32(x_addr, x_offset, 4, x_addr)
+            mwc.next_f32_01(xf_sel)
+            op.st.volatile.shared.f32(addr(x_addr), xf_sel)
 
         label('iter_loop_start')
 
-        comment('Do... well, most of everything')
+        comment("I really didn't want to have to sync each loop, but it seems")
+        comment("like the highest-performance strategy right now")
+        #op.bar.sync(1)
 
-        mwc.next_f32_11(x_coord)
-        mwc.next_f32_11(y_coord)
-        mwc.next_f32_01(color_coord)
+        with block():
+            reg.u32('num_rounds')
+            reg.pred('overload')
+            op.ld.local.u32(num_rounds, addr(l_num_rounds))
+            op.add.u32(num_rounds, num_rounds, 1)
+            op.st.local.u32(addr(l_num_rounds), num_rounds)
 
-        op.add.u32(num_rounds, num_rounds, 1)
 
+
+        with block("Select an xform"):
+            reg.f32('xf_sel')
+            reg.u32('warp_offset xf_sel_addr')
+            op.mov.u32(warp_offset, '%tid.x')
+            op.mov.u32(xf_sel_addr, s_xf_sel)
+            op.shr.u32(warp_offset, warp_offset, 5)
+            op.mad.lo.u32(xf_sel_addr, warp_offset, 4, xf_sel_addr)
+            op.ld.volatile.shared.f32(xf_sel, addr(xf_sel_addr))
+
+            reg.f32('xf_density')
+            reg.pred('xf_jump')
+            for xf in features.xforms:
+                cp.get(cpA, xf_density, 'cp.xforms[%d].cweight' % xf.id)
+                op.setp.le.f32(xf_jump, xf_sel, xf_density)
+                op.bra('XFORM_%d' % xf.id, ifp=xf_jump)
+            std.asrt("Reached end of xforms without choosing one")
+
+            for xf in features.xforms:
+                label('XFORM_%d' % xf.id)
+                variations.apply_xform(x_coord, y_coord, color_coord,
+                                       x_coord, y_coord, color_coord, xf.id)
+                op.bra.uni("xform_done")
+
+
+
+        label("xform_done")
         with block("Test if we're still in FUSE"):
             reg.s32('num_samples')
             reg.pred('p_in_fuse')
@@ -108,7 +170,26 @@ class IterThread(PTXEntryPoint):
         reg.pred('p_point_is_valid')
         with block("Write the result"):
             hist.scatter(x_coord, y_coord, color_coord, 0, p_point_is_valid)
-            op.add.u32(num_writes, num_writes, 1, ifp=p_point_is_valid)
+            with block():
+                reg.u32('num_writes')
+                op.ld.local.u32(num_writes, addr(l_num_writes))
+                op.add.u32(num_writes, num_writes, 1, ifp=p_point_is_valid)
+                op.st.local.u32(addr(l_num_writes), num_writes)
+
+        with block("If the result was invalid, handle badvals"):
+            reg.f32('consec')
+            reg.pred('need_new_point')
+            op.ld.local.f32(consec, addr(l_consec))
+            op.mov.f32(consec, 0., ifp=p_point_is_valid)
+            op.add.f32(consec, consec, 1., ifnotp=p_point_is_valid)
+            op.setp.ge.f32(need_new_point, consec, 5.)
+            op.bra('badval_done', ifnotp=need_new_point)
+            mwc.next_f32_11(x_coord)
+            mwc.next_f32_11(y_coord)
+            mwc.next_f32_01(color_coord)
+            op.mov.f32(consec, 0.)
+            label('badval_done')
+            op.st.local.f32(addr(l_consec), consec)
 
         with block("Increment number of samples by number of good values"):
             reg.b32('good_samples laneid')
@@ -125,21 +206,35 @@ class IterThread(PTXEntryPoint):
             reg.s32('num_samples num_samples_needed')
             op.ld.shared.s32(num_samples, addr(s_num_samples))
             cp.get(cpA, num_samples_needed, 'cp.nsamples')
+            std.store_per_thread(g_whatever, num_samples_needed)
             op.setp.ge.s32(p_cp_done, num_samples, num_samples_needed)
             op.bra.uni(cp_loop_start, ifp=p_cp_done)
 
+        with block("If first warp, pick new thread offset"):
+            reg.u32('warpid')
+            reg.pred('first_warp')
+            op.mov.u32(warpid, '%tid.x')
+            op.shr.b32(warpid, warpid, 5)
+            op.setp.eq.u32(first_warp, warpid, 0)
+            #std.asrt("Looks like we're not the first warp", notp=first_warp,
+                    #ret=True)
+            op.bra.uni(iter_loop_choose_xform, ifp=first_warp)
         op.bra.uni(iter_loop_start)
 
         label('all_cps_done')
         # TODO this is for testing, move it to a debug statement
-        std.store_per_thread(g_num_rounds, num_rounds,
-                             g_num_writes, num_writes)
+        with block():
+            reg.u32('num_rounds num_writes')
+            op.ld.local.u32(num_rounds, addr(l_num_rounds))
+            op.ld.local.u32(num_writes, addr(l_num_writes))
+            std.store_per_thread(g_num_rounds, num_rounds,
+                                 g_num_writes, num_writes)
 
     @instmethod
     def upload_cp_stream(self, ctx, cp_stream, num_cps):
         cp_array_dp, cp_array_l = ctx.mod.get_global('g_cp_array')
         assert len(cp_stream) <= cp_array_l, "Stream too big!"
-        cuda.memcpy_htod_async(cp_array_dp, cp_stream)
+        cuda.memcpy_htod(cp_array_dp, cp_stream)
 
         num_cps_dp, num_cps_l = ctx.mod.get_global('g_num_cps')
         cuda.memset_d32(num_cps_dp, num_cps, 1)
@@ -162,14 +257,29 @@ class IterThread(PTXEntryPoint):
 
     def call_teardown(self, ctx):
         shape = (ctx.grid[0], ctx.block[0]/32, 32)
+
+        def print_thing(s, a):
+            print '%s:' % s
+            for i, r in enumerate(a):
+                for j in range(0,len(r),8):
+                    print '%2d\t%s' % (i,
+                        '\t'.join(['%g '%np.mean(r[k]) for k in range(j,j+8)]))
+
         num_rounds_dp, num_rounds_l = ctx.mod.get_global('g_num_rounds')
         num_writes_dp, num_writes_l = ctx.mod.get_global('g_num_writes')
+        whatever_dp, whatever_l = ctx.mod.get_global('g_whatever')
         rounds = cuda.from_device(num_rounds_dp, shape, np.int32)
         writes = cuda.from_device(num_writes_dp, shape, np.int32)
-        print "Rounds:", sum(rounds)
-        print "Writes:", sum(writes)
-        print rounds
-        print writes
+        whatever = cuda.from_device(whatever_dp, shape, np.int32)
+        print_thing("Rounds", rounds)
+        print_thing("Writes", writes)
+        print_thing("Whatever", whatever)
+
+        print np.sum(rounds)
+
+        dp, l = ctx.mod.get_global('g_num_cps_started')
+        cps_started = cuda.from_device(dp, 1, np.uint32)
+        print "CPs started:", cps_started
 
 class CameraTransform(PTXFragment):
     shortname = 'camera'
@@ -363,10 +473,13 @@ class HistScatter(PTXFragment):
             cp.get(cpA, norm_time, 'cp.norm_time')
             palette.look_up(r, g, b, a, color, norm_time)
             # TODO: look up, scale by xform visibility
-            op.red.add.f32(addr(hist_bin_addr), r)
-            op.red.add.f32(addr(hist_bin_addr,4), g)
-            op.red.add.f32(addr(hist_bin_addr,8), b)
-            op.red.add.f32(addr(hist_bin_addr,12), a)
+            # TODO: Make this more performant
+            reg.f32('gval')
+            for i, val in enumerate([r, g, b, a]):
+                #op.red.add.f32(addr(hist_bin_addr,4*i), val)
+                op.ld.f32(gval,addr(hist_bin_addr,4*i))
+                op.add.f32(gval, gval, val)
+                op.st.f32(addr(hist_bin_addr,4*i),gval)
 
 
     def call_setup(self, ctx):
@@ -380,6 +493,8 @@ class HistScatter(PTXFragment):
                 (features.hist_height, features.hist_stride, 4),
                 dtype=np.float32)
 
+
+
 class MWCRNG(PTXFragment):
     shortname = "mwc"
 
diff --git a/main.py b/main.py
index bf45876..f313676 100644
--- a/main.py
+++ b/main.py
@@ -11,6 +11,7 @@
 
 import os
 import sys
+from pprint import pprint
 from ctypes import *
 
 import numpy as np
@@ -39,10 +40,12 @@ def main(args):
     anim = Animation(genomes)
     anim.compile()
     bins = anim.render_frame()
-    #dump_3d(bins)
-    bins /= ((np.mean(bins)+1e-9)/128.)
-    bins.astype(np.uint8)
-
+    #bins = np.log2(bins + 1)
+    bins *= (512./(np.mean([bins[y][x][3]
+                           for x in range(anim.features.hist_width)
+                           for y in range(anim.features.hist_height)])+1e-9))
+    bins = np.minimum(bins, 255)
+    bins = bins.astype(np.uint8)
 
     if '-g' not in args:
         return