From 6bac3b3a95e95f29b37a645c620ea8aabfa86478 Mon Sep 17 00:00:00 2001
From: Steven Robertson <steven@strobe.cc>
Date: Fri, 9 Dec 2011 14:14:36 -0500
Subject: [PATCH] Use reordered, lossy bit handling

---
 cuburn/code/iter.py | 47 ++++++++++++++++++++++++++-------------------
 cuburn/render.py    | 12 ++++++++----
 2 files changed, 35 insertions(+), 24 deletions(-)

diff --git a/cuburn/code/iter.py b/cuburn/code/iter.py
index d7d29ae..cde620c 100644
--- a/cuburn/code/iter.py
+++ b/cuburn/code/iter.py
@@ -401,7 +401,8 @@ __launch_bounds__(BS, 1)
 write_shmem(
         float4 *acc,
         const uint32_t *log,
-        const uint32_t *log_bounds
+        const uint32_t *log_bounds,
+        uint32_t nbins
 ) {
     const int tid = threadIdx.x;
     const int bid = blockIdx.x;
@@ -443,16 +444,15 @@ write_shmem(
     float time = tid * rnrounds;
     float time_step = BS * rnrounds;
 
-    int glo_base = bid << SHAB;
-    float4* glo_ptr = &acc[glo_base];
+    int magic = ((blockIdx.x & 0xff) << 3) + ((blockIdx.x & 0xf00) << 12);
+    int magic_mask = 0xf007f8;
 
     for (int i = idx_lo + tid; i < idx_hi; i += BS) {
         int entry = log[i];
+        time += time_step;
 
-        // Constant '12' is 32 - 8 - SHAB, where 8 is the
-        // number of bits assigned to color. TODO: This ignores opacity.
-        bfe_decl(glob_addr, entry, SHAB, 12);
-        if (glob_addr != bid) continue;
+        // TODO: opacity
+        if ((entry & magic_mask) != magic) continue;
 
         asm volatile ({{crep("""
 {
@@ -461,7 +461,10 @@ write_shmem(
     .reg .u64 ptr;
     .reg .f32 rf, gf, bf, df;
 
-    bfi.b32         shoff,  %0,     0,  2,  12;
+    and.b32         shoff,  %0,     0xff800;
+    shr.b32         shoff,  shoff,  6;
+    bfi.b32         shoff,  %0,     shoff,  2,  3;
+
     bfe.u32         color,  %0,     24, 8;
     shl.b32         color,  color,  3;
     cvt.rni.u32.f32 time,   %1;
@@ -476,15 +479,16 @@ acc_write_start:
 @p  ld.shared.volatile.u32  hiw,    [shoff+0x4000];
     add.cc.u32      lo,     los,    low;
     addc.u32        hi,     his,    hiw;
-    setp.lo.u32     q,      hi,     (1023 << 22);
-    selp.b32        hiw,    hi,     0,      q;
-    selp.b32        low,    lo,     0,      q;
+    setp.hs.and.u32 q,      hi,     (1023 << 22),   p;
+    selp.b32        hiw,    0,      hi,     q;
+    selp.b32        low,    0,      lo,     q;
 @p  st.shared.volatile.u32   [shoff+0x4000],    hiw;
     // This instruction will get replaced with an STSUL
 @p  st.shared.volatile.u32   [shoff+0xffff],    low;
-@!p bra             acc_write_start;
-@q  bra             oflow_write_end;
-    shl.b32         shoff,  shoff,  2;
+//@!p bra             acc_write_start;
+@!q bra             oflow_write_end;
+    // TODO: opacity
+    bfi.b32         shoff,  %0,     0,  4,  24;
     cvt.u64.u32     ptr,    shoff;
     add.u64         ptr,    ptr,    %2;
     bfe.u32         r,      hi,     4,      18;
@@ -504,16 +508,18 @@ acc_write_start:
 
 oflow_write_end:
 }
-        """)}}  ::  "r"(entry), "f"(time), "l"(glo_ptr));
-        // TODO: go through the pain of manual address calculation for global ptr
-        time += time_step;
+        """)}}  ::  "r"(entry), "f"(time), "l"(acc));
     }
 
     __syncthreads();
+
+
     int idx = tid;
-    for (int i = 0; i < (SHAW / BS); i++) {
+    int glo_idx = magic | (((idx << 8) | idx) & 0xff807);
+
+    for (int i = 0; i < (SHAW / BS) && glo_idx < nbins; i++) {
         int d, r, g, b;
-        float4 pix = acc[glo_base + idx];
+        float4 pix = acc[glo_idx];
         asm({{crep("""
 {
     .reg .u32 hi, lo;
@@ -530,8 +536,9 @@ oflow_write_end:
         pix.y += g / 255.0f;
         pix.z += b / 255.0f;
         pix.w += d;
-        acc[glo_base + idx] = pix;
+        acc[glo_idx] = pix;
         idx += BS;
+        glo_idx += (BS << 8);
     }
 }
 
diff --git a/cuburn/render.py b/cuburn/render.py
index 4876f54..d3251bf 100644
--- a/cuburn/render.py
+++ b/cuburn/render.py
@@ -8,6 +8,7 @@ from itertools import cycle, repeat, chain, izip
 from ctypes import *
 from cStringIO import StringIO
 import numpy as np
+from numpy import int32 as i32
 from scipy import ndimage
 
 from fr0stlib import pyflam3
@@ -147,7 +148,9 @@ class Renderer(object):
             d_log = cuda.mem_alloc(log_size * 4)
             d_log_sorted = cuda.mem_alloc(log_size * 4)
             sorter = sort.Sorter(log_size)
-            nwriteblocks = int(np.ceil(nbins / float(1<<12)))
+            # We need to cover each unique tag - address bits 20-23 - with one
+            # write block per sort bin. Or somethinig like that.
+            nwriteblocks = int(np.ceil(nbins / float(1<<20))) * 256
 
         # Calculate 'nslots', the number of simultaneous running threads that
         # can be active on the GPU during iteration (and thus the number of
@@ -267,10 +270,11 @@ class Renderer(object):
                              block=(32, self._iter.NTHREADS/32, 1),
                              grid=(ntemporal_samples, 1), stream=iter_stream)
                     _sync_stream(write_stream, iter_stream)
-                    sorter.sort(d_log_sorted, d_log, log_size, 12, True,
+                    sorter.sort(d_log_sorted, d_log, log_size, 3, True,
                                 stream=write_stream)
+                    #print cuda.from_device(sorter.dglobal, (256,), np.uint32)
                     _sync_stream(iter_stream, write_stream)
-                    write_fun(d_accum, d_log_sorted, sorter.dglobal,
+                    write_fun(d_accum, d_log_sorted, sorter.dglobal, i32(nbins),
                               block=(1024, 1, 1), grid=(nwriteblocks, 1),
                               stream=write_stream)
             else:
@@ -302,5 +306,5 @@ class Renderer(object):
 
     def _trim(self, result):
         g = self.info.gutter
-        return result[g:-g,g:-g].copy()
+        return result[g:-g,g:g+self.info.width].copy()