Deferred works again. Time to break it.

2026-07-21 19:23:23 -04:00 · 2011-12-08 15:28:10 -05:00
parent e106524701
commit b76208078f
3 changed files with 118 additions and 60 deletions
@@ -363,9 +363,9 @@ void iter(
        pix.w += 1.0f;
        *accbuf = pix;
 {{elif info.acc_mode == 'deferred'}}
-        // 'color' gets the top 9 bits. TODO: add dithering via precalc.
+        // 'color' gets the top 8 bits. TODO: add dithering via precalc.
-        uint32_t icolor = fminf(1.0f, cc) * 511.0f + color_dither;
+        uint32_t icolor = fminf(1.0f, cc) * 255.0f + color_dither;
-        asm("bfi.b32    %0, %1, %0, 23, 9;" : "+r"(i) : "r"(icolor));
+        asm("bfi.b32    %0, %1, %0, 24, 8;" : "+r"(i) : "r"(icolor));
        *log = i;
 {{endif}}
    }
@@ -394,10 +394,10 @@ void write_shmem_helper(
        const uint32_t gb
 ) {
    float4 pix = acc[glo_idx];
-    pix.x += (dr & 0xffff) / 255.0f;
+    pix.x += (dr & 0xffff) / 127.0f;
    pix.w += dr >> 16;
-    pix.y += (gb & 0xffff) / 255.0f;
+    pix.y += (gb & 0xffff) / 127.0f;
-    pix.z += (gb >> 16) / 255.0f;
+    pix.z += (gb >> 16) / 127.0f;
    acc[glo_idx] = pix;
 }
@@ -417,8 +417,7 @@ __launch_bounds__(BS, 1)
 write_shmem(
        float4 *acc,
        const uint32_t *log,
-        const uint32_t *log_bounds,
+        const uint32_t *log_bounds
        const int log_bounds_shift
 ) {
    const int tid = threadIdx.x;
    const int bid = blockIdx.x;
@@ -441,62 +440,94 @@ write_shmem(
    s_acc_gb[tid+3*BS] = 0;
    __syncthreads();
-    // TODO: share across threads - discernable performance impact?
+    // This predicate is used for the horrible monkey-patching magic.
-    int lb_idx_lo, lb_idx_hi;
+    asm volatile(".reg .pred p; setp.lt.u32 p, %0, 42;" :: "r"(s_acc_dr[0]));
    if (log_bounds_shift > 0) {
        lb_idx_hi = ((bid + 1) << log_bounds_shift) - 1;
        lb_idx_lo = (bid << log_bounds_shift) - 1;
    } else {
        lb_idx_hi = bid >> (-log_bounds_shift);
        lb_idx_lo = lb_idx_hi - 1;
    }
-    int idx_lo, idx_hi;
+    // log_bounds[] holds inclusive prefix sums, so that log_bounds[0] is the
-    if (lb_idx_lo < 0) idx_lo = 0;
+    // largest index with radix 0, and so on.
-    else idx_lo = log_bounds[lb_idx_lo] & ~(BS-1);
+    int lb_idx_hi = bid & 0xff;
-    idx_hi = (log_bounds[lb_idx_hi] & ~(BS - 1)) + BS;
+    int lb_idx_lo = lb_idx_hi - 1;
    int idx_lo;
    if (lb_idx_lo > 0) idx_lo = log_bounds[lb_idx_lo] & ~(BS - 1);
    else               idx_lo = 0;
    int idx_hi = (log_bounds[lb_idx_hi] & ~(BS - 1)) + BS;
    float rnrounds = 1.0f / (idx_hi - idx_lo);
    float time = tid * rnrounds;
    float time_step = BS * rnrounds;
    int glo_base = bid << SHAB;
    float4* glo_ptr = &acc[glo_base];
    for (int i = idx_lo + tid; i < idx_hi; i += BS) {
        int entry = log[i];
-
+        // Constant '12' is 32 - 8 - SHAB, where 8 is the
-        // TODO: constant '11' is really just 32 - 9 - SHAB, where 9 is the
+        // number of bits assigned to color. TODO: This ignores opacity.
-        // number of bits assigned to color. This ignores opacity.
+        bfe_decl(glob_addr, entry, SHAB, 12);
        bfe_decl(glob_addr, entry, SHAB, 11);
        if (glob_addr != bid) continue;
-        bfe_decl(shr_addr, entry, 0, SHAB);
+        // Shared memory address, pre-shifted
-        bfe_decl(color, entry, 23, 9);
+        int shr_addr;
        asm("bfi.b32 %0, %1, 0, 2, 12;" : "=r"(shr_addr) : "r"(entry));
        bfe_decl(color, entry, 24, 8);
-        float colorf = color / 511.0f;
+        float colorf = color / 255.0f;
        float4 outcol = tex2D(palTex, colorf, time);
        // TODO: change texture sampler to return shorts and avoid this
-        uint32_t r = 255.0f * outcol.x;
+        uint32_t r = 127.0f * outcol.x;
-        uint32_t g = 255.0f * outcol.y;
+        uint32_t g = 127.0f * outcol.y;
-        uint32_t b = 255.0f * outcol.z;
+        uint32_t b = 127.0f * outcol.z;
-        uint32_t dr = atomicAdd(s_acc_dr + shr_addr, r + 0x10000);
+        uint32_t dr = r + 0x10000, gb = g + (b << 16);
        uint32_t gb = atomicAdd(s_acc_gb + shr_addr, g + (b << 16));
        uint32_t d = dr >> 16;
-        // Neat trick: if overflow is about to happen, write the accumulator,
+        asm volatile ({{crep("""
-        // and subtract the last known values from the accumulator again.
+{
-        // Even if the ints end up wrapping around once before the subtraction
+    .reg .pred q;
-        // can occur, the results after the subtraction will be correct.
+    .reg .u32 d, r, g, b, dr, gb, drw, gbw, off;
-        // (Wrapping twice will mess up the intermediate write, but is pretty
+    .reg .u64 ptr;
-        // unlikely.)
+    .reg .f32 rf, gf, bf, df;
-        if (d == 250) {
+
-            atomicSub(s_acc_dr + shr_addr, dr);
+acc_write_start:
-            atomicSub(s_acc_gb + shr_addr, gb);
+    // This instruction will get replaced with a LDSLK that sets 'p'.
-            write_shmem_helper(acc, glo_base + shr_addr, dr, gb);
+    // The 0xffff is a signature to make sure we get the right instruction,
-        }
+    // and will get replaced with a 0-offset when patching.
    ld.shared.volatile.u32   dr,     [%2+0xffff];
@p  ld.shared.volatile.u32   gb,     [%2+0x4000];
@p  add.u32         %0,     dr,     %0;
@p  add.u32         %1,     gb,     %1;
    // TODO: clever use of slct could remove an instruction?
@p  setp.lo.u32     q,      %0,     32768000;
@p  selp.b32        drw,    %0,     0,      q;
@p  selp.b32        gbw,    %1,     0,      q;
@p  st.shared.volatile.u32   [%2+0x4000],    gbw;
    // This instruction will get replaced with an STSUL
@p  st.shared.volatile.u32   [%2+0xffff],    drw;
@!p bra             acc_write_start;
@q  bra             oflow_write_end;
    shl.b32         %2,     %2,     2;
    cvt.u64.u32     ptr,    %2;
    add.u64         ptr,    ptr,    %3;
    and.b32         r,      %0,     0xffff;
    shr.b32         g,      %1,     16;
    and.b32         b,      %1,     0xffff;
    cvt.rn.f32.u32  rf,     r;
    cvt.rn.f32.u32  gf,     g;
    cvt.rn.f32.u32  bf,     b;
    mul.ftz.f32     rf,     rf,     0.007874015748031496;
    mul.ftz.f32     gf,     gf,     0.007874015748031496;
    mul.ftz.f32     bf,     bf,     0.007874015748031496;
    red.add.f32     [ptr],   500.0;
    red.add.f32     [ptr+4], bf;
    red.add.f32     [ptr+8], gf;
    red.add.f32     [ptr+12], rf;
 oflow_write_end:
 }
        """)}}  ::  "r"(dr), "r"(gb), "r"(shr_addr), "l"(glo_ptr));
        // TODO: go through the pain of manual address calculation for global ptr
        time += time_step;
    }
@@ -519,3 +550,36 @@ write_shmem(
                              if n != 'final'],
                **globals())
    @staticmethod
    def monkey_patch(cubin):
        LD      = np.uint64(0x851c00fcff0300c1)
        LDSLK   = np.uint64(0x851c0000000000c4)
        ST      = np.uint64(0x850000fcff0300c9)
        STSUL   = np.uint64(0x85000000000000cc)
        regmask = np.uint64(0x00c0ff0300000000)
        prdmask = np.uint64(0x003c000000000000)
        O = 64  # Expected offset to last instruction
        offset = cubin.find('\x85')
        while offset >= 0:
            # Using these fixed offsets makes this code intentionally
            # intolerant of compiler instruction reordering
            if cubin[offset+7] == '\xc1' and cubin[offset+O] == '\x85':
                ld = np.frombuffer(cubin[offset:offset+8], dtype='>u8')
                st = np.frombuffer(cubin[offset+O:offset+8+O], dtype='>u8')
                if ((ld & (~regmask)) == LD and
                    ((st & (~regmask)) & (~prdmask)) == ST):
                    break
            offset = cubin.find('\x85', offset+1)
        assert offset > 0, 'Could not find patch point!'
        # Note that these bits are still reversed, and we ignore the
        # (im)possibility of a negative predicate in this case
        pred = (st & prdmask) >> 50
        ld = LDSLK | (ld & regmask) | (pred << 10)
        st = STSUL | (st & regmask) | (st & prdmask)
        return ( cubin[:offset] + ld.byteswap().tostring()
               + cubin[offset+8:offset+O]
               + st.byteswap().tostring() + cubin[offset+8+O:] )
@@ -131,7 +131,7 @@ class RenderInfo(object):
    # position into a single 32-bit int for now, which limits resolution to
    # 1080p when xform opacity is respected, so the other two modes will hang
    # around until that can be extended to be memory-limited again.
-    acc_mode = 'global'
+    acc_mode = 'deferred'
    # TODO: fix this
    chaos_used = False
@@ -81,6 +81,11 @@ class Renderer(object):
        self.cubin = pycuda.compiler.compile(
                self.src, keep=keep, options=cmp_options,
                cache_dir=False if keep else None)
        with open('/tmp/iter_kern.cubin', 'wb') as fp:
            fp.write(self.cubin)
        # For now, we apply the monkey-patch manually. May eventually make
        # this more of a framework if I do it in more than one code segment.
        self.cubin = self._iter.monkey_patch(self.cubin)
        self.mod = cuda.module_from_buffer(self.cubin, jit_options)
        with open('/tmp/iter_kern.cubin', 'wb') as fp:
            fp.write(self.cubin)
@@ -142,18 +147,7 @@ class Renderer(object):
            d_log = cuda.mem_alloc(log_size * 4)
            d_log_sorted = cuda.mem_alloc(log_size * 4)
            sorter = sort.Sorter(log_size)
-
+            nwriteblocks = int(np.ceil(nbins / float(1<<12)))
            # Shared accumulators take care of the lowest 12 bits, but due to
            # a quirk of the sort implementation, asking the sort to handle
            # fewer bits than it is compiled for will make it considerably
            # slower (and it can't be compiled for <7b), so we actually dig in
            # to the accumulator's SHAB window for those cases.
            SHAB = np.int32(12)
            address_bits = np.int32(np.ceil(np.log2(nbins+1)))
            start_bit = address_bits - sorter.radix_bits
            log_shift = np.int32(SHAB - start_bit)
            nwriteblocks = int(np.ceil(nbins / (1<<SHAB)))
            print start_bit, log_shift, nwriteblocks
        # Calculate 'nslots', the number of simultaneous running threads that
        # can be active on the GPU during iteration (and thus the number of
@@ -252,10 +246,10 @@ class Renderer(object):
                             block=(32, self._iter.NTHREADS/32, 1),
                             grid=(ntemporal_samples, 1), stream=iter_stream)
                    _sync_stream(write_stream, iter_stream)
-                    sorter.sort(d_log_sorted, d_log, log_size, start_bit, True,
+                    sorter.sort(d_log_sorted, d_log, log_size, 12, True,
                                stream=write_stream)
                    _sync_stream(iter_stream, write_stream)
-                    write_fun(d_accum, d_log_sorted, sorter.dglobal, log_shift,
+                    write_fun(d_accum, d_log_sorted, sorter.dglobal,
                              block=(1024, 1, 1), grid=(nwriteblocks, 1),
                              texrefs=[tref], stream=write_stream)
            else: