diff --git a/cuburn/code/iter.py b/cuburn/code/iter.py index 6d483b4..366bf1d 100644 --- a/cuburn/code/iter.py +++ b/cuburn/code/iter.py @@ -419,6 +419,9 @@ write_shmem( } __syncthreads(); + // Shut the compiler up + idx = s_acc[0]; + // log_bounds[] holds inclusive prefix sums, so that log_bounds[0] is the // largest index with radix 0, and so on. int lb_idx_hi = bid & 0xff; @@ -438,18 +441,19 @@ write_shmem( for (int i = idx_lo + tid; i < idx_hi; i += BS) { int entry = log[i]; - time += time_step; - - // TODO: opacity - if ((entry & magic_mask) != magic) continue; asm volatile ({{crep(""" { .reg .pred q; - .reg .u32 shoff, color, time, d, r, g, b, hi, lo, hiw, low; + .reg .u32 shoff, color, time, d, r, g, b, hi, lo, hiw, low, tmp; .reg .u64 ptr; .reg .f32 rf, gf, bf, df, rg, gg, dg, bg; + // TODO: opacity + and.b32 tmp, %0, %4; + setp.eq.u32 q, tmp, %3; +@!q bra before_sync; + and.b32 shoff, %0, 0xff800; shr.b32 shoff, shoff, 5; bfi.b32 shoff, %0, shoff, 3, 3; @@ -462,9 +466,13 @@ write_shmem( ld.shared.v2.u32 {hiw, low}, [shoff]; add.cc.u32 lo, lo, low; addc.u32 hi, hi, hiw; - st.shared.v2.u32 [shoff], {hi, lo}; setp.hs.u32 q, hi, (1023 << 22); -@!q bra oflow_write_end; +@q bra oflow_sync; + st.shared.v2.u32 [shoff], {hi, lo}; +before_sync: + bar.sync 0; + bra oflow_write_end; +oflow_sync: st.shared.v2.u32 [shoff], {0, 0}; // TODO: opacity @@ -472,6 +480,8 @@ write_shmem( cvt.u64.u32 ptr, shoff; add.u64 ptr, ptr, %2; ld.global.v4.f32 {dg,bg,gg,rg}, [ptr]; + bar.sync 0; + bfe.u32 r, hi, 4, 18; bfe.u32 g, lo, 18, 14; bfi.b32 g, hi, g, 14, 4; @@ -487,7 +497,8 @@ write_shmem( oflow_write_end: } - """)}} :: "r"(entry), "f"(time), "l"(acc)); + """)}} :: "r"(entry), "f"(time), "l"(acc), "r"(magic), "r"(magic_mask)); + time += time_step; } __syncthreads();