Fix overflow-related bug (hopefully)

This commit is contained in:
Steven Robertson 2011-12-09 18:46:40 -05:00
parent c5da1efc74
commit a509e58b43

View File

@ -419,6 +419,9 @@ write_shmem(
} }
__syncthreads(); __syncthreads();
// Shut the compiler up
idx = s_acc[0];
// log_bounds[] holds inclusive prefix sums, so that log_bounds[0] is the // log_bounds[] holds inclusive prefix sums, so that log_bounds[0] is the
// largest index with radix 0, and so on. // largest index with radix 0, and so on.
int lb_idx_hi = bid & 0xff; int lb_idx_hi = bid & 0xff;
@ -438,18 +441,19 @@ write_shmem(
for (int i = idx_lo + tid; i < idx_hi; i += BS) { for (int i = idx_lo + tid; i < idx_hi; i += BS) {
int entry = log[i]; int entry = log[i];
time += time_step;
// TODO: opacity
if ((entry & magic_mask) != magic) continue;
asm volatile ({{crep(""" asm volatile ({{crep("""
{ {
.reg .pred q; .reg .pred q;
.reg .u32 shoff, color, time, d, r, g, b, hi, lo, hiw, low; .reg .u32 shoff, color, time, d, r, g, b, hi, lo, hiw, low, tmp;
.reg .u64 ptr; .reg .u64 ptr;
.reg .f32 rf, gf, bf, df, rg, gg, dg, bg; .reg .f32 rf, gf, bf, df, rg, gg, dg, bg;
// TODO: opacity
and.b32 tmp, %0, %4;
setp.eq.u32 q, tmp, %3;
@!q bra before_sync;
and.b32 shoff, %0, 0xff800; and.b32 shoff, %0, 0xff800;
shr.b32 shoff, shoff, 5; shr.b32 shoff, shoff, 5;
bfi.b32 shoff, %0, shoff, 3, 3; bfi.b32 shoff, %0, shoff, 3, 3;
@ -462,9 +466,13 @@ write_shmem(
ld.shared.v2.u32 {hiw, low}, [shoff]; ld.shared.v2.u32 {hiw, low}, [shoff];
add.cc.u32 lo, lo, low; add.cc.u32 lo, lo, low;
addc.u32 hi, hi, hiw; addc.u32 hi, hi, hiw;
st.shared.v2.u32 [shoff], {hi, lo};
setp.hs.u32 q, hi, (1023 << 22); setp.hs.u32 q, hi, (1023 << 22);
@!q bra oflow_write_end; @q bra oflow_sync;
st.shared.v2.u32 [shoff], {hi, lo};
before_sync:
bar.sync 0;
bra oflow_write_end;
oflow_sync:
st.shared.v2.u32 [shoff], {0, 0}; st.shared.v2.u32 [shoff], {0, 0};
// TODO: opacity // TODO: opacity
@ -472,6 +480,8 @@ write_shmem(
cvt.u64.u32 ptr, shoff; cvt.u64.u32 ptr, shoff;
add.u64 ptr, ptr, %2; add.u64 ptr, ptr, %2;
ld.global.v4.f32 {dg,bg,gg,rg}, [ptr]; ld.global.v4.f32 {dg,bg,gg,rg}, [ptr];
bar.sync 0;
bfe.u32 r, hi, 4, 18; bfe.u32 r, hi, 4, 18;
bfe.u32 g, lo, 18, 14; bfe.u32 g, lo, 18, 14;
bfi.b32 g, hi, g, 14, 4; bfi.b32 g, hi, g, 14, 4;
@ -487,7 +497,8 @@ write_shmem(
oflow_write_end: oflow_write_end:
} }
""")}} :: "r"(entry), "f"(time), "l"(acc)); """)}} :: "r"(entry), "f"(time), "l"(acc), "r"(magic), "r"(magic_mask));
time += time_step;
} }
__syncthreads(); __syncthreads();