Use reordered, lossy bit handling

This commit is contained in:
Steven Robertson 2011-12-09 14:14:36 -05:00
parent b592cda3db
commit 6bac3b3a95
2 changed files with 35 additions and 24 deletions

View File

@ -401,7 +401,8 @@ __launch_bounds__(BS, 1)
write_shmem( write_shmem(
float4 *acc, float4 *acc,
const uint32_t *log, const uint32_t *log,
const uint32_t *log_bounds const uint32_t *log_bounds,
uint32_t nbins
) { ) {
const int tid = threadIdx.x; const int tid = threadIdx.x;
const int bid = blockIdx.x; const int bid = blockIdx.x;
@ -443,16 +444,15 @@ write_shmem(
float time = tid * rnrounds; float time = tid * rnrounds;
float time_step = BS * rnrounds; float time_step = BS * rnrounds;
int glo_base = bid << SHAB; int magic = ((blockIdx.x & 0xff) << 3) + ((blockIdx.x & 0xf00) << 12);
float4* glo_ptr = &acc[glo_base]; int magic_mask = 0xf007f8;
for (int i = idx_lo + tid; i < idx_hi; i += BS) { for (int i = idx_lo + tid; i < idx_hi; i += BS) {
int entry = log[i]; int entry = log[i];
time += time_step;
// Constant '12' is 32 - 8 - SHAB, where 8 is the // TODO: opacity
// number of bits assigned to color. TODO: This ignores opacity. if ((entry & magic_mask) != magic) continue;
bfe_decl(glob_addr, entry, SHAB, 12);
if (glob_addr != bid) continue;
asm volatile ({{crep(""" asm volatile ({{crep("""
{ {
@ -461,7 +461,10 @@ write_shmem(
.reg .u64 ptr; .reg .u64 ptr;
.reg .f32 rf, gf, bf, df; .reg .f32 rf, gf, bf, df;
bfi.b32 shoff, %0, 0, 2, 12; and.b32 shoff, %0, 0xff800;
shr.b32 shoff, shoff, 6;
bfi.b32 shoff, %0, shoff, 2, 3;
bfe.u32 color, %0, 24, 8; bfe.u32 color, %0, 24, 8;
shl.b32 color, color, 3; shl.b32 color, color, 3;
cvt.rni.u32.f32 time, %1; cvt.rni.u32.f32 time, %1;
@ -476,15 +479,16 @@ acc_write_start:
@p ld.shared.volatile.u32 hiw, [shoff+0x4000]; @p ld.shared.volatile.u32 hiw, [shoff+0x4000];
add.cc.u32 lo, los, low; add.cc.u32 lo, los, low;
addc.u32 hi, his, hiw; addc.u32 hi, his, hiw;
setp.lo.u32 q, hi, (1023 << 22); setp.hs.and.u32 q, hi, (1023 << 22), p;
selp.b32 hiw, hi, 0, q; selp.b32 hiw, 0, hi, q;
selp.b32 low, lo, 0, q; selp.b32 low, 0, lo, q;
@p st.shared.volatile.u32 [shoff+0x4000], hiw; @p st.shared.volatile.u32 [shoff+0x4000], hiw;
// This instruction will get replaced with an STSUL // This instruction will get replaced with an STSUL
@p st.shared.volatile.u32 [shoff+0xffff], low; @p st.shared.volatile.u32 [shoff+0xffff], low;
@!p bra acc_write_start; //@!p bra acc_write_start;
@q bra oflow_write_end; @!q bra oflow_write_end;
shl.b32 shoff, shoff, 2; // TODO: opacity
bfi.b32 shoff, %0, 0, 4, 24;
cvt.u64.u32 ptr, shoff; cvt.u64.u32 ptr, shoff;
add.u64 ptr, ptr, %2; add.u64 ptr, ptr, %2;
bfe.u32 r, hi, 4, 18; bfe.u32 r, hi, 4, 18;
@ -504,16 +508,18 @@ acc_write_start:
oflow_write_end: oflow_write_end:
} }
""")}} :: "r"(entry), "f"(time), "l"(glo_ptr)); """)}} :: "r"(entry), "f"(time), "l"(acc));
// TODO: go through the pain of manual address calculation for global ptr
time += time_step;
} }
__syncthreads(); __syncthreads();
int idx = tid; int idx = tid;
for (int i = 0; i < (SHAW / BS); i++) { int glo_idx = magic | (((idx << 8) | idx) & 0xff807);
for (int i = 0; i < (SHAW / BS) && glo_idx < nbins; i++) {
int d, r, g, b; int d, r, g, b;
float4 pix = acc[glo_base + idx]; float4 pix = acc[glo_idx];
asm({{crep(""" asm({{crep("""
{ {
.reg .u32 hi, lo; .reg .u32 hi, lo;
@ -530,8 +536,9 @@ oflow_write_end:
pix.y += g / 255.0f; pix.y += g / 255.0f;
pix.z += b / 255.0f; pix.z += b / 255.0f;
pix.w += d; pix.w += d;
acc[glo_base + idx] = pix; acc[glo_idx] = pix;
idx += BS; idx += BS;
glo_idx += (BS << 8);
} }
} }

View File

@ -8,6 +8,7 @@ from itertools import cycle, repeat, chain, izip
from ctypes import * from ctypes import *
from cStringIO import StringIO from cStringIO import StringIO
import numpy as np import numpy as np
from numpy import int32 as i32
from scipy import ndimage from scipy import ndimage
from fr0stlib import pyflam3 from fr0stlib import pyflam3
@ -147,7 +148,9 @@ class Renderer(object):
d_log = cuda.mem_alloc(log_size * 4) d_log = cuda.mem_alloc(log_size * 4)
d_log_sorted = cuda.mem_alloc(log_size * 4) d_log_sorted = cuda.mem_alloc(log_size * 4)
sorter = sort.Sorter(log_size) sorter = sort.Sorter(log_size)
nwriteblocks = int(np.ceil(nbins / float(1<<12))) # We need to cover each unique tag - address bits 20-23 - with one
# write block per sort bin. Or somethinig like that.
nwriteblocks = int(np.ceil(nbins / float(1<<20))) * 256
# Calculate 'nslots', the number of simultaneous running threads that # Calculate 'nslots', the number of simultaneous running threads that
# can be active on the GPU during iteration (and thus the number of # can be active on the GPU during iteration (and thus the number of
@ -267,10 +270,11 @@ class Renderer(object):
block=(32, self._iter.NTHREADS/32, 1), block=(32, self._iter.NTHREADS/32, 1),
grid=(ntemporal_samples, 1), stream=iter_stream) grid=(ntemporal_samples, 1), stream=iter_stream)
_sync_stream(write_stream, iter_stream) _sync_stream(write_stream, iter_stream)
sorter.sort(d_log_sorted, d_log, log_size, 12, True, sorter.sort(d_log_sorted, d_log, log_size, 3, True,
stream=write_stream) stream=write_stream)
#print cuda.from_device(sorter.dglobal, (256,), np.uint32)
_sync_stream(iter_stream, write_stream) _sync_stream(iter_stream, write_stream)
write_fun(d_accum, d_log_sorted, sorter.dglobal, write_fun(d_accum, d_log_sorted, sorter.dglobal, i32(nbins),
block=(1024, 1, 1), grid=(nwriteblocks, 1), block=(1024, 1, 1), grid=(nwriteblocks, 1),
stream=write_stream) stream=write_stream)
else: else:
@ -302,5 +306,5 @@ class Renderer(object):
def _trim(self, result): def _trim(self, result):
g = self.info.gutter g = self.info.gutter
return result[g:-g,g:-g].copy() return result[g:-g,g:g+self.info.width].copy()