mirror of
https://github.com/stevenrobertson/cuburn.git
synced 2025-02-05 11:40:04 -05:00
Deferred works again. Time to break it.
This commit is contained in:
parent
e106524701
commit
b76208078f
@ -363,9 +363,9 @@ void iter(
|
|||||||
pix.w += 1.0f;
|
pix.w += 1.0f;
|
||||||
*accbuf = pix;
|
*accbuf = pix;
|
||||||
{{elif info.acc_mode == 'deferred'}}
|
{{elif info.acc_mode == 'deferred'}}
|
||||||
// 'color' gets the top 9 bits. TODO: add dithering via precalc.
|
// 'color' gets the top 8 bits. TODO: add dithering via precalc.
|
||||||
uint32_t icolor = fminf(1.0f, cc) * 511.0f + color_dither;
|
uint32_t icolor = fminf(1.0f, cc) * 255.0f + color_dither;
|
||||||
asm("bfi.b32 %0, %1, %0, 23, 9;" : "+r"(i) : "r"(icolor));
|
asm("bfi.b32 %0, %1, %0, 24, 8;" : "+r"(i) : "r"(icolor));
|
||||||
*log = i;
|
*log = i;
|
||||||
{{endif}}
|
{{endif}}
|
||||||
}
|
}
|
||||||
@ -394,10 +394,10 @@ void write_shmem_helper(
|
|||||||
const uint32_t gb
|
const uint32_t gb
|
||||||
) {
|
) {
|
||||||
float4 pix = acc[glo_idx];
|
float4 pix = acc[glo_idx];
|
||||||
pix.x += (dr & 0xffff) / 255.0f;
|
pix.x += (dr & 0xffff) / 127.0f;
|
||||||
pix.w += dr >> 16;
|
pix.w += dr >> 16;
|
||||||
pix.y += (gb & 0xffff) / 255.0f;
|
pix.y += (gb & 0xffff) / 127.0f;
|
||||||
pix.z += (gb >> 16) / 255.0f;
|
pix.z += (gb >> 16) / 127.0f;
|
||||||
acc[glo_idx] = pix;
|
acc[glo_idx] = pix;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -417,8 +417,7 @@ __launch_bounds__(BS, 1)
|
|||||||
write_shmem(
|
write_shmem(
|
||||||
float4 *acc,
|
float4 *acc,
|
||||||
const uint32_t *log,
|
const uint32_t *log,
|
||||||
const uint32_t *log_bounds,
|
const uint32_t *log_bounds
|
||||||
const int log_bounds_shift
|
|
||||||
) {
|
) {
|
||||||
const int tid = threadIdx.x;
|
const int tid = threadIdx.x;
|
||||||
const int bid = blockIdx.x;
|
const int bid = blockIdx.x;
|
||||||
@ -441,62 +440,94 @@ write_shmem(
|
|||||||
s_acc_gb[tid+3*BS] = 0;
|
s_acc_gb[tid+3*BS] = 0;
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
// TODO: share across threads - discernable performance impact?
|
// This predicate is used for the horrible monkey-patching magic.
|
||||||
int lb_idx_lo, lb_idx_hi;
|
asm volatile(".reg .pred p; setp.lt.u32 p, %0, 42;" :: "r"(s_acc_dr[0]));
|
||||||
if (log_bounds_shift > 0) {
|
|
||||||
lb_idx_hi = ((bid + 1) << log_bounds_shift) - 1;
|
|
||||||
lb_idx_lo = (bid << log_bounds_shift) - 1;
|
|
||||||
} else {
|
|
||||||
lb_idx_hi = bid >> (-log_bounds_shift);
|
|
||||||
lb_idx_lo = lb_idx_hi - 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
int idx_lo, idx_hi;
|
// log_bounds[] holds inclusive prefix sums, so that log_bounds[0] is the
|
||||||
if (lb_idx_lo < 0) idx_lo = 0;
|
// largest index with radix 0, and so on.
|
||||||
else idx_lo = log_bounds[lb_idx_lo] & ~(BS-1);
|
int lb_idx_hi = bid & 0xff;
|
||||||
idx_hi = (log_bounds[lb_idx_hi] & ~(BS - 1)) + BS;
|
int lb_idx_lo = lb_idx_hi - 1;
|
||||||
|
|
||||||
|
int idx_lo;
|
||||||
|
if (lb_idx_lo > 0) idx_lo = log_bounds[lb_idx_lo] & ~(BS - 1);
|
||||||
|
else idx_lo = 0;
|
||||||
|
int idx_hi = (log_bounds[lb_idx_hi] & ~(BS - 1)) + BS;
|
||||||
|
|
||||||
float rnrounds = 1.0f / (idx_hi - idx_lo);
|
float rnrounds = 1.0f / (idx_hi - idx_lo);
|
||||||
float time = tid * rnrounds;
|
float time = tid * rnrounds;
|
||||||
float time_step = BS * rnrounds;
|
float time_step = BS * rnrounds;
|
||||||
|
|
||||||
int glo_base = bid << SHAB;
|
int glo_base = bid << SHAB;
|
||||||
|
float4* glo_ptr = &acc[glo_base];
|
||||||
|
|
||||||
for (int i = idx_lo + tid; i < idx_hi; i += BS) {
|
for (int i = idx_lo + tid; i < idx_hi; i += BS) {
|
||||||
int entry = log[i];
|
int entry = log[i];
|
||||||
|
|
||||||
|
// Constant '12' is 32 - 8 - SHAB, where 8 is the
|
||||||
// TODO: constant '11' is really just 32 - 9 - SHAB, where 9 is the
|
// number of bits assigned to color. TODO: This ignores opacity.
|
||||||
// number of bits assigned to color. This ignores opacity.
|
bfe_decl(glob_addr, entry, SHAB, 12);
|
||||||
bfe_decl(glob_addr, entry, SHAB, 11);
|
|
||||||
if (glob_addr != bid) continue;
|
if (glob_addr != bid) continue;
|
||||||
|
|
||||||
bfe_decl(shr_addr, entry, 0, SHAB);
|
// Shared memory address, pre-shifted
|
||||||
bfe_decl(color, entry, 23, 9);
|
int shr_addr;
|
||||||
|
asm("bfi.b32 %0, %1, 0, 2, 12;" : "=r"(shr_addr) : "r"(entry));
|
||||||
|
bfe_decl(color, entry, 24, 8);
|
||||||
|
|
||||||
float colorf = color / 511.0f;
|
float colorf = color / 255.0f;
|
||||||
float4 outcol = tex2D(palTex, colorf, time);
|
float4 outcol = tex2D(palTex, colorf, time);
|
||||||
|
|
||||||
// TODO: change texture sampler to return shorts and avoid this
|
// TODO: change texture sampler to return shorts and avoid this
|
||||||
uint32_t r = 255.0f * outcol.x;
|
uint32_t r = 127.0f * outcol.x;
|
||||||
uint32_t g = 255.0f * outcol.y;
|
uint32_t g = 127.0f * outcol.y;
|
||||||
uint32_t b = 255.0f * outcol.z;
|
uint32_t b = 127.0f * outcol.z;
|
||||||
|
|
||||||
uint32_t dr = atomicAdd(s_acc_dr + shr_addr, r + 0x10000);
|
uint32_t dr = r + 0x10000, gb = g + (b << 16);
|
||||||
uint32_t gb = atomicAdd(s_acc_gb + shr_addr, g + (b << 16));
|
|
||||||
uint32_t d = dr >> 16;
|
|
||||||
|
|
||||||
// Neat trick: if overflow is about to happen, write the accumulator,
|
asm volatile ({{crep("""
|
||||||
// and subtract the last known values from the accumulator again.
|
{
|
||||||
// Even if the ints end up wrapping around once before the subtraction
|
.reg .pred q;
|
||||||
// can occur, the results after the subtraction will be correct.
|
.reg .u32 d, r, g, b, dr, gb, drw, gbw, off;
|
||||||
// (Wrapping twice will mess up the intermediate write, but is pretty
|
.reg .u64 ptr;
|
||||||
// unlikely.)
|
.reg .f32 rf, gf, bf, df;
|
||||||
if (d == 250) {
|
|
||||||
atomicSub(s_acc_dr + shr_addr, dr);
|
acc_write_start:
|
||||||
atomicSub(s_acc_gb + shr_addr, gb);
|
// This instruction will get replaced with a LDSLK that sets 'p'.
|
||||||
write_shmem_helper(acc, glo_base + shr_addr, dr, gb);
|
// The 0xffff is a signature to make sure we get the right instruction,
|
||||||
}
|
// and will get replaced with a 0-offset when patching.
|
||||||
|
ld.shared.volatile.u32 dr, [%2+0xffff];
|
||||||
|
@p ld.shared.volatile.u32 gb, [%2+0x4000];
|
||||||
|
@p add.u32 %0, dr, %0;
|
||||||
|
@p add.u32 %1, gb, %1;
|
||||||
|
// TODO: clever use of slct could remove an instruction?
|
||||||
|
@p setp.lo.u32 q, %0, 32768000;
|
||||||
|
@p selp.b32 drw, %0, 0, q;
|
||||||
|
@p selp.b32 gbw, %1, 0, q;
|
||||||
|
@p st.shared.volatile.u32 [%2+0x4000], gbw;
|
||||||
|
// This instruction will get replaced with an STSUL
|
||||||
|
@p st.shared.volatile.u32 [%2+0xffff], drw;
|
||||||
|
@!p bra acc_write_start;
|
||||||
|
@q bra oflow_write_end;
|
||||||
|
shl.b32 %2, %2, 2;
|
||||||
|
cvt.u64.u32 ptr, %2;
|
||||||
|
add.u64 ptr, ptr, %3;
|
||||||
|
and.b32 r, %0, 0xffff;
|
||||||
|
shr.b32 g, %1, 16;
|
||||||
|
and.b32 b, %1, 0xffff;
|
||||||
|
cvt.rn.f32.u32 rf, r;
|
||||||
|
cvt.rn.f32.u32 gf, g;
|
||||||
|
cvt.rn.f32.u32 bf, b;
|
||||||
|
mul.ftz.f32 rf, rf, 0.007874015748031496;
|
||||||
|
mul.ftz.f32 gf, gf, 0.007874015748031496;
|
||||||
|
mul.ftz.f32 bf, bf, 0.007874015748031496;
|
||||||
|
red.add.f32 [ptr], 500.0;
|
||||||
|
red.add.f32 [ptr+4], bf;
|
||||||
|
red.add.f32 [ptr+8], gf;
|
||||||
|
red.add.f32 [ptr+12], rf;
|
||||||
|
|
||||||
|
oflow_write_end:
|
||||||
|
}
|
||||||
|
""")}} :: "r"(dr), "r"(gb), "r"(shr_addr), "l"(glo_ptr));
|
||||||
|
// TODO: go through the pain of manual address calculation for global ptr
|
||||||
time += time_step;
|
time += time_step;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -519,3 +550,36 @@ write_shmem(
|
|||||||
if n != 'final'],
|
if n != 'final'],
|
||||||
**globals())
|
**globals())
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def monkey_patch(cubin):
|
||||||
|
LD = np.uint64(0x851c00fcff0300c1)
|
||||||
|
LDSLK = np.uint64(0x851c0000000000c4)
|
||||||
|
ST = np.uint64(0x850000fcff0300c9)
|
||||||
|
STSUL = np.uint64(0x85000000000000cc)
|
||||||
|
regmask = np.uint64(0x00c0ff0300000000)
|
||||||
|
prdmask = np.uint64(0x003c000000000000)
|
||||||
|
|
||||||
|
O = 64 # Expected offset to last instruction
|
||||||
|
|
||||||
|
offset = cubin.find('\x85')
|
||||||
|
while offset >= 0:
|
||||||
|
# Using these fixed offsets makes this code intentionally
|
||||||
|
# intolerant of compiler instruction reordering
|
||||||
|
if cubin[offset+7] == '\xc1' and cubin[offset+O] == '\x85':
|
||||||
|
ld = np.frombuffer(cubin[offset:offset+8], dtype='>u8')
|
||||||
|
st = np.frombuffer(cubin[offset+O:offset+8+O], dtype='>u8')
|
||||||
|
if ((ld & (~regmask)) == LD and
|
||||||
|
((st & (~regmask)) & (~prdmask)) == ST):
|
||||||
|
break
|
||||||
|
offset = cubin.find('\x85', offset+1)
|
||||||
|
assert offset > 0, 'Could not find patch point!'
|
||||||
|
|
||||||
|
# Note that these bits are still reversed, and we ignore the
|
||||||
|
# (im)possibility of a negative predicate in this case
|
||||||
|
pred = (st & prdmask) >> 50
|
||||||
|
ld = LDSLK | (ld & regmask) | (pred << 10)
|
||||||
|
st = STSUL | (st & regmask) | (st & prdmask)
|
||||||
|
|
||||||
|
return ( cubin[:offset] + ld.byteswap().tostring()
|
||||||
|
+ cubin[offset+8:offset+O]
|
||||||
|
+ st.byteswap().tostring() + cubin[offset+8+O:] )
|
||||||
|
@ -131,7 +131,7 @@ class RenderInfo(object):
|
|||||||
# position into a single 32-bit int for now, which limits resolution to
|
# position into a single 32-bit int for now, which limits resolution to
|
||||||
# 1080p when xform opacity is respected, so the other two modes will hang
|
# 1080p when xform opacity is respected, so the other two modes will hang
|
||||||
# around until that can be extended to be memory-limited again.
|
# around until that can be extended to be memory-limited again.
|
||||||
acc_mode = 'global'
|
acc_mode = 'deferred'
|
||||||
|
|
||||||
# TODO: fix this
|
# TODO: fix this
|
||||||
chaos_used = False
|
chaos_used = False
|
||||||
|
@ -81,6 +81,11 @@ class Renderer(object):
|
|||||||
self.cubin = pycuda.compiler.compile(
|
self.cubin = pycuda.compiler.compile(
|
||||||
self.src, keep=keep, options=cmp_options,
|
self.src, keep=keep, options=cmp_options,
|
||||||
cache_dir=False if keep else None)
|
cache_dir=False if keep else None)
|
||||||
|
with open('/tmp/iter_kern.cubin', 'wb') as fp:
|
||||||
|
fp.write(self.cubin)
|
||||||
|
# For now, we apply the monkey-patch manually. May eventually make
|
||||||
|
# this more of a framework if I do it in more than one code segment.
|
||||||
|
self.cubin = self._iter.monkey_patch(self.cubin)
|
||||||
self.mod = cuda.module_from_buffer(self.cubin, jit_options)
|
self.mod = cuda.module_from_buffer(self.cubin, jit_options)
|
||||||
with open('/tmp/iter_kern.cubin', 'wb') as fp:
|
with open('/tmp/iter_kern.cubin', 'wb') as fp:
|
||||||
fp.write(self.cubin)
|
fp.write(self.cubin)
|
||||||
@ -142,18 +147,7 @@ class Renderer(object):
|
|||||||
d_log = cuda.mem_alloc(log_size * 4)
|
d_log = cuda.mem_alloc(log_size * 4)
|
||||||
d_log_sorted = cuda.mem_alloc(log_size * 4)
|
d_log_sorted = cuda.mem_alloc(log_size * 4)
|
||||||
sorter = sort.Sorter(log_size)
|
sorter = sort.Sorter(log_size)
|
||||||
|
nwriteblocks = int(np.ceil(nbins / float(1<<12)))
|
||||||
# Shared accumulators take care of the lowest 12 bits, but due to
|
|
||||||
# a quirk of the sort implementation, asking the sort to handle
|
|
||||||
# fewer bits than it is compiled for will make it considerably
|
|
||||||
# slower (and it can't be compiled for <7b), so we actually dig in
|
|
||||||
# to the accumulator's SHAB window for those cases.
|
|
||||||
SHAB = np.int32(12)
|
|
||||||
address_bits = np.int32(np.ceil(np.log2(nbins+1)))
|
|
||||||
start_bit = address_bits - sorter.radix_bits
|
|
||||||
log_shift = np.int32(SHAB - start_bit)
|
|
||||||
nwriteblocks = int(np.ceil(nbins / (1<<SHAB)))
|
|
||||||
print start_bit, log_shift, nwriteblocks
|
|
||||||
|
|
||||||
# Calculate 'nslots', the number of simultaneous running threads that
|
# Calculate 'nslots', the number of simultaneous running threads that
|
||||||
# can be active on the GPU during iteration (and thus the number of
|
# can be active on the GPU during iteration (and thus the number of
|
||||||
@ -252,10 +246,10 @@ class Renderer(object):
|
|||||||
block=(32, self._iter.NTHREADS/32, 1),
|
block=(32, self._iter.NTHREADS/32, 1),
|
||||||
grid=(ntemporal_samples, 1), stream=iter_stream)
|
grid=(ntemporal_samples, 1), stream=iter_stream)
|
||||||
_sync_stream(write_stream, iter_stream)
|
_sync_stream(write_stream, iter_stream)
|
||||||
sorter.sort(d_log_sorted, d_log, log_size, start_bit, True,
|
sorter.sort(d_log_sorted, d_log, log_size, 12, True,
|
||||||
stream=write_stream)
|
stream=write_stream)
|
||||||
_sync_stream(iter_stream, write_stream)
|
_sync_stream(iter_stream, write_stream)
|
||||||
write_fun(d_accum, d_log_sorted, sorter.dglobal, log_shift,
|
write_fun(d_accum, d_log_sorted, sorter.dglobal,
|
||||||
block=(1024, 1, 1), grid=(nwriteblocks, 1),
|
block=(1024, 1, 1), grid=(nwriteblocks, 1),
|
||||||
texrefs=[tref], stream=write_stream)
|
texrefs=[tref], stream=write_stream)
|
||||||
else:
|
else:
|
||||||
|
Loading…
Reference in New Issue
Block a user