mirror of
https://github.com/stevenrobertson/cuburn.git
synced 2025-02-05 11:40:04 -05:00
Use reordered, lossy bit handling
This commit is contained in:
parent
b592cda3db
commit
6bac3b3a95
@ -401,7 +401,8 @@ __launch_bounds__(BS, 1)
|
|||||||
write_shmem(
|
write_shmem(
|
||||||
float4 *acc,
|
float4 *acc,
|
||||||
const uint32_t *log,
|
const uint32_t *log,
|
||||||
const uint32_t *log_bounds
|
const uint32_t *log_bounds,
|
||||||
|
uint32_t nbins
|
||||||
) {
|
) {
|
||||||
const int tid = threadIdx.x;
|
const int tid = threadIdx.x;
|
||||||
const int bid = blockIdx.x;
|
const int bid = blockIdx.x;
|
||||||
@ -443,16 +444,15 @@ write_shmem(
|
|||||||
float time = tid * rnrounds;
|
float time = tid * rnrounds;
|
||||||
float time_step = BS * rnrounds;
|
float time_step = BS * rnrounds;
|
||||||
|
|
||||||
int glo_base = bid << SHAB;
|
int magic = ((blockIdx.x & 0xff) << 3) + ((blockIdx.x & 0xf00) << 12);
|
||||||
float4* glo_ptr = &acc[glo_base];
|
int magic_mask = 0xf007f8;
|
||||||
|
|
||||||
for (int i = idx_lo + tid; i < idx_hi; i += BS) {
|
for (int i = idx_lo + tid; i < idx_hi; i += BS) {
|
||||||
int entry = log[i];
|
int entry = log[i];
|
||||||
|
time += time_step;
|
||||||
|
|
||||||
// Constant '12' is 32 - 8 - SHAB, where 8 is the
|
// TODO: opacity
|
||||||
// number of bits assigned to color. TODO: This ignores opacity.
|
if ((entry & magic_mask) != magic) continue;
|
||||||
bfe_decl(glob_addr, entry, SHAB, 12);
|
|
||||||
if (glob_addr != bid) continue;
|
|
||||||
|
|
||||||
asm volatile ({{crep("""
|
asm volatile ({{crep("""
|
||||||
{
|
{
|
||||||
@ -461,7 +461,10 @@ write_shmem(
|
|||||||
.reg .u64 ptr;
|
.reg .u64 ptr;
|
||||||
.reg .f32 rf, gf, bf, df;
|
.reg .f32 rf, gf, bf, df;
|
||||||
|
|
||||||
bfi.b32 shoff, %0, 0, 2, 12;
|
and.b32 shoff, %0, 0xff800;
|
||||||
|
shr.b32 shoff, shoff, 6;
|
||||||
|
bfi.b32 shoff, %0, shoff, 2, 3;
|
||||||
|
|
||||||
bfe.u32 color, %0, 24, 8;
|
bfe.u32 color, %0, 24, 8;
|
||||||
shl.b32 color, color, 3;
|
shl.b32 color, color, 3;
|
||||||
cvt.rni.u32.f32 time, %1;
|
cvt.rni.u32.f32 time, %1;
|
||||||
@ -476,15 +479,16 @@ acc_write_start:
|
|||||||
@p ld.shared.volatile.u32 hiw, [shoff+0x4000];
|
@p ld.shared.volatile.u32 hiw, [shoff+0x4000];
|
||||||
add.cc.u32 lo, los, low;
|
add.cc.u32 lo, los, low;
|
||||||
addc.u32 hi, his, hiw;
|
addc.u32 hi, his, hiw;
|
||||||
setp.lo.u32 q, hi, (1023 << 22);
|
setp.hs.and.u32 q, hi, (1023 << 22), p;
|
||||||
selp.b32 hiw, hi, 0, q;
|
selp.b32 hiw, 0, hi, q;
|
||||||
selp.b32 low, lo, 0, q;
|
selp.b32 low, 0, lo, q;
|
||||||
@p st.shared.volatile.u32 [shoff+0x4000], hiw;
|
@p st.shared.volatile.u32 [shoff+0x4000], hiw;
|
||||||
// This instruction will get replaced with an STSUL
|
// This instruction will get replaced with an STSUL
|
||||||
@p st.shared.volatile.u32 [shoff+0xffff], low;
|
@p st.shared.volatile.u32 [shoff+0xffff], low;
|
||||||
@!p bra acc_write_start;
|
//@!p bra acc_write_start;
|
||||||
@q bra oflow_write_end;
|
@!q bra oflow_write_end;
|
||||||
shl.b32 shoff, shoff, 2;
|
// TODO: opacity
|
||||||
|
bfi.b32 shoff, %0, 0, 4, 24;
|
||||||
cvt.u64.u32 ptr, shoff;
|
cvt.u64.u32 ptr, shoff;
|
||||||
add.u64 ptr, ptr, %2;
|
add.u64 ptr, ptr, %2;
|
||||||
bfe.u32 r, hi, 4, 18;
|
bfe.u32 r, hi, 4, 18;
|
||||||
@ -504,16 +508,18 @@ acc_write_start:
|
|||||||
|
|
||||||
oflow_write_end:
|
oflow_write_end:
|
||||||
}
|
}
|
||||||
""")}} :: "r"(entry), "f"(time), "l"(glo_ptr));
|
""")}} :: "r"(entry), "f"(time), "l"(acc));
|
||||||
// TODO: go through the pain of manual address calculation for global ptr
|
|
||||||
time += time_step;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
|
|
||||||
int idx = tid;
|
int idx = tid;
|
||||||
for (int i = 0; i < (SHAW / BS); i++) {
|
int glo_idx = magic | (((idx << 8) | idx) & 0xff807);
|
||||||
|
|
||||||
|
for (int i = 0; i < (SHAW / BS) && glo_idx < nbins; i++) {
|
||||||
int d, r, g, b;
|
int d, r, g, b;
|
||||||
float4 pix = acc[glo_base + idx];
|
float4 pix = acc[glo_idx];
|
||||||
asm({{crep("""
|
asm({{crep("""
|
||||||
{
|
{
|
||||||
.reg .u32 hi, lo;
|
.reg .u32 hi, lo;
|
||||||
@ -530,8 +536,9 @@ oflow_write_end:
|
|||||||
pix.y += g / 255.0f;
|
pix.y += g / 255.0f;
|
||||||
pix.z += b / 255.0f;
|
pix.z += b / 255.0f;
|
||||||
pix.w += d;
|
pix.w += d;
|
||||||
acc[glo_base + idx] = pix;
|
acc[glo_idx] = pix;
|
||||||
idx += BS;
|
idx += BS;
|
||||||
|
glo_idx += (BS << 8);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -8,6 +8,7 @@ from itertools import cycle, repeat, chain, izip
|
|||||||
from ctypes import *
|
from ctypes import *
|
||||||
from cStringIO import StringIO
|
from cStringIO import StringIO
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
from numpy import int32 as i32
|
||||||
from scipy import ndimage
|
from scipy import ndimage
|
||||||
|
|
||||||
from fr0stlib import pyflam3
|
from fr0stlib import pyflam3
|
||||||
@ -147,7 +148,9 @@ class Renderer(object):
|
|||||||
d_log = cuda.mem_alloc(log_size * 4)
|
d_log = cuda.mem_alloc(log_size * 4)
|
||||||
d_log_sorted = cuda.mem_alloc(log_size * 4)
|
d_log_sorted = cuda.mem_alloc(log_size * 4)
|
||||||
sorter = sort.Sorter(log_size)
|
sorter = sort.Sorter(log_size)
|
||||||
nwriteblocks = int(np.ceil(nbins / float(1<<12)))
|
# We need to cover each unique tag - address bits 20-23 - with one
|
||||||
|
# write block per sort bin. Or somethinig like that.
|
||||||
|
nwriteblocks = int(np.ceil(nbins / float(1<<20))) * 256
|
||||||
|
|
||||||
# Calculate 'nslots', the number of simultaneous running threads that
|
# Calculate 'nslots', the number of simultaneous running threads that
|
||||||
# can be active on the GPU during iteration (and thus the number of
|
# can be active on the GPU during iteration (and thus the number of
|
||||||
@ -267,10 +270,11 @@ class Renderer(object):
|
|||||||
block=(32, self._iter.NTHREADS/32, 1),
|
block=(32, self._iter.NTHREADS/32, 1),
|
||||||
grid=(ntemporal_samples, 1), stream=iter_stream)
|
grid=(ntemporal_samples, 1), stream=iter_stream)
|
||||||
_sync_stream(write_stream, iter_stream)
|
_sync_stream(write_stream, iter_stream)
|
||||||
sorter.sort(d_log_sorted, d_log, log_size, 12, True,
|
sorter.sort(d_log_sorted, d_log, log_size, 3, True,
|
||||||
stream=write_stream)
|
stream=write_stream)
|
||||||
|
#print cuda.from_device(sorter.dglobal, (256,), np.uint32)
|
||||||
_sync_stream(iter_stream, write_stream)
|
_sync_stream(iter_stream, write_stream)
|
||||||
write_fun(d_accum, d_log_sorted, sorter.dglobal,
|
write_fun(d_accum, d_log_sorted, sorter.dglobal, i32(nbins),
|
||||||
block=(1024, 1, 1), grid=(nwriteblocks, 1),
|
block=(1024, 1, 1), grid=(nwriteblocks, 1),
|
||||||
stream=write_stream)
|
stream=write_stream)
|
||||||
else:
|
else:
|
||||||
@ -302,5 +306,5 @@ class Renderer(object):
|
|||||||
|
|
||||||
def _trim(self, result):
|
def _trim(self, result):
|
||||||
g = self.info.gutter
|
g = self.info.gutter
|
||||||
return result[g:-g,g:-g].copy()
|
return result[g:-g,g:g+self.info.width].copy()
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user