mirror of
				https://github.com/stevenrobertson/cuburn.git
				synced 2025-11-03 18:00:55 -05:00 
			
		
		
		
	Use reordered, lossy bit handling
This commit is contained in:
		@ -401,7 +401,8 @@ __launch_bounds__(BS, 1)
 | 
				
			|||||||
write_shmem(
 | 
					write_shmem(
 | 
				
			||||||
        float4 *acc,
 | 
					        float4 *acc,
 | 
				
			||||||
        const uint32_t *log,
 | 
					        const uint32_t *log,
 | 
				
			||||||
        const uint32_t *log_bounds
 | 
					        const uint32_t *log_bounds,
 | 
				
			||||||
 | 
					        uint32_t nbins
 | 
				
			||||||
) {
 | 
					) {
 | 
				
			||||||
    const int tid = threadIdx.x;
 | 
					    const int tid = threadIdx.x;
 | 
				
			||||||
    const int bid = blockIdx.x;
 | 
					    const int bid = blockIdx.x;
 | 
				
			||||||
@ -443,16 +444,15 @@ write_shmem(
 | 
				
			|||||||
    float time = tid * rnrounds;
 | 
					    float time = tid * rnrounds;
 | 
				
			||||||
    float time_step = BS * rnrounds;
 | 
					    float time_step = BS * rnrounds;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    int glo_base = bid << SHAB;
 | 
					    int magic = ((blockIdx.x & 0xff) << 3) + ((blockIdx.x & 0xf00) << 12);
 | 
				
			||||||
    float4* glo_ptr = &acc[glo_base];
 | 
					    int magic_mask = 0xf007f8;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    for (int i = idx_lo + tid; i < idx_hi; i += BS) {
 | 
					    for (int i = idx_lo + tid; i < idx_hi; i += BS) {
 | 
				
			||||||
        int entry = log[i];
 | 
					        int entry = log[i];
 | 
				
			||||||
 | 
					        time += time_step;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        // Constant '12' is 32 - 8 - SHAB, where 8 is the
 | 
					        // TODO: opacity
 | 
				
			||||||
        // number of bits assigned to color. TODO: This ignores opacity.
 | 
					        if ((entry & magic_mask) != magic) continue;
 | 
				
			||||||
        bfe_decl(glob_addr, entry, SHAB, 12);
 | 
					 | 
				
			||||||
        if (glob_addr != bid) continue;
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
        asm volatile ({{crep("""
 | 
					        asm volatile ({{crep("""
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
@ -461,7 +461,10 @@ write_shmem(
 | 
				
			|||||||
    .reg .u64 ptr;
 | 
					    .reg .u64 ptr;
 | 
				
			||||||
    .reg .f32 rf, gf, bf, df;
 | 
					    .reg .f32 rf, gf, bf, df;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    bfi.b32         shoff,  %0,     0,  2,  12;
 | 
					    and.b32         shoff,  %0,     0xff800;
 | 
				
			||||||
 | 
					    shr.b32         shoff,  shoff,  6;
 | 
				
			||||||
 | 
					    bfi.b32         shoff,  %0,     shoff,  2,  3;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    bfe.u32         color,  %0,     24, 8;
 | 
					    bfe.u32         color,  %0,     24, 8;
 | 
				
			||||||
    shl.b32         color,  color,  3;
 | 
					    shl.b32         color,  color,  3;
 | 
				
			||||||
    cvt.rni.u32.f32 time,   %1;
 | 
					    cvt.rni.u32.f32 time,   %1;
 | 
				
			||||||
@ -476,15 +479,16 @@ acc_write_start:
 | 
				
			|||||||
@p  ld.shared.volatile.u32  hiw,    [shoff+0x4000];
 | 
					@p  ld.shared.volatile.u32  hiw,    [shoff+0x4000];
 | 
				
			||||||
    add.cc.u32      lo,     los,    low;
 | 
					    add.cc.u32      lo,     los,    low;
 | 
				
			||||||
    addc.u32        hi,     his,    hiw;
 | 
					    addc.u32        hi,     his,    hiw;
 | 
				
			||||||
    setp.lo.u32     q,      hi,     (1023 << 22);
 | 
					    setp.hs.and.u32 q,      hi,     (1023 << 22),   p;
 | 
				
			||||||
    selp.b32        hiw,    hi,     0,      q;
 | 
					    selp.b32        hiw,    0,      hi,     q;
 | 
				
			||||||
    selp.b32        low,    lo,     0,      q;
 | 
					    selp.b32        low,    0,      lo,     q;
 | 
				
			||||||
@p  st.shared.volatile.u32   [shoff+0x4000],    hiw;
 | 
					@p  st.shared.volatile.u32   [shoff+0x4000],    hiw;
 | 
				
			||||||
    // This instruction will get replaced with an STSUL
 | 
					    // This instruction will get replaced with an STSUL
 | 
				
			||||||
@p  st.shared.volatile.u32   [shoff+0xffff],    low;
 | 
					@p  st.shared.volatile.u32   [shoff+0xffff],    low;
 | 
				
			||||||
@!p bra             acc_write_start;
 | 
					//@!p bra             acc_write_start;
 | 
				
			||||||
@q  bra             oflow_write_end;
 | 
					@!q bra             oflow_write_end;
 | 
				
			||||||
    shl.b32         shoff,  shoff,  2;
 | 
					    // TODO: opacity
 | 
				
			||||||
 | 
					    bfi.b32         shoff,  %0,     0,  4,  24;
 | 
				
			||||||
    cvt.u64.u32     ptr,    shoff;
 | 
					    cvt.u64.u32     ptr,    shoff;
 | 
				
			||||||
    add.u64         ptr,    ptr,    %2;
 | 
					    add.u64         ptr,    ptr,    %2;
 | 
				
			||||||
    bfe.u32         r,      hi,     4,      18;
 | 
					    bfe.u32         r,      hi,     4,      18;
 | 
				
			||||||
@ -504,16 +508,18 @@ acc_write_start:
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
oflow_write_end:
 | 
					oflow_write_end:
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
        """)}}  ::  "r"(entry), "f"(time), "l"(glo_ptr));
 | 
					        """)}}  ::  "r"(entry), "f"(time), "l"(acc));
 | 
				
			||||||
        // TODO: go through the pain of manual address calculation for global ptr
 | 
					 | 
				
			||||||
        time += time_step;
 | 
					 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    __syncthreads();
 | 
					    __syncthreads();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    int idx = tid;
 | 
					    int idx = tid;
 | 
				
			||||||
    for (int i = 0; i < (SHAW / BS); i++) {
 | 
					    int glo_idx = magic | (((idx << 8) | idx) & 0xff807);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    for (int i = 0; i < (SHAW / BS) && glo_idx < nbins; i++) {
 | 
				
			||||||
        int d, r, g, b;
 | 
					        int d, r, g, b;
 | 
				
			||||||
        float4 pix = acc[glo_base + idx];
 | 
					        float4 pix = acc[glo_idx];
 | 
				
			||||||
        asm({{crep("""
 | 
					        asm({{crep("""
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
    .reg .u32 hi, lo;
 | 
					    .reg .u32 hi, lo;
 | 
				
			||||||
@ -530,8 +536,9 @@ oflow_write_end:
 | 
				
			|||||||
        pix.y += g / 255.0f;
 | 
					        pix.y += g / 255.0f;
 | 
				
			||||||
        pix.z += b / 255.0f;
 | 
					        pix.z += b / 255.0f;
 | 
				
			||||||
        pix.w += d;
 | 
					        pix.w += d;
 | 
				
			||||||
        acc[glo_base + idx] = pix;
 | 
					        acc[glo_idx] = pix;
 | 
				
			||||||
        idx += BS;
 | 
					        idx += BS;
 | 
				
			||||||
 | 
					        glo_idx += (BS << 8);
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
				
			|||||||
@ -8,6 +8,7 @@ from itertools import cycle, repeat, chain, izip
 | 
				
			|||||||
from ctypes import *
 | 
					from ctypes import *
 | 
				
			||||||
from cStringIO import StringIO
 | 
					from cStringIO import StringIO
 | 
				
			||||||
import numpy as np
 | 
					import numpy as np
 | 
				
			||||||
 | 
					from numpy import int32 as i32
 | 
				
			||||||
from scipy import ndimage
 | 
					from scipy import ndimage
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from fr0stlib import pyflam3
 | 
					from fr0stlib import pyflam3
 | 
				
			||||||
@ -147,7 +148,9 @@ class Renderer(object):
 | 
				
			|||||||
            d_log = cuda.mem_alloc(log_size * 4)
 | 
					            d_log = cuda.mem_alloc(log_size * 4)
 | 
				
			||||||
            d_log_sorted = cuda.mem_alloc(log_size * 4)
 | 
					            d_log_sorted = cuda.mem_alloc(log_size * 4)
 | 
				
			||||||
            sorter = sort.Sorter(log_size)
 | 
					            sorter = sort.Sorter(log_size)
 | 
				
			||||||
            nwriteblocks = int(np.ceil(nbins / float(1<<12)))
 | 
					            # We need to cover each unique tag - address bits 20-23 - with one
 | 
				
			||||||
 | 
					            # write block per sort bin. Or somethinig like that.
 | 
				
			||||||
 | 
					            nwriteblocks = int(np.ceil(nbins / float(1<<20))) * 256
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        # Calculate 'nslots', the number of simultaneous running threads that
 | 
					        # Calculate 'nslots', the number of simultaneous running threads that
 | 
				
			||||||
        # can be active on the GPU during iteration (and thus the number of
 | 
					        # can be active on the GPU during iteration (and thus the number of
 | 
				
			||||||
@ -267,10 +270,11 @@ class Renderer(object):
 | 
				
			|||||||
                             block=(32, self._iter.NTHREADS/32, 1),
 | 
					                             block=(32, self._iter.NTHREADS/32, 1),
 | 
				
			||||||
                             grid=(ntemporal_samples, 1), stream=iter_stream)
 | 
					                             grid=(ntemporal_samples, 1), stream=iter_stream)
 | 
				
			||||||
                    _sync_stream(write_stream, iter_stream)
 | 
					                    _sync_stream(write_stream, iter_stream)
 | 
				
			||||||
                    sorter.sort(d_log_sorted, d_log, log_size, 12, True,
 | 
					                    sorter.sort(d_log_sorted, d_log, log_size, 3, True,
 | 
				
			||||||
                                stream=write_stream)
 | 
					                                stream=write_stream)
 | 
				
			||||||
 | 
					                    #print cuda.from_device(sorter.dglobal, (256,), np.uint32)
 | 
				
			||||||
                    _sync_stream(iter_stream, write_stream)
 | 
					                    _sync_stream(iter_stream, write_stream)
 | 
				
			||||||
                    write_fun(d_accum, d_log_sorted, sorter.dglobal,
 | 
					                    write_fun(d_accum, d_log_sorted, sorter.dglobal, i32(nbins),
 | 
				
			||||||
                              block=(1024, 1, 1), grid=(nwriteblocks, 1),
 | 
					                              block=(1024, 1, 1), grid=(nwriteblocks, 1),
 | 
				
			||||||
                              stream=write_stream)
 | 
					                              stream=write_stream)
 | 
				
			||||||
            else:
 | 
					            else:
 | 
				
			||||||
@ -302,5 +306,5 @@ class Renderer(object):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
    def _trim(self, result):
 | 
					    def _trim(self, result):
 | 
				
			||||||
        g = self.info.gutter
 | 
					        g = self.info.gutter
 | 
				
			||||||
        return result[g:-g,g:-g].copy()
 | 
					        return result[g:-g,g:g+self.info.width].copy()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user