From 6bac3b3a95e95f29b37a645c620ea8aabfa86478 Mon Sep 17 00:00:00 2001 From: Steven Robertson Date: Fri, 9 Dec 2011 14:14:36 -0500 Subject: [PATCH] Use reordered, lossy bit handling --- cuburn/code/iter.py | 47 ++++++++++++++++++++++++++------------------- cuburn/render.py | 12 ++++++++---- 2 files changed, 35 insertions(+), 24 deletions(-) diff --git a/cuburn/code/iter.py b/cuburn/code/iter.py index d7d29ae..cde620c 100644 --- a/cuburn/code/iter.py +++ b/cuburn/code/iter.py @@ -401,7 +401,8 @@ __launch_bounds__(BS, 1) write_shmem( float4 *acc, const uint32_t *log, - const uint32_t *log_bounds + const uint32_t *log_bounds, + uint32_t nbins ) { const int tid = threadIdx.x; const int bid = blockIdx.x; @@ -443,16 +444,15 @@ write_shmem( float time = tid * rnrounds; float time_step = BS * rnrounds; - int glo_base = bid << SHAB; - float4* glo_ptr = &acc[glo_base]; + int magic = ((blockIdx.x & 0xff) << 3) + ((blockIdx.x & 0xf00) << 12); + int magic_mask = 0xf007f8; for (int i = idx_lo + tid; i < idx_hi; i += BS) { int entry = log[i]; + time += time_step; - // Constant '12' is 32 - 8 - SHAB, where 8 is the - // number of bits assigned to color. TODO: This ignores opacity. - bfe_decl(glob_addr, entry, SHAB, 12); - if (glob_addr != bid) continue; + // TODO: opacity + if ((entry & magic_mask) != magic) continue; asm volatile ({{crep(""" { @@ -461,7 +461,10 @@ write_shmem( .reg .u64 ptr; .reg .f32 rf, gf, bf, df; - bfi.b32 shoff, %0, 0, 2, 12; + and.b32 shoff, %0, 0xff800; + shr.b32 shoff, shoff, 6; + bfi.b32 shoff, %0, shoff, 2, 3; + bfe.u32 color, %0, 24, 8; shl.b32 color, color, 3; cvt.rni.u32.f32 time, %1; @@ -476,15 +479,16 @@ acc_write_start: @p ld.shared.volatile.u32 hiw, [shoff+0x4000]; add.cc.u32 lo, los, low; addc.u32 hi, his, hiw; - setp.lo.u32 q, hi, (1023 << 22); - selp.b32 hiw, hi, 0, q; - selp.b32 low, lo, 0, q; + setp.hs.and.u32 q, hi, (1023 << 22), p; + selp.b32 hiw, 0, hi, q; + selp.b32 low, 0, lo, q; @p st.shared.volatile.u32 [shoff+0x4000], hiw; // This instruction will get replaced with an STSUL @p st.shared.volatile.u32 [shoff+0xffff], low; -@!p bra acc_write_start; -@q bra oflow_write_end; - shl.b32 shoff, shoff, 2; +//@!p bra acc_write_start; +@!q bra oflow_write_end; + // TODO: opacity + bfi.b32 shoff, %0, 0, 4, 24; cvt.u64.u32 ptr, shoff; add.u64 ptr, ptr, %2; bfe.u32 r, hi, 4, 18; @@ -504,16 +508,18 @@ acc_write_start: oflow_write_end: } - """)}} :: "r"(entry), "f"(time), "l"(glo_ptr)); - // TODO: go through the pain of manual address calculation for global ptr - time += time_step; + """)}} :: "r"(entry), "f"(time), "l"(acc)); } __syncthreads(); + + int idx = tid; - for (int i = 0; i < (SHAW / BS); i++) { + int glo_idx = magic | (((idx << 8) | idx) & 0xff807); + + for (int i = 0; i < (SHAW / BS) && glo_idx < nbins; i++) { int d, r, g, b; - float4 pix = acc[glo_base + idx]; + float4 pix = acc[glo_idx]; asm({{crep(""" { .reg .u32 hi, lo; @@ -530,8 +536,9 @@ oflow_write_end: pix.y += g / 255.0f; pix.z += b / 255.0f; pix.w += d; - acc[glo_base + idx] = pix; + acc[glo_idx] = pix; idx += BS; + glo_idx += (BS << 8); } } diff --git a/cuburn/render.py b/cuburn/render.py index 4876f54..d3251bf 100644 --- a/cuburn/render.py +++ b/cuburn/render.py @@ -8,6 +8,7 @@ from itertools import cycle, repeat, chain, izip from ctypes import * from cStringIO import StringIO import numpy as np +from numpy import int32 as i32 from scipy import ndimage from fr0stlib import pyflam3 @@ -147,7 +148,9 @@ class Renderer(object): d_log = cuda.mem_alloc(log_size * 4) d_log_sorted = cuda.mem_alloc(log_size * 4) sorter = sort.Sorter(log_size) - nwriteblocks = int(np.ceil(nbins / float(1<<12))) + # We need to cover each unique tag - address bits 20-23 - with one + # write block per sort bin. Or somethinig like that. + nwriteblocks = int(np.ceil(nbins / float(1<<20))) * 256 # Calculate 'nslots', the number of simultaneous running threads that # can be active on the GPU during iteration (and thus the number of @@ -267,10 +270,11 @@ class Renderer(object): block=(32, self._iter.NTHREADS/32, 1), grid=(ntemporal_samples, 1), stream=iter_stream) _sync_stream(write_stream, iter_stream) - sorter.sort(d_log_sorted, d_log, log_size, 12, True, + sorter.sort(d_log_sorted, d_log, log_size, 3, True, stream=write_stream) + #print cuda.from_device(sorter.dglobal, (256,), np.uint32) _sync_stream(iter_stream, write_stream) - write_fun(d_accum, d_log_sorted, sorter.dglobal, + write_fun(d_accum, d_log_sorted, sorter.dglobal, i32(nbins), block=(1024, 1, 1), grid=(nwriteblocks, 1), stream=write_stream) else: @@ -302,5 +306,5 @@ class Renderer(object): def _trim(self, result): g = self.info.gutter - return result[g:-g,g:-g].copy() + return result[g:-g,g:g+self.info.width].copy()