mirror of
				https://github.com/stevenrobertson/cuburn.git
				synced 2025-11-03 18:00:55 -05:00 
			
		
		
		
	Deferred works again. Time to break it.
This commit is contained in:
		@ -363,9 +363,9 @@ void iter(
 | 
				
			|||||||
        pix.w += 1.0f;
 | 
					        pix.w += 1.0f;
 | 
				
			||||||
        *accbuf = pix;
 | 
					        *accbuf = pix;
 | 
				
			||||||
{{elif info.acc_mode == 'deferred'}}
 | 
					{{elif info.acc_mode == 'deferred'}}
 | 
				
			||||||
        // 'color' gets the top 9 bits. TODO: add dithering via precalc.
 | 
					        // 'color' gets the top 8 bits. TODO: add dithering via precalc.
 | 
				
			||||||
        uint32_t icolor = fminf(1.0f, cc) * 511.0f + color_dither;
 | 
					        uint32_t icolor = fminf(1.0f, cc) * 255.0f + color_dither;
 | 
				
			||||||
        asm("bfi.b32    %0, %1, %0, 23, 9;" : "+r"(i) : "r"(icolor));
 | 
					        asm("bfi.b32    %0, %1, %0, 24, 8;" : "+r"(i) : "r"(icolor));
 | 
				
			||||||
        *log = i;
 | 
					        *log = i;
 | 
				
			||||||
{{endif}}
 | 
					{{endif}}
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
@ -394,10 +394,10 @@ void write_shmem_helper(
 | 
				
			|||||||
        const uint32_t gb
 | 
					        const uint32_t gb
 | 
				
			||||||
) {
 | 
					) {
 | 
				
			||||||
    float4 pix = acc[glo_idx];
 | 
					    float4 pix = acc[glo_idx];
 | 
				
			||||||
    pix.x += (dr & 0xffff) / 255.0f;
 | 
					    pix.x += (dr & 0xffff) / 127.0f;
 | 
				
			||||||
    pix.w += dr >> 16;
 | 
					    pix.w += dr >> 16;
 | 
				
			||||||
    pix.y += (gb & 0xffff) / 255.0f;
 | 
					    pix.y += (gb & 0xffff) / 127.0f;
 | 
				
			||||||
    pix.z += (gb >> 16) / 255.0f;
 | 
					    pix.z += (gb >> 16) / 127.0f;
 | 
				
			||||||
    acc[glo_idx] = pix;
 | 
					    acc[glo_idx] = pix;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -417,8 +417,7 @@ __launch_bounds__(BS, 1)
 | 
				
			|||||||
write_shmem(
 | 
					write_shmem(
 | 
				
			||||||
        float4 *acc,
 | 
					        float4 *acc,
 | 
				
			||||||
        const uint32_t *log,
 | 
					        const uint32_t *log,
 | 
				
			||||||
        const uint32_t *log_bounds,
 | 
					        const uint32_t *log_bounds
 | 
				
			||||||
        const int log_bounds_shift
 | 
					 | 
				
			||||||
) {
 | 
					) {
 | 
				
			||||||
    const int tid = threadIdx.x;
 | 
					    const int tid = threadIdx.x;
 | 
				
			||||||
    const int bid = blockIdx.x;
 | 
					    const int bid = blockIdx.x;
 | 
				
			||||||
@ -441,62 +440,94 @@ write_shmem(
 | 
				
			|||||||
    s_acc_gb[tid+3*BS] = 0;
 | 
					    s_acc_gb[tid+3*BS] = 0;
 | 
				
			||||||
    __syncthreads();
 | 
					    __syncthreads();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    // TODO: share across threads - discernable performance impact?
 | 
					    // This predicate is used for the horrible monkey-patching magic.
 | 
				
			||||||
    int lb_idx_lo, lb_idx_hi;
 | 
					    asm volatile(".reg .pred p; setp.lt.u32 p, %0, 42;" :: "r"(s_acc_dr[0]));
 | 
				
			||||||
    if (log_bounds_shift > 0) {
 | 
					 | 
				
			||||||
        lb_idx_hi = ((bid + 1) << log_bounds_shift) - 1;
 | 
					 | 
				
			||||||
        lb_idx_lo = (bid << log_bounds_shift) - 1;
 | 
					 | 
				
			||||||
    } else {
 | 
					 | 
				
			||||||
        lb_idx_hi = bid >> (-log_bounds_shift);
 | 
					 | 
				
			||||||
        lb_idx_lo = lb_idx_hi - 1;
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    int idx_lo, idx_hi;
 | 
					    // log_bounds[] holds inclusive prefix sums, so that log_bounds[0] is the
 | 
				
			||||||
    if (lb_idx_lo < 0) idx_lo = 0;
 | 
					    // largest index with radix 0, and so on.
 | 
				
			||||||
    else idx_lo = log_bounds[lb_idx_lo] & ~(BS-1);
 | 
					    int lb_idx_hi = bid & 0xff;
 | 
				
			||||||
    idx_hi = (log_bounds[lb_idx_hi] & ~(BS - 1)) + BS;
 | 
					    int lb_idx_lo = lb_idx_hi - 1;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    int idx_lo;
 | 
				
			||||||
 | 
					    if (lb_idx_lo > 0) idx_lo = log_bounds[lb_idx_lo] & ~(BS - 1);
 | 
				
			||||||
 | 
					    else               idx_lo = 0;
 | 
				
			||||||
 | 
					    int idx_hi = (log_bounds[lb_idx_hi] & ~(BS - 1)) + BS;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    float rnrounds = 1.0f / (idx_hi - idx_lo);
 | 
					    float rnrounds = 1.0f / (idx_hi - idx_lo);
 | 
				
			||||||
    float time = tid * rnrounds;
 | 
					    float time = tid * rnrounds;
 | 
				
			||||||
    float time_step = BS * rnrounds;
 | 
					    float time_step = BS * rnrounds;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    int glo_base = bid << SHAB;
 | 
					    int glo_base = bid << SHAB;
 | 
				
			||||||
 | 
					    float4* glo_ptr = &acc[glo_base];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    for (int i = idx_lo + tid; i < idx_hi; i += BS) {
 | 
					    for (int i = idx_lo + tid; i < idx_hi; i += BS) {
 | 
				
			||||||
        int entry = log[i];
 | 
					        int entry = log[i];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        // Constant '12' is 32 - 8 - SHAB, where 8 is the
 | 
				
			||||||
        // TODO: constant '11' is really just 32 - 9 - SHAB, where 9 is the
 | 
					        // number of bits assigned to color. TODO: This ignores opacity.
 | 
				
			||||||
        // number of bits assigned to color. This ignores opacity.
 | 
					        bfe_decl(glob_addr, entry, SHAB, 12);
 | 
				
			||||||
        bfe_decl(glob_addr, entry, SHAB, 11);
 | 
					 | 
				
			||||||
        if (glob_addr != bid) continue;
 | 
					        if (glob_addr != bid) continue;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        bfe_decl(shr_addr, entry, 0, SHAB);
 | 
					        // Shared memory address, pre-shifted
 | 
				
			||||||
        bfe_decl(color, entry, 23, 9);
 | 
					        int shr_addr;
 | 
				
			||||||
 | 
					        asm("bfi.b32 %0, %1, 0, 2, 12;" : "=r"(shr_addr) : "r"(entry));
 | 
				
			||||||
 | 
					        bfe_decl(color, entry, 24, 8);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        float colorf = color / 511.0f;
 | 
					        float colorf = color / 255.0f;
 | 
				
			||||||
        float4 outcol = tex2D(palTex, colorf, time);
 | 
					        float4 outcol = tex2D(palTex, colorf, time);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        // TODO: change texture sampler to return shorts and avoid this
 | 
					        // TODO: change texture sampler to return shorts and avoid this
 | 
				
			||||||
        uint32_t r = 255.0f * outcol.x;
 | 
					        uint32_t r = 127.0f * outcol.x;
 | 
				
			||||||
        uint32_t g = 255.0f * outcol.y;
 | 
					        uint32_t g = 127.0f * outcol.y;
 | 
				
			||||||
        uint32_t b = 255.0f * outcol.z;
 | 
					        uint32_t b = 127.0f * outcol.z;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        uint32_t dr = atomicAdd(s_acc_dr + shr_addr, r + 0x10000);
 | 
					        uint32_t dr = r + 0x10000, gb = g + (b << 16);
 | 
				
			||||||
        uint32_t gb = atomicAdd(s_acc_gb + shr_addr, g + (b << 16));
 | 
					 | 
				
			||||||
        uint32_t d = dr >> 16;
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
        // Neat trick: if overflow is about to happen, write the accumulator,
 | 
					        asm volatile ({{crep("""
 | 
				
			||||||
        // and subtract the last known values from the accumulator again.
 | 
					{
 | 
				
			||||||
        // Even if the ints end up wrapping around once before the subtraction
 | 
					    .reg .pred q;
 | 
				
			||||||
        // can occur, the results after the subtraction will be correct.
 | 
					    .reg .u32 d, r, g, b, dr, gb, drw, gbw, off;
 | 
				
			||||||
        // (Wrapping twice will mess up the intermediate write, but is pretty
 | 
					    .reg .u64 ptr;
 | 
				
			||||||
        // unlikely.)
 | 
					    .reg .f32 rf, gf, bf, df;
 | 
				
			||||||
        if (d == 250) {
 | 
					
 | 
				
			||||||
            atomicSub(s_acc_dr + shr_addr, dr);
 | 
					acc_write_start:
 | 
				
			||||||
            atomicSub(s_acc_gb + shr_addr, gb);
 | 
					    // This instruction will get replaced with a LDSLK that sets 'p'.
 | 
				
			||||||
            write_shmem_helper(acc, glo_base + shr_addr, dr, gb);
 | 
					    // The 0xffff is a signature to make sure we get the right instruction,
 | 
				
			||||||
        }
 | 
					    // and will get replaced with a 0-offset when patching.
 | 
				
			||||||
 | 
					    ld.shared.volatile.u32   dr,     [%2+0xffff];
 | 
				
			||||||
 | 
					@p  ld.shared.volatile.u32   gb,     [%2+0x4000];
 | 
				
			||||||
 | 
					@p  add.u32         %0,     dr,     %0;
 | 
				
			||||||
 | 
					@p  add.u32         %1,     gb,     %1;
 | 
				
			||||||
 | 
					    // TODO: clever use of slct could remove an instruction?
 | 
				
			||||||
 | 
					@p  setp.lo.u32     q,      %0,     32768000;
 | 
				
			||||||
 | 
					@p  selp.b32        drw,    %0,     0,      q;
 | 
				
			||||||
 | 
					@p  selp.b32        gbw,    %1,     0,      q;
 | 
				
			||||||
 | 
					@p  st.shared.volatile.u32   [%2+0x4000],    gbw;
 | 
				
			||||||
 | 
					    // This instruction will get replaced with an STSUL
 | 
				
			||||||
 | 
					@p  st.shared.volatile.u32   [%2+0xffff],    drw;
 | 
				
			||||||
 | 
					@!p bra             acc_write_start;
 | 
				
			||||||
 | 
					@q  bra             oflow_write_end;
 | 
				
			||||||
 | 
					    shl.b32         %2,     %2,     2;
 | 
				
			||||||
 | 
					    cvt.u64.u32     ptr,    %2;
 | 
				
			||||||
 | 
					    add.u64         ptr,    ptr,    %3;
 | 
				
			||||||
 | 
					    and.b32         r,      %0,     0xffff;
 | 
				
			||||||
 | 
					    shr.b32         g,      %1,     16;
 | 
				
			||||||
 | 
					    and.b32         b,      %1,     0xffff;
 | 
				
			||||||
 | 
					    cvt.rn.f32.u32  rf,     r;
 | 
				
			||||||
 | 
					    cvt.rn.f32.u32  gf,     g;
 | 
				
			||||||
 | 
					    cvt.rn.f32.u32  bf,     b;
 | 
				
			||||||
 | 
					    mul.ftz.f32     rf,     rf,     0.007874015748031496;
 | 
				
			||||||
 | 
					    mul.ftz.f32     gf,     gf,     0.007874015748031496;
 | 
				
			||||||
 | 
					    mul.ftz.f32     bf,     bf,     0.007874015748031496;
 | 
				
			||||||
 | 
					    red.add.f32     [ptr],   500.0;
 | 
				
			||||||
 | 
					    red.add.f32     [ptr+4], bf;
 | 
				
			||||||
 | 
					    red.add.f32     [ptr+8], gf;
 | 
				
			||||||
 | 
					    red.add.f32     [ptr+12], rf;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					oflow_write_end:
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					        """)}}  ::  "r"(dr), "r"(gb), "r"(shr_addr), "l"(glo_ptr));
 | 
				
			||||||
 | 
					        // TODO: go through the pain of manual address calculation for global ptr
 | 
				
			||||||
        time += time_step;
 | 
					        time += time_step;
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -519,3 +550,36 @@ write_shmem(
 | 
				
			|||||||
                              if n != 'final'],
 | 
					                              if n != 'final'],
 | 
				
			||||||
                **globals())
 | 
					                **globals())
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    @staticmethod
 | 
				
			||||||
 | 
					    def monkey_patch(cubin):
 | 
				
			||||||
 | 
					        LD      = np.uint64(0x851c00fcff0300c1)
 | 
				
			||||||
 | 
					        LDSLK   = np.uint64(0x851c0000000000c4)
 | 
				
			||||||
 | 
					        ST      = np.uint64(0x850000fcff0300c9)
 | 
				
			||||||
 | 
					        STSUL   = np.uint64(0x85000000000000cc)
 | 
				
			||||||
 | 
					        regmask = np.uint64(0x00c0ff0300000000)
 | 
				
			||||||
 | 
					        prdmask = np.uint64(0x003c000000000000)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        O = 64  # Expected offset to last instruction
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        offset = cubin.find('\x85')
 | 
				
			||||||
 | 
					        while offset >= 0:
 | 
				
			||||||
 | 
					            # Using these fixed offsets makes this code intentionally
 | 
				
			||||||
 | 
					            # intolerant of compiler instruction reordering
 | 
				
			||||||
 | 
					            if cubin[offset+7] == '\xc1' and cubin[offset+O] == '\x85':
 | 
				
			||||||
 | 
					                ld = np.frombuffer(cubin[offset:offset+8], dtype='>u8')
 | 
				
			||||||
 | 
					                st = np.frombuffer(cubin[offset+O:offset+8+O], dtype='>u8')
 | 
				
			||||||
 | 
					                if ((ld & (~regmask)) == LD and
 | 
				
			||||||
 | 
					                    ((st & (~regmask)) & (~prdmask)) == ST):
 | 
				
			||||||
 | 
					                    break
 | 
				
			||||||
 | 
					            offset = cubin.find('\x85', offset+1)
 | 
				
			||||||
 | 
					        assert offset > 0, 'Could not find patch point!'
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # Note that these bits are still reversed, and we ignore the
 | 
				
			||||||
 | 
					        # (im)possibility of a negative predicate in this case
 | 
				
			||||||
 | 
					        pred = (st & prdmask) >> 50
 | 
				
			||||||
 | 
					        ld = LDSLK | (ld & regmask) | (pred << 10)
 | 
				
			||||||
 | 
					        st = STSUL | (st & regmask) | (st & prdmask)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        return ( cubin[:offset] + ld.byteswap().tostring()
 | 
				
			||||||
 | 
					               + cubin[offset+8:offset+O]
 | 
				
			||||||
 | 
					               + st.byteswap().tostring() + cubin[offset+8+O:] )
 | 
				
			||||||
 | 
				
			|||||||
@ -131,7 +131,7 @@ class RenderInfo(object):
 | 
				
			|||||||
    # position into a single 32-bit int for now, which limits resolution to
 | 
					    # position into a single 32-bit int for now, which limits resolution to
 | 
				
			||||||
    # 1080p when xform opacity is respected, so the other two modes will hang
 | 
					    # 1080p when xform opacity is respected, so the other two modes will hang
 | 
				
			||||||
    # around until that can be extended to be memory-limited again.
 | 
					    # around until that can be extended to be memory-limited again.
 | 
				
			||||||
    acc_mode = 'global'
 | 
					    acc_mode = 'deferred'
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # TODO: fix this
 | 
					    # TODO: fix this
 | 
				
			||||||
    chaos_used = False
 | 
					    chaos_used = False
 | 
				
			||||||
 | 
				
			|||||||
@ -81,6 +81,11 @@ class Renderer(object):
 | 
				
			|||||||
        self.cubin = pycuda.compiler.compile(
 | 
					        self.cubin = pycuda.compiler.compile(
 | 
				
			||||||
                self.src, keep=keep, options=cmp_options,
 | 
					                self.src, keep=keep, options=cmp_options,
 | 
				
			||||||
                cache_dir=False if keep else None)
 | 
					                cache_dir=False if keep else None)
 | 
				
			||||||
 | 
					        with open('/tmp/iter_kern.cubin', 'wb') as fp:
 | 
				
			||||||
 | 
					            fp.write(self.cubin)
 | 
				
			||||||
 | 
					        # For now, we apply the monkey-patch manually. May eventually make
 | 
				
			||||||
 | 
					        # this more of a framework if I do it in more than one code segment.
 | 
				
			||||||
 | 
					        self.cubin = self._iter.monkey_patch(self.cubin)
 | 
				
			||||||
        self.mod = cuda.module_from_buffer(self.cubin, jit_options)
 | 
					        self.mod = cuda.module_from_buffer(self.cubin, jit_options)
 | 
				
			||||||
        with open('/tmp/iter_kern.cubin', 'wb') as fp:
 | 
					        with open('/tmp/iter_kern.cubin', 'wb') as fp:
 | 
				
			||||||
            fp.write(self.cubin)
 | 
					            fp.write(self.cubin)
 | 
				
			||||||
@ -142,18 +147,7 @@ class Renderer(object):
 | 
				
			|||||||
            d_log = cuda.mem_alloc(log_size * 4)
 | 
					            d_log = cuda.mem_alloc(log_size * 4)
 | 
				
			||||||
            d_log_sorted = cuda.mem_alloc(log_size * 4)
 | 
					            d_log_sorted = cuda.mem_alloc(log_size * 4)
 | 
				
			||||||
            sorter = sort.Sorter(log_size)
 | 
					            sorter = sort.Sorter(log_size)
 | 
				
			||||||
 | 
					            nwriteblocks = int(np.ceil(nbins / float(1<<12)))
 | 
				
			||||||
            # Shared accumulators take care of the lowest 12 bits, but due to
 | 
					 | 
				
			||||||
            # a quirk of the sort implementation, asking the sort to handle
 | 
					 | 
				
			||||||
            # fewer bits than it is compiled for will make it considerably
 | 
					 | 
				
			||||||
            # slower (and it can't be compiled for <7b), so we actually dig in
 | 
					 | 
				
			||||||
            # to the accumulator's SHAB window for those cases.
 | 
					 | 
				
			||||||
            SHAB = np.int32(12)
 | 
					 | 
				
			||||||
            address_bits = np.int32(np.ceil(np.log2(nbins+1)))
 | 
					 | 
				
			||||||
            start_bit = address_bits - sorter.radix_bits
 | 
					 | 
				
			||||||
            log_shift = np.int32(SHAB - start_bit)
 | 
					 | 
				
			||||||
            nwriteblocks = int(np.ceil(nbins / (1<<SHAB)))
 | 
					 | 
				
			||||||
            print start_bit, log_shift, nwriteblocks
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
        # Calculate 'nslots', the number of simultaneous running threads that
 | 
					        # Calculate 'nslots', the number of simultaneous running threads that
 | 
				
			||||||
        # can be active on the GPU during iteration (and thus the number of
 | 
					        # can be active on the GPU during iteration (and thus the number of
 | 
				
			||||||
@ -252,10 +246,10 @@ class Renderer(object):
 | 
				
			|||||||
                             block=(32, self._iter.NTHREADS/32, 1),
 | 
					                             block=(32, self._iter.NTHREADS/32, 1),
 | 
				
			||||||
                             grid=(ntemporal_samples, 1), stream=iter_stream)
 | 
					                             grid=(ntemporal_samples, 1), stream=iter_stream)
 | 
				
			||||||
                    _sync_stream(write_stream, iter_stream)
 | 
					                    _sync_stream(write_stream, iter_stream)
 | 
				
			||||||
                    sorter.sort(d_log_sorted, d_log, log_size, start_bit, True,
 | 
					                    sorter.sort(d_log_sorted, d_log, log_size, 12, True,
 | 
				
			||||||
                                stream=write_stream)
 | 
					                                stream=write_stream)
 | 
				
			||||||
                    _sync_stream(iter_stream, write_stream)
 | 
					                    _sync_stream(iter_stream, write_stream)
 | 
				
			||||||
                    write_fun(d_accum, d_log_sorted, sorter.dglobal, log_shift,
 | 
					                    write_fun(d_accum, d_log_sorted, sorter.dglobal,
 | 
				
			||||||
                              block=(1024, 1, 1), grid=(nwriteblocks, 1),
 | 
					                              block=(1024, 1, 1), grid=(nwriteblocks, 1),
 | 
				
			||||||
                              texrefs=[tref], stream=write_stream)
 | 
					                              texrefs=[tref], stream=write_stream)
 | 
				
			||||||
            else:
 | 
					            else:
 | 
				
			||||||
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user