mirror of
				https://github.com/stevenrobertson/cuburn.git
				synced 2025-11-03 18:00:55 -05:00 
			
		
		
		
	Lockless lossy shared memory writeback.
Barely tested! And yet it's going straight into master. Lucky you!
This commit is contained in:
		@ -410,26 +410,15 @@ write_shmem(
 | 
				
			|||||||
    // These two accumulators, used in write_shmem, hold {density, red} and
 | 
					    // These two accumulators, used in write_shmem, hold {density, red} and
 | 
				
			||||||
    // {green, blue} values as packed u16 pairs. The fixed size represents
 | 
					    // {green, blue} values as packed u16 pairs. The fixed size represents
 | 
				
			||||||
    // 4,096 pixels in the accumulator.
 | 
					    // 4,096 pixels in the accumulator.
 | 
				
			||||||
    __shared__ uint32_t s_acc_dr[SHAW];
 | 
					    __shared__ uint32_t s_acc[SHAW*2];
 | 
				
			||||||
    __shared__ uint32_t s_acc_gb[SHAW];
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    // TODO: doesn't respect SHAW/BS
 | 
					    int idx = tid;
 | 
				
			||||||
    // TODO: compare generated code with unrolled for-loop
 | 
					    for (int i = 0; i < (SHAW * 2 / BS); i++) {
 | 
				
			||||||
    s_acc_dr[tid] = 0;
 | 
					        s_acc[idx] = 0;
 | 
				
			||||||
    s_acc_gb[tid] = 0;
 | 
					        idx += BS;
 | 
				
			||||||
    s_acc_dr[tid+BS] = 0;
 | 
					    }
 | 
				
			||||||
    s_acc_gb[tid+BS] = 0;
 | 
					 | 
				
			||||||
    s_acc_dr[tid+2*BS] = 0;
 | 
					 | 
				
			||||||
    s_acc_gb[tid+2*BS] = 0;
 | 
					 | 
				
			||||||
    s_acc_dr[tid+3*BS] = 0;
 | 
					 | 
				
			||||||
    s_acc_gb[tid+3*BS] = 0;
 | 
					 | 
				
			||||||
    __syncthreads();
 | 
					    __syncthreads();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    // This predicate is used for the horrible monkey-patching magic. Second
 | 
					 | 
				
			||||||
    // variable is just to shut the compiler up.
 | 
					 | 
				
			||||||
    asm volatile(".reg .pred p; setp.lt.u32 p, %0, 42;"
 | 
					 | 
				
			||||||
                 :: "r"(s_acc_dr[0]), "r"(s_acc_gb[0]));
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    // log_bounds[] holds inclusive prefix sums, so that log_bounds[0] is the
 | 
					    // log_bounds[] holds inclusive prefix sums, so that log_bounds[0] is the
 | 
				
			||||||
    // largest index with radix 0, and so on.
 | 
					    // largest index with radix 0, and so on.
 | 
				
			||||||
    int lb_idx_hi = bid & 0xff;
 | 
					    int lb_idx_hi = bid & 0xff;
 | 
				
			||||||
@ -457,40 +446,32 @@ write_shmem(
 | 
				
			|||||||
        asm volatile ({{crep("""
 | 
					        asm volatile ({{crep("""
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
    .reg .pred q;
 | 
					    .reg .pred q;
 | 
				
			||||||
    .reg .u32 shoff, color, time, d, r, g, b, hi, lo, his, los, hiw, low;
 | 
					    .reg .u32 shoff, color, time, d, r, g, b, hi, lo, hiw, low;
 | 
				
			||||||
    .reg .u64 ptr;
 | 
					    .reg .u64 ptr;
 | 
				
			||||||
    .reg .f32 rf, gf, bf, df;
 | 
					    .reg .f32 rf, gf, bf, df, rg, gg, dg, bg;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    and.b32         shoff,  %0,     0xff800;
 | 
					    and.b32         shoff,  %0,     0xff800;
 | 
				
			||||||
    shr.b32         shoff,  shoff,  6;
 | 
					    shr.b32         shoff,  shoff,  5;
 | 
				
			||||||
    bfi.b32         shoff,  %0,     shoff,  2,  3;
 | 
					    bfi.b32         shoff,  %0,     shoff,  3,  3;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    bfe.u32         color,  %0,     24, 8;
 | 
					    bfe.u32         color,  %0,     24, 8;
 | 
				
			||||||
    shl.b32         color,  color,  3;
 | 
					    shl.b32         color,  color,  3;
 | 
				
			||||||
    cvt.rni.u32.f32 time,   %1;
 | 
					    cvt.rni.u32.f32 time,   %1;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    suld.b.2d.v2.b32.clamp  {his, los},   [flatpal, {color, time}];
 | 
					    suld.b.2d.v2.b32.clamp  {hi, lo},   [flatpal, {color, time}];
 | 
				
			||||||
 | 
					    ld.shared.v2.u32    {hiw, low},     [shoff];
 | 
				
			||||||
acc_write_start:
 | 
					    add.cc.u32          lo,     lo,     low;
 | 
				
			||||||
    // This instruction will get replaced with a LDSLK that sets 'p'.
 | 
					    addc.u32            hi,     hi,     hiw;
 | 
				
			||||||
    // The 0xffff is a signature to make sure we get the right instruction,
 | 
					    st.shared.v2.u32    [shoff],        {hi, lo};
 | 
				
			||||||
    // and will get replaced with a 0-offset when patching.
 | 
					    setp.hs.u32         q,      hi,     (1023 << 22);
 | 
				
			||||||
    ld.shared.volatile.u32  low,    [shoff+0xffff];
 | 
					 | 
				
			||||||
@p  ld.shared.volatile.u32  hiw,    [shoff+0x4000];
 | 
					 | 
				
			||||||
    add.cc.u32      lo,     los,    low;
 | 
					 | 
				
			||||||
    addc.u32        hi,     his,    hiw;
 | 
					 | 
				
			||||||
    setp.hs.and.u32 q,      hi,     (1023 << 22),   p;
 | 
					 | 
				
			||||||
    selp.b32        hiw,    0,      hi,     q;
 | 
					 | 
				
			||||||
    selp.b32        low,    0,      lo,     q;
 | 
					 | 
				
			||||||
@p  st.shared.volatile.u32   [shoff+0x4000],    hiw;
 | 
					 | 
				
			||||||
    // This instruction will get replaced with an STSUL
 | 
					 | 
				
			||||||
@p  st.shared.volatile.u32   [shoff+0xffff],    low;
 | 
					 | 
				
			||||||
//@!p bra             acc_write_start;
 | 
					 | 
				
			||||||
@!q bra             oflow_write_end;
 | 
					@!q bra             oflow_write_end;
 | 
				
			||||||
 | 
					    st.shared.v2.u32    [shoff],    {0, 0};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    // TODO: opacity
 | 
					    // TODO: opacity
 | 
				
			||||||
    bfi.b32         shoff,  %0,     0,  4,  24;
 | 
					    bfi.b32         shoff,  %0,     0,  4,  24;
 | 
				
			||||||
    cvt.u64.u32     ptr,    shoff;
 | 
					    cvt.u64.u32     ptr,    shoff;
 | 
				
			||||||
    add.u64         ptr,    ptr,    %2;
 | 
					    add.u64         ptr,    ptr,    %2;
 | 
				
			||||||
 | 
					    ld.global.v4.f32    {dg,bg,gg,rg},  [ptr];
 | 
				
			||||||
    bfe.u32         r,      hi,     4,      18;
 | 
					    bfe.u32         r,      hi,     4,      18;
 | 
				
			||||||
    bfe.u32         g,      lo,     18,     14;
 | 
					    bfe.u32         g,      lo,     18,     14;
 | 
				
			||||||
    bfi.b32         g,      hi,     g,      14,     4;
 | 
					    bfi.b32         g,      hi,     g,      14,     4;
 | 
				
			||||||
@ -498,13 +479,11 @@ acc_write_start:
 | 
				
			|||||||
    cvt.rn.f32.u32  rf,     r;
 | 
					    cvt.rn.f32.u32  rf,     r;
 | 
				
			||||||
    cvt.rn.f32.u32  gf,     g;
 | 
					    cvt.rn.f32.u32  gf,     g;
 | 
				
			||||||
    cvt.rn.f32.u32  bf,     b;
 | 
					    cvt.rn.f32.u32  bf,     b;
 | 
				
			||||||
    mul.ftz.f32     rf,     rf,     (1.0/255.0);
 | 
					    fma.rn.ftz.f32  rf,     rf,     (1.0/255.0),    rg;
 | 
				
			||||||
    mul.ftz.f32     gf,     gf,     (1.0/255.0);
 | 
					    fma.rn.ftz.f32  gf,     gf,     (1.0/255.0),    gg;
 | 
				
			||||||
    mul.ftz.f32     bf,     bf,     (1.0/255.0);
 | 
					    fma.rn.ftz.f32  bf,     bf,     (1.0/255.0),    bg;
 | 
				
			||||||
    red.add.f32     [ptr],  rf;
 | 
					    add.f32         df,     df,     dg;
 | 
				
			||||||
    red.add.f32     [ptr+4], gf;
 | 
					    st.global.v4.f32    [ptr],  {df,bf,gf,rf};
 | 
				
			||||||
    red.add.f32     [ptr+8], bf;
 | 
					 | 
				
			||||||
    red.add.f32     [ptr+12], 1023.0;
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
oflow_write_end:
 | 
					oflow_write_end:
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
@ -514,7 +493,7 @@ oflow_write_end:
 | 
				
			|||||||
    __syncthreads();
 | 
					    __syncthreads();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    int idx = tid;
 | 
					    idx = tid;
 | 
				
			||||||
    int glo_idx = magic | (((idx << 8) | idx) & 0xff807);
 | 
					    int glo_idx = magic | (((idx << 8) | idx) & 0xff807);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    for (int i = 0; i < (SHAW / BS) && glo_idx < nbins; i++) {
 | 
					    for (int i = 0; i < (SHAW / BS) && glo_idx < nbins; i++) {
 | 
				
			||||||
@ -523,15 +502,14 @@ oflow_write_end:
 | 
				
			|||||||
        asm({{crep("""
 | 
					        asm({{crep("""
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
    .reg .u32 hi, lo;
 | 
					    .reg .u32 hi, lo;
 | 
				
			||||||
    ld.shared.u32   lo,     [%4];
 | 
					    ld.shared.v2.u32   {hi, lo},    [%4];
 | 
				
			||||||
    ld.shared.u32   hi,     [%4+0x4000];
 | 
					 | 
				
			||||||
    shr.u32         %0,     hi,     22;
 | 
					    shr.u32         %0,     hi,     22;
 | 
				
			||||||
    bfe.u32         %1,     hi,     4,      18;
 | 
					    bfe.u32         %1,     hi,     4,      18;
 | 
				
			||||||
    bfe.u32         %2,     lo,     18,     14;
 | 
					    bfe.u32         %2,     lo,     18,     14;
 | 
				
			||||||
    bfi.b32         %2,     hi,     %2,     14,     4;
 | 
					    bfi.b32         %2,     hi,     %2,     14,     4;
 | 
				
			||||||
    and.b32         %3,     lo,     ((1<<18)-1);
 | 
					    and.b32         %3,     lo,     ((1<<18)-1);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
        """)}} : "=r"(d), "=r"(r), "=r"(g), "=r"(b) : "r"(idx*4));
 | 
					        """)}} : "=r"(d), "=r"(r), "=r"(g), "=r"(b) : "r"(idx*8));
 | 
				
			||||||
        pix.x += r / 255.0f;
 | 
					        pix.x += r / 255.0f;
 | 
				
			||||||
        pix.y += g / 255.0f;
 | 
					        pix.y += g / 255.0f;
 | 
				
			||||||
        pix.z += b / 255.0f;
 | 
					        pix.z += b / 255.0f;
 | 
				
			||||||
@ -553,36 +531,3 @@ oflow_write_end:
 | 
				
			|||||||
                              if n != 'final'],
 | 
					                              if n != 'final'],
 | 
				
			||||||
                **globals())
 | 
					                **globals())
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @staticmethod
 | 
					 | 
				
			||||||
    def monkey_patch(cubin):
 | 
					 | 
				
			||||||
        LD      = np.uint64(0x851c00fcff0300c1)
 | 
					 | 
				
			||||||
        LDSLK   = np.uint64(0x851c0000000000c4)
 | 
					 | 
				
			||||||
        ST      = np.uint64(0x850000fcff0300c9)
 | 
					 | 
				
			||||||
        STSUL   = np.uint64(0x85000000000000cc)
 | 
					 | 
				
			||||||
        regmask = np.uint64(0x00c0ff0300000000)
 | 
					 | 
				
			||||||
        prdmask = np.uint64(0x003c000000000000)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        O = 64  # Expected offset to last instruction
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        offset = cubin.find('\x85')
 | 
					 | 
				
			||||||
        while offset >= 0:
 | 
					 | 
				
			||||||
            # Using these fixed offsets makes this code intentionally
 | 
					 | 
				
			||||||
            # intolerant of compiler instruction reordering
 | 
					 | 
				
			||||||
            if cubin[offset+7] == '\xc1' and cubin[offset+O] == '\x85':
 | 
					 | 
				
			||||||
                ld = np.frombuffer(cubin[offset:offset+8], dtype='>u8')
 | 
					 | 
				
			||||||
                st = np.frombuffer(cubin[offset+O:offset+8+O], dtype='>u8')
 | 
					 | 
				
			||||||
                if ((ld & (~regmask)) == LD and
 | 
					 | 
				
			||||||
                    ((st & (~regmask)) & (~prdmask)) == ST):
 | 
					 | 
				
			||||||
                    break
 | 
					 | 
				
			||||||
            offset = cubin.find('\x85', offset+1)
 | 
					 | 
				
			||||||
        assert offset > 0, 'Could not find patch point!'
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        # Note that these bits are still reversed, and we ignore the
 | 
					 | 
				
			||||||
        # (im)possibility of a negative predicate in this case
 | 
					 | 
				
			||||||
        pred = (st & prdmask) >> 50
 | 
					 | 
				
			||||||
        ld = LDSLK | (ld & regmask) | (pred << 10)
 | 
					 | 
				
			||||||
        st = STSUL | (st & regmask) | (st & prdmask)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        return ( cubin[:offset] + ld.byteswap().tostring()
 | 
					 | 
				
			||||||
               + cubin[offset+8:offset+O]
 | 
					 | 
				
			||||||
               + st.byteswap().tostring() + cubin[offset+8+O:] )
 | 
					 | 
				
			||||||
 | 
				
			|||||||
@ -82,11 +82,6 @@ class Renderer(object):
 | 
				
			|||||||
        self.cubin = pycuda.compiler.compile(
 | 
					        self.cubin = pycuda.compiler.compile(
 | 
				
			||||||
                self.src, keep=keep, options=cmp_options,
 | 
					                self.src, keep=keep, options=cmp_options,
 | 
				
			||||||
                cache_dir=False if keep else None)
 | 
					                cache_dir=False if keep else None)
 | 
				
			||||||
        with open('/tmp/iter_kern.cubin', 'wb') as fp:
 | 
					 | 
				
			||||||
            fp.write(self.cubin)
 | 
					 | 
				
			||||||
        # For now, we apply the monkey-patch manually. May eventually make
 | 
					 | 
				
			||||||
        # this more of a framework if I do it in more than one code segment.
 | 
					 | 
				
			||||||
        self.cubin = self._iter.monkey_patch(self.cubin)
 | 
					 | 
				
			||||||
        self.mod = cuda.module_from_buffer(self.cubin, jit_options)
 | 
					        self.mod = cuda.module_from_buffer(self.cubin, jit_options)
 | 
				
			||||||
        with open('/tmp/iter_kern.cubin', 'wb') as fp:
 | 
					        with open('/tmp/iter_kern.cubin', 'wb') as fp:
 | 
				
			||||||
            fp.write(self.cubin)
 | 
					            fp.write(self.cubin)
 | 
				
			||||||
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user