mirror of
				https://github.com/stevenrobertson/cuburn.git
				synced 2025-11-03 18:00:55 -05:00 
			
		
		
		
	New atomic write mode
This commit is contained in:
		@ -270,8 +270,8 @@ void interp_palette_hsv_flat(mwc_st *rctxs,
 | 
				
			|||||||
    uint32_t r = min(255, (uint32_t) (rgba.x * 255.0f + 0.49f * mwc_next_11(rctx)));
 | 
					    uint32_t r = min(255, (uint32_t) (rgba.x * 255.0f + 0.49f * mwc_next_11(rctx)));
 | 
				
			||||||
    uint32_t g = min(255, (uint32_t) (rgba.y * 255.0f + 0.49f * mwc_next_11(rctx)));
 | 
					    uint32_t g = min(255, (uint32_t) (rgba.y * 255.0f + 0.49f * mwc_next_11(rctx)));
 | 
				
			||||||
    uint32_t b = min(255, (uint32_t) (rgba.z * 255.0f + 0.49f * mwc_next_11(rctx)));
 | 
					    uint32_t b = min(255, (uint32_t) (rgba.z * 255.0f + 0.49f * mwc_next_11(rctx)));
 | 
				
			||||||
    out.x = (1 << 22) | (r << 4);
 | 
					    out.y = (1 << 22) | (r << 4);
 | 
				
			||||||
    out.y = (g << 18) | b;
 | 
					    out.x = (g << 18) | b;
 | 
				
			||||||
    surf2Dwrite(out, flatpal, 8 * threadIdx.x, blockIdx.x);
 | 
					    surf2Dwrite(out, flatpal, 8 * threadIdx.x, blockIdx.x);
 | 
				
			||||||
    rctxs[gid] = rctx;
 | 
					    rctxs[gid] = rctx;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
				
			|||||||
@ -187,8 +187,10 @@ void iter(
 | 
				
			|||||||
        uint64_t out_ptr,
 | 
					        uint64_t out_ptr,
 | 
				
			||||||
        mwc_st *msts,
 | 
					        mwc_st *msts,
 | 
				
			||||||
        float4 *points,
 | 
					        float4 *points,
 | 
				
			||||||
        const iter_params *all_params,
 | 
					        const iter_params *all_params
 | 
				
			||||||
        int nsamps_to_generate
 | 
					{{if info.acc_mode == 'atomic'}}
 | 
				
			||||||
 | 
					      , uint64_t atom_ptr
 | 
				
			||||||
 | 
					{{endif}}
 | 
				
			||||||
) {
 | 
					) {
 | 
				
			||||||
    const iter_params *global_params = &(all_params[blockIdx.x]);
 | 
					    const iter_params *global_params = &(all_params[blockIdx.x]);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -207,10 +209,14 @@ void iter(
 | 
				
			|||||||
    int this_rb_idx = rb_idx + threadIdx.x + 32 * threadIdx.y;
 | 
					    int this_rb_idx = rb_idx + threadIdx.x + 32 * threadIdx.y;
 | 
				
			||||||
    mwc_st rctx = msts[this_rb_idx];
 | 
					    mwc_st rctx = msts[this_rb_idx];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
{{if info.acc_mode != 'deferred'}}
 | 
					{{if info.acc_mode == 'global'}}
 | 
				
			||||||
    __shared__ float time_frac;
 | 
					    __shared__ float time_frac;
 | 
				
			||||||
    time_frac = blockIdx.x / (float) gridDim.x;
 | 
					    time_frac = blockIdx.x / (float) gridDim.x;
 | 
				
			||||||
{{else}}
 | 
					{{else}}
 | 
				
			||||||
 | 
					    {{if info.acc_mode == 'atomic'}}
 | 
				
			||||||
 | 
					    // TODO: spare the register, reuse at call site?
 | 
				
			||||||
 | 
					    int time = blockIdx.x >> 4;
 | 
				
			||||||
 | 
					    {{endif}}
 | 
				
			||||||
    float color_dither = 0.49f * mwc_next_11(rctx);
 | 
					    float color_dither = 0.49f * mwc_next_11(rctx);
 | 
				
			||||||
{{endif}}
 | 
					{{endif}}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -224,10 +230,10 @@ void iter(
 | 
				
			|||||||
    // Shared memory size can be reduced by a factor of four using a slower
 | 
					    // Shared memory size can be reduced by a factor of four using a slower
 | 
				
			||||||
    // 4-stage reduce, but on Fermi hardware shmem use isn't a bottleneck
 | 
					    // 4-stage reduce, but on Fermi hardware shmem use isn't a bottleneck
 | 
				
			||||||
    __shared__ float swap[{{4*NTHREADS}}];
 | 
					    __shared__ float swap[{{4*NTHREADS}}];
 | 
				
			||||||
    __shared__ float cosel[{{NWARPS}}];
 | 
					    __shared__ float cosel[{{2*NWARPS}}];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    // This is normally done after the swap-sync in the main loop
 | 
					    // This is normally done after the swap-sync in the main loop
 | 
				
			||||||
    if (threadIdx.y == 0 && threadIdx.x < {{NWARPS}})
 | 
					    if (threadIdx.y == 0 && threadIdx.x < {{NWARPS*2}})
 | 
				
			||||||
        cosel[threadIdx.x] = mwc_next_01(rctx);
 | 
					        cosel[threadIdx.x] = mwc_next_01(rctx);
 | 
				
			||||||
    __syncthreads();
 | 
					    __syncthreads();
 | 
				
			||||||
{{endif}}
 | 
					{{endif}}
 | 
				
			||||||
@ -295,7 +301,7 @@ void iter(
 | 
				
			|||||||
        __syncthreads();
 | 
					        __syncthreads();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        // We select the next xforms here, since we've just synced.
 | 
					        // We select the next xforms here, since we've just synced.
 | 
				
			||||||
        if (threadIdx.y == 0 && threadIdx.x < {{NWARPS}})
 | 
					        if (threadIdx.y == 0 && threadIdx.x < {{NWARPS*2}})
 | 
				
			||||||
            cosel[threadIdx.x] = mwc_next_01(rctx);
 | 
					            cosel[threadIdx.x] = mwc_next_01(rctx);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        fuse = swap[sr];
 | 
					        fuse = swap[sr];
 | 
				
			||||||
@ -345,15 +351,60 @@ void iter(
 | 
				
			|||||||
        }
 | 
					        }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        uint32_t i = iy * acc_size.stride + ix;
 | 
					        uint32_t i = iy * acc_size.stride + ix;
 | 
				
			||||||
 | 
					 | 
				
			||||||
{{if info.acc_mode == 'atomic'}}
 | 
					{{if info.acc_mode == 'atomic'}}
 | 
				
			||||||
        float4 outcol = tex2D(palTex, cc, time_frac);
 | 
					        asm volatile ({{crep("""
 | 
				
			||||||
        float *accbuf_f = reinterpret_cast<float*>(out_ptr + (16*i));
 | 
					{
 | 
				
			||||||
        atomicAdd(accbuf_f,   outcol.x);
 | 
					    .reg .pred  p;
 | 
				
			||||||
        atomicAdd(accbuf_f+1, outcol.y);
 | 
					    .reg .u32   off, color, hi, lo, d, r, g, b;
 | 
				
			||||||
        atomicAdd(accbuf_f+2, outcol.z);
 | 
					    .reg .f32   colorf, rf, gf, bf, df;
 | 
				
			||||||
        atomicAdd(accbuf_f+3, 1.0f);
 | 
					    .reg .u64   ptr, val;
 | 
				
			||||||
{{elif info.acc_mode == 'global'}}
 | 
					
 | 
				
			||||||
 | 
					    // TODO: coord dithering better, or pre-supersampled palette?
 | 
				
			||||||
 | 
					    fma.rn.ftz.f32      colorf, %0,     255.0,  %1;
 | 
				
			||||||
 | 
					    cvt.rni.u32.f32     color,  colorf;
 | 
				
			||||||
 | 
					    shl.b32             color,  color,  3;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    suld.b.2d.v2.b32.clamp      {lo, hi},   [flatpal, {color, %2}];
 | 
				
			||||||
 | 
					    mov.b64             val,    {lo, hi};
 | 
				
			||||||
 | 
					    shl.b32             off,    %3,     3;
 | 
				
			||||||
 | 
					    cvt.u64.u32         ptr,    off;
 | 
				
			||||||
 | 
					    add.u64             ptr,    ptr,    %4;
 | 
				
			||||||
 | 
					    setp.le.f32         p,      %5,     0.98;
 | 
				
			||||||
 | 
					@p  red.global.add.u64  [ptr],  val;
 | 
				
			||||||
 | 
					@p  bra                 oflow_end;
 | 
				
			||||||
 | 
					    atom.global.add.u64 val,    [ptr],  val;
 | 
				
			||||||
 | 
					    mov.b64             {lo, hi},       val;
 | 
				
			||||||
 | 
					    setp.lo.u32         p,      hi,     (256 << 22);
 | 
				
			||||||
 | 
					@p  bra                 oflow_end;
 | 
				
			||||||
 | 
					    atom.global.exch.b64 val,   [ptr],  0;
 | 
				
			||||||
 | 
					    mov.b64             {lo, hi},       val;
 | 
				
			||||||
 | 
					    shr.u32             d,      hi,     22;
 | 
				
			||||||
 | 
					    bfe.u32             r,      hi,     4,      18;
 | 
				
			||||||
 | 
					    bfe.u32             g,      lo,     18,     14;
 | 
				
			||||||
 | 
					    bfi.b32             g,      hi,     g,      14,     4;
 | 
				
			||||||
 | 
					    and.b32             b,      lo,     ((1<<18)-1);
 | 
				
			||||||
 | 
					    cvt.rn.f32.u32      rf,     r;
 | 
				
			||||||
 | 
					    cvt.rn.f32.u32      gf,     g;
 | 
				
			||||||
 | 
					    cvt.rn.f32.u32      bf,     b;
 | 
				
			||||||
 | 
					    cvt.rn.f32.u32      df,     d;
 | 
				
			||||||
 | 
					    mul.rn.ftz.f32      rf,     rf,     (1.0/255.0);
 | 
				
			||||||
 | 
					    mul.rn.ftz.f32      gf,     gf,     (1.0/255.0);
 | 
				
			||||||
 | 
					    mul.rn.ftz.f32      bf,     bf,     (1.0/255.0);
 | 
				
			||||||
 | 
					    shl.b32             off,    %3,     4;
 | 
				
			||||||
 | 
					    cvt.u64.u32         ptr,    off;
 | 
				
			||||||
 | 
					    add.u64             ptr,    ptr,    %6;
 | 
				
			||||||
 | 
					    red.global.add.f32  [ptr],          rf;
 | 
				
			||||||
 | 
					    red.global.add.f32  [ptr+4],        gf;
 | 
				
			||||||
 | 
					    red.global.add.f32  [ptr+8],        bf;
 | 
				
			||||||
 | 
					    red.global.add.f32  [ptr+12],       df;
 | 
				
			||||||
 | 
					oflow_end:
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					        """)}}  ::  "f"(cc), "f"(color_dither), "r"(time), "r"(i),
 | 
				
			||||||
 | 
					                    "l"(atom_ptr), "f"(cosel[threadIdx.y + {{NWARPS}}]),
 | 
				
			||||||
 | 
					                    "l"(out_ptr));
 | 
				
			||||||
 | 
					{{endif}}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					{{if info.acc_mode == 'global'}}
 | 
				
			||||||
        float4 outcol = tex2D(palTex, cc, time_frac);
 | 
					        float4 outcol = tex2D(palTex, cc, time_frac);
 | 
				
			||||||
        float4 *accbuf = reinterpret_cast<float4*>(out_ptr + (16*i));
 | 
					        float4 *accbuf = reinterpret_cast<float4*>(out_ptr + (16*i));
 | 
				
			||||||
        float4 pix = *accbuf;
 | 
					        float4 pix = *accbuf;
 | 
				
			||||||
@ -380,6 +431,47 @@ void iter(
 | 
				
			|||||||
    return;
 | 
					    return;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					{{if info.acc_mode == 'atomic'}}
 | 
				
			||||||
 | 
					__global__
 | 
				
			||||||
 | 
					void flush_atom(uint64_t out_ptr, uint64_t atom_ptr, int nbins) {
 | 
				
			||||||
 | 
					    int i = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;
 | 
				
			||||||
 | 
					    asm volatile ({{crep("""
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					    .reg .u32   off, hi, lo, d, r, g, b;
 | 
				
			||||||
 | 
					    .reg .u64   val, ptr;
 | 
				
			||||||
 | 
					    .reg .f32   rf, gf, bf, df, rg, gg, bg, dg;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    // TODO: use explicit movs to handle this
 | 
				
			||||||
 | 
					    shl.b32             off,    %0,     3;
 | 
				
			||||||
 | 
					    cvt.u64.u32         ptr,    off;
 | 
				
			||||||
 | 
					    add.u64             ptr,    ptr,    %1;
 | 
				
			||||||
 | 
					    ld.global.v2.u32    {lo, hi},   [ptr];
 | 
				
			||||||
 | 
					    shl.b32             off,    %0,     4;
 | 
				
			||||||
 | 
					    cvt.u64.u32         ptr,    off;
 | 
				
			||||||
 | 
					    add.u64             ptr,    ptr,    %2;
 | 
				
			||||||
 | 
					    ld.global.v4.f32    {rg,gg,bg,dg},  [ptr];
 | 
				
			||||||
 | 
					    shr.u32             d,      hi,     22;
 | 
				
			||||||
 | 
					    bfe.u32             r,      hi,     4,      18;
 | 
				
			||||||
 | 
					    bfe.u32             g,      lo,     18,     14;
 | 
				
			||||||
 | 
					    bfi.b32             g,      hi,     g,      14,     4;
 | 
				
			||||||
 | 
					    and.b32             b,      lo,     ((1<<18)-1);
 | 
				
			||||||
 | 
					    cvt.rn.f32.u32      rf,     r;
 | 
				
			||||||
 | 
					    cvt.rn.f32.u32      gf,     g;
 | 
				
			||||||
 | 
					    cvt.rn.f32.u32      bf,     b;
 | 
				
			||||||
 | 
					    cvt.rn.f32.u32      df,     d;
 | 
				
			||||||
 | 
					    fma.rn.ftz.f32      rg,     rf,     (1.0/255.0),    rg;
 | 
				
			||||||
 | 
					    fma.rn.ftz.f32      gg,     gf,     (1.0/255.0),    gg;
 | 
				
			||||||
 | 
					    fma.rn.ftz.f32      bg,     bf,     (1.0/255.0),    bg;
 | 
				
			||||||
 | 
					    add.rn.ftz.f32      dg,     df,     dg;
 | 
				
			||||||
 | 
					    st.global.v4.f32    [ptr],  {rg,gg,bg,dg};
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					    """)}}  ::  "r"(i), "l"(atom_ptr), "l"(out_ptr));
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					{{endif}}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					{{if info.acc_mode == 'deferred'}}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
// Block size, shared accumulation bits, shared accumulation width.
 | 
					// Block size, shared accumulation bits, shared accumulation width.
 | 
				
			||||||
#define BS 1024
 | 
					#define BS 1024
 | 
				
			||||||
#define SHAB 12
 | 
					#define SHAB 12
 | 
				
			||||||
@ -462,8 +554,8 @@ write_shmem(
 | 
				
			|||||||
    shl.b32         color,  color,  3;
 | 
					    shl.b32         color,  color,  3;
 | 
				
			||||||
    cvt.rni.u32.f32 time,   %1;
 | 
					    cvt.rni.u32.f32 time,   %1;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    suld.b.2d.v2.b32.clamp  {hi, lo},   [flatpal, {color, time}];
 | 
					    suld.b.2d.v2.b32.clamp  {lo, hi},   [flatpal, {color, time}];
 | 
				
			||||||
    ld.shared.v2.u32    {hiw, low},     [shoff];
 | 
					    ld.shared.v2.u32    {low, hiw},     [shoff];
 | 
				
			||||||
    add.cc.u32          lo,     lo,     low;
 | 
					    add.cc.u32          lo,     lo,     low;
 | 
				
			||||||
    addc.u32            hi,     hi,     hiw;
 | 
					    addc.u32            hi,     hi,     hiw;
 | 
				
			||||||
    setp.hs.u32         q,      hi,     (1023 << 22);
 | 
					    setp.hs.u32         q,      hi,     (1023 << 22);
 | 
				
			||||||
@ -513,7 +605,7 @@ oflow_write_end:
 | 
				
			|||||||
        asm({{crep("""
 | 
					        asm({{crep("""
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
    .reg .u32 hi, lo;
 | 
					    .reg .u32 hi, lo;
 | 
				
			||||||
    ld.shared.v2.u32   {hi, lo},    [%4];
 | 
					    ld.shared.v2.u32   {lo, hi},    [%4];
 | 
				
			||||||
    shr.u32         %0,     hi,     22;
 | 
					    shr.u32         %0,     hi,     22;
 | 
				
			||||||
    bfe.u32         %1,     hi,     4,      18;
 | 
					    bfe.u32         %1,     hi,     4,      18;
 | 
				
			||||||
    bfe.u32         %2,     lo,     18,     14;
 | 
					    bfe.u32         %2,     lo,     18,     14;
 | 
				
			||||||
@ -530,7 +622,7 @@ oflow_write_end:
 | 
				
			|||||||
        glo_idx += (BS << 8);
 | 
					        glo_idx += (BS << 8);
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					{{endif}}
 | 
				
			||||||
''', 'iter_kern')
 | 
					''', 'iter_kern')
 | 
				
			||||||
        return tmpl.substitute(
 | 
					        return tmpl.substitute(
 | 
				
			||||||
                info = self.info,
 | 
					                info = self.info,
 | 
				
			||||||
 | 
				
			|||||||
@ -8,7 +8,7 @@ from itertools import cycle, repeat, chain, izip
 | 
				
			|||||||
from ctypes import *
 | 
					from ctypes import *
 | 
				
			||||||
from cStringIO import StringIO
 | 
					from cStringIO import StringIO
 | 
				
			||||||
import numpy as np
 | 
					import numpy as np
 | 
				
			||||||
from numpy import int32 as i32
 | 
					from numpy import int32 as i32, uint64 as u64
 | 
				
			||||||
from scipy import ndimage
 | 
					from scipy import ndimage
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from fr0stlib import pyflam3
 | 
					from fr0stlib import pyflam3
 | 
				
			||||||
@ -110,7 +110,6 @@ class Renderer(object):
 | 
				
			|||||||
        reset_rb_fun = self.mod.get_function("reset_rb")
 | 
					        reset_rb_fun = self.mod.get_function("reset_rb")
 | 
				
			||||||
        packer_fun = self.mod.get_function("interp_iter_params")
 | 
					        packer_fun = self.mod.get_function("interp_iter_params")
 | 
				
			||||||
        iter_fun = self.mod.get_function("iter")
 | 
					        iter_fun = self.mod.get_function("iter")
 | 
				
			||||||
        write_fun = self.mod.get_function("write_shmem")
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
        info = self.info
 | 
					        info = self.info
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -119,6 +118,7 @@ class Renderer(object):
 | 
				
			|||||||
        filt_stream = cuda.Stream()
 | 
					        filt_stream = cuda.Stream()
 | 
				
			||||||
        if info.acc_mode == 'deferred':
 | 
					        if info.acc_mode == 'deferred':
 | 
				
			||||||
            write_stream = cuda.Stream()
 | 
					            write_stream = cuda.Stream()
 | 
				
			||||||
 | 
					            write_fun = self.mod.get_function("write_shmem")
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            write_stream = iter_stream
 | 
					            write_stream = iter_stream
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -132,6 +132,9 @@ class Renderer(object):
 | 
				
			|||||||
        # Extra padding in accum helps with write_shmem overruns
 | 
					        # Extra padding in accum helps with write_shmem overruns
 | 
				
			||||||
        d_accum = cuda.mem_alloc(16 * nbins + (1<<16))
 | 
					        d_accum = cuda.mem_alloc(16 * nbins + (1<<16))
 | 
				
			||||||
        d_out = cuda.mem_alloc(16 * nbins)
 | 
					        d_out = cuda.mem_alloc(16 * nbins)
 | 
				
			||||||
 | 
					        if info.acc_mode == 'atomic':
 | 
				
			||||||
 | 
					            d_atom = cuda.mem_alloc(8 * nbins)
 | 
				
			||||||
 | 
					            flush_fun = self.mod.get_function("flush_atom")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        acc_size = np.array([info.acc_width, info.acc_height, info.acc_stride])
 | 
					        acc_size = np.array([info.acc_width, info.acc_height, info.acc_stride])
 | 
				
			||||||
        d_acc_size = self.mod.get_global('acc_size')[0]
 | 
					        d_acc_size = self.mod.get_global('acc_size')[0]
 | 
				
			||||||
@ -191,7 +194,7 @@ class Renderer(object):
 | 
				
			|||||||
        d_palint_vals = cuda.to_device(
 | 
					        d_palint_vals = cuda.to_device(
 | 
				
			||||||
                np.concatenate(map(info.db.palettes.get, pals[1::2])))
 | 
					                np.concatenate(map(info.db.palettes.get, pals[1::2])))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if info.acc_mode == 'deferred':
 | 
					        if info.acc_mode in ('deferred', 'atomic'):
 | 
				
			||||||
            palette_fun = self.mod.get_function("interp_palette_hsv_flat")
 | 
					            palette_fun = self.mod.get_function("interp_palette_hsv_flat")
 | 
				
			||||||
            dsc = cuda.ArrayDescriptor3D()
 | 
					            dsc = cuda.ArrayDescriptor3D()
 | 
				
			||||||
            dsc.height = info.palette_height
 | 
					            dsc.height = info.palette_height
 | 
				
			||||||
@ -227,7 +230,7 @@ class Renderer(object):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
        for idx, start, stop in times:
 | 
					        for idx, start, stop in times:
 | 
				
			||||||
            twidth = np.float32((stop-start) / info.palette_height)
 | 
					            twidth = np.float32((stop-start) / info.palette_height)
 | 
				
			||||||
            if info.acc_mode == 'deferred':
 | 
					            if info.acc_mode in ('deferred', 'atomic'):
 | 
				
			||||||
                palette_fun(d_seeds, d_palint_times, d_palint_vals,
 | 
					                palette_fun(d_seeds, d_palint_times, d_palint_vals,
 | 
				
			||||||
                            np.float32(start), twidth,
 | 
					                            np.float32(start), twidth,
 | 
				
			||||||
                            block=(256,1,1), grid=(info.palette_height,1),
 | 
					                            block=(256,1,1), grid=(info.palette_height,1),
 | 
				
			||||||
@ -257,6 +260,8 @@ class Renderer(object):
 | 
				
			|||||||
                #print '%60s %g' % ('_'.join(n), i)
 | 
					                #print '%60s %g' % ('_'.join(n), i)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            util.BaseCode.fill_dptr(self.mod, d_accum, 4 * nbins, write_stream)
 | 
					            util.BaseCode.fill_dptr(self.mod, d_accum, 4 * nbins, write_stream)
 | 
				
			||||||
 | 
					            if info.acc_mode == 'atomic':
 | 
				
			||||||
 | 
					                util.BaseCode.fill_dptr(self.mod, d_atom, 2 * nbins, write_stream)
 | 
				
			||||||
            nrounds = ( (info.density * info.width * info.height)
 | 
					            nrounds = ( (info.density * info.width * info.height)
 | 
				
			||||||
                      / (ntemporal_samples * 256 * 256) ) + 1
 | 
					                      / (ntemporal_samples * 256 * 256) ) + 1
 | 
				
			||||||
            if info.acc_mode == 'deferred':
 | 
					            if info.acc_mode == 'deferred':
 | 
				
			||||||
@ -273,9 +278,16 @@ class Renderer(object):
 | 
				
			|||||||
                              block=(1024, 1, 1), grid=(nwriteblocks, 1),
 | 
					                              block=(1024, 1, 1), grid=(nwriteblocks, 1),
 | 
				
			||||||
                              stream=write_stream)
 | 
					                              stream=write_stream)
 | 
				
			||||||
            else:
 | 
					            else:
 | 
				
			||||||
                iter_fun(np.uint64(d_accum), d_seeds, d_points, d_infos,
 | 
					                args = [u64(d_accum), d_seeds, d_points, d_infos]
 | 
				
			||||||
                         block=(32, self._iter.NTHREADS/32, 1),
 | 
					                if info.acc_mode == 'atomic':
 | 
				
			||||||
 | 
					                    args.append(u64(d_atom))
 | 
				
			||||||
 | 
					                iter_fun(*args, block=(32, self._iter.NTHREADS/32, 1),
 | 
				
			||||||
                         grid=(ntemporal_samples, nrounds), stream=iter_stream)
 | 
					                         grid=(ntemporal_samples, nrounds), stream=iter_stream)
 | 
				
			||||||
 | 
					                if info.acc_mode == 'atomic':
 | 
				
			||||||
 | 
					                    nblocks = int(np.ceil(np.sqrt(nbins/float(512))))
 | 
				
			||||||
 | 
					                    flush_fun(u64(d_accum), u64(d_atom), i32(nbins),
 | 
				
			||||||
 | 
					                              block=(512, 1, 1), grid=(nblocks, nblocks),
 | 
				
			||||||
 | 
					                              stream=iter_stream)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            util.BaseCode.fill_dptr(self.mod, d_out, 4 * nbins, filt_stream)
 | 
					            util.BaseCode.fill_dptr(self.mod, d_out, 4 * nbins, filt_stream)
 | 
				
			||||||
            _sync_stream(filt_stream, write_stream)
 | 
					            _sync_stream(filt_stream, write_stream)
 | 
				
			||||||
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user