diff --git a/cuburn/code/interp.py b/cuburn/code/interp.py index 0cbae08..5b4961f 100644 --- a/cuburn/code/interp.py +++ b/cuburn/code/interp.py @@ -270,8 +270,8 @@ void interp_palette_hsv_flat(mwc_st *rctxs, uint32_t r = min(255, (uint32_t) (rgba.x * 255.0f + 0.49f * mwc_next_11(rctx))); uint32_t g = min(255, (uint32_t) (rgba.y * 255.0f + 0.49f * mwc_next_11(rctx))); uint32_t b = min(255, (uint32_t) (rgba.z * 255.0f + 0.49f * mwc_next_11(rctx))); - out.x = (1 << 22) | (r << 4); - out.y = (g << 18) | b; + out.y = (1 << 22) | (r << 4); + out.x = (g << 18) | b; surf2Dwrite(out, flatpal, 8 * threadIdx.x, blockIdx.x); rctxs[gid] = rctx; } diff --git a/cuburn/code/iter.py b/cuburn/code/iter.py index b996481..45eae76 100644 --- a/cuburn/code/iter.py +++ b/cuburn/code/iter.py @@ -187,8 +187,10 @@ void iter( uint64_t out_ptr, mwc_st *msts, float4 *points, - const iter_params *all_params, - int nsamps_to_generate + const iter_params *all_params +{{if info.acc_mode == 'atomic'}} + , uint64_t atom_ptr +{{endif}} ) { const iter_params *global_params = &(all_params[blockIdx.x]); @@ -207,10 +209,14 @@ void iter( int this_rb_idx = rb_idx + threadIdx.x + 32 * threadIdx.y; mwc_st rctx = msts[this_rb_idx]; -{{if info.acc_mode != 'deferred'}} +{{if info.acc_mode == 'global'}} __shared__ float time_frac; time_frac = blockIdx.x / (float) gridDim.x; {{else}} + {{if info.acc_mode == 'atomic'}} + // TODO: spare the register, reuse at call site? + int time = blockIdx.x >> 4; + {{endif}} float color_dither = 0.49f * mwc_next_11(rctx); {{endif}} @@ -224,10 +230,10 @@ void iter( // Shared memory size can be reduced by a factor of four using a slower // 4-stage reduce, but on Fermi hardware shmem use isn't a bottleneck __shared__ float swap[{{4*NTHREADS}}]; - __shared__ float cosel[{{NWARPS}}]; + __shared__ float cosel[{{2*NWARPS}}]; // This is normally done after the swap-sync in the main loop - if (threadIdx.y == 0 && threadIdx.x < {{NWARPS}}) + if (threadIdx.y == 0 && threadIdx.x < {{NWARPS*2}}) cosel[threadIdx.x] = mwc_next_01(rctx); __syncthreads(); {{endif}} @@ -295,7 +301,7 @@ void iter( __syncthreads(); // We select the next xforms here, since we've just synced. - if (threadIdx.y == 0 && threadIdx.x < {{NWARPS}}) + if (threadIdx.y == 0 && threadIdx.x < {{NWARPS*2}}) cosel[threadIdx.x] = mwc_next_01(rctx); fuse = swap[sr]; @@ -345,15 +351,60 @@ void iter( } uint32_t i = iy * acc_size.stride + ix; - {{if info.acc_mode == 'atomic'}} - float4 outcol = tex2D(palTex, cc, time_frac); - float *accbuf_f = reinterpret_cast(out_ptr + (16*i)); - atomicAdd(accbuf_f, outcol.x); - atomicAdd(accbuf_f+1, outcol.y); - atomicAdd(accbuf_f+2, outcol.z); - atomicAdd(accbuf_f+3, 1.0f); -{{elif info.acc_mode == 'global'}} + asm volatile ({{crep(""" +{ + .reg .pred p; + .reg .u32 off, color, hi, lo, d, r, g, b; + .reg .f32 colorf, rf, gf, bf, df; + .reg .u64 ptr, val; + + // TODO: coord dithering better, or pre-supersampled palette? + fma.rn.ftz.f32 colorf, %0, 255.0, %1; + cvt.rni.u32.f32 color, colorf; + shl.b32 color, color, 3; + + suld.b.2d.v2.b32.clamp {lo, hi}, [flatpal, {color, %2}]; + mov.b64 val, {lo, hi}; + shl.b32 off, %3, 3; + cvt.u64.u32 ptr, off; + add.u64 ptr, ptr, %4; + setp.le.f32 p, %5, 0.98; +@p red.global.add.u64 [ptr], val; +@p bra oflow_end; + atom.global.add.u64 val, [ptr], val; + mov.b64 {lo, hi}, val; + setp.lo.u32 p, hi, (256 << 22); +@p bra oflow_end; + atom.global.exch.b64 val, [ptr], 0; + mov.b64 {lo, hi}, val; + shr.u32 d, hi, 22; + bfe.u32 r, hi, 4, 18; + bfe.u32 g, lo, 18, 14; + bfi.b32 g, hi, g, 14, 4; + and.b32 b, lo, ((1<<18)-1); + cvt.rn.f32.u32 rf, r; + cvt.rn.f32.u32 gf, g; + cvt.rn.f32.u32 bf, b; + cvt.rn.f32.u32 df, d; + mul.rn.ftz.f32 rf, rf, (1.0/255.0); + mul.rn.ftz.f32 gf, gf, (1.0/255.0); + mul.rn.ftz.f32 bf, bf, (1.0/255.0); + shl.b32 off, %3, 4; + cvt.u64.u32 ptr, off; + add.u64 ptr, ptr, %6; + red.global.add.f32 [ptr], rf; + red.global.add.f32 [ptr+4], gf; + red.global.add.f32 [ptr+8], bf; + red.global.add.f32 [ptr+12], df; +oflow_end: +} + """)}} :: "f"(cc), "f"(color_dither), "r"(time), "r"(i), + "l"(atom_ptr), "f"(cosel[threadIdx.y + {{NWARPS}}]), + "l"(out_ptr)); +{{endif}} + +{{if info.acc_mode == 'global'}} float4 outcol = tex2D(palTex, cc, time_frac); float4 *accbuf = reinterpret_cast(out_ptr + (16*i)); float4 pix = *accbuf; @@ -380,6 +431,47 @@ void iter( return; } +{{if info.acc_mode == 'atomic'}} +__global__ +void flush_atom(uint64_t out_ptr, uint64_t atom_ptr, int nbins) { + int i = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x; + asm volatile ({{crep(""" +{ + .reg .u32 off, hi, lo, d, r, g, b; + .reg .u64 val, ptr; + .reg .f32 rf, gf, bf, df, rg, gg, bg, dg; + + // TODO: use explicit movs to handle this + shl.b32 off, %0, 3; + cvt.u64.u32 ptr, off; + add.u64 ptr, ptr, %1; + ld.global.v2.u32 {lo, hi}, [ptr]; + shl.b32 off, %0, 4; + cvt.u64.u32 ptr, off; + add.u64 ptr, ptr, %2; + ld.global.v4.f32 {rg,gg,bg,dg}, [ptr]; + shr.u32 d, hi, 22; + bfe.u32 r, hi, 4, 18; + bfe.u32 g, lo, 18, 14; + bfi.b32 g, hi, g, 14, 4; + and.b32 b, lo, ((1<<18)-1); + cvt.rn.f32.u32 rf, r; + cvt.rn.f32.u32 gf, g; + cvt.rn.f32.u32 bf, b; + cvt.rn.f32.u32 df, d; + fma.rn.ftz.f32 rg, rf, (1.0/255.0), rg; + fma.rn.ftz.f32 gg, gf, (1.0/255.0), gg; + fma.rn.ftz.f32 bg, bf, (1.0/255.0), bg; + add.rn.ftz.f32 dg, df, dg; + st.global.v4.f32 [ptr], {rg,gg,bg,dg}; +} + """)}} :: "r"(i), "l"(atom_ptr), "l"(out_ptr)); +} + +{{endif}} + +{{if info.acc_mode == 'deferred'}} + // Block size, shared accumulation bits, shared accumulation width. #define BS 1024 #define SHAB 12 @@ -462,8 +554,8 @@ write_shmem( shl.b32 color, color, 3; cvt.rni.u32.f32 time, %1; - suld.b.2d.v2.b32.clamp {hi, lo}, [flatpal, {color, time}]; - ld.shared.v2.u32 {hiw, low}, [shoff]; + suld.b.2d.v2.b32.clamp {lo, hi}, [flatpal, {color, time}]; + ld.shared.v2.u32 {low, hiw}, [shoff]; add.cc.u32 lo, lo, low; addc.u32 hi, hi, hiw; setp.hs.u32 q, hi, (1023 << 22); @@ -513,7 +605,7 @@ oflow_write_end: asm({{crep(""" { .reg .u32 hi, lo; - ld.shared.v2.u32 {hi, lo}, [%4]; + ld.shared.v2.u32 {lo, hi}, [%4]; shr.u32 %0, hi, 22; bfe.u32 %1, hi, 4, 18; bfe.u32 %2, lo, 18, 14; @@ -530,7 +622,7 @@ oflow_write_end: glo_idx += (BS << 8); } } - +{{endif}} ''', 'iter_kern') return tmpl.substitute( info = self.info, diff --git a/cuburn/render.py b/cuburn/render.py index 8986dae..977a73a 100644 --- a/cuburn/render.py +++ b/cuburn/render.py @@ -8,7 +8,7 @@ from itertools import cycle, repeat, chain, izip from ctypes import * from cStringIO import StringIO import numpy as np -from numpy import int32 as i32 +from numpy import int32 as i32, uint64 as u64 from scipy import ndimage from fr0stlib import pyflam3 @@ -110,7 +110,6 @@ class Renderer(object): reset_rb_fun = self.mod.get_function("reset_rb") packer_fun = self.mod.get_function("interp_iter_params") iter_fun = self.mod.get_function("iter") - write_fun = self.mod.get_function("write_shmem") info = self.info @@ -119,6 +118,7 @@ class Renderer(object): filt_stream = cuda.Stream() if info.acc_mode == 'deferred': write_stream = cuda.Stream() + write_fun = self.mod.get_function("write_shmem") else: write_stream = iter_stream @@ -132,6 +132,9 @@ class Renderer(object): # Extra padding in accum helps with write_shmem overruns d_accum = cuda.mem_alloc(16 * nbins + (1<<16)) d_out = cuda.mem_alloc(16 * nbins) + if info.acc_mode == 'atomic': + d_atom = cuda.mem_alloc(8 * nbins) + flush_fun = self.mod.get_function("flush_atom") acc_size = np.array([info.acc_width, info.acc_height, info.acc_stride]) d_acc_size = self.mod.get_global('acc_size')[0] @@ -191,7 +194,7 @@ class Renderer(object): d_palint_vals = cuda.to_device( np.concatenate(map(info.db.palettes.get, pals[1::2]))) - if info.acc_mode == 'deferred': + if info.acc_mode in ('deferred', 'atomic'): palette_fun = self.mod.get_function("interp_palette_hsv_flat") dsc = cuda.ArrayDescriptor3D() dsc.height = info.palette_height @@ -227,7 +230,7 @@ class Renderer(object): for idx, start, stop in times: twidth = np.float32((stop-start) / info.palette_height) - if info.acc_mode == 'deferred': + if info.acc_mode in ('deferred', 'atomic'): palette_fun(d_seeds, d_palint_times, d_palint_vals, np.float32(start), twidth, block=(256,1,1), grid=(info.palette_height,1), @@ -257,6 +260,8 @@ class Renderer(object): #print '%60s %g' % ('_'.join(n), i) util.BaseCode.fill_dptr(self.mod, d_accum, 4 * nbins, write_stream) + if info.acc_mode == 'atomic': + util.BaseCode.fill_dptr(self.mod, d_atom, 2 * nbins, write_stream) nrounds = ( (info.density * info.width * info.height) / (ntemporal_samples * 256 * 256) ) + 1 if info.acc_mode == 'deferred': @@ -273,9 +278,16 @@ class Renderer(object): block=(1024, 1, 1), grid=(nwriteblocks, 1), stream=write_stream) else: - iter_fun(np.uint64(d_accum), d_seeds, d_points, d_infos, - block=(32, self._iter.NTHREADS/32, 1), + args = [u64(d_accum), d_seeds, d_points, d_infos] + if info.acc_mode == 'atomic': + args.append(u64(d_atom)) + iter_fun(*args, block=(32, self._iter.NTHREADS/32, 1), grid=(ntemporal_samples, nrounds), stream=iter_stream) + if info.acc_mode == 'atomic': + nblocks = int(np.ceil(np.sqrt(nbins/float(512)))) + flush_fun(u64(d_accum), u64(d_atom), i32(nbins), + block=(512, 1, 1), grid=(nblocks, nblocks), + stream=iter_stream) util.BaseCode.fill_dptr(self.mod, d_out, 4 * nbins, filt_stream) _sync_stream(filt_stream, write_stream)