mirror of
https://github.com/stevenrobertson/cuburn.git
synced 2025-02-05 11:40:04 -05:00
New atomic write mode
This commit is contained in:
parent
a0fd3f965f
commit
6c50e6dadc
@ -270,8 +270,8 @@ void interp_palette_hsv_flat(mwc_st *rctxs,
|
|||||||
uint32_t r = min(255, (uint32_t) (rgba.x * 255.0f + 0.49f * mwc_next_11(rctx)));
|
uint32_t r = min(255, (uint32_t) (rgba.x * 255.0f + 0.49f * mwc_next_11(rctx)));
|
||||||
uint32_t g = min(255, (uint32_t) (rgba.y * 255.0f + 0.49f * mwc_next_11(rctx)));
|
uint32_t g = min(255, (uint32_t) (rgba.y * 255.0f + 0.49f * mwc_next_11(rctx)));
|
||||||
uint32_t b = min(255, (uint32_t) (rgba.z * 255.0f + 0.49f * mwc_next_11(rctx)));
|
uint32_t b = min(255, (uint32_t) (rgba.z * 255.0f + 0.49f * mwc_next_11(rctx)));
|
||||||
out.x = (1 << 22) | (r << 4);
|
out.y = (1 << 22) | (r << 4);
|
||||||
out.y = (g << 18) | b;
|
out.x = (g << 18) | b;
|
||||||
surf2Dwrite(out, flatpal, 8 * threadIdx.x, blockIdx.x);
|
surf2Dwrite(out, flatpal, 8 * threadIdx.x, blockIdx.x);
|
||||||
rctxs[gid] = rctx;
|
rctxs[gid] = rctx;
|
||||||
}
|
}
|
||||||
|
@ -187,8 +187,10 @@ void iter(
|
|||||||
uint64_t out_ptr,
|
uint64_t out_ptr,
|
||||||
mwc_st *msts,
|
mwc_st *msts,
|
||||||
float4 *points,
|
float4 *points,
|
||||||
const iter_params *all_params,
|
const iter_params *all_params
|
||||||
int nsamps_to_generate
|
{{if info.acc_mode == 'atomic'}}
|
||||||
|
, uint64_t atom_ptr
|
||||||
|
{{endif}}
|
||||||
) {
|
) {
|
||||||
const iter_params *global_params = &(all_params[blockIdx.x]);
|
const iter_params *global_params = &(all_params[blockIdx.x]);
|
||||||
|
|
||||||
@ -207,10 +209,14 @@ void iter(
|
|||||||
int this_rb_idx = rb_idx + threadIdx.x + 32 * threadIdx.y;
|
int this_rb_idx = rb_idx + threadIdx.x + 32 * threadIdx.y;
|
||||||
mwc_st rctx = msts[this_rb_idx];
|
mwc_st rctx = msts[this_rb_idx];
|
||||||
|
|
||||||
{{if info.acc_mode != 'deferred'}}
|
{{if info.acc_mode == 'global'}}
|
||||||
__shared__ float time_frac;
|
__shared__ float time_frac;
|
||||||
time_frac = blockIdx.x / (float) gridDim.x;
|
time_frac = blockIdx.x / (float) gridDim.x;
|
||||||
{{else}}
|
{{else}}
|
||||||
|
{{if info.acc_mode == 'atomic'}}
|
||||||
|
// TODO: spare the register, reuse at call site?
|
||||||
|
int time = blockIdx.x >> 4;
|
||||||
|
{{endif}}
|
||||||
float color_dither = 0.49f * mwc_next_11(rctx);
|
float color_dither = 0.49f * mwc_next_11(rctx);
|
||||||
{{endif}}
|
{{endif}}
|
||||||
|
|
||||||
@ -224,10 +230,10 @@ void iter(
|
|||||||
// Shared memory size can be reduced by a factor of four using a slower
|
// Shared memory size can be reduced by a factor of four using a slower
|
||||||
// 4-stage reduce, but on Fermi hardware shmem use isn't a bottleneck
|
// 4-stage reduce, but on Fermi hardware shmem use isn't a bottleneck
|
||||||
__shared__ float swap[{{4*NTHREADS}}];
|
__shared__ float swap[{{4*NTHREADS}}];
|
||||||
__shared__ float cosel[{{NWARPS}}];
|
__shared__ float cosel[{{2*NWARPS}}];
|
||||||
|
|
||||||
// This is normally done after the swap-sync in the main loop
|
// This is normally done after the swap-sync in the main loop
|
||||||
if (threadIdx.y == 0 && threadIdx.x < {{NWARPS}})
|
if (threadIdx.y == 0 && threadIdx.x < {{NWARPS*2}})
|
||||||
cosel[threadIdx.x] = mwc_next_01(rctx);
|
cosel[threadIdx.x] = mwc_next_01(rctx);
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
{{endif}}
|
{{endif}}
|
||||||
@ -295,7 +301,7 @@ void iter(
|
|||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
// We select the next xforms here, since we've just synced.
|
// We select the next xforms here, since we've just synced.
|
||||||
if (threadIdx.y == 0 && threadIdx.x < {{NWARPS}})
|
if (threadIdx.y == 0 && threadIdx.x < {{NWARPS*2}})
|
||||||
cosel[threadIdx.x] = mwc_next_01(rctx);
|
cosel[threadIdx.x] = mwc_next_01(rctx);
|
||||||
|
|
||||||
fuse = swap[sr];
|
fuse = swap[sr];
|
||||||
@ -345,15 +351,60 @@ void iter(
|
|||||||
}
|
}
|
||||||
|
|
||||||
uint32_t i = iy * acc_size.stride + ix;
|
uint32_t i = iy * acc_size.stride + ix;
|
||||||
|
|
||||||
{{if info.acc_mode == 'atomic'}}
|
{{if info.acc_mode == 'atomic'}}
|
||||||
float4 outcol = tex2D(palTex, cc, time_frac);
|
asm volatile ({{crep("""
|
||||||
float *accbuf_f = reinterpret_cast<float*>(out_ptr + (16*i));
|
{
|
||||||
atomicAdd(accbuf_f, outcol.x);
|
.reg .pred p;
|
||||||
atomicAdd(accbuf_f+1, outcol.y);
|
.reg .u32 off, color, hi, lo, d, r, g, b;
|
||||||
atomicAdd(accbuf_f+2, outcol.z);
|
.reg .f32 colorf, rf, gf, bf, df;
|
||||||
atomicAdd(accbuf_f+3, 1.0f);
|
.reg .u64 ptr, val;
|
||||||
{{elif info.acc_mode == 'global'}}
|
|
||||||
|
// TODO: coord dithering better, or pre-supersampled palette?
|
||||||
|
fma.rn.ftz.f32 colorf, %0, 255.0, %1;
|
||||||
|
cvt.rni.u32.f32 color, colorf;
|
||||||
|
shl.b32 color, color, 3;
|
||||||
|
|
||||||
|
suld.b.2d.v2.b32.clamp {lo, hi}, [flatpal, {color, %2}];
|
||||||
|
mov.b64 val, {lo, hi};
|
||||||
|
shl.b32 off, %3, 3;
|
||||||
|
cvt.u64.u32 ptr, off;
|
||||||
|
add.u64 ptr, ptr, %4;
|
||||||
|
setp.le.f32 p, %5, 0.98;
|
||||||
|
@p red.global.add.u64 [ptr], val;
|
||||||
|
@p bra oflow_end;
|
||||||
|
atom.global.add.u64 val, [ptr], val;
|
||||||
|
mov.b64 {lo, hi}, val;
|
||||||
|
setp.lo.u32 p, hi, (256 << 22);
|
||||||
|
@p bra oflow_end;
|
||||||
|
atom.global.exch.b64 val, [ptr], 0;
|
||||||
|
mov.b64 {lo, hi}, val;
|
||||||
|
shr.u32 d, hi, 22;
|
||||||
|
bfe.u32 r, hi, 4, 18;
|
||||||
|
bfe.u32 g, lo, 18, 14;
|
||||||
|
bfi.b32 g, hi, g, 14, 4;
|
||||||
|
and.b32 b, lo, ((1<<18)-1);
|
||||||
|
cvt.rn.f32.u32 rf, r;
|
||||||
|
cvt.rn.f32.u32 gf, g;
|
||||||
|
cvt.rn.f32.u32 bf, b;
|
||||||
|
cvt.rn.f32.u32 df, d;
|
||||||
|
mul.rn.ftz.f32 rf, rf, (1.0/255.0);
|
||||||
|
mul.rn.ftz.f32 gf, gf, (1.0/255.0);
|
||||||
|
mul.rn.ftz.f32 bf, bf, (1.0/255.0);
|
||||||
|
shl.b32 off, %3, 4;
|
||||||
|
cvt.u64.u32 ptr, off;
|
||||||
|
add.u64 ptr, ptr, %6;
|
||||||
|
red.global.add.f32 [ptr], rf;
|
||||||
|
red.global.add.f32 [ptr+4], gf;
|
||||||
|
red.global.add.f32 [ptr+8], bf;
|
||||||
|
red.global.add.f32 [ptr+12], df;
|
||||||
|
oflow_end:
|
||||||
|
}
|
||||||
|
""")}} :: "f"(cc), "f"(color_dither), "r"(time), "r"(i),
|
||||||
|
"l"(atom_ptr), "f"(cosel[threadIdx.y + {{NWARPS}}]),
|
||||||
|
"l"(out_ptr));
|
||||||
|
{{endif}}
|
||||||
|
|
||||||
|
{{if info.acc_mode == 'global'}}
|
||||||
float4 outcol = tex2D(palTex, cc, time_frac);
|
float4 outcol = tex2D(palTex, cc, time_frac);
|
||||||
float4 *accbuf = reinterpret_cast<float4*>(out_ptr + (16*i));
|
float4 *accbuf = reinterpret_cast<float4*>(out_ptr + (16*i));
|
||||||
float4 pix = *accbuf;
|
float4 pix = *accbuf;
|
||||||
@ -380,6 +431,47 @@ void iter(
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
{{if info.acc_mode == 'atomic'}}
|
||||||
|
__global__
|
||||||
|
void flush_atom(uint64_t out_ptr, uint64_t atom_ptr, int nbins) {
|
||||||
|
int i = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;
|
||||||
|
asm volatile ({{crep("""
|
||||||
|
{
|
||||||
|
.reg .u32 off, hi, lo, d, r, g, b;
|
||||||
|
.reg .u64 val, ptr;
|
||||||
|
.reg .f32 rf, gf, bf, df, rg, gg, bg, dg;
|
||||||
|
|
||||||
|
// TODO: use explicit movs to handle this
|
||||||
|
shl.b32 off, %0, 3;
|
||||||
|
cvt.u64.u32 ptr, off;
|
||||||
|
add.u64 ptr, ptr, %1;
|
||||||
|
ld.global.v2.u32 {lo, hi}, [ptr];
|
||||||
|
shl.b32 off, %0, 4;
|
||||||
|
cvt.u64.u32 ptr, off;
|
||||||
|
add.u64 ptr, ptr, %2;
|
||||||
|
ld.global.v4.f32 {rg,gg,bg,dg}, [ptr];
|
||||||
|
shr.u32 d, hi, 22;
|
||||||
|
bfe.u32 r, hi, 4, 18;
|
||||||
|
bfe.u32 g, lo, 18, 14;
|
||||||
|
bfi.b32 g, hi, g, 14, 4;
|
||||||
|
and.b32 b, lo, ((1<<18)-1);
|
||||||
|
cvt.rn.f32.u32 rf, r;
|
||||||
|
cvt.rn.f32.u32 gf, g;
|
||||||
|
cvt.rn.f32.u32 bf, b;
|
||||||
|
cvt.rn.f32.u32 df, d;
|
||||||
|
fma.rn.ftz.f32 rg, rf, (1.0/255.0), rg;
|
||||||
|
fma.rn.ftz.f32 gg, gf, (1.0/255.0), gg;
|
||||||
|
fma.rn.ftz.f32 bg, bf, (1.0/255.0), bg;
|
||||||
|
add.rn.ftz.f32 dg, df, dg;
|
||||||
|
st.global.v4.f32 [ptr], {rg,gg,bg,dg};
|
||||||
|
}
|
||||||
|
""")}} :: "r"(i), "l"(atom_ptr), "l"(out_ptr));
|
||||||
|
}
|
||||||
|
|
||||||
|
{{endif}}
|
||||||
|
|
||||||
|
{{if info.acc_mode == 'deferred'}}
|
||||||
|
|
||||||
// Block size, shared accumulation bits, shared accumulation width.
|
// Block size, shared accumulation bits, shared accumulation width.
|
||||||
#define BS 1024
|
#define BS 1024
|
||||||
#define SHAB 12
|
#define SHAB 12
|
||||||
@ -462,8 +554,8 @@ write_shmem(
|
|||||||
shl.b32 color, color, 3;
|
shl.b32 color, color, 3;
|
||||||
cvt.rni.u32.f32 time, %1;
|
cvt.rni.u32.f32 time, %1;
|
||||||
|
|
||||||
suld.b.2d.v2.b32.clamp {hi, lo}, [flatpal, {color, time}];
|
suld.b.2d.v2.b32.clamp {lo, hi}, [flatpal, {color, time}];
|
||||||
ld.shared.v2.u32 {hiw, low}, [shoff];
|
ld.shared.v2.u32 {low, hiw}, [shoff];
|
||||||
add.cc.u32 lo, lo, low;
|
add.cc.u32 lo, lo, low;
|
||||||
addc.u32 hi, hi, hiw;
|
addc.u32 hi, hi, hiw;
|
||||||
setp.hs.u32 q, hi, (1023 << 22);
|
setp.hs.u32 q, hi, (1023 << 22);
|
||||||
@ -513,7 +605,7 @@ oflow_write_end:
|
|||||||
asm({{crep("""
|
asm({{crep("""
|
||||||
{
|
{
|
||||||
.reg .u32 hi, lo;
|
.reg .u32 hi, lo;
|
||||||
ld.shared.v2.u32 {hi, lo}, [%4];
|
ld.shared.v2.u32 {lo, hi}, [%4];
|
||||||
shr.u32 %0, hi, 22;
|
shr.u32 %0, hi, 22;
|
||||||
bfe.u32 %1, hi, 4, 18;
|
bfe.u32 %1, hi, 4, 18;
|
||||||
bfe.u32 %2, lo, 18, 14;
|
bfe.u32 %2, lo, 18, 14;
|
||||||
@ -530,7 +622,7 @@ oflow_write_end:
|
|||||||
glo_idx += (BS << 8);
|
glo_idx += (BS << 8);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
{{endif}}
|
||||||
''', 'iter_kern')
|
''', 'iter_kern')
|
||||||
return tmpl.substitute(
|
return tmpl.substitute(
|
||||||
info = self.info,
|
info = self.info,
|
||||||
|
@ -8,7 +8,7 @@ from itertools import cycle, repeat, chain, izip
|
|||||||
from ctypes import *
|
from ctypes import *
|
||||||
from cStringIO import StringIO
|
from cStringIO import StringIO
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from numpy import int32 as i32
|
from numpy import int32 as i32, uint64 as u64
|
||||||
from scipy import ndimage
|
from scipy import ndimage
|
||||||
|
|
||||||
from fr0stlib import pyflam3
|
from fr0stlib import pyflam3
|
||||||
@ -110,7 +110,6 @@ class Renderer(object):
|
|||||||
reset_rb_fun = self.mod.get_function("reset_rb")
|
reset_rb_fun = self.mod.get_function("reset_rb")
|
||||||
packer_fun = self.mod.get_function("interp_iter_params")
|
packer_fun = self.mod.get_function("interp_iter_params")
|
||||||
iter_fun = self.mod.get_function("iter")
|
iter_fun = self.mod.get_function("iter")
|
||||||
write_fun = self.mod.get_function("write_shmem")
|
|
||||||
|
|
||||||
info = self.info
|
info = self.info
|
||||||
|
|
||||||
@ -119,6 +118,7 @@ class Renderer(object):
|
|||||||
filt_stream = cuda.Stream()
|
filt_stream = cuda.Stream()
|
||||||
if info.acc_mode == 'deferred':
|
if info.acc_mode == 'deferred':
|
||||||
write_stream = cuda.Stream()
|
write_stream = cuda.Stream()
|
||||||
|
write_fun = self.mod.get_function("write_shmem")
|
||||||
else:
|
else:
|
||||||
write_stream = iter_stream
|
write_stream = iter_stream
|
||||||
|
|
||||||
@ -132,6 +132,9 @@ class Renderer(object):
|
|||||||
# Extra padding in accum helps with write_shmem overruns
|
# Extra padding in accum helps with write_shmem overruns
|
||||||
d_accum = cuda.mem_alloc(16 * nbins + (1<<16))
|
d_accum = cuda.mem_alloc(16 * nbins + (1<<16))
|
||||||
d_out = cuda.mem_alloc(16 * nbins)
|
d_out = cuda.mem_alloc(16 * nbins)
|
||||||
|
if info.acc_mode == 'atomic':
|
||||||
|
d_atom = cuda.mem_alloc(8 * nbins)
|
||||||
|
flush_fun = self.mod.get_function("flush_atom")
|
||||||
|
|
||||||
acc_size = np.array([info.acc_width, info.acc_height, info.acc_stride])
|
acc_size = np.array([info.acc_width, info.acc_height, info.acc_stride])
|
||||||
d_acc_size = self.mod.get_global('acc_size')[0]
|
d_acc_size = self.mod.get_global('acc_size')[0]
|
||||||
@ -191,7 +194,7 @@ class Renderer(object):
|
|||||||
d_palint_vals = cuda.to_device(
|
d_palint_vals = cuda.to_device(
|
||||||
np.concatenate(map(info.db.palettes.get, pals[1::2])))
|
np.concatenate(map(info.db.palettes.get, pals[1::2])))
|
||||||
|
|
||||||
if info.acc_mode == 'deferred':
|
if info.acc_mode in ('deferred', 'atomic'):
|
||||||
palette_fun = self.mod.get_function("interp_palette_hsv_flat")
|
palette_fun = self.mod.get_function("interp_palette_hsv_flat")
|
||||||
dsc = cuda.ArrayDescriptor3D()
|
dsc = cuda.ArrayDescriptor3D()
|
||||||
dsc.height = info.palette_height
|
dsc.height = info.palette_height
|
||||||
@ -227,7 +230,7 @@ class Renderer(object):
|
|||||||
|
|
||||||
for idx, start, stop in times:
|
for idx, start, stop in times:
|
||||||
twidth = np.float32((stop-start) / info.palette_height)
|
twidth = np.float32((stop-start) / info.palette_height)
|
||||||
if info.acc_mode == 'deferred':
|
if info.acc_mode in ('deferred', 'atomic'):
|
||||||
palette_fun(d_seeds, d_palint_times, d_palint_vals,
|
palette_fun(d_seeds, d_palint_times, d_palint_vals,
|
||||||
np.float32(start), twidth,
|
np.float32(start), twidth,
|
||||||
block=(256,1,1), grid=(info.palette_height,1),
|
block=(256,1,1), grid=(info.palette_height,1),
|
||||||
@ -257,6 +260,8 @@ class Renderer(object):
|
|||||||
#print '%60s %g' % ('_'.join(n), i)
|
#print '%60s %g' % ('_'.join(n), i)
|
||||||
|
|
||||||
util.BaseCode.fill_dptr(self.mod, d_accum, 4 * nbins, write_stream)
|
util.BaseCode.fill_dptr(self.mod, d_accum, 4 * nbins, write_stream)
|
||||||
|
if info.acc_mode == 'atomic':
|
||||||
|
util.BaseCode.fill_dptr(self.mod, d_atom, 2 * nbins, write_stream)
|
||||||
nrounds = ( (info.density * info.width * info.height)
|
nrounds = ( (info.density * info.width * info.height)
|
||||||
/ (ntemporal_samples * 256 * 256) ) + 1
|
/ (ntemporal_samples * 256 * 256) ) + 1
|
||||||
if info.acc_mode == 'deferred':
|
if info.acc_mode == 'deferred':
|
||||||
@ -273,9 +278,16 @@ class Renderer(object):
|
|||||||
block=(1024, 1, 1), grid=(nwriteblocks, 1),
|
block=(1024, 1, 1), grid=(nwriteblocks, 1),
|
||||||
stream=write_stream)
|
stream=write_stream)
|
||||||
else:
|
else:
|
||||||
iter_fun(np.uint64(d_accum), d_seeds, d_points, d_infos,
|
args = [u64(d_accum), d_seeds, d_points, d_infos]
|
||||||
block=(32, self._iter.NTHREADS/32, 1),
|
if info.acc_mode == 'atomic':
|
||||||
|
args.append(u64(d_atom))
|
||||||
|
iter_fun(*args, block=(32, self._iter.NTHREADS/32, 1),
|
||||||
grid=(ntemporal_samples, nrounds), stream=iter_stream)
|
grid=(ntemporal_samples, nrounds), stream=iter_stream)
|
||||||
|
if info.acc_mode == 'atomic':
|
||||||
|
nblocks = int(np.ceil(np.sqrt(nbins/float(512))))
|
||||||
|
flush_fun(u64(d_accum), u64(d_atom), i32(nbins),
|
||||||
|
block=(512, 1, 1), grid=(nblocks, nblocks),
|
||||||
|
stream=iter_stream)
|
||||||
|
|
||||||
util.BaseCode.fill_dptr(self.mod, d_out, 4 * nbins, filt_stream)
|
util.BaseCode.fill_dptr(self.mod, d_out, 4 * nbins, filt_stream)
|
||||||
_sync_stream(filt_stream, write_stream)
|
_sync_stream(filt_stream, write_stream)
|
||||||
|
Loading…
Reference in New Issue
Block a user