mirror of
https://github.com/stevenrobertson/cuburn.git
synced 2025-02-05 11:40:04 -05:00
Deferred writeback.
This commit is contained in:
parent
05e1d08681
commit
eb43b151dc
@ -122,7 +122,6 @@ class IterCode(HunkOCode):
|
|||||||
bodies = [self._xfbody(i,x) for i,x in sorted(info.genome.xforms.items())]
|
bodies = [self._xfbody(i,x) for i,x in sorted(info.genome.xforms.items())]
|
||||||
bodies.append(iterbody)
|
bodies.append(iterbody)
|
||||||
self.defs = '\n'.join(bodies)
|
self.defs = '\n'.join(bodies)
|
||||||
self.decls += self.pix_helpers.substitute(info=info)
|
|
||||||
|
|
||||||
decls = """
|
decls = """
|
||||||
// Note: for normalized lookups, uchar4 actually returns floats
|
// Note: for normalized lookups, uchar4 actually returns floats
|
||||||
@ -132,78 +131,6 @@ __device__ int rb_head, rb_tail, rb_size;
|
|||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
pix_helpers = Template("""
|
|
||||||
__device__
|
|
||||||
void read_pix(float4 &pix, float &den) {
|
|
||||||
den = pix.w;
|
|
||||||
{{if info.pal_has_alpha}}
|
|
||||||
read_half(pix.z, pix.w, pix.z, den);
|
|
||||||
{{endif}}
|
|
||||||
}
|
|
||||||
|
|
||||||
__device__
|
|
||||||
void write_pix(float4 &pix, float den) {
|
|
||||||
{{if info.pal_has_alpha}}
|
|
||||||
write_half(pix.z, pix.z, pix.w, den);
|
|
||||||
{{endif}}
|
|
||||||
pix.w = den;
|
|
||||||
}
|
|
||||||
|
|
||||||
__device__
|
|
||||||
void update_pix(uint64_t ptr, uint32_t i, float4 c) {
|
|
||||||
{{if info.pal_has_alpha}}
|
|
||||||
asm volatile ({{crep('''
|
|
||||||
{
|
|
||||||
.reg .u16 sz, sw;
|
|
||||||
.reg .u64 base, off;
|
|
||||||
.reg .f32 x, y, z, w, den, rc, tz, tw;
|
|
||||||
|
|
||||||
// TODO: this limits the accumulation buffer to <4GB
|
|
||||||
shl.b32 %0, %0, 4;
|
|
||||||
cvt.u64.u32 off, %0;
|
|
||||||
add.u64 base, %1, off;
|
|
||||||
ld.cg.v4.f32 {x, y, z, den}, [base];
|
|
||||||
add.f32 x, x, %2;
|
|
||||||
add.f32 y, y, %3;
|
|
||||||
mov.b32 {sz, sw}, z;
|
|
||||||
cvt.rn.f32.u16 tz, sz;
|
|
||||||
cvt.rn.f32.u16 tw, sw;
|
|
||||||
mul.f32 tz, tz, den;
|
|
||||||
mul.f32 tw, tz, den;
|
|
||||||
fma.f32 tz, %4, 65535.0, tz;
|
|
||||||
fma.f32 tw, %5, 65535.0, tw;
|
|
||||||
add.f32 den, 1.0;
|
|
||||||
rcp.approx.f32 rc, den;
|
|
||||||
mul.f32 tz, tz, rc;
|
|
||||||
mul.f32 tw, tw, rc;
|
|
||||||
cvt.rni.u16.f32 sz, tz;
|
|
||||||
cvt.rni.u16.f32 sw, tw;
|
|
||||||
mov.b32 z, {sz, sw};
|
|
||||||
st.cs.v4.f32 [base], {x, y, z, den};
|
|
||||||
}
|
|
||||||
''')}} : "+r"(i) : "l"(ptr), "f"(c.x), "f"(c.y), "f"(c.z), "f"(c.w));
|
|
||||||
{{else}}
|
|
||||||
asm volatile ({{crep('''
|
|
||||||
{
|
|
||||||
.reg .u64 base, off;
|
|
||||||
.reg .f32 x, y, z, den;
|
|
||||||
|
|
||||||
// TODO: this limits the accumulation buffer to <4GB
|
|
||||||
shl.b32 %0, %0, 4;
|
|
||||||
cvt.u64.u32 off, %0;
|
|
||||||
add.u64 base, %1, off;
|
|
||||||
ld.cg.v4.f32 {x, y, z, den}, [base];
|
|
||||||
add.f32 x, x, %2;
|
|
||||||
add.f32 y, y, %3;
|
|
||||||
add.f32 z, z, %4;
|
|
||||||
add.f32 den, den, 1.0;
|
|
||||||
st.cs.v4.f32 [base], {x, y, z, den};
|
|
||||||
}
|
|
||||||
''')}} : "+r"(i) : "l"(ptr), "f"(c.x), "f"(c.y), "f"(c.z));
|
|
||||||
{{endif}}
|
|
||||||
}
|
|
||||||
""")
|
|
||||||
|
|
||||||
def _xfbody(self, xfid, xform):
|
def _xfbody(self, xfid, xform):
|
||||||
px = self.pcp.xforms[xfid]
|
px = self.pcp.xforms[xfid]
|
||||||
tmpl = Template(r"""
|
tmpl = Template(r"""
|
||||||
@ -249,19 +176,23 @@ __global__ void reset_rb(int size) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
__global__
|
__global__
|
||||||
void iter(uint64_t accbuf_ptr, mwc_st *msts, float4 *points,
|
void iter(
|
||||||
const iter_params *all_params, int nsamps_to_generate) {
|
uint64_t out_ptr,
|
||||||
|
mwc_st *msts,
|
||||||
|
float4 *points,
|
||||||
|
const iter_params *all_params,
|
||||||
|
int nsamps_to_generate
|
||||||
|
) {
|
||||||
const iter_params *global_params = &(all_params[blockIdx.x]);
|
const iter_params *global_params = &(all_params[blockIdx.x]);
|
||||||
|
|
||||||
__shared__ int nsamps;
|
{{if info.acc_mode != 'deferred'}}
|
||||||
nsamps = nsamps_to_generate;
|
|
||||||
|
|
||||||
__shared__ float time_frac;
|
__shared__ float time_frac;
|
||||||
time_frac = blockIdx.x / (float) gridDim.x;
|
time_frac = blockIdx.x / (float) gridDim.x;
|
||||||
|
{{endif}}
|
||||||
|
|
||||||
// load params to shared memory cooperatively
|
// load params to shared memory cooperatively
|
||||||
for (int i = threadIdx.y * blockDim.x + threadIdx.x;
|
for (int i = threadIdx.y * blockDim.x + threadIdx.x;
|
||||||
i * 4 < sizeof(iter_params); i += blockDim.x * blockDim.y)
|
i < (sizeof(iter_params) / 4); i += blockDim.x * blockDim.y)
|
||||||
reinterpret_cast<float*>(¶ms)[i] =
|
reinterpret_cast<float*>(¶ms)[i] =
|
||||||
reinterpret_cast<const float*>(global_params)[i];
|
reinterpret_cast<const float*>(global_params)[i];
|
||||||
|
|
||||||
@ -272,9 +203,10 @@ void iter(uint64_t accbuf_ptr, mwc_st *msts, float4 *points,
|
|||||||
__syncthreads();
|
__syncthreads();
|
||||||
int this_rb_idx = rb_idx + threadIdx.x + 32 * threadIdx.y;
|
int this_rb_idx = rb_idx + threadIdx.x + 32 * threadIdx.y;
|
||||||
mwc_st rctx = msts[this_rb_idx];
|
mwc_st rctx = msts[this_rb_idx];
|
||||||
|
|
||||||
|
// TODO: 4th channel unused. Kill or use for something helpful
|
||||||
float4 old_point = points[this_rb_idx];
|
float4 old_point = points[this_rb_idx];
|
||||||
float x = old_point.x, y = old_point.y,
|
float x = old_point.x, y = old_point.y, color = old_point.z;
|
||||||
color = old_point.z, fuse_rounds = old_point.w;
|
|
||||||
|
|
||||||
{{if info.chaos_used}}
|
{{if info.chaos_used}}
|
||||||
int last_xf_used = 0;
|
int last_xf_used = 0;
|
||||||
@ -290,116 +222,139 @@ void iter(uint64_t accbuf_ptr, mwc_st *msts, float4 *points,
|
|||||||
__syncthreads();
|
__syncthreads();
|
||||||
{{endif}}
|
{{endif}}
|
||||||
|
|
||||||
|
bool fuse = false;
|
||||||
|
|
||||||
while (1) {
|
// This condition checks for large numbers, Infs, and NaNs.
|
||||||
// This condition checks for large numbers, Infs, and NaNs.
|
if (!(-(fabsf(x) + fabsf(y)) > -1.0e6f)) {
|
||||||
if (!(-(fabsf(x) + fabsf(y) > -1.0e6f))) {
|
x = mwc_next_11(rctx);
|
||||||
x = mwc_next_11(rctx);
|
y = mwc_next_11(rctx);
|
||||||
y = mwc_next_11(rctx);
|
color = mwc_next_01(rctx);
|
||||||
color = mwc_next_01(rctx);
|
fuse = true;
|
||||||
fuse_rounds = {{info.fuse / 32}};
|
}
|
||||||
}
|
|
||||||
|
|
||||||
// 32 rounds is somewhat arbitrary, but it has a pleasing 32-ness
|
// TODO: link up with FUSE, etc
|
||||||
for (int i = 0; i < 32; i++) {
|
for (int round = 0; round < 256; round++) {
|
||||||
|
|
||||||
{{if info.chaos_used}}
|
{{if info.chaos_used}}
|
||||||
|
|
||||||
{{precalc_chaos(pcp, std_xforms)}}
|
{{precalc_chaos(pcp, std_xforms)}}
|
||||||
|
|
||||||
// For now, we don't attempt to use the swap buffer when chaos is used
|
// For now, we don't attempt to use the swap buffer when chaos is used
|
||||||
float xfsel = mwc_next_01(rctx);
|
float xfsel = mwc_next_01(rctx);
|
||||||
|
|
||||||
{{for prior_xform_idx, prior_xform_name in enumerate(std_xforms)}}
|
{{for prior_xform_idx, prior_xform_name in enumerate(std_xforms)}}
|
||||||
if (last_xf_used == {{prior_xform_idx}}) {
|
if (last_xf_used == {{prior_xform_idx}}) {
|
||||||
{{for xform_idx, xform_name in enumerate(std_xforms[:-1])}}
|
{{for xform_idx, xform_name in enumerate(std_xforms[:-1])}}
|
||||||
if (xfsel <= {{pcp['chaos_'+prior_xform_name+'_'+xform_name]}}) {
|
if (xfsel <= {{pcp['chaos_'+prior_xform_name+'_'+xform_name]}}) {
|
||||||
apply_xf_{{xform_name}}(x, y, color, rctx);
|
apply_xf_{{xform_name}}(x, y, color, rctx);
|
||||||
last_xf_used = {{xform_idx}};
|
last_xf_used = {{xform_idx}};
|
||||||
} else
|
|
||||||
{{endfor}}
|
|
||||||
{
|
|
||||||
apply_xf_{{std_xforms[-1]}}(x, y, color, rctx);
|
|
||||||
last_xf_used = {{len(std_xforms)-1}};
|
|
||||||
}
|
|
||||||
} else
|
} else
|
||||||
{{endfor}}
|
{{endfor}}
|
||||||
{
|
{
|
||||||
printf("Something went *very* wrong.\n");
|
|
||||||
asm("trap;");
|
|
||||||
}
|
|
||||||
|
|
||||||
{{else}}
|
|
||||||
{{precalc_densities(pcp, std_xforms)}}
|
|
||||||
float xfsel = cosel[threadIdx.y];
|
|
||||||
|
|
||||||
{{for xform_name in std_xforms[:-1]}}
|
|
||||||
if (xfsel <= {{pcp['den_'+xform_name]}}) {
|
|
||||||
apply_xf_{{xform_name}}(x, y, color, rctx);
|
|
||||||
} else
|
|
||||||
{{endfor}}
|
|
||||||
apply_xf_{{std_xforms[-1]}}(x, y, color, rctx);
|
apply_xf_{{std_xforms[-1]}}(x, y, color, rctx);
|
||||||
|
last_xf_used = {{len(std_xforms)-1}};
|
||||||
int sw = (threadIdx.y * 32 + threadIdx.x * 33) & {{NTHREADS-1}};
|
}
|
||||||
int sr = threadIdx.y * 32 + threadIdx.x;
|
} else
|
||||||
|
{{endfor}}
|
||||||
swap[sw] = fuse_rounds;
|
{
|
||||||
swap[sw+{{NTHREADS}}] = x;
|
printf("Something went *very* wrong.\n");
|
||||||
swap[sw+{{2*NTHREADS}}] = y;
|
asm("trap;");
|
||||||
swap[sw+{{3*NTHREADS}}] = color;
|
|
||||||
__syncthreads();
|
|
||||||
|
|
||||||
// We select the next xforms here, since we've just synced.
|
|
||||||
if (threadIdx.y == 0 && threadIdx.x < {{NWARPS}})
|
|
||||||
cosel[threadIdx.x] = mwc_next_01(rctx);
|
|
||||||
|
|
||||||
fuse_rounds = swap[sr];
|
|
||||||
x = swap[sr+{{NTHREADS}}];
|
|
||||||
y = swap[sr+{{2*NTHREADS}}];
|
|
||||||
color = swap[sr+{{3*NTHREADS}}];
|
|
||||||
|
|
||||||
{{endif}}
|
|
||||||
|
|
||||||
if (fuse_rounds > 0.0f) continue;
|
|
||||||
|
|
||||||
{{if 'final' in cp.xforms}}
|
|
||||||
float fx = x, fy = y, fcolor = color;
|
|
||||||
apply_xf_final(fx, fy, fcolor, rctx);
|
|
||||||
{{endif}}
|
|
||||||
|
|
||||||
float cx, cy, cc;
|
|
||||||
|
|
||||||
{{precalc_camera(info, pcp.camera)}}
|
|
||||||
|
|
||||||
{{if 'final' in cp.xforms}}
|
|
||||||
{{apply_affine('fx', 'fy', 'cx', 'cy', pcp.camera)}}
|
|
||||||
cc = fcolor;
|
|
||||||
{{else}}
|
|
||||||
{{apply_affine('x', 'y', 'cx', 'cy', pcp.camera)}}
|
|
||||||
cc = color;
|
|
||||||
{{endif}}
|
|
||||||
|
|
||||||
uint32_t ix = trunca(cx), iy = trunca(cy);
|
|
||||||
|
|
||||||
if (ix >= {{info.acc_width}} || iy >= {{info.acc_height}})
|
|
||||||
continue;
|
|
||||||
|
|
||||||
uint32_t i = iy * {{info.acc_stride}} + ix;
|
|
||||||
|
|
||||||
float4 outcol = tex2D(palTex, cc, time_frac);
|
|
||||||
update_pix(accbuf_ptr, i, outcol);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
int num_okay = __popc(__ballot(fuse_rounds == 0.0f));
|
{{else}}
|
||||||
// Some xforms give so many badvals that a thread is almost guaranteed
|
{{precalc_densities(pcp, std_xforms)}}
|
||||||
// to hit another badval before the fuse is over, causing the card to
|
float xfsel = cosel[threadIdx.y];
|
||||||
// spin forever. To avoid this, we count a fuse round as 1/4 of a
|
|
||||||
// sample below.
|
|
||||||
if (threadIdx.x == 0) atomicSub(&nsamps, 256 + num_okay * 24);
|
|
||||||
fuse_rounds = fmaxf(0.0f, fuse_rounds - 1.0f);
|
|
||||||
|
|
||||||
|
{{for xform_name in std_xforms[:-1]}}
|
||||||
|
if (xfsel <= {{pcp['den_'+xform_name]}}) {
|
||||||
|
apply_xf_{{xform_name}}(x, y, color, rctx);
|
||||||
|
} else
|
||||||
|
{{endfor}}
|
||||||
|
apply_xf_{{std_xforms[-1]}}(x, y, color, rctx);
|
||||||
|
|
||||||
|
int sw = (threadIdx.y * 32 + threadIdx.x * 33) & {{NTHREADS-1}};
|
||||||
|
int sr = threadIdx.y * 32 + threadIdx.x;
|
||||||
|
|
||||||
|
swap[sw] = fuse ? 1.0f : 0.0f;
|
||||||
|
swap[sw+{{NTHREADS}}] = x;
|
||||||
|
swap[sw+{{2*NTHREADS}}] = y;
|
||||||
|
swap[sw+{{3*NTHREADS}}] = color;
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
if (nsamps <= 0) break;
|
|
||||||
|
// We select the next xforms here, since we've just synced.
|
||||||
|
if (threadIdx.y == 0 && threadIdx.x < {{NWARPS}})
|
||||||
|
cosel[threadIdx.x] = mwc_next_01(rctx);
|
||||||
|
|
||||||
|
fuse = swap[sr];
|
||||||
|
x = swap[sr+{{NTHREADS}}];
|
||||||
|
y = swap[sr+{{2*NTHREADS}}];
|
||||||
|
color = swap[sr+{{3*NTHREADS}}];
|
||||||
|
|
||||||
|
{{endif}}
|
||||||
|
|
||||||
|
{{if info.acc_mode == 'deferred'}}
|
||||||
|
int tid = threadIdx.y * 32 + threadIdx.x;
|
||||||
|
int offset = 4 * (256 * (256 * blockIdx.x + round) + tid);
|
||||||
|
int *log = reinterpret_cast<int*>(out_ptr + offset);
|
||||||
|
{{endif}}
|
||||||
|
|
||||||
|
if (fuse) {
|
||||||
|
{{if info.acc_mode == 'deferred'}}
|
||||||
|
*log = 0xffffffff;
|
||||||
|
{{endif}}
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
{{if 'final' in cp.xforms}}
|
||||||
|
float fx = x, fy = y, fcolor = color;
|
||||||
|
apply_xf_final(fx, fy, fcolor, rctx);
|
||||||
|
{{endif}}
|
||||||
|
|
||||||
|
float cx, cy, cc;
|
||||||
|
|
||||||
|
{{precalc_camera(info, pcp.camera)}}
|
||||||
|
|
||||||
|
{{if 'final' in cp.xforms}}
|
||||||
|
{{apply_affine('fx', 'fy', 'cx', 'cy', pcp.camera)}}
|
||||||
|
cc = fcolor;
|
||||||
|
{{else}}
|
||||||
|
{{apply_affine('x', 'y', 'cx', 'cy', pcp.camera)}}
|
||||||
|
cc = color;
|
||||||
|
{{endif}}
|
||||||
|
|
||||||
|
uint32_t ix = trunca(cx), iy = trunca(cy);
|
||||||
|
|
||||||
|
if (ix >= {{info.acc_width}} || iy >= {{info.acc_height}}) {
|
||||||
|
{{if info.acc_mode == 'deferred'}}
|
||||||
|
*log = 0xffffffff;
|
||||||
|
{{endif}}
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
uint32_t i = iy * {{info.acc_stride}} + ix;
|
||||||
|
|
||||||
|
{{if info.acc_mode == 'atomic'}}
|
||||||
|
float4 outcol = tex2D(palTex, cc, time_frac);
|
||||||
|
float *accbuf_f = reinterpret_cast<float*>(out_ptr + (16*i));
|
||||||
|
atomicAdd(accbuf_f, outcol.x);
|
||||||
|
atomicAdd(accbuf_f+1, outcol.y);
|
||||||
|
atomicAdd(accbuf_f+2, outcol.z);
|
||||||
|
atomicAdd(accbuf_f+3, 1.0f);
|
||||||
|
{{elif info.acc_mode == 'global'}}
|
||||||
|
float4 outcol = tex2D(palTex, cc, time_frac);
|
||||||
|
float4 *accbuf = reinterpret_cast<float4*>(out_ptr + (16*i));
|
||||||
|
float4 pix = *accbuf;
|
||||||
|
pix.x += outcol.x;
|
||||||
|
pix.y += outcol.y;
|
||||||
|
pix.z += outcol.z;
|
||||||
|
pix.w += 1.0f;
|
||||||
|
*accbuf = pix;
|
||||||
|
{{elif info.acc_mode == 'deferred'}}
|
||||||
|
// 'color' gets the top 9 bits. TODO: add dithering via precalc.
|
||||||
|
uint32_t icolor = cc * 512.0f;
|
||||||
|
asm("bfi.b32 %0, %1, %0, 23, 9;" : "+r"(i) : "r"(icolor));
|
||||||
|
*log = i;
|
||||||
|
{{endif}}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (threadIdx.x == 0 && threadIdx.y == 0)
|
if (threadIdx.x == 0 && threadIdx.y == 0)
|
||||||
@ -407,10 +362,140 @@ void iter(uint64_t accbuf_ptr, mwc_st *msts, float4 *points,
|
|||||||
__syncthreads();
|
__syncthreads();
|
||||||
this_rb_idx = rb_idx + threadIdx.x + 32 * threadIdx.y;
|
this_rb_idx = rb_idx + threadIdx.x + 32 * threadIdx.y;
|
||||||
|
|
||||||
points[this_rb_idx] = make_float4(x, y, color, fuse_rounds);
|
points[this_rb_idx] = make_float4(x, y, color, 0.0f);
|
||||||
msts[this_rb_idx] = rctx;
|
msts[this_rb_idx] = rctx;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Block size, shared accumulation bits, shared accumulation width.
|
||||||
|
#define BS 1024
|
||||||
|
#define SHAB 12
|
||||||
|
#define SHAW (1<<SHAB)
|
||||||
|
|
||||||
|
// These two accumulators, used in write_shmem, hold {density, red} and
|
||||||
|
// {green, blue} values as packed u16 pairs. The fixed size represents 4,096
|
||||||
|
// pixels in the accumulator.
|
||||||
|
__shared__ uint32_t s_acc_dr[SHAW];
|
||||||
|
__shared__ uint32_t s_acc_gb[SHAW];
|
||||||
|
|
||||||
|
// Read from the shm accumulators and write to the global ones.
|
||||||
|
__device__
|
||||||
|
void write_shmem_helper(
|
||||||
|
float4 *acc,
|
||||||
|
const int glo_base,
|
||||||
|
const int idx
|
||||||
|
) {
|
||||||
|
float4 pix = acc[glo_base+idx];
|
||||||
|
uint32_t dr = s_acc_dr[idx];
|
||||||
|
pix.x += (dr & 0xffff) / 255.0f;
|
||||||
|
pix.w += dr >> 16;
|
||||||
|
uint32_t gb = s_acc_gb[idx];
|
||||||
|
pix.y += (gb & 0xffff) / 255.0f;
|
||||||
|
pix.z += (gb >> 16) / 255.0f;
|
||||||
|
acc[glo_base+idx] = pix;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Read the point log, accumulate in shared memory, and write the results.
|
||||||
|
// This kernel is to be launched with one block for every 4,096 addresses to
|
||||||
|
// be processed, and will handle those addresses.
|
||||||
|
//
|
||||||
|
// log_bounds is an array mapping radix values to the first index in the log
|
||||||
|
// with that radix position. For performance reasons in other parts of the
|
||||||
|
// code, the radix may actually include bits within the lower SHAB part of the
|
||||||
|
// address, or it might not cover the first few bits after the SHAB part;
|
||||||
|
// log_bounds_shift covers that. glob_addr_bits specifies the number of bits
|
||||||
|
// above SHAB which are address bits.
|
||||||
|
|
||||||
|
__global__ void
|
||||||
|
__launch_bounds__(BS, 1)
|
||||||
|
write_shmem(
|
||||||
|
float4 *acc,
|
||||||
|
const uint32_t *log,
|
||||||
|
const uint32_t *log_bounds,
|
||||||
|
const int log_bounds_shift
|
||||||
|
) {
|
||||||
|
const int tid = threadIdx.x;
|
||||||
|
const int bid = blockIdx.x;
|
||||||
|
|
||||||
|
// TODO: doesn't respect SHAW/BS
|
||||||
|
// TODO: compare generated code with unrolled for-loop
|
||||||
|
s_acc_dr[tid] = 0;
|
||||||
|
s_acc_gb[tid] = 0;
|
||||||
|
s_acc_dr[tid+BS] = 0;
|
||||||
|
s_acc_gb[tid+BS] = 0;
|
||||||
|
s_acc_dr[tid+2*BS] = 0;
|
||||||
|
s_acc_gb[tid+2*BS] = 0;
|
||||||
|
s_acc_dr[tid+3*BS] = 0;
|
||||||
|
s_acc_gb[tid+3*BS] = 0;
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
// TODO: share across threads - discernable performance impact?
|
||||||
|
int lb_idx_lo, lb_idx_hi;
|
||||||
|
if (log_bounds_shift > 0) {
|
||||||
|
lb_idx_hi = ((bid + 1) << log_bounds_shift) - 1;
|
||||||
|
lb_idx_lo = (bid << log_bounds_shift) - 1;
|
||||||
|
} else {
|
||||||
|
lb_idx_hi = bid >> (-log_bounds_shift);
|
||||||
|
lb_idx_lo = lb_idx_hi - 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
int idx_lo, idx_hi;
|
||||||
|
if (lb_idx_lo < 0) idx_lo = 0;
|
||||||
|
else idx_lo = log_bounds[lb_idx_lo] & ~(BS-1);
|
||||||
|
idx_hi = (log_bounds[lb_idx_hi] & ~(BS - 1)) + BS;
|
||||||
|
|
||||||
|
float rnrounds = 1.0f / (idx_hi - idx_lo);
|
||||||
|
float time = tid * rnrounds;
|
||||||
|
float time_step = BS * rnrounds;
|
||||||
|
|
||||||
|
int glo_base = bid << SHAB;
|
||||||
|
|
||||||
|
for (int i = idx_lo + tid; i < idx_hi; i += BS) {
|
||||||
|
int entry = log[i];
|
||||||
|
|
||||||
|
|
||||||
|
// TODO: constant '11' is really just 32 - 9 - SHAB, where 9 is the
|
||||||
|
// number of bits assigned to color. This ignores opacity.
|
||||||
|
bfe_decl(glob_addr, entry, SHAB, 11);
|
||||||
|
if (glob_addr != bid) continue;
|
||||||
|
|
||||||
|
bfe_decl(shr_addr, entry, 0, SHAB);
|
||||||
|
bfe_decl(color, entry, 23, 9);
|
||||||
|
|
||||||
|
float colorf = color / 512.0f;
|
||||||
|
float4 outcol = tex2D(palTex, colorf, time);
|
||||||
|
|
||||||
|
// TODO: change texture sampler to return shorts and avoid this
|
||||||
|
uint32_t r = 255.0f * outcol.x;
|
||||||
|
uint32_t g = 255.0f * outcol.y;
|
||||||
|
uint32_t b = 255.0f * outcol.z;
|
||||||
|
|
||||||
|
uint32_t dr = atomicAdd(s_acc_dr + shr_addr, r + 0x10000);
|
||||||
|
uint32_t gb = atomicAdd(s_acc_gb + shr_addr, g + (b << 16));
|
||||||
|
uint32_t d = dr >> 16;
|
||||||
|
|
||||||
|
// Neat trick: if overflow is about to happen, write the accumulator,
|
||||||
|
// and subtract the last known values from the accumulator again.
|
||||||
|
// Even if the ints end up wrapping around once before the subtraction
|
||||||
|
// can occur, the results after the subtraction will be correct.
|
||||||
|
// (Wrapping twice will mess up the intermediate write, but is pretty
|
||||||
|
// unlikely.)
|
||||||
|
if (d == 250) {
|
||||||
|
atomicSub(s_acc_dr + shr_addr, dr);
|
||||||
|
atomicSub(s_acc_gb + shr_addr, gb);
|
||||||
|
write_shmem_helper(acc, glo_base, shr_addr);
|
||||||
|
}
|
||||||
|
time += time_step;
|
||||||
|
}
|
||||||
|
|
||||||
|
__syncthreads();
|
||||||
|
int idx = tid;
|
||||||
|
for (int i = 0; i < (SHAW / BS); i++) {
|
||||||
|
write_shmem_helper(acc, glo_base, idx);
|
||||||
|
idx += BS;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
''')
|
''')
|
||||||
return tmpl.substitute(
|
return tmpl.substitute(
|
||||||
info = self.info,
|
info = self.info,
|
||||||
|
@ -71,6 +71,13 @@ float3 hsv2rgb(float3 hsv);
|
|||||||
#define M_SQRT2 1.41421353816986f
|
#define M_SQRT2 1.41421353816986f
|
||||||
#define M_SQRT1_2 0.70710676908493f
|
#define M_SQRT1_2 0.70710676908493f
|
||||||
|
|
||||||
|
#define bfe(d, s, o, w) \
|
||||||
|
asm("bfe.u32 %0, %1, %2, %3;" : "=r"(d) : "r"(s), "r"(o), "r"(w))
|
||||||
|
|
||||||
|
#define bfe_decl(d, s, o, w) \
|
||||||
|
int d; \
|
||||||
|
bfe(d, s, o, w)
|
||||||
|
|
||||||
// TODO: use launch parameter preconfig to eliminate unnecessary parts
|
// TODO: use launch parameter preconfig to eliminate unnecessary parts
|
||||||
__device__
|
__device__
|
||||||
uint32_t gtid() {
|
uint32_t gtid() {
|
||||||
|
@ -99,8 +99,10 @@ class RenderInfo(object):
|
|||||||
genomes. The values of this class are fixed before compilation begins.
|
genomes. The values of this class are fixed before compilation begins.
|
||||||
"""
|
"""
|
||||||
# Number of iterations to iterate without write after generating a new
|
# Number of iterations to iterate without write after generating a new
|
||||||
# point, including the number of bad
|
# point. This number is currently fixed pretty deeply in the set of magic
|
||||||
fuse = 192
|
# constants which govern buffer sizes; changing the value here won't
|
||||||
|
# actually change the code on the device to do something different.
|
||||||
|
fuse = 256
|
||||||
|
|
||||||
# Height of the texture pallete which gets uploaded to the GPU (assuming
|
# Height of the texture pallete which gets uploaded to the GPU (assuming
|
||||||
# that palette-from-texture is enabled). For most genomes, this doesn't
|
# that palette-from-texture is enabled). For most genomes, this doesn't
|
||||||
@ -120,11 +122,19 @@ class RenderInfo(object):
|
|||||||
# which I'm not opposed to)
|
# which I'm not opposed to)
|
||||||
alpha_output_channel = False
|
alpha_output_channel = False
|
||||||
|
|
||||||
# TODO: fix these
|
# There are three settings for this somewhat ersatz paramater. 'global'
|
||||||
|
# uses unsynchronized global writes to accumulate sample points, 'atomic'
|
||||||
|
# uses atomic global writes, and 'deferred' stores color and position in a
|
||||||
|
# sample log, sorts the log by position, and uses shared memory to
|
||||||
|
# perform the accumulation. Deferred has the accuracy of 'atomic' and
|
||||||
|
# the speed of 'global' (it's actually faster!), but packs color and
|
||||||
|
# position into a single 32-bit int for now, which limits resolution to
|
||||||
|
# 1080p when xform opacity is respected, so the other two modes will hang
|
||||||
|
# around until that can be extended to be memory-limited again.
|
||||||
|
acc_mode = 'deferred'
|
||||||
|
|
||||||
|
# TODO: fix this
|
||||||
chaos_used = False
|
chaos_used = False
|
||||||
final_xform_index = 3
|
|
||||||
pal_has_alpha = False
|
|
||||||
density = 2000
|
|
||||||
|
|
||||||
def __init__(self, db, **kwargs):
|
def __init__(self, db, **kwargs):
|
||||||
self.db = db
|
self.db = db
|
||||||
@ -134,6 +144,7 @@ class RenderInfo(object):
|
|||||||
self.acc_width = self.width + 2 * self.gutter
|
self.acc_width = self.width + 2 * self.gutter
|
||||||
self.acc_height = self.height + 2 * self.gutter
|
self.acc_height = self.height + 2 * self.gutter
|
||||||
self.acc_stride = 32 * int(np.ceil(self.acc_width / 32.))
|
self.acc_stride = 32 * int(np.ceil(self.acc_width / 32.))
|
||||||
|
self.density = self.quality
|
||||||
|
|
||||||
# Deref genome
|
# Deref genome
|
||||||
self.genome = self.db.genomes[self.genome]
|
self.genome = self.db.genomes[self.genome]
|
||||||
|
@ -20,10 +20,13 @@ import pycuda.tools
|
|||||||
|
|
||||||
import cuburn.genome
|
import cuburn.genome
|
||||||
from cuburn import affine
|
from cuburn import affine
|
||||||
from cuburn.code import util, mwc, iter, filtering
|
from cuburn.code import util, mwc, iter, filtering, sort
|
||||||
|
|
||||||
RenderedImage = namedtuple('RenderedImage', 'buf idx gpu_time')
|
RenderedImage = namedtuple('RenderedImage', 'buf idx gpu_time')
|
||||||
|
|
||||||
|
def _sync_stream(dst, src):
|
||||||
|
dst.wait_for_event(cuda.Event(cuda.event_flags.DISABLE_TIMING).record(src))
|
||||||
|
|
||||||
class Renderer(object):
|
class Renderer(object):
|
||||||
"""
|
"""
|
||||||
Control structure for rendering a series of frames.
|
Control structure for rendering a series of frames.
|
||||||
@ -107,16 +110,47 @@ class Renderer(object):
|
|||||||
packer_fun = self.mod.get_function("interp_iter_params")
|
packer_fun = self.mod.get_function("interp_iter_params")
|
||||||
palette_fun = self.mod.get_function("interp_palette_hsv")
|
palette_fun = self.mod.get_function("interp_palette_hsv")
|
||||||
iter_fun = self.mod.get_function("iter")
|
iter_fun = self.mod.get_function("iter")
|
||||||
|
write_fun = self.mod.get_function("write_shmem")
|
||||||
|
|
||||||
info = self.info
|
info = self.info
|
||||||
stream = cuda.Stream()
|
|
||||||
event_a = cuda.Event().record(stream)
|
# The synchronization model is messy. See helpers/task_model.svg.
|
||||||
|
iter_stream = cuda.Stream()
|
||||||
|
filt_stream = cuda.Stream()
|
||||||
|
if info.acc_mode == 'deferred':
|
||||||
|
write_stream = cuda.Stream()
|
||||||
|
else:
|
||||||
|
write_stream = iter_stream
|
||||||
|
|
||||||
|
# These events fire when the corresponding buffer is available for
|
||||||
|
# reading on the host (i.e. the copy is done). On the first pass, 'a'
|
||||||
|
# will be ignored, and subsequently moved to 'b'.
|
||||||
|
event_a = cuda.Event().record(filt_stream)
|
||||||
event_b = None
|
event_b = None
|
||||||
|
|
||||||
nbins = info.acc_height * info.acc_stride
|
nbins = info.acc_height * info.acc_stride
|
||||||
d_accum = cuda.mem_alloc(16 * nbins)
|
d_accum = cuda.mem_alloc(16 * nbins)
|
||||||
d_out = cuda.mem_alloc(16 * nbins)
|
d_out = cuda.mem_alloc(16 * nbins)
|
||||||
|
|
||||||
|
if info.acc_mode == 'deferred':
|
||||||
|
# Having a fixed, power-of-two log size makes things much easier
|
||||||
|
log_size = 64 << 20
|
||||||
|
d_log = cuda.mem_alloc(log_size * 4)
|
||||||
|
d_log_sorted = cuda.mem_alloc(log_size * 4)
|
||||||
|
sorter = sort.Sorter(log_size)
|
||||||
|
|
||||||
|
# Shared accumulators take care of the lowest 12 bits, but due to
|
||||||
|
# a quirk of the sort implementation, asking the sort to handle
|
||||||
|
# fewer bits than it is compiled for will make it considerably
|
||||||
|
# slower (and it can't be compiled for <7b), so we actually dig in
|
||||||
|
# to the accumulator's SHAB window for those cases.
|
||||||
|
SHAB = np.int32(12)
|
||||||
|
address_bits = np.int32(np.ceil(np.log2(nbins+1)))
|
||||||
|
start_bit = address_bits - sorter.radix_bits
|
||||||
|
log_shift = np.int32(SHAB - start_bit)
|
||||||
|
nwriteblocks = int(np.ceil(nbins / (1<<SHAB)))
|
||||||
|
print start_bit, log_shift, nwriteblocks
|
||||||
|
|
||||||
# Calculate 'nslots', the number of simultaneous running threads that
|
# Calculate 'nslots', the number of simultaneous running threads that
|
||||||
# can be active on the GPU during iteration (and thus the number of
|
# can be active on the GPU during iteration (and thus the number of
|
||||||
# slots for loading and storing RNG and point context that will be
|
# slots for loading and storing RNG and point context that will be
|
||||||
@ -131,7 +165,6 @@ class Renderer(object):
|
|||||||
nsms = cuda.Context.get_device().multiprocessor_count
|
nsms = cuda.Context.get_device().multiprocessor_count
|
||||||
rb_size = occupancy.warps_per_mp * nsms / (iter_threads_per_block / 32)
|
rb_size = occupancy.warps_per_mp * nsms / (iter_threads_per_block / 32)
|
||||||
nslots = iter_threads_per_block * rb_size
|
nslots = iter_threads_per_block * rb_size
|
||||||
ntemporal_samples = int(np.ceil(1000. / rb_size) * rb_size)
|
|
||||||
|
|
||||||
# Reset the ringbuffer info for the slots
|
# Reset the ringbuffer info for the slots
|
||||||
reset_rb_fun(np.int32(rb_size), block=(1,1,1))
|
reset_rb_fun(np.int32(rb_size), block=(1,1,1))
|
||||||
@ -140,6 +173,11 @@ class Renderer(object):
|
|||||||
seeds = mwc.MWC.make_seeds(nslots)
|
seeds = mwc.MWC.make_seeds(nslots)
|
||||||
d_seeds = cuda.to_device(seeds)
|
d_seeds = cuda.to_device(seeds)
|
||||||
|
|
||||||
|
# We used to auto-calculate this to a multiple of the number of SMs on
|
||||||
|
# the device, but since we now use shorter launches and, to a certain
|
||||||
|
# extent, allow simultaneous occupancy, that's not as important. The
|
||||||
|
# 1024 is a magic constant, though: FUSE
|
||||||
|
ntemporal_samples = 1024
|
||||||
genome_times, genome_knots = self._iter.packer.pack()
|
genome_times, genome_knots = self._iter.packer.pack()
|
||||||
d_genome_times = cuda.to_device(genome_times)
|
d_genome_times = cuda.to_device(genome_times)
|
||||||
d_genome_knots = cuda.to_device(genome_knots)
|
d_genome_knots = cuda.to_device(genome_knots)
|
||||||
@ -174,7 +212,7 @@ class Renderer(object):
|
|||||||
palette_fun(d_palmem, d_palint_times, d_palint_vals,
|
palette_fun(d_palmem, d_palint_times, d_palint_vals,
|
||||||
np.float32(start), width,
|
np.float32(start), width,
|
||||||
block=(256,1,1), grid=(info.palette_height,1),
|
block=(256,1,1), grid=(info.palette_height,1),
|
||||||
stream=stream)
|
stream=write_stream)
|
||||||
|
|
||||||
# TODO: do we need to do this each time in order to reset cache?
|
# TODO: do we need to do this each time in order to reset cache?
|
||||||
tref = self.mod.get_texref('palTex')
|
tref = self.mod.get_texref('palTex')
|
||||||
@ -188,11 +226,11 @@ class Renderer(object):
|
|||||||
np.float32(start), width, d_seeds,
|
np.float32(start), width, d_seeds,
|
||||||
np.int32(ntemporal_samples), block=(256,1,1),
|
np.int32(ntemporal_samples), block=(256,1,1),
|
||||||
grid=(int(np.ceil(ntemporal_samples/256.)),1),
|
grid=(int(np.ceil(ntemporal_samples/256.)),1),
|
||||||
stream=stream)
|
stream=iter_stream)
|
||||||
|
|
||||||
# TODO: if we only do this once per anim, does quality improve?
|
# Reset points so that they will be FUSEd
|
||||||
util.BaseCode.fill_dptr(self.mod, d_points, 4 * nslots,
|
util.BaseCode.fill_dptr(self.mod, d_points, 4 * nslots,
|
||||||
stream, np.float32(np.nan))
|
iter_stream, np.float32(np.nan))
|
||||||
|
|
||||||
# Get interpolated control points for debugging
|
# Get interpolated control points for debugging
|
||||||
#stream.synchronize()
|
#stream.synchronize()
|
||||||
@ -201,20 +239,34 @@ class Renderer(object):
|
|||||||
#for i, n in zip(d_temp[5], self._iter.packer.packed):
|
#for i, n in zip(d_temp[5], self._iter.packer.packed):
|
||||||
#print '%60s %g' % ('_'.join(n), i)
|
#print '%60s %g' % ('_'.join(n), i)
|
||||||
|
|
||||||
util.BaseCode.fill_dptr(self.mod, d_accum, 4 * nbins, stream)
|
util.BaseCode.fill_dptr(self.mod, d_accum, 4 * nbins, write_stream)
|
||||||
nsamps = info.density * info.width * info.height / ntemporal_samples
|
nrounds = ( (info.density * info.width * info.height)
|
||||||
iter_fun(np.uint64(d_accum), d_seeds, d_points,
|
/ (ntemporal_samples * 256 * 256) ) + 1
|
||||||
d_infos, np.int32(nsamps),
|
if info.acc_mode == 'deferred':
|
||||||
block=(32, self._iter.NTHREADS/32, 1),
|
for i in range(nrounds):
|
||||||
grid=(ntemporal_samples, 1),
|
iter_fun(np.uint64(d_log), d_seeds, d_points, d_infos,
|
||||||
texrefs=[tref], stream=stream)
|
block=(32, self._iter.NTHREADS/32, 1),
|
||||||
|
grid=(ntemporal_samples, 1),
|
||||||
|
texrefs=[tref], stream=iter_stream)
|
||||||
|
_sync_stream(write_stream, iter_stream)
|
||||||
|
sorter.sort(d_log_sorted, d_log, log_size, start_bit, True,
|
||||||
|
stream=write_stream)
|
||||||
|
_sync_stream(iter_stream, write_stream)
|
||||||
|
write_fun(d_accum, d_log_sorted, sorter.dglobal, log_shift,
|
||||||
|
block=(1024, 1, 1), grid=(nwriteblocks, 1),
|
||||||
|
stream=write_stream)
|
||||||
|
else:
|
||||||
|
iter_fun(np.uint64(d_accum), d_seeds, d_points, d_infos,
|
||||||
|
block=(32, self._iter.NTHREADS/32, 1),
|
||||||
|
grid=(ntemporal_samples, nrounds),
|
||||||
|
texrefs=[tref], stream=iter_stream)
|
||||||
|
|
||||||
stream.synchronize()
|
util.BaseCode.fill_dptr(self.mod, d_out, 4 * nbins, filt_stream)
|
||||||
|
_sync_stream(filt_stream, write_stream)
|
||||||
util.BaseCode.fill_dptr(self.mod, d_out, 4 * nbins, stream)
|
filt.de(d_out, d_accum, info, start, stop, filt_stream)
|
||||||
filt.de(d_out, d_accum, info, start, stop, stream)
|
_sync_stream(write_stream, filt_stream)
|
||||||
filt.colorclip(d_out, info, start, stop, stream)
|
filt.colorclip(d_out, info, start, stop, filt_stream)
|
||||||
cuda.memcpy_dtoh_async(h_out_a, d_out, stream)
|
cuda.memcpy_dtoh_async(h_out_a, d_out, filt_stream)
|
||||||
|
|
||||||
if event_b:
|
if event_b:
|
||||||
while not event_a.query():
|
while not event_a.query():
|
||||||
@ -222,11 +274,10 @@ class Renderer(object):
|
|||||||
gpu_time = event_a.time_since(event_b)
|
gpu_time = event_a.time_since(event_b)
|
||||||
yield RenderedImage(self._trim(h_out_b), last_idx, gpu_time)
|
yield RenderedImage(self._trim(h_out_b), last_idx, gpu_time)
|
||||||
|
|
||||||
event_a, event_b = cuda.Event().record(stream), event_a
|
event_a, event_b = cuda.Event().record(filt_stream), event_a
|
||||||
h_out_a, h_out_b = h_out_b, h_out_a
|
h_out_a, h_out_b = h_out_b, h_out_a
|
||||||
last_idx = idx
|
last_idx = idx
|
||||||
|
|
||||||
|
|
||||||
while not event_a.query():
|
while not event_a.query():
|
||||||
timemod.sleep(0.001)
|
timemod.sleep(0.001)
|
||||||
gpu_time = event_a.time_since(event_b)
|
gpu_time = event_a.time_since(event_b)
|
||||||
|
Loading…
Reference in New Issue
Block a user