cuburn/cuburn/code/iter.py
2011-10-15 19:08:16 -04:00

293 lines
9.3 KiB
Python

"""
The main iteration loop.
"""
from cuburn.code import mwc, variations
from cuburn.code.util import *
class IterCode(HunkOCode):
# The number of threads per block
NTHREADS = 256
def __init__(self, features):
self.features = features
self.packer = DataPacker('iter_info')
iterbody = self._iterbody()
bodies = [self._xfbody(i,x) for i,x in enumerate(self.features.xforms)]
bodies.append(iterbody)
self.defs = '\n'.join(bodies)
self.decls += self.pix_helpers.substitute(features=features)
decls = """
// Note: for normalized lookups, uchar4 actually returns floats
texture<uchar4, cudaTextureType2D, cudaReadModeNormalizedFloat> palTex;
__shared__ iter_info info;
"""
pix_helpers = Template("""
__device__
void read_pix(float4 &pix, float &den) {
den = pix.w;
{{if features.pal_has_alpha}}
read_half(pix.z, pix.w, pix.z, den);
{{endif}}
}
__device__
void write_pix(float4 &pix, float den) {
{{if features.pal_has_alpha}}
write_half(pix.z, pix.z, pix.w, den);
{{endif}}
pix.w = den;
}
__device__
void update_pix(uint64_t ptr, uint32_t i, float4 c) {
{{if features.pal_has_alpha}}
asm volatile ({{crep('''
{
.reg .u16 sz, sw;
.reg .u64 base, off;
.reg .f32 x, y, z, w, den, rc, tz, tw;
// TODO: this limits the accumulation buffer to <4GB
shl.b32 %0, %0, 4;
cvt.u64.u32 off, %0;
add.u64 base, %1, off;
ld.cg.v4.f32 {x, y, z, den}, [base];
add.f32 x, x, %2;
add.f32 y, y, %3;
mov.b32 {sz, sw}, z;
cvt.rn.f32.u16 tz, sz;
cvt.rn.f32.u16 tw, sw;
mul.f32 tz, tz, den;
mul.f32 tw, tz, den;
fma.f32 tz, %4, 65535.0, tz;
fma.f32 tw, %5, 65535.0, tw;
add.f32 den, 1.0;
rcp.approx.f32 rc, den;
mul.f32 tz, tz, rc;
mul.f32 tw, tw, rc;
cvt.rni.u16.f32 sz, tz;
cvt.rni.u16.f32 sw, tw;
mov.b32 z, {sz, sw};
st.cs.v4.f32 [base], {x, y, z, den};
}
''')}} : "+r"(i) : "l"(ptr), "f"(c.x), "f"(c.y), "f"(c.z), "f"(c.w));
{{else}}
asm volatile ({{crep('''
{
.reg .u64 base, off;
.reg .f32 x, y, z, den;
// TODO: this limits the accumulation buffer to <4GB
shl.b32 %0, %0, 4;
cvt.u64.u32 off, %0;
add.u64 base, %1, off;
ld.cg.v4.f32 {x, y, z, den}, [base];
add.f32 x, x, %2;
add.f32 y, y, %3;
add.f32 z, z, %4;
add.f32 den, den, 1.0;
st.cs.v4.f32 [base], {x, y, z, den};
}
''')}} : "+r"(i) : "l"(ptr), "f"(c.x), "f"(c.y), "f"(c.z));
{{endif}}
}
""")
def _xfbody(self, xfid, xform):
px = self.packer.view('info', 'xf%d_' % xfid)
px.sub('xf', 'cp.xforms[%d]' % xfid)
tmpl = Template("""
__device__
void apply_xf{{xfid}}(float &ox, float &oy, float &color, mwc_st &rctx) {
float tx, ty;
{{apply_affine_flam3('ox', 'oy', 'tx', 'ty', px, 'xf.c', 'pre')}}
ox = 0;
oy = 0;
{{for v in xform.vars}}
{{if variations.var_nos[v].startswith('pre_')}}
if (1) {
float w = {{px.get('xf.var[%d]' % v)}};
{{variations.var_code[variations.var_nos[v]].substitute(locals())}}
}
{{endif}}
{{endfor}}
{{for v in xform.vars}}
{{if not variations.var_nos[v].startswith('pre_')}}
if (1) {
float w = {{px.get('xf.var[%d]' % v)}};
{{variations.var_code[variations.var_nos[v]].substitute(locals())}}
}
{{endif}}
{{endfor}}
float csp = {{px.get('xf.color_speed')}};
color = color * (1.0f - csp) + {{px.get('xf.color')}} * csp;
};
""")
g = dict(globals())
g.update(locals())
return tmpl.substitute(g)
def _iterbody(self):
tmpl = Template(r'''
__global__
void iter(mwc_st *msts, iter_info *infos, uint64_t accbuf_ptr) {
__shared__ int nsamps;
mwc_st rctx = msts[gtid()];
iter_info *info_glob = &(infos[blockIdx.x]);
// load info to shared memory cooperatively
for (int i = threadIdx.y * blockDim.x + threadIdx.x;
i * 4 < sizeof(iter_info); i += blockDim.x * blockDim.y)
reinterpret_cast<float*>(&info)[i] =
reinterpret_cast<float*>(info_glob)[i];
if (threadIdx.y == 0 && threadIdx.x == 0)
nsamps = {{packer.get("cp.width * cp.height / cp.ntemporal_samples * cp.adj_density")}};
{{if features.chaos_used}}
int last_xf_used = 0;
{{else}}
// Size can be reduced by a factor of four using a slower 4-stage reduce
__shared__ float swap[{{4*NTHREADS}}];
__shared__ float cosel[{{NWARPS}}];
if (threadIdx.y == 0 && threadIdx.x < {{NWARPS}})
cosel[threadIdx.x] = mwc_next_01(rctx);
{{endif}}
__syncthreads();
int consec_bad = -{{features.fuse}};
if (threadIdx.y == 1 && threadIdx.x == 0) {
float ditherwidth = {{packer.get("0.33 * cp.spatial_filter_radius")}};
float u0 = mwc_next_01(rctx);
float r = ditherwidth * sqrtf(-2.0f * log2f(u0) / M_LOG2E);
float u1 = 2.0f * M_PI * mwc_next_01(rctx);
info.cam_xo += r * cos(u1);
info.cam_yo += r * sin(u1);
}
float x, y, color;
x = mwc_next_11(rctx);
y = mwc_next_11(rctx);
color = mwc_next_01(rctx);
while (1) {
{{if features.chaos_used}}
// For now, we can't use the swap buffer with chaos enabled
float xfsel = mwc_next_01(rctx);
{{else}}
float xfsel = cosel[threadIdx.y];
{{endif}}
{{if features.chaos_used}}
{{for density_row_idx, prior_xform_idx in enumerate(features.std_xforms)}}
{{for density_col_idx, this_xform_idx in enumerate(features.std_xforms)}}
if (last_xf_used == {{prior_xform_idx}} &&
xfsel <= {{packer.get("cp.chaos_densities[%d][%d]" % (density_row_idx, density_col_idx))}}) {
apply_xf{{this_xform_idx}}(x, y, color, rctx);
last_xf_used = {{this_xform_idx}};
} else
{{endfor}}
{{endfor}}
{{else}}
{{for density_col_idx, this_xform_idx in enumerate(features.std_xforms)}}
if (xfsel <= {{packer.get("cp.norm_density[%d]" % (density_col_idx))}}) {
apply_xf{{this_xform_idx}}(x, y, color, rctx);
} else
{{endfor}}
{{endif}}
{
printf("Reached trap, aborting execution! %g (%d,%d,%d)\n",
xfsel, blockIdx.x, threadIdx.y, threadIdx.x);
asm volatile ("trap;");
}
{{if not features.chaos_used}}
// Swap thread states here so that writeback skipping logic doesn't die
int sw = (threadIdx.y * 32 + threadIdx.x * 33) & {{NTHREADS-1}};
int sr = threadIdx.y * 32 + threadIdx.x;
swap[sw] = consec_bad;
swap[sw+{{NTHREADS}}] = x;
swap[sw+{{2*NTHREADS}}] = y;
swap[sw+{{3*NTHREADS}}] = color;
__syncthreads();
// This is in the middle of the function so that only one sync is
// required per loop.
if (nsamps < 0) break;
{{if not features.chaos_used}}
// Similarly, we select the next xforms here.
if (threadIdx.y == 0 && threadIdx.x < {{NWARPS}})
cosel[threadIdx.x] = mwc_next_01(rctx);
{{endif}}
consec_bad = swap[sr];
x = swap[sr+{{NTHREADS}}];
y = swap[sr+{{2*NTHREADS}}];
color = swap[sr+{{3*NTHREADS}}];
{{endif}}
if (consec_bad < 0) {
consec_bad++;
continue;
}
int remain = __popc(__ballot(1));
if (threadIdx.x == 0) atomicSub(&nsamps, remain);
{{if features.final_xform_index}}
float fx = x, fy = y, fcolor;
apply_xf{{features.final_xform_index}}(fx, fy, fcolor, rctx);
{{endif}}
// TODO: this may not optimize well, verify.
float cx, cy;
{{if features.final_xform_index}}
{{apply_affine('fx', 'fy', 'cx', 'cy', packer,
'cp.camera_transform', 'cam')}}
{{else}}
{{apply_affine('x', 'y', 'cx', 'cy', packer,
'cp.camera_transform', 'cam')}}
{{endif}}
uint32_t ix = trunca(cx), iy = trunca(cy);
if (ix >= {{features.acc_width}} || iy >= {{features.acc_height}} ) {
consec_bad++;
if (consec_bad > {{features.max_oob}}) {
x = mwc_next_11(rctx);
y = mwc_next_11(rctx);
color = mwc_next_01(rctx);
consec_bad = -{{features.fuse}};
}
continue;
}
uint32_t i = iy * {{features.acc_stride}} + ix;
float4 outcol = tex2D(palTex, color, {{packer.get("cp_step_frac")}});
update_pix(accbuf_ptr, i, outcol);
}
msts[gtid()] = rctx;
}
''')
return tmpl.substitute(
features = self.features,
packer = self.packer.view('info'),
NTHREADS = self.NTHREADS,
NWARPS = self.NTHREADS / 32,
**globals())