mirror of
https://github.com/stevenrobertson/cuburn.git
synced 2025-04-21 09:01:31 -04:00
293 lines
9.3 KiB
Python
293 lines
9.3 KiB
Python
"""
|
|
The main iteration loop.
|
|
"""
|
|
|
|
from cuburn.code import mwc, variations
|
|
from cuburn.code.util import *
|
|
|
|
class IterCode(HunkOCode):
|
|
# The number of threads per block
|
|
NTHREADS = 256
|
|
|
|
def __init__(self, features):
|
|
self.features = features
|
|
self.packer = DataPacker('iter_info')
|
|
iterbody = self._iterbody()
|
|
bodies = [self._xfbody(i,x) for i,x in enumerate(self.features.xforms)]
|
|
bodies.append(iterbody)
|
|
self.defs = '\n'.join(bodies)
|
|
self.decls += self.pix_helpers.substitute(features=features)
|
|
|
|
decls = """
|
|
// Note: for normalized lookups, uchar4 actually returns floats
|
|
texture<uchar4, cudaTextureType2D, cudaReadModeNormalizedFloat> palTex;
|
|
__shared__ iter_info info;
|
|
|
|
"""
|
|
|
|
pix_helpers = Template("""
|
|
__device__
|
|
void read_pix(float4 &pix, float &den) {
|
|
den = pix.w;
|
|
{{if features.pal_has_alpha}}
|
|
read_half(pix.z, pix.w, pix.z, den);
|
|
{{endif}}
|
|
}
|
|
|
|
__device__
|
|
void write_pix(float4 &pix, float den) {
|
|
{{if features.pal_has_alpha}}
|
|
write_half(pix.z, pix.z, pix.w, den);
|
|
{{endif}}
|
|
pix.w = den;
|
|
}
|
|
|
|
__device__
|
|
void update_pix(uint64_t ptr, uint32_t i, float4 c) {
|
|
{{if features.pal_has_alpha}}
|
|
asm volatile ({{crep('''
|
|
{
|
|
.reg .u16 sz, sw;
|
|
.reg .u64 base, off;
|
|
.reg .f32 x, y, z, w, den, rc, tz, tw;
|
|
|
|
// TODO: this limits the accumulation buffer to <4GB
|
|
shl.b32 %0, %0, 4;
|
|
cvt.u64.u32 off, %0;
|
|
add.u64 base, %1, off;
|
|
ld.cg.v4.f32 {x, y, z, den}, [base];
|
|
add.f32 x, x, %2;
|
|
add.f32 y, y, %3;
|
|
mov.b32 {sz, sw}, z;
|
|
cvt.rn.f32.u16 tz, sz;
|
|
cvt.rn.f32.u16 tw, sw;
|
|
mul.f32 tz, tz, den;
|
|
mul.f32 tw, tz, den;
|
|
fma.f32 tz, %4, 65535.0, tz;
|
|
fma.f32 tw, %5, 65535.0, tw;
|
|
add.f32 den, 1.0;
|
|
rcp.approx.f32 rc, den;
|
|
mul.f32 tz, tz, rc;
|
|
mul.f32 tw, tw, rc;
|
|
cvt.rni.u16.f32 sz, tz;
|
|
cvt.rni.u16.f32 sw, tw;
|
|
mov.b32 z, {sz, sw};
|
|
st.cs.v4.f32 [base], {x, y, z, den};
|
|
}
|
|
''')}} : "+r"(i) : "l"(ptr), "f"(c.x), "f"(c.y), "f"(c.z), "f"(c.w));
|
|
{{else}}
|
|
asm volatile ({{crep('''
|
|
{
|
|
.reg .u64 base, off;
|
|
.reg .f32 x, y, z, den;
|
|
|
|
// TODO: this limits the accumulation buffer to <4GB
|
|
shl.b32 %0, %0, 4;
|
|
cvt.u64.u32 off, %0;
|
|
add.u64 base, %1, off;
|
|
ld.cg.v4.f32 {x, y, z, den}, [base];
|
|
add.f32 x, x, %2;
|
|
add.f32 y, y, %3;
|
|
add.f32 z, z, %4;
|
|
add.f32 den, den, 1.0;
|
|
st.cs.v4.f32 [base], {x, y, z, den};
|
|
}
|
|
''')}} : "+r"(i) : "l"(ptr), "f"(c.x), "f"(c.y), "f"(c.z));
|
|
{{endif}}
|
|
}
|
|
""")
|
|
|
|
def _xfbody(self, xfid, xform):
|
|
px = self.packer.view('info', 'xf%d_' % xfid)
|
|
px.sub('xf', 'cp.xforms[%d]' % xfid)
|
|
|
|
tmpl = Template("""
|
|
__device__
|
|
void apply_xf{{xfid}}(float &ox, float &oy, float &color, mwc_st &rctx) {
|
|
float tx, ty;
|
|
|
|
{{apply_affine_flam3('ox', 'oy', 'tx', 'ty', px, 'xf.c', 'pre')}}
|
|
|
|
ox = 0;
|
|
oy = 0;
|
|
|
|
{{for v in xform.vars}}
|
|
{{if variations.var_nos[v].startswith('pre_')}}
|
|
if (1) {
|
|
float w = {{px.get('xf.var[%d]' % v)}};
|
|
{{variations.var_code[variations.var_nos[v]].substitute(locals())}}
|
|
}
|
|
{{endif}}
|
|
{{endfor}}
|
|
|
|
{{for v in xform.vars}}
|
|
{{if not variations.var_nos[v].startswith('pre_')}}
|
|
if (1) {
|
|
float w = {{px.get('xf.var[%d]' % v)}};
|
|
{{variations.var_code[variations.var_nos[v]].substitute(locals())}}
|
|
}
|
|
{{endif}}
|
|
{{endfor}}
|
|
|
|
float csp = {{px.get('xf.color_speed')}};
|
|
color = color * (1.0f - csp) + {{px.get('xf.color')}} * csp;
|
|
};
|
|
""")
|
|
g = dict(globals())
|
|
g.update(locals())
|
|
return tmpl.substitute(g)
|
|
|
|
def _iterbody(self):
|
|
tmpl = Template(r'''
|
|
__global__
|
|
void iter(mwc_st *msts, iter_info *infos, uint64_t accbuf_ptr) {
|
|
__shared__ int nsamps;
|
|
mwc_st rctx = msts[gtid()];
|
|
iter_info *info_glob = &(infos[blockIdx.x]);
|
|
|
|
// load info to shared memory cooperatively
|
|
for (int i = threadIdx.y * blockDim.x + threadIdx.x;
|
|
i * 4 < sizeof(iter_info); i += blockDim.x * blockDim.y)
|
|
reinterpret_cast<float*>(&info)[i] =
|
|
reinterpret_cast<float*>(info_glob)[i];
|
|
|
|
if (threadIdx.y == 0 && threadIdx.x == 0)
|
|
nsamps = {{packer.get("cp.width * cp.height / cp.ntemporal_samples * cp.adj_density")}};
|
|
|
|
{{if features.chaos_used}}
|
|
int last_xf_used = 0;
|
|
{{else}}
|
|
// Size can be reduced by a factor of four using a slower 4-stage reduce
|
|
__shared__ float swap[{{4*NTHREADS}}];
|
|
__shared__ float cosel[{{NWARPS}}];
|
|
if (threadIdx.y == 0 && threadIdx.x < {{NWARPS}})
|
|
cosel[threadIdx.x] = mwc_next_01(rctx);
|
|
{{endif}}
|
|
|
|
__syncthreads();
|
|
int consec_bad = -{{features.fuse}};
|
|
|
|
if (threadIdx.y == 1 && threadIdx.x == 0) {
|
|
float ditherwidth = {{packer.get("0.33 * cp.spatial_filter_radius")}};
|
|
float u0 = mwc_next_01(rctx);
|
|
float r = ditherwidth * sqrtf(-2.0f * log2f(u0) / M_LOG2E);
|
|
|
|
float u1 = 2.0f * M_PI * mwc_next_01(rctx);
|
|
info.cam_xo += r * cos(u1);
|
|
info.cam_yo += r * sin(u1);
|
|
}
|
|
|
|
float x, y, color;
|
|
x = mwc_next_11(rctx);
|
|
y = mwc_next_11(rctx);
|
|
color = mwc_next_01(rctx);
|
|
|
|
while (1) {
|
|
{{if features.chaos_used}}
|
|
// For now, we can't use the swap buffer with chaos enabled
|
|
float xfsel = mwc_next_01(rctx);
|
|
{{else}}
|
|
float xfsel = cosel[threadIdx.y];
|
|
{{endif}}
|
|
|
|
{{if features.chaos_used}}
|
|
{{for density_row_idx, prior_xform_idx in enumerate(features.std_xforms)}}
|
|
{{for density_col_idx, this_xform_idx in enumerate(features.std_xforms)}}
|
|
if (last_xf_used == {{prior_xform_idx}} &&
|
|
xfsel <= {{packer.get("cp.chaos_densities[%d][%d]" % (density_row_idx, density_col_idx))}}) {
|
|
apply_xf{{this_xform_idx}}(x, y, color, rctx);
|
|
last_xf_used = {{this_xform_idx}};
|
|
} else
|
|
{{endfor}}
|
|
{{endfor}}
|
|
{{else}}
|
|
{{for density_col_idx, this_xform_idx in enumerate(features.std_xforms)}}
|
|
if (xfsel <= {{packer.get("cp.norm_density[%d]" % (density_col_idx))}}) {
|
|
apply_xf{{this_xform_idx}}(x, y, color, rctx);
|
|
} else
|
|
{{endfor}}
|
|
{{endif}}
|
|
{
|
|
printf("Reached trap, aborting execution! %g (%d,%d,%d)\n",
|
|
xfsel, blockIdx.x, threadIdx.y, threadIdx.x);
|
|
asm volatile ("trap;");
|
|
}
|
|
|
|
{{if not features.chaos_used}}
|
|
// Swap thread states here so that writeback skipping logic doesn't die
|
|
int sw = (threadIdx.y * 32 + threadIdx.x * 33) & {{NTHREADS-1}};
|
|
int sr = threadIdx.y * 32 + threadIdx.x;
|
|
|
|
swap[sw] = consec_bad;
|
|
swap[sw+{{NTHREADS}}] = x;
|
|
swap[sw+{{2*NTHREADS}}] = y;
|
|
swap[sw+{{3*NTHREADS}}] = color;
|
|
__syncthreads();
|
|
// This is in the middle of the function so that only one sync is
|
|
// required per loop.
|
|
if (nsamps < 0) break;
|
|
|
|
{{if not features.chaos_used}}
|
|
// Similarly, we select the next xforms here.
|
|
if (threadIdx.y == 0 && threadIdx.x < {{NWARPS}})
|
|
cosel[threadIdx.x] = mwc_next_01(rctx);
|
|
{{endif}}
|
|
|
|
consec_bad = swap[sr];
|
|
x = swap[sr+{{NTHREADS}}];
|
|
y = swap[sr+{{2*NTHREADS}}];
|
|
color = swap[sr+{{3*NTHREADS}}];
|
|
{{endif}}
|
|
|
|
if (consec_bad < 0) {
|
|
consec_bad++;
|
|
continue;
|
|
}
|
|
|
|
int remain = __popc(__ballot(1));
|
|
if (threadIdx.x == 0) atomicSub(&nsamps, remain);
|
|
|
|
{{if features.final_xform_index}}
|
|
float fx = x, fy = y, fcolor;
|
|
apply_xf{{features.final_xform_index}}(fx, fy, fcolor, rctx);
|
|
{{endif}}
|
|
|
|
// TODO: this may not optimize well, verify.
|
|
float cx, cy;
|
|
{{if features.final_xform_index}}
|
|
{{apply_affine('fx', 'fy', 'cx', 'cy', packer,
|
|
'cp.camera_transform', 'cam')}}
|
|
{{else}}
|
|
{{apply_affine('x', 'y', 'cx', 'cy', packer,
|
|
'cp.camera_transform', 'cam')}}
|
|
{{endif}}
|
|
uint32_t ix = trunca(cx), iy = trunca(cy);
|
|
|
|
if (ix >= {{features.acc_width}} || iy >= {{features.acc_height}} ) {
|
|
consec_bad++;
|
|
if (consec_bad > {{features.max_oob}}) {
|
|
x = mwc_next_11(rctx);
|
|
y = mwc_next_11(rctx);
|
|
color = mwc_next_01(rctx);
|
|
consec_bad = -{{features.fuse}};
|
|
}
|
|
continue;
|
|
}
|
|
|
|
uint32_t i = iy * {{features.acc_stride}} + ix;
|
|
|
|
float4 outcol = tex2D(palTex, color, {{packer.get("cp_step_frac")}});
|
|
update_pix(accbuf_ptr, i, outcol);
|
|
|
|
}
|
|
msts[gtid()] = rctx;
|
|
}
|
|
''')
|
|
return tmpl.substitute(
|
|
features = self.features,
|
|
packer = self.packer.view('info'),
|
|
NTHREADS = self.NTHREADS,
|
|
NWARPS = self.NTHREADS / 32,
|
|
**globals())
|
|
|