Major bugfix. Also include thread-swapping that works.

This commit is contained in:
Steven Robertson 2011-06-25 20:37:08 -04:00
parent 44f897f28e
commit 18a60ec066
2 changed files with 59 additions and 16 deletions

View File

@ -67,9 +67,10 @@ void apply_xf{{xfid}}(float *ix, float *iy, float *icolor, mwc_st *rctx) {
return tmpl.substitute(g) return tmpl.substitute(g)
def _iterbody(self): def _iterbody(self):
tmpl = Template(''' tmpl = Template(r'''
__global__ __global__
void iter(mwc_st *msts, iter_info *infos, float4 *accbuf, float *denbuf) { void iter(mwc_st *msts, iter_info *infos, float4 *accbuf, float *denbuf) {
__shared__ int nsamps;
mwc_st rctx = msts[gtid()]; mwc_st rctx = msts[gtid()];
iter_info *info_glob = &(infos[blockIdx.x]); iter_info *info_glob = &(infos[blockIdx.x]);
@ -79,24 +80,48 @@ void iter(mwc_st *msts, iter_info *infos, float4 *accbuf, float *denbuf) {
reinterpret_cast<float*>(&info)[i] = reinterpret_cast<float*>(&info)[i] =
reinterpret_cast<float*>(info_glob)[i]; reinterpret_cast<float*>(info_glob)[i];
if (threadIdx.y == 0 && threadIdx.x == 0)
nsamps = {{packer.get("cp.width * cp.height / cp.ntemporal_samples * cp.adj_density")}};
__syncthreads();
int consec_bad = -{{features.fuse}}; int consec_bad = -{{features.fuse}};
// TODO: remove '512' constant
int nsamps = {{packer.get("cp.width * cp.height / (cp.ntemporal_samples * 512.) * cp.adj_density")}};
float x, y, color; float x, y, color;
int last_xf_used = 0;
x = mwc_next_11(&rctx); x = mwc_next_11(&rctx);
y = mwc_next_11(&rctx); y = mwc_next_11(&rctx);
color = mwc_next_01(&rctx); color = mwc_next_01(&rctx);
while (nsamps > 0) { {{if features.chaos_used}}
int last_xf_used = 0;
{{else}}
// Size can be reduced by a factor of four using a slower 4-stage reduce
__shared__ float swap[2048];
__shared__ float cosel[16];
{{endif}}
while (1) {
{{if features.chaos_used}}
// For now, we can't use the swap buffer with chaos enabled
float xfsel = mwc_next_01(&rctx); float xfsel = mwc_next_01(&rctx);
// Needed to match the behavior of the loop with swapping
__syncthreads();
{{else}}
if (threadIdx.y == 0 && threadIdx.x < 16) {
cosel[threadIdx.x] = mwc_next_01(&rctx);
}
__syncthreads();
float xfsel = cosel[threadIdx.y];
{{endif}}
// This is moved from outside the conditional to avoid needing an extra
// __syncthreads on every loop
if (nsamps < 0) break;
{{if features.chaos_used}} {{if features.chaos_used}}
{{for density_row_idx, prior_xform_idx in enumerate(features.std_xforms)}} {{for density_row_idx, prior_xform_idx in enumerate(features.std_xforms)}}
{{for density_col_idx, this_xform_idx in enumerate(features.std_xforms)}} {{for density_col_idx, this_xform_idx in enumerate(features.std_xforms)}}
if (last_xf_used == {{prior_xform_idx}} && if (last_xf_used == {{prior_xform_idx}} &&
xfsel < {{packer.get("cp.chaos_densities[%d][%d]" % (density_row_idx, density_col_idx))}}) { xfsel <= {{packer.get("cp.chaos_densities[%d][%d]" % (density_row_idx, density_col_idx))}}) {
apply_xf{{this_xform_idx}}(&x, &y, &color, &rctx); apply_xf{{this_xform_idx}}(&x, &y, &color, &rctx);
last_xf_used = {{this_xform_idx}}; last_xf_used = {{this_xform_idx}};
} else } else
@ -104,22 +129,40 @@ void iter(mwc_st *msts, iter_info *infos, float4 *accbuf, float *denbuf) {
{{endfor}} {{endfor}}
{{else}} {{else}}
{{for density_col_idx, this_xform_idx in enumerate(features.std_xforms)}} {{for density_col_idx, this_xform_idx in enumerate(features.std_xforms)}}
if (xfsel < {{packer.get("cp.norm_density[%d]" % (density_col_idx))}}) { if (xfsel <= {{packer.get("cp.norm_density[%d]" % (density_col_idx))}}) {
apply_xf{{this_xform_idx}}(&x, &y, &color, &rctx); apply_xf{{this_xform_idx}}(&x, &y, &color, &rctx);
} else } else
{{endfor}} {{endfor}}
{{endif}} {{endif}}
{ {
//printf("%d ",last_xf_used); printf("Reached trap, aborting execution! %g (%d,%d,%d)\n",
denbuf[0] = xfsel; xfsel, blockIdx.x, threadIdx.y, threadIdx.x);
break; // TODO: fail here asm volatile ("trap;");
} }
{{if not features.chaos_used}}
// Swap thread states here so that writeback skipping logic doesn't die
int sw = (threadIdx.y * 32 + threadIdx.x * 33) & 0x1ff;
int sr = threadIdx.y * 32 + threadIdx.x;
swap[sw] = consec_bad;
swap[sw+512] = x;
swap[sw+1024] = y;
swap[sw+1536] = color;
__syncthreads();
consec_bad = swap[sr];
x = swap[sr+512];
y = swap[sr+1024];
color = swap[sr+1536];
{{endif}}
if (consec_bad < 0) { if (consec_bad < 0) {
consec_bad++; consec_bad++;
continue; continue;
} }
nsamps--;
int remain = __popc(__ballot(1));
if (threadIdx.x == 0) atomicSub(&nsamps, remain);
{{if features.final_xform_index}} {{if features.final_xform_index}}
float fx = x, fy = y, fcolor; float fx = x, fy = y, fcolor;

View File

@ -267,7 +267,7 @@ class _AnimRenderer(object):
packer = a._iter.packer packer = a._iter.packer
iter_fun = a.mod.get_function("iter") iter_fun = a.mod.get_function("iter")
iter_fun.set_cache_config(cuda.func_cache.PREFER_L1) #iter_fun.set_cache_config(cuda.func_cache.PREFER_L1)
# Must be accumulated over all CPs # Must be accumulated over all CPs
gam, vib = 0, 0 gam, vib = 0, 0
@ -454,7 +454,8 @@ class Features(object):
self.acc_width = genomes[0].width + 2 * self.gutter self.acc_width = genomes[0].width + 2 * self.gutter
self.acc_height = genomes[0].height + 2 * self.gutter self.acc_height = genomes[0].height + 2 * self.gutter
self.acc_stride = 32 * int(math.ceil(self.acc_width / 32.)) self.acc_stride = 32 * int(math.ceil(self.acc_width / 32.))
self.std_xforms = filter(lambda v: v != self.final_xform_index, range(self.nxforms)) self.std_xforms = filter(lambda v: v != self.final_xform_index,
range(self.nxforms))
self.chaos_used = False self.chaos_used = False
for cp in genomes: for cp in genomes:
for r in range(len(self.std_xforms)): for r in range(len(self.std_xforms)):
@ -474,4 +475,3 @@ class XFormFeatures(object):
self.vars = ( self.vars = (
self.vars.union(set([i for i, v in enumerate(x.var) if v]))) self.vars.union(set([i for i, v in enumerate(x.var) if v])))