New badvals mechanism.

This commit is contained in:
Steven Robertson 2011-10-27 12:59:58 -04:00
parent cac9b691a8
commit f3a79b200c
4 changed files with 144 additions and 112 deletions

View File

@ -242,10 +242,10 @@ void apply_xf_{{xfid}}(float &ox, float &oy, float &color, mwc_st &rctx) {
def _iterbody(self): def _iterbody(self):
tmpl = Template(r''' tmpl = Template(r'''
__global__ __global__
void iter(uint64_t accbuf_ptr, mwc_st *msts, iter_params *all_params, void iter(uint64_t accbuf_ptr, mwc_st *msts, float4 *points,
int nsamps_to_generate) { const iter_params *all_params, int nsamps_to_generate) {
mwc_st rctx = msts[gtid()]; mwc_st rctx = msts[devtid()];
iter_params *global_params = &(all_params[blockIdx.x]); const iter_params *global_params = &(all_params[blockIdx.x]);
__shared__ int nsamps; __shared__ int nsamps;
nsamps = nsamps_to_generate; nsamps = nsamps_to_generate;
@ -257,7 +257,7 @@ void iter(uint64_t accbuf_ptr, mwc_st *msts, iter_params *all_params,
for (int i = threadIdx.y * blockDim.x + threadIdx.x; for (int i = threadIdx.y * blockDim.x + threadIdx.x;
i * 4 < sizeof(iter_params); i += blockDim.x * blockDim.y) i * 4 < sizeof(iter_params); i += blockDim.x * blockDim.y)
reinterpret_cast<float*>(&params)[i] = reinterpret_cast<float*>(&params)[i] =
reinterpret_cast<float*>(global_params)[i]; reinterpret_cast<const float*>(global_params)[i];
{{if info.chaos_used}} {{if info.chaos_used}}
int last_xf_used = 0; int last_xf_used = 0;
@ -273,14 +273,21 @@ void iter(uint64_t accbuf_ptr, mwc_st *msts, iter_params *all_params,
{{endif}} {{endif}}
__syncthreads(); __syncthreads();
int consec_bad = -{{info.fuse}}; float4 old_point = points[devtid()];
float x = old_point.x, y = old_point.y,
color = old_point.z, fuse_rounds = old_point.w;
float x, y, color; while (1) {
// This condition checks for large numbers, Infs, and NaNs.
if (!(-(fabsf(x) + fabsf(y) > -1.0e6f))) {
x = mwc_next_11(rctx); x = mwc_next_11(rctx);
y = mwc_next_11(rctx); y = mwc_next_11(rctx);
color = mwc_next_01(rctx); color = mwc_next_01(rctx);
fuse_rounds = {{info.fuse / 32}};
}
while (1) { // 32 rounds is somewhat arbitrary, but it has a pleasing 32-ness
for (int i = 0; i < 32; i++) {
{{if info.chaos_used}} {{if info.chaos_used}}
@ -319,37 +326,27 @@ void iter(uint64_t accbuf_ptr, mwc_st *msts, iter_params *all_params,
{{endfor}} {{endfor}}
apply_xf_{{std_xforms[-1]}}(x, y, color, rctx); apply_xf_{{std_xforms[-1]}}(x, y, color, rctx);
// Swap thread states here so that writeback skipping logic doesn't die
int sw = (threadIdx.y * 32 + threadIdx.x * 33) & {{NTHREADS-1}}; int sw = (threadIdx.y * 32 + threadIdx.x * 33) & {{NTHREADS-1}};
int sr = threadIdx.y * 32 + threadIdx.x; int sr = threadIdx.y * 32 + threadIdx.x;
swap[sw] = consec_bad; swap[sw] = fuse_rounds;
swap[sw+{{NTHREADS}}] = x; swap[sw+{{NTHREADS}}] = x;
swap[sw+{{2*NTHREADS}}] = y; swap[sw+{{2*NTHREADS}}] = y;
swap[sw+{{3*NTHREADS}}] = color; swap[sw+{{3*NTHREADS}}] = color;
__syncthreads(); __syncthreads();
// This is in the middle of the function so that only one sync is
// required per loop.
if (nsamps < 0) break;
// Similarly, we select the next xforms here. // We select the next xforms here, since we've just synced.
if (threadIdx.y == 0 && threadIdx.x < {{NWARPS}}) if (threadIdx.y == 0 && threadIdx.x < {{NWARPS}})
cosel[threadIdx.x] = mwc_next_01(rctx); cosel[threadIdx.x] = mwc_next_01(rctx);
consec_bad = swap[sr]; fuse_rounds = swap[sr];
x = swap[sr+{{NTHREADS}}]; x = swap[sr+{{NTHREADS}}];
y = swap[sr+{{2*NTHREADS}}]; y = swap[sr+{{2*NTHREADS}}];
color = swap[sr+{{3*NTHREADS}}]; color = swap[sr+{{3*NTHREADS}}];
{{endif}} {{endif}}
if (consec_bad < 0) { if (fuse_rounds > 0.0f) continue;
consec_bad++;
continue;
}
int remain = __popc(__ballot(1));
if (threadIdx.x == 0) atomicSub(&nsamps, remain);
{{if 'final' in cp.xforms}} {{if 'final' in cp.xforms}}
float fx = x, fy = y, fcolor = color; float fx = x, fy = y, fcolor = color;
@ -370,23 +367,24 @@ void iter(uint64_t accbuf_ptr, mwc_st *msts, iter_params *all_params,
uint32_t ix = trunca(cx), iy = trunca(cy); uint32_t ix = trunca(cx), iy = trunca(cy);
if (ix >= {{info.acc_width}} || iy >= {{info.acc_height}} ) { if (ix >= {{info.acc_width}} || iy >= {{info.acc_height}})
consec_bad++;
if (consec_bad > {{info.max_oob}}) {
x = mwc_next_11(rctx);
y = mwc_next_11(rctx);
color = mwc_next_01(rctx);
consec_bad = -{{info.fuse}};
}
continue; continue;
}
uint32_t i = iy * {{info.acc_stride}} + ix; uint32_t i = iy * {{info.acc_stride}} + ix;
float4 outcol = tex2D(palTex, cc, time_frac); float4 outcol = tex2D(palTex, cc, time_frac);
update_pix(accbuf_ptr, i, outcol); update_pix(accbuf_ptr, i, outcol);
} }
msts[gtid()] = rctx;
int num_okay = __popc(__ballot(fuse_rounds == 0.0f));
if (threadIdx.x == 0) atomicSub(&nsamps, num_okay * 32);
fuse_rounds = fmaxf(0.0f, fuse_rounds - 1.0f);
__syncthreads();
if (nsamps <= 0) break;
}
points[devtid()] = make_float4(x, y, color, fuse_rounds);
msts[devtid()] = rctx;
} }
''') ''')
return tmpl.substitute( return tmpl.substitute(

View File

@ -42,7 +42,7 @@ float3 rgb2hsv(float3 rgb);
float3 hsv2rgb(float3 hsv); float3 hsv2rgb(float3 hsv);
""" """
defs = r""" defs = Template(r"""
#undef M_E #undef M_E
#undef M_LOG2E #undef M_LOG2E
#undef M_LOG10E #undef M_LOG10E
@ -80,6 +80,28 @@ uint32_t gtid() {
(blockIdx.x + (gridDim.x * blockIdx.y)))); (blockIdx.x + (gridDim.x * blockIdx.y))));
} }
/* Returns the ID of this thread on the device. Note that this counter is
* volatile according to the PTX ISA. It should be used for loading and saving
* state that must be unique across running threads, not for accessing things
* in a known order. */
__device__
int devtid() {
int result;
asm({{crep('''
{
.reg .u32 tmp1, tmp2;
mov.u32 %0, %smid;
mov.u32 tmp1, %nsmid;
mov.u32 tmp2, %warpid;
mad.lo.u32 %0, %0, tmp1, tmp2;
mov.u32 tmp1, %nwarpid;
mov.u32 tmp2, %laneid;
mad.lo.u32 %0, %0, tmp1, tmp2;
}''')}} : "=r"(result) );
return result;
}
__device__ __device__
uint32_t trunca(float f) { uint32_t trunca(float f) {
// truncate as used in address calculations. note the use of a signed // truncate as used in address calculations. note the use of a signed
@ -182,7 +204,7 @@ float3 hsv2rgb(float3 hsv) {
else { out.x = val; out.y = min; out.z = mid; } else { out.x = val; out.y = min; out.z = mid; }
return out; return out;
} }
""" """).substitute()
@staticmethod @staticmethod
def fill_dptr(mod, dptr, size, stream=None, value=np.uint32(0)): def fill_dptr(mod, dptr, size, stream=None, value=np.uint32(0)):

View File

@ -82,11 +82,9 @@ class RenderInfo(object):
Determine features and constants required to render a particular set of Determine features and constants required to render a particular set of
genomes. The values of this class are fixed before compilation begins. genomes. The values of this class are fixed before compilation begins.
""" """
# Constant parameters which control handling of out-of-frame samples: # Number of iterations to iterate without write after generating a new
# Number of iterations to iterate without write after new point # point, including the number of bad
fuse = 10 fuse = 128
# Maximum consecutive out-of-bounds points before picking new point
max_oob = 10
# Height of the texture pallete which gets uploaded to the GPU (assuming # Height of the texture pallete which gets uploaded to the GPU (assuming
# that palette-from-texture is enabled). For most genomes, this doesn't # that palette-from-texture is enabled). For most genomes, this doesn't

View File

@ -14,6 +14,7 @@ from fr0stlib import pyflam3
from fr0stlib.pyflam3._flam3 import * from fr0stlib.pyflam3._flam3 import *
from fr0stlib.pyflam3.constants import * from fr0stlib.pyflam3.constants import *
import pycuda.autoinit
import pycuda.compiler import pycuda.compiler
import pycuda.driver as cuda import pycuda.driver as cuda
import pycuda.tools import pycuda.tools
@ -153,7 +154,15 @@ class Renderer(object):
np.concatenate(map(info.db.palettes.get, pals[1::2]))) np.concatenate(map(info.db.palettes.get, pals[1::2])))
d_palmem = cuda.mem_alloc(256 * info.palette_height * 4) d_palmem = cuda.mem_alloc(256 * info.palette_height * 4)
seeds = mwc.MWC.make_seeds(self._iter.NTHREADS * cps_per_block) # The '+1' avoids more situations where the 'smid' value is larger
# than the number of enabled SMs on a chip, which is warned against in
# the docs but not seen in the wild. Things could get nastier on
# subsequent silicon, but I doubt they'd ever kill more than 1 SM
nslots = pycuda.autoinit.device.max_threads_per_multiprocessor * \
(pycuda.autoinit.device.multiprocessor_count + 1)
d_points = cuda.mem_alloc(nslots * 16)
seeds = mwc.MWC.make_seeds(nslots)
d_seeds = cuda.to_device(seeds) d_seeds = cuda.to_device(seeds)
h_out = cuda.pagelocked_empty((info.acc_height, info.acc_stride, 4), h_out = cuda.pagelocked_empty((info.acc_height, info.acc_stride, 4),
@ -200,6 +209,10 @@ class Renderer(object):
block=(256,1,1), grid=(cps_per_block/256,1), block=(256,1,1), grid=(cps_per_block/256,1),
stream=iter_stream) stream=iter_stream)
# TODO: if we only do this once per anim, does quality improve?
util.BaseCode.fill_dptr(self.mod, d_points, 4 * nslots,
iter_stream, np.float32(np.nan))
# Get interpolated control points for debugging # Get interpolated control points for debugging
#iter_stream.synchronize() #iter_stream.synchronize()
#d_temp = cuda.from_device(d_infos, #d_temp = cuda.from_device(d_infos,
@ -208,7 +221,8 @@ class Renderer(object):
#print '%60s %g' % ('_'.join(n), i) #print '%60s %g' % ('_'.join(n), i)
nsamps = info.density * info.width * info.height / cps_per_block nsamps = info.density * info.width * info.height / cps_per_block
iter_fun(np.uint64(d_accum), d_seeds, d_infos, np.int32(nsamps), iter_fun(np.uint64(d_accum), d_seeds, d_points,
d_infos, np.int32(nsamps),
block=(32, self._iter.NTHREADS/32, 1), block=(32, self._iter.NTHREADS/32, 1),
grid=(cps_per_block, 1), grid=(cps_per_block, 1),
texrefs=[tref], stream=iter_stream) texrefs=[tref], stream=iter_stream)