mirror of
https://github.com/stevenrobertson/cuburn.git
synced 2025-02-05 11:40:04 -05:00
New badvals mechanism.
This commit is contained in:
parent
cac9b691a8
commit
f3a79b200c
@ -242,10 +242,10 @@ void apply_xf_{{xfid}}(float &ox, float &oy, float &color, mwc_st &rctx) {
|
|||||||
def _iterbody(self):
|
def _iterbody(self):
|
||||||
tmpl = Template(r'''
|
tmpl = Template(r'''
|
||||||
__global__
|
__global__
|
||||||
void iter(uint64_t accbuf_ptr, mwc_st *msts, iter_params *all_params,
|
void iter(uint64_t accbuf_ptr, mwc_st *msts, float4 *points,
|
||||||
int nsamps_to_generate) {
|
const iter_params *all_params, int nsamps_to_generate) {
|
||||||
mwc_st rctx = msts[gtid()];
|
mwc_st rctx = msts[devtid()];
|
||||||
iter_params *global_params = &(all_params[blockIdx.x]);
|
const iter_params *global_params = &(all_params[blockIdx.x]);
|
||||||
|
|
||||||
__shared__ int nsamps;
|
__shared__ int nsamps;
|
||||||
nsamps = nsamps_to_generate;
|
nsamps = nsamps_to_generate;
|
||||||
@ -257,7 +257,7 @@ void iter(uint64_t accbuf_ptr, mwc_st *msts, iter_params *all_params,
|
|||||||
for (int i = threadIdx.y * blockDim.x + threadIdx.x;
|
for (int i = threadIdx.y * blockDim.x + threadIdx.x;
|
||||||
i * 4 < sizeof(iter_params); i += blockDim.x * blockDim.y)
|
i * 4 < sizeof(iter_params); i += blockDim.x * blockDim.y)
|
||||||
reinterpret_cast<float*>(¶ms)[i] =
|
reinterpret_cast<float*>(¶ms)[i] =
|
||||||
reinterpret_cast<float*>(global_params)[i];
|
reinterpret_cast<const float*>(global_params)[i];
|
||||||
|
|
||||||
{{if info.chaos_used}}
|
{{if info.chaos_used}}
|
||||||
int last_xf_used = 0;
|
int last_xf_used = 0;
|
||||||
@ -273,120 +273,118 @@ void iter(uint64_t accbuf_ptr, mwc_st *msts, iter_params *all_params,
|
|||||||
{{endif}}
|
{{endif}}
|
||||||
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
int consec_bad = -{{info.fuse}};
|
float4 old_point = points[devtid()];
|
||||||
|
float x = old_point.x, y = old_point.y,
|
||||||
float x, y, color;
|
color = old_point.z, fuse_rounds = old_point.w;
|
||||||
x = mwc_next_11(rctx);
|
|
||||||
y = mwc_next_11(rctx);
|
|
||||||
color = mwc_next_01(rctx);
|
|
||||||
|
|
||||||
while (1) {
|
while (1) {
|
||||||
|
// This condition checks for large numbers, Infs, and NaNs.
|
||||||
|
if (!(-(fabsf(x) + fabsf(y) > -1.0e6f))) {
|
||||||
|
x = mwc_next_11(rctx);
|
||||||
|
y = mwc_next_11(rctx);
|
||||||
|
color = mwc_next_01(rctx);
|
||||||
|
fuse_rounds = {{info.fuse / 32}};
|
||||||
|
}
|
||||||
|
|
||||||
|
// 32 rounds is somewhat arbitrary, but it has a pleasing 32-ness
|
||||||
|
for (int i = 0; i < 32; i++) {
|
||||||
|
|
||||||
{{if info.chaos_used}}
|
{{if info.chaos_used}}
|
||||||
|
|
||||||
{{precalc_chaos(pcp, std_xforms)}}
|
{{precalc_chaos(pcp, std_xforms)}}
|
||||||
|
|
||||||
// For now, we don't attempt to use the swap buffer when chaos is used
|
// For now, we don't attempt to use the swap buffer when chaos is used
|
||||||
float xfsel = mwc_next_01(rctx);
|
float xfsel = mwc_next_01(rctx);
|
||||||
|
|
||||||
{{for prior_xform_idx, prior_xform_name in enumerate(std_xforms)}}
|
{{for prior_xform_idx, prior_xform_name in enumerate(std_xforms)}}
|
||||||
if (last_xf_used == {{prior_xform_idx}}) {
|
if (last_xf_used == {{prior_xform_idx}}) {
|
||||||
{{for xform_idx, xform_name in enumerate(std_xforms[:-1])}}
|
{{for xform_idx, xform_name in enumerate(std_xforms[:-1])}}
|
||||||
if (xfsel <= {{pcp['chaos_'+prior_xform_name+'_'+xform_name]}}) {
|
if (xfsel <= {{pcp['chaos_'+prior_xform_name+'_'+xform_name]}}) {
|
||||||
apply_xf_{{xform_name}}(x, y, color, rctx);
|
apply_xf_{{xform_name}}(x, y, color, rctx);
|
||||||
last_xf_used = {{xform_idx}};
|
last_xf_used = {{xform_idx}};
|
||||||
|
} else
|
||||||
|
{{endfor}}
|
||||||
|
{
|
||||||
|
apply_xf_{{std_xforms[-1]}}(x, y, color, rctx);
|
||||||
|
last_xf_used = {{len(std_xforms)-1}};
|
||||||
|
}
|
||||||
} else
|
} else
|
||||||
{{endfor}}
|
{{endfor}}
|
||||||
{
|
{
|
||||||
|
printf("Something went *very* wrong.\n");
|
||||||
|
asm("trap;");
|
||||||
|
}
|
||||||
|
|
||||||
|
{{else}}
|
||||||
|
{{precalc_densities(pcp, std_xforms)}}
|
||||||
|
float xfsel = cosel[threadIdx.y];
|
||||||
|
|
||||||
|
{{for xform_name in std_xforms[:-1]}}
|
||||||
|
if (xfsel <= {{pcp['den_'+xform_name]}}) {
|
||||||
|
apply_xf_{{xform_name}}(x, y, color, rctx);
|
||||||
|
} else
|
||||||
|
{{endfor}}
|
||||||
apply_xf_{{std_xforms[-1]}}(x, y, color, rctx);
|
apply_xf_{{std_xforms[-1]}}(x, y, color, rctx);
|
||||||
last_xf_used = {{len(std_xforms)-1}};
|
|
||||||
}
|
int sw = (threadIdx.y * 32 + threadIdx.x * 33) & {{NTHREADS-1}};
|
||||||
} else
|
int sr = threadIdx.y * 32 + threadIdx.x;
|
||||||
{{endfor}}
|
|
||||||
{
|
swap[sw] = fuse_rounds;
|
||||||
printf("Something went *very* wrong.\n");
|
swap[sw+{{NTHREADS}}] = x;
|
||||||
asm("trap;");
|
swap[sw+{{2*NTHREADS}}] = y;
|
||||||
|
swap[sw+{{3*NTHREADS}}] = color;
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
// We select the next xforms here, since we've just synced.
|
||||||
|
if (threadIdx.y == 0 && threadIdx.x < {{NWARPS}})
|
||||||
|
cosel[threadIdx.x] = mwc_next_01(rctx);
|
||||||
|
|
||||||
|
fuse_rounds = swap[sr];
|
||||||
|
x = swap[sr+{{NTHREADS}}];
|
||||||
|
y = swap[sr+{{2*NTHREADS}}];
|
||||||
|
color = swap[sr+{{3*NTHREADS}}];
|
||||||
|
|
||||||
|
{{endif}}
|
||||||
|
|
||||||
|
if (fuse_rounds > 0.0f) continue;
|
||||||
|
|
||||||
|
{{if 'final' in cp.xforms}}
|
||||||
|
float fx = x, fy = y, fcolor = color;
|
||||||
|
apply_xf_final(fx, fy, fcolor, rctx);
|
||||||
|
{{endif}}
|
||||||
|
|
||||||
|
float cx, cy, cc;
|
||||||
|
|
||||||
|
{{precalc_camera(info, pcp.camera)}}
|
||||||
|
|
||||||
|
{{if 'final' in cp.xforms}}
|
||||||
|
{{apply_affine('fx', 'fy', 'cx', 'cy', pcp.camera)}}
|
||||||
|
cc = fcolor;
|
||||||
|
{{else}}
|
||||||
|
{{apply_affine('x', 'y', 'cx', 'cy', pcp.camera)}}
|
||||||
|
cc = color;
|
||||||
|
{{endif}}
|
||||||
|
|
||||||
|
uint32_t ix = trunca(cx), iy = trunca(cy);
|
||||||
|
|
||||||
|
if (ix >= {{info.acc_width}} || iy >= {{info.acc_height}})
|
||||||
|
continue;
|
||||||
|
|
||||||
|
uint32_t i = iy * {{info.acc_stride}} + ix;
|
||||||
|
|
||||||
|
float4 outcol = tex2D(palTex, cc, time_frac);
|
||||||
|
update_pix(accbuf_ptr, i, outcol);
|
||||||
}
|
}
|
||||||
|
|
||||||
{{else}}
|
int num_okay = __popc(__ballot(fuse_rounds == 0.0f));
|
||||||
{{precalc_densities(pcp, std_xforms)}}
|
if (threadIdx.x == 0) atomicSub(&nsamps, num_okay * 32);
|
||||||
float xfsel = cosel[threadIdx.y];
|
fuse_rounds = fmaxf(0.0f, fuse_rounds - 1.0f);
|
||||||
|
|
||||||
{{for xform_name in std_xforms[:-1]}}
|
|
||||||
if (xfsel <= {{pcp['den_'+xform_name]}}) {
|
|
||||||
apply_xf_{{xform_name}}(x, y, color, rctx);
|
|
||||||
} else
|
|
||||||
{{endfor}}
|
|
||||||
apply_xf_{{std_xforms[-1]}}(x, y, color, rctx);
|
|
||||||
|
|
||||||
// Swap thread states here so that writeback skipping logic doesn't die
|
|
||||||
int sw = (threadIdx.y * 32 + threadIdx.x * 33) & {{NTHREADS-1}};
|
|
||||||
int sr = threadIdx.y * 32 + threadIdx.x;
|
|
||||||
|
|
||||||
swap[sw] = consec_bad;
|
|
||||||
swap[sw+{{NTHREADS}}] = x;
|
|
||||||
swap[sw+{{2*NTHREADS}}] = y;
|
|
||||||
swap[sw+{{3*NTHREADS}}] = color;
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
// This is in the middle of the function so that only one sync is
|
if (nsamps <= 0) break;
|
||||||
// required per loop.
|
|
||||||
if (nsamps < 0) break;
|
|
||||||
|
|
||||||
// Similarly, we select the next xforms here.
|
|
||||||
if (threadIdx.y == 0 && threadIdx.x < {{NWARPS}})
|
|
||||||
cosel[threadIdx.x] = mwc_next_01(rctx);
|
|
||||||
|
|
||||||
consec_bad = swap[sr];
|
|
||||||
x = swap[sr+{{NTHREADS}}];
|
|
||||||
y = swap[sr+{{2*NTHREADS}}];
|
|
||||||
color = swap[sr+{{3*NTHREADS}}];
|
|
||||||
|
|
||||||
{{endif}}
|
|
||||||
|
|
||||||
if (consec_bad < 0) {
|
|
||||||
consec_bad++;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
int remain = __popc(__ballot(1));
|
|
||||||
if (threadIdx.x == 0) atomicSub(&nsamps, remain);
|
|
||||||
|
|
||||||
{{if 'final' in cp.xforms}}
|
|
||||||
float fx = x, fy = y, fcolor = color;
|
|
||||||
apply_xf_final(fx, fy, fcolor, rctx);
|
|
||||||
{{endif}}
|
|
||||||
|
|
||||||
float cx, cy, cc;
|
|
||||||
|
|
||||||
{{precalc_camera(info, pcp.camera)}}
|
|
||||||
|
|
||||||
{{if 'final' in cp.xforms}}
|
|
||||||
{{apply_affine('fx', 'fy', 'cx', 'cy', pcp.camera)}}
|
|
||||||
cc = fcolor;
|
|
||||||
{{else}}
|
|
||||||
{{apply_affine('x', 'y', 'cx', 'cy', pcp.camera)}}
|
|
||||||
cc = color;
|
|
||||||
{{endif}}
|
|
||||||
|
|
||||||
uint32_t ix = trunca(cx), iy = trunca(cy);
|
|
||||||
|
|
||||||
if (ix >= {{info.acc_width}} || iy >= {{info.acc_height}} ) {
|
|
||||||
consec_bad++;
|
|
||||||
if (consec_bad > {{info.max_oob}}) {
|
|
||||||
x = mwc_next_11(rctx);
|
|
||||||
y = mwc_next_11(rctx);
|
|
||||||
color = mwc_next_01(rctx);
|
|
||||||
consec_bad = -{{info.fuse}};
|
|
||||||
}
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
uint32_t i = iy * {{info.acc_stride}} + ix;
|
|
||||||
|
|
||||||
float4 outcol = tex2D(palTex, cc, time_frac);
|
|
||||||
update_pix(accbuf_ptr, i, outcol);
|
|
||||||
}
|
}
|
||||||
msts[gtid()] = rctx;
|
points[devtid()] = make_float4(x, y, color, fuse_rounds);
|
||||||
|
msts[devtid()] = rctx;
|
||||||
}
|
}
|
||||||
''')
|
''')
|
||||||
return tmpl.substitute(
|
return tmpl.substitute(
|
||||||
|
@ -42,7 +42,7 @@ float3 rgb2hsv(float3 rgb);
|
|||||||
float3 hsv2rgb(float3 hsv);
|
float3 hsv2rgb(float3 hsv);
|
||||||
"""
|
"""
|
||||||
|
|
||||||
defs = r"""
|
defs = Template(r"""
|
||||||
#undef M_E
|
#undef M_E
|
||||||
#undef M_LOG2E
|
#undef M_LOG2E
|
||||||
#undef M_LOG10E
|
#undef M_LOG10E
|
||||||
@ -80,6 +80,28 @@ uint32_t gtid() {
|
|||||||
(blockIdx.x + (gridDim.x * blockIdx.y))));
|
(blockIdx.x + (gridDim.x * blockIdx.y))));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/* Returns the ID of this thread on the device. Note that this counter is
|
||||||
|
* volatile according to the PTX ISA. It should be used for loading and saving
|
||||||
|
* state that must be unique across running threads, not for accessing things
|
||||||
|
* in a known order. */
|
||||||
|
__device__
|
||||||
|
int devtid() {
|
||||||
|
int result;
|
||||||
|
asm({{crep('''
|
||||||
|
{
|
||||||
|
.reg .u32 tmp1, tmp2;
|
||||||
|
mov.u32 %0, %smid;
|
||||||
|
mov.u32 tmp1, %nsmid;
|
||||||
|
mov.u32 tmp2, %warpid;
|
||||||
|
mad.lo.u32 %0, %0, tmp1, tmp2;
|
||||||
|
mov.u32 tmp1, %nwarpid;
|
||||||
|
mov.u32 tmp2, %laneid;
|
||||||
|
mad.lo.u32 %0, %0, tmp1, tmp2;
|
||||||
|
}''')}} : "=r"(result) );
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
__device__
|
__device__
|
||||||
uint32_t trunca(float f) {
|
uint32_t trunca(float f) {
|
||||||
// truncate as used in address calculations. note the use of a signed
|
// truncate as used in address calculations. note the use of a signed
|
||||||
@ -182,7 +204,7 @@ float3 hsv2rgb(float3 hsv) {
|
|||||||
else { out.x = val; out.y = min; out.z = mid; }
|
else { out.x = val; out.y = min; out.z = mid; }
|
||||||
return out;
|
return out;
|
||||||
}
|
}
|
||||||
"""
|
""").substitute()
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def fill_dptr(mod, dptr, size, stream=None, value=np.uint32(0)):
|
def fill_dptr(mod, dptr, size, stream=None, value=np.uint32(0)):
|
||||||
|
@ -82,11 +82,9 @@ class RenderInfo(object):
|
|||||||
Determine features and constants required to render a particular set of
|
Determine features and constants required to render a particular set of
|
||||||
genomes. The values of this class are fixed before compilation begins.
|
genomes. The values of this class are fixed before compilation begins.
|
||||||
"""
|
"""
|
||||||
# Constant parameters which control handling of out-of-frame samples:
|
# Number of iterations to iterate without write after generating a new
|
||||||
# Number of iterations to iterate without write after new point
|
# point, including the number of bad
|
||||||
fuse = 10
|
fuse = 128
|
||||||
# Maximum consecutive out-of-bounds points before picking new point
|
|
||||||
max_oob = 10
|
|
||||||
|
|
||||||
# Height of the texture pallete which gets uploaded to the GPU (assuming
|
# Height of the texture pallete which gets uploaded to the GPU (assuming
|
||||||
# that palette-from-texture is enabled). For most genomes, this doesn't
|
# that palette-from-texture is enabled). For most genomes, this doesn't
|
||||||
|
@ -14,6 +14,7 @@ from fr0stlib import pyflam3
|
|||||||
from fr0stlib.pyflam3._flam3 import *
|
from fr0stlib.pyflam3._flam3 import *
|
||||||
from fr0stlib.pyflam3.constants import *
|
from fr0stlib.pyflam3.constants import *
|
||||||
|
|
||||||
|
import pycuda.autoinit
|
||||||
import pycuda.compiler
|
import pycuda.compiler
|
||||||
import pycuda.driver as cuda
|
import pycuda.driver as cuda
|
||||||
import pycuda.tools
|
import pycuda.tools
|
||||||
@ -153,7 +154,15 @@ class Renderer(object):
|
|||||||
np.concatenate(map(info.db.palettes.get, pals[1::2])))
|
np.concatenate(map(info.db.palettes.get, pals[1::2])))
|
||||||
d_palmem = cuda.mem_alloc(256 * info.palette_height * 4)
|
d_palmem = cuda.mem_alloc(256 * info.palette_height * 4)
|
||||||
|
|
||||||
seeds = mwc.MWC.make_seeds(self._iter.NTHREADS * cps_per_block)
|
# The '+1' avoids more situations where the 'smid' value is larger
|
||||||
|
# than the number of enabled SMs on a chip, which is warned against in
|
||||||
|
# the docs but not seen in the wild. Things could get nastier on
|
||||||
|
# subsequent silicon, but I doubt they'd ever kill more than 1 SM
|
||||||
|
nslots = pycuda.autoinit.device.max_threads_per_multiprocessor * \
|
||||||
|
(pycuda.autoinit.device.multiprocessor_count + 1)
|
||||||
|
|
||||||
|
d_points = cuda.mem_alloc(nslots * 16)
|
||||||
|
seeds = mwc.MWC.make_seeds(nslots)
|
||||||
d_seeds = cuda.to_device(seeds)
|
d_seeds = cuda.to_device(seeds)
|
||||||
|
|
||||||
h_out = cuda.pagelocked_empty((info.acc_height, info.acc_stride, 4),
|
h_out = cuda.pagelocked_empty((info.acc_height, info.acc_stride, 4),
|
||||||
@ -200,6 +209,10 @@ class Renderer(object):
|
|||||||
block=(256,1,1), grid=(cps_per_block/256,1),
|
block=(256,1,1), grid=(cps_per_block/256,1),
|
||||||
stream=iter_stream)
|
stream=iter_stream)
|
||||||
|
|
||||||
|
# TODO: if we only do this once per anim, does quality improve?
|
||||||
|
util.BaseCode.fill_dptr(self.mod, d_points, 4 * nslots,
|
||||||
|
iter_stream, np.float32(np.nan))
|
||||||
|
|
||||||
# Get interpolated control points for debugging
|
# Get interpolated control points for debugging
|
||||||
#iter_stream.synchronize()
|
#iter_stream.synchronize()
|
||||||
#d_temp = cuda.from_device(d_infos,
|
#d_temp = cuda.from_device(d_infos,
|
||||||
@ -208,7 +221,8 @@ class Renderer(object):
|
|||||||
#print '%60s %g' % ('_'.join(n), i)
|
#print '%60s %g' % ('_'.join(n), i)
|
||||||
|
|
||||||
nsamps = info.density * info.width * info.height / cps_per_block
|
nsamps = info.density * info.width * info.height / cps_per_block
|
||||||
iter_fun(np.uint64(d_accum), d_seeds, d_infos, np.int32(nsamps),
|
iter_fun(np.uint64(d_accum), d_seeds, d_points,
|
||||||
|
d_infos, np.int32(nsamps),
|
||||||
block=(32, self._iter.NTHREADS/32, 1),
|
block=(32, self._iter.NTHREADS/32, 1),
|
||||||
grid=(cps_per_block, 1),
|
grid=(cps_per_block, 1),
|
||||||
texrefs=[tref], stream=iter_stream)
|
texrefs=[tref], stream=iter_stream)
|
||||||
|
Loading…
Reference in New Issue
Block a user