New badvals mechanism.

This commit is contained in:
Steven Robertson
2011-10-27 12:59:58 -04:00
parent cac9b691a8
commit f3a79b200c
4 changed files with 144 additions and 112 deletions

View File

@ -14,6 +14,7 @@ from fr0stlib import pyflam3
from fr0stlib.pyflam3._flam3 import *
from fr0stlib.pyflam3.constants import *
import pycuda.autoinit
import pycuda.compiler
import pycuda.driver as cuda
import pycuda.tools
@ -153,7 +154,15 @@ class Renderer(object):
np.concatenate(map(info.db.palettes.get, pals[1::2])))
d_palmem = cuda.mem_alloc(256 * info.palette_height * 4)
seeds = mwc.MWC.make_seeds(self._iter.NTHREADS * cps_per_block)
# The '+1' avoids more situations where the 'smid' value is larger
# than the number of enabled SMs on a chip, which is warned against in
# the docs but not seen in the wild. Things could get nastier on
# subsequent silicon, but I doubt they'd ever kill more than 1 SM
nslots = pycuda.autoinit.device.max_threads_per_multiprocessor * \
(pycuda.autoinit.device.multiprocessor_count + 1)
d_points = cuda.mem_alloc(nslots * 16)
seeds = mwc.MWC.make_seeds(nslots)
d_seeds = cuda.to_device(seeds)
h_out = cuda.pagelocked_empty((info.acc_height, info.acc_stride, 4),
@ -200,6 +209,10 @@ class Renderer(object):
block=(256,1,1), grid=(cps_per_block/256,1),
stream=iter_stream)
# TODO: if we only do this once per anim, does quality improve?
util.BaseCode.fill_dptr(self.mod, d_points, 4 * nslots,
iter_stream, np.float32(np.nan))
# Get interpolated control points for debugging
#iter_stream.synchronize()
#d_temp = cuda.from_device(d_infos,
@ -208,7 +221,8 @@ class Renderer(object):
#print '%60s %g' % ('_'.join(n), i)
nsamps = info.density * info.width * info.height / cps_per_block
iter_fun(np.uint64(d_accum), d_seeds, d_infos, np.int32(nsamps),
iter_fun(np.uint64(d_accum), d_seeds, d_points,
d_infos, np.int32(nsamps),
block=(32, self._iter.NTHREADS/32, 1),
grid=(cps_per_block, 1),
texrefs=[tref], stream=iter_stream)