Use 3*256 instead of 2*512 blocks; faster on GF104

This commit is contained in:
Steven Robertson 2011-10-15 00:33:37 -04:00
parent c7728d3507
commit 3be14547ea
2 changed files with 15 additions and 13 deletions

View File

@ -7,7 +7,7 @@ from cuburn.code.util import *
class IterCode(HunkOCode):
# The number of threads per block
NTHREADS = 512
NTHREADS = 256
def __init__(self, features):
self.features = features
@ -158,9 +158,9 @@ void iter(mwc_st *msts, iter_info *infos, uint64_t accbuf_ptr) {
int last_xf_used = 0;
{{else}}
// Size can be reduced by a factor of four using a slower 4-stage reduce
__shared__ float swap[2048];
__shared__ float cosel[16];
if (threadIdx.y == 0 && threadIdx.x < 16)
__shared__ float swap[{{4*NTHREADS}}];
__shared__ float cosel[{{NWARPS}}];
if (threadIdx.y == 0 && threadIdx.x < {{NWARPS}})
cosel[threadIdx.x] = mwc_next_01(rctx);
{{endif}}
@ -215,13 +215,13 @@ void iter(mwc_st *msts, iter_info *infos, uint64_t accbuf_ptr) {
{{if not features.chaos_used}}
// Swap thread states here so that writeback skipping logic doesn't die
int sw = (threadIdx.y * 32 + threadIdx.x * 33) & 0x1ff;
int sw = (threadIdx.y * 32 + threadIdx.x * 33) & {{NTHREADS-1}};
int sr = threadIdx.y * 32 + threadIdx.x;
swap[sw] = consec_bad;
swap[sw+512] = x;
swap[sw+1024] = y;
swap[sw+1536] = color;
swap[sw+{{NTHREADS}}] = x;
swap[sw+{{2*NTHREADS}}] = y;
swap[sw+{{3*NTHREADS}}] = color;
__syncthreads();
// This is in the middle of the function so that only one sync is
// required per loop.
@ -229,14 +229,14 @@ void iter(mwc_st *msts, iter_info *infos, uint64_t accbuf_ptr) {
{{if not features.chaos_used}}
// Similarly, we select the next xforms here.
if (threadIdx.y == 0 && threadIdx.x < 16)
if (threadIdx.y == 0 && threadIdx.x < {{NWARPS}})
cosel[threadIdx.x] = mwc_next_01(rctx);
{{endif}}
consec_bad = swap[sr];
x = swap[sr+512];
y = swap[sr+1024];
color = swap[sr+1536];
x = swap[sr+{{NTHREADS}}];
y = swap[sr+{{2*NTHREADS}}];
color = swap[sr+{{3*NTHREADS}}];
{{endif}}
if (consec_bad < 0) {
@ -286,5 +286,7 @@ void iter(mwc_st *msts, iter_info *infos, uint64_t accbuf_ptr) {
return tmpl.substitute(
features = self.features,
packer = self.packer.view('info'),
NTHREADS = self.NTHREADS,
NWARPS = self.NTHREADS / 32,
**globals())

View File

@ -107,7 +107,7 @@ class Animation(object):
In other words, it's best to use exactly one Animation for each
interpolated sequence between one or two genomes.
"""
cmp_options = ('-use_fast_math', '-maxrregcount', '32')
cmp_options = ('-use_fast_math', '-maxrregcount', '42')
keep = False
def __init__(self, ctypes_genome_array):