Use 3*256 instead of 2*512 blocks; faster on GF104

This commit is contained in:
Steven Robertson 2011-10-15 00:33:37 -04:00
parent c7728d3507
commit 3be14547ea
2 changed files with 15 additions and 13 deletions

View File

@ -7,7 +7,7 @@ from cuburn.code.util import *
class IterCode(HunkOCode): class IterCode(HunkOCode):
# The number of threads per block # The number of threads per block
NTHREADS = 512 NTHREADS = 256
def __init__(self, features): def __init__(self, features):
self.features = features self.features = features
@ -158,9 +158,9 @@ void iter(mwc_st *msts, iter_info *infos, uint64_t accbuf_ptr) {
int last_xf_used = 0; int last_xf_used = 0;
{{else}} {{else}}
// Size can be reduced by a factor of four using a slower 4-stage reduce // Size can be reduced by a factor of four using a slower 4-stage reduce
__shared__ float swap[2048]; __shared__ float swap[{{4*NTHREADS}}];
__shared__ float cosel[16]; __shared__ float cosel[{{NWARPS}}];
if (threadIdx.y == 0 && threadIdx.x < 16) if (threadIdx.y == 0 && threadIdx.x < {{NWARPS}})
cosel[threadIdx.x] = mwc_next_01(rctx); cosel[threadIdx.x] = mwc_next_01(rctx);
{{endif}} {{endif}}
@ -215,13 +215,13 @@ void iter(mwc_st *msts, iter_info *infos, uint64_t accbuf_ptr) {
{{if not features.chaos_used}} {{if not features.chaos_used}}
// Swap thread states here so that writeback skipping logic doesn't die // Swap thread states here so that writeback skipping logic doesn't die
int sw = (threadIdx.y * 32 + threadIdx.x * 33) & 0x1ff; int sw = (threadIdx.y * 32 + threadIdx.x * 33) & {{NTHREADS-1}};
int sr = threadIdx.y * 32 + threadIdx.x; int sr = threadIdx.y * 32 + threadIdx.x;
swap[sw] = consec_bad; swap[sw] = consec_bad;
swap[sw+512] = x; swap[sw+{{NTHREADS}}] = x;
swap[sw+1024] = y; swap[sw+{{2*NTHREADS}}] = y;
swap[sw+1536] = color; swap[sw+{{3*NTHREADS}}] = color;
__syncthreads(); __syncthreads();
// This is in the middle of the function so that only one sync is // This is in the middle of the function so that only one sync is
// required per loop. // required per loop.
@ -229,14 +229,14 @@ void iter(mwc_st *msts, iter_info *infos, uint64_t accbuf_ptr) {
{{if not features.chaos_used}} {{if not features.chaos_used}}
// Similarly, we select the next xforms here. // Similarly, we select the next xforms here.
if (threadIdx.y == 0 && threadIdx.x < 16) if (threadIdx.y == 0 && threadIdx.x < {{NWARPS}})
cosel[threadIdx.x] = mwc_next_01(rctx); cosel[threadIdx.x] = mwc_next_01(rctx);
{{endif}} {{endif}}
consec_bad = swap[sr]; consec_bad = swap[sr];
x = swap[sr+512]; x = swap[sr+{{NTHREADS}}];
y = swap[sr+1024]; y = swap[sr+{{2*NTHREADS}}];
color = swap[sr+1536]; color = swap[sr+{{3*NTHREADS}}];
{{endif}} {{endif}}
if (consec_bad < 0) { if (consec_bad < 0) {
@ -286,5 +286,7 @@ void iter(mwc_st *msts, iter_info *infos, uint64_t accbuf_ptr) {
return tmpl.substitute( return tmpl.substitute(
features = self.features, features = self.features,
packer = self.packer.view('info'), packer = self.packer.view('info'),
NTHREADS = self.NTHREADS,
NWARPS = self.NTHREADS / 32,
**globals()) **globals())

View File

@ -107,7 +107,7 @@ class Animation(object):
In other words, it's best to use exactly one Animation for each In other words, it's best to use exactly one Animation for each
interpolated sequence between one or two genomes. interpolated sequence between one or two genomes.
""" """
cmp_options = ('-use_fast_math', '-maxrregcount', '32') cmp_options = ('-use_fast_math', '-maxrregcount', '42')
keep = False keep = False
def __init__(self, ctypes_genome_array): def __init__(self, ctypes_genome_array):