From 3be14547ea03b4778fd1f56e6e24b7eb66faee8d Mon Sep 17 00:00:00 2001 From: Steven Robertson Date: Sat, 15 Oct 2011 00:33:37 -0400 Subject: [PATCH] Use 3*256 instead of 2*512 blocks; faster on GF104 --- cuburn/code/iter.py | 26 ++++++++++++++------------ cuburn/render.py | 2 +- 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/cuburn/code/iter.py b/cuburn/code/iter.py index 99c3645..757c302 100644 --- a/cuburn/code/iter.py +++ b/cuburn/code/iter.py @@ -7,7 +7,7 @@ from cuburn.code.util import * class IterCode(HunkOCode): # The number of threads per block - NTHREADS = 512 + NTHREADS = 256 def __init__(self, features): self.features = features @@ -158,9 +158,9 @@ void iter(mwc_st *msts, iter_info *infos, uint64_t accbuf_ptr) { int last_xf_used = 0; {{else}} // Size can be reduced by a factor of four using a slower 4-stage reduce - __shared__ float swap[2048]; - __shared__ float cosel[16]; - if (threadIdx.y == 0 && threadIdx.x < 16) + __shared__ float swap[{{4*NTHREADS}}]; + __shared__ float cosel[{{NWARPS}}]; + if (threadIdx.y == 0 && threadIdx.x < {{NWARPS}}) cosel[threadIdx.x] = mwc_next_01(rctx); {{endif}} @@ -215,13 +215,13 @@ void iter(mwc_st *msts, iter_info *infos, uint64_t accbuf_ptr) { {{if not features.chaos_used}} // Swap thread states here so that writeback skipping logic doesn't die - int sw = (threadIdx.y * 32 + threadIdx.x * 33) & 0x1ff; + int sw = (threadIdx.y * 32 + threadIdx.x * 33) & {{NTHREADS-1}}; int sr = threadIdx.y * 32 + threadIdx.x; swap[sw] = consec_bad; - swap[sw+512] = x; - swap[sw+1024] = y; - swap[sw+1536] = color; + swap[sw+{{NTHREADS}}] = x; + swap[sw+{{2*NTHREADS}}] = y; + swap[sw+{{3*NTHREADS}}] = color; __syncthreads(); // This is in the middle of the function so that only one sync is // required per loop. @@ -229,14 +229,14 @@ void iter(mwc_st *msts, iter_info *infos, uint64_t accbuf_ptr) { {{if not features.chaos_used}} // Similarly, we select the next xforms here. - if (threadIdx.y == 0 && threadIdx.x < 16) + if (threadIdx.y == 0 && threadIdx.x < {{NWARPS}}) cosel[threadIdx.x] = mwc_next_01(rctx); {{endif}} consec_bad = swap[sr]; - x = swap[sr+512]; - y = swap[sr+1024]; - color = swap[sr+1536]; + x = swap[sr+{{NTHREADS}}]; + y = swap[sr+{{2*NTHREADS}}]; + color = swap[sr+{{3*NTHREADS}}]; {{endif}} if (consec_bad < 0) { @@ -286,5 +286,7 @@ void iter(mwc_st *msts, iter_info *infos, uint64_t accbuf_ptr) { return tmpl.substitute( features = self.features, packer = self.packer.view('info'), + NTHREADS = self.NTHREADS, + NWARPS = self.NTHREADS / 32, **globals()) diff --git a/cuburn/render.py b/cuburn/render.py index 87a1acd..f1a311e 100644 --- a/cuburn/render.py +++ b/cuburn/render.py @@ -107,7 +107,7 @@ class Animation(object): In other words, it's best to use exactly one Animation for each interpolated sequence between one or two genomes. """ - cmp_options = ('-use_fast_math', '-maxrregcount', '32') + cmp_options = ('-use_fast_math', '-maxrregcount', '42') keep = False def __init__(self, ctypes_genome_array):