mirror of
https://github.com/stevenrobertson/cuburn.git
synced 2025-02-05 11:40:04 -05:00
Use 3*256 instead of 2*512 blocks; faster on GF104
This commit is contained in:
parent
c7728d3507
commit
3be14547ea
@ -7,7 +7,7 @@ from cuburn.code.util import *
|
||||
|
||||
class IterCode(HunkOCode):
|
||||
# The number of threads per block
|
||||
NTHREADS = 512
|
||||
NTHREADS = 256
|
||||
|
||||
def __init__(self, features):
|
||||
self.features = features
|
||||
@ -158,9 +158,9 @@ void iter(mwc_st *msts, iter_info *infos, uint64_t accbuf_ptr) {
|
||||
int last_xf_used = 0;
|
||||
{{else}}
|
||||
// Size can be reduced by a factor of four using a slower 4-stage reduce
|
||||
__shared__ float swap[2048];
|
||||
__shared__ float cosel[16];
|
||||
if (threadIdx.y == 0 && threadIdx.x < 16)
|
||||
__shared__ float swap[{{4*NTHREADS}}];
|
||||
__shared__ float cosel[{{NWARPS}}];
|
||||
if (threadIdx.y == 0 && threadIdx.x < {{NWARPS}})
|
||||
cosel[threadIdx.x] = mwc_next_01(rctx);
|
||||
{{endif}}
|
||||
|
||||
@ -215,13 +215,13 @@ void iter(mwc_st *msts, iter_info *infos, uint64_t accbuf_ptr) {
|
||||
|
||||
{{if not features.chaos_used}}
|
||||
// Swap thread states here so that writeback skipping logic doesn't die
|
||||
int sw = (threadIdx.y * 32 + threadIdx.x * 33) & 0x1ff;
|
||||
int sw = (threadIdx.y * 32 + threadIdx.x * 33) & {{NTHREADS-1}};
|
||||
int sr = threadIdx.y * 32 + threadIdx.x;
|
||||
|
||||
swap[sw] = consec_bad;
|
||||
swap[sw+512] = x;
|
||||
swap[sw+1024] = y;
|
||||
swap[sw+1536] = color;
|
||||
swap[sw+{{NTHREADS}}] = x;
|
||||
swap[sw+{{2*NTHREADS}}] = y;
|
||||
swap[sw+{{3*NTHREADS}}] = color;
|
||||
__syncthreads();
|
||||
// This is in the middle of the function so that only one sync is
|
||||
// required per loop.
|
||||
@ -229,14 +229,14 @@ void iter(mwc_st *msts, iter_info *infos, uint64_t accbuf_ptr) {
|
||||
|
||||
{{if not features.chaos_used}}
|
||||
// Similarly, we select the next xforms here.
|
||||
if (threadIdx.y == 0 && threadIdx.x < 16)
|
||||
if (threadIdx.y == 0 && threadIdx.x < {{NWARPS}})
|
||||
cosel[threadIdx.x] = mwc_next_01(rctx);
|
||||
{{endif}}
|
||||
|
||||
consec_bad = swap[sr];
|
||||
x = swap[sr+512];
|
||||
y = swap[sr+1024];
|
||||
color = swap[sr+1536];
|
||||
x = swap[sr+{{NTHREADS}}];
|
||||
y = swap[sr+{{2*NTHREADS}}];
|
||||
color = swap[sr+{{3*NTHREADS}}];
|
||||
{{endif}}
|
||||
|
||||
if (consec_bad < 0) {
|
||||
@ -286,5 +286,7 @@ void iter(mwc_st *msts, iter_info *infos, uint64_t accbuf_ptr) {
|
||||
return tmpl.substitute(
|
||||
features = self.features,
|
||||
packer = self.packer.view('info'),
|
||||
NTHREADS = self.NTHREADS,
|
||||
NWARPS = self.NTHREADS / 32,
|
||||
**globals())
|
||||
|
||||
|
@ -107,7 +107,7 @@ class Animation(object):
|
||||
In other words, it's best to use exactly one Animation for each
|
||||
interpolated sequence between one or two genomes.
|
||||
"""
|
||||
cmp_options = ('-use_fast_math', '-maxrregcount', '32')
|
||||
cmp_options = ('-use_fast_math', '-maxrregcount', '42')
|
||||
keep = False
|
||||
|
||||
def __init__(self, ctypes_genome_array):
|
||||
|
Loading…
Reference in New Issue
Block a user