mirror of
https://github.com/stevenrobertson/cuburn.git
synced 2025-02-05 11:40:04 -05:00
Use 3*256 instead of 2*512 blocks; faster on GF104
This commit is contained in:
parent
c7728d3507
commit
3be14547ea
@ -7,7 +7,7 @@ from cuburn.code.util import *
|
|||||||
|
|
||||||
class IterCode(HunkOCode):
|
class IterCode(HunkOCode):
|
||||||
# The number of threads per block
|
# The number of threads per block
|
||||||
NTHREADS = 512
|
NTHREADS = 256
|
||||||
|
|
||||||
def __init__(self, features):
|
def __init__(self, features):
|
||||||
self.features = features
|
self.features = features
|
||||||
@ -158,9 +158,9 @@ void iter(mwc_st *msts, iter_info *infos, uint64_t accbuf_ptr) {
|
|||||||
int last_xf_used = 0;
|
int last_xf_used = 0;
|
||||||
{{else}}
|
{{else}}
|
||||||
// Size can be reduced by a factor of four using a slower 4-stage reduce
|
// Size can be reduced by a factor of four using a slower 4-stage reduce
|
||||||
__shared__ float swap[2048];
|
__shared__ float swap[{{4*NTHREADS}}];
|
||||||
__shared__ float cosel[16];
|
__shared__ float cosel[{{NWARPS}}];
|
||||||
if (threadIdx.y == 0 && threadIdx.x < 16)
|
if (threadIdx.y == 0 && threadIdx.x < {{NWARPS}})
|
||||||
cosel[threadIdx.x] = mwc_next_01(rctx);
|
cosel[threadIdx.x] = mwc_next_01(rctx);
|
||||||
{{endif}}
|
{{endif}}
|
||||||
|
|
||||||
@ -215,13 +215,13 @@ void iter(mwc_st *msts, iter_info *infos, uint64_t accbuf_ptr) {
|
|||||||
|
|
||||||
{{if not features.chaos_used}}
|
{{if not features.chaos_used}}
|
||||||
// Swap thread states here so that writeback skipping logic doesn't die
|
// Swap thread states here so that writeback skipping logic doesn't die
|
||||||
int sw = (threadIdx.y * 32 + threadIdx.x * 33) & 0x1ff;
|
int sw = (threadIdx.y * 32 + threadIdx.x * 33) & {{NTHREADS-1}};
|
||||||
int sr = threadIdx.y * 32 + threadIdx.x;
|
int sr = threadIdx.y * 32 + threadIdx.x;
|
||||||
|
|
||||||
swap[sw] = consec_bad;
|
swap[sw] = consec_bad;
|
||||||
swap[sw+512] = x;
|
swap[sw+{{NTHREADS}}] = x;
|
||||||
swap[sw+1024] = y;
|
swap[sw+{{2*NTHREADS}}] = y;
|
||||||
swap[sw+1536] = color;
|
swap[sw+{{3*NTHREADS}}] = color;
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
// This is in the middle of the function so that only one sync is
|
// This is in the middle of the function so that only one sync is
|
||||||
// required per loop.
|
// required per loop.
|
||||||
@ -229,14 +229,14 @@ void iter(mwc_st *msts, iter_info *infos, uint64_t accbuf_ptr) {
|
|||||||
|
|
||||||
{{if not features.chaos_used}}
|
{{if not features.chaos_used}}
|
||||||
// Similarly, we select the next xforms here.
|
// Similarly, we select the next xforms here.
|
||||||
if (threadIdx.y == 0 && threadIdx.x < 16)
|
if (threadIdx.y == 0 && threadIdx.x < {{NWARPS}})
|
||||||
cosel[threadIdx.x] = mwc_next_01(rctx);
|
cosel[threadIdx.x] = mwc_next_01(rctx);
|
||||||
{{endif}}
|
{{endif}}
|
||||||
|
|
||||||
consec_bad = swap[sr];
|
consec_bad = swap[sr];
|
||||||
x = swap[sr+512];
|
x = swap[sr+{{NTHREADS}}];
|
||||||
y = swap[sr+1024];
|
y = swap[sr+{{2*NTHREADS}}];
|
||||||
color = swap[sr+1536];
|
color = swap[sr+{{3*NTHREADS}}];
|
||||||
{{endif}}
|
{{endif}}
|
||||||
|
|
||||||
if (consec_bad < 0) {
|
if (consec_bad < 0) {
|
||||||
@ -286,5 +286,7 @@ void iter(mwc_st *msts, iter_info *infos, uint64_t accbuf_ptr) {
|
|||||||
return tmpl.substitute(
|
return tmpl.substitute(
|
||||||
features = self.features,
|
features = self.features,
|
||||||
packer = self.packer.view('info'),
|
packer = self.packer.view('info'),
|
||||||
|
NTHREADS = self.NTHREADS,
|
||||||
|
NWARPS = self.NTHREADS / 32,
|
||||||
**globals())
|
**globals())
|
||||||
|
|
||||||
|
@ -107,7 +107,7 @@ class Animation(object):
|
|||||||
In other words, it's best to use exactly one Animation for each
|
In other words, it's best to use exactly one Animation for each
|
||||||
interpolated sequence between one or two genomes.
|
interpolated sequence between one or two genomes.
|
||||||
"""
|
"""
|
||||||
cmp_options = ('-use_fast_math', '-maxrregcount', '32')
|
cmp_options = ('-use_fast_math', '-maxrregcount', '42')
|
||||||
keep = False
|
keep = False
|
||||||
|
|
||||||
def __init__(self, ctypes_genome_array):
|
def __init__(self, ctypes_genome_array):
|
||||||
|
Loading…
Reference in New Issue
Block a user