Use 3*256 instead of 2*512 blocks; faster on GF104

2025-06-30 13:26:27 -04:00 · 2011-10-15 00:33:37 -04:00
parent c7728d3507
commit 3be14547ea
2 changed files with 15 additions and 13 deletions
--- a/cuburn/code/iter.py
+++ b/cuburn/code/iter.py
@ -7,7 +7,7 @@ from cuburn.code.util import *
 class IterCode(HunkOCode):
    # The number of threads per block
-    NTHREADS = 512
+    NTHREADS = 256
    def __init__(self, features):
        self.features = features
@ -158,9 +158,9 @@ void iter(mwc_st *msts, iter_info *infos, uint64_t accbuf_ptr) {
    int last_xf_used = 0;
    {{else}}
    // Size can be reduced by a factor of four using a slower 4-stage reduce
-    __shared__ float swap[2048];
+    __shared__ float swap[{{4*NTHREADS}}];
-    __shared__ float cosel[16];
+    __shared__ float cosel[{{NWARPS}}];
-    if (threadIdx.y == 0 && threadIdx.x < 16)
+    if (threadIdx.y == 0 && threadIdx.x < {{NWARPS}})
        cosel[threadIdx.x] = mwc_next_01(rctx);
    {{endif}}
@ -215,13 +215,13 @@ void iter(mwc_st *msts, iter_info *infos, uint64_t accbuf_ptr) {
        {{if not features.chaos_used}}
        // Swap thread states here so that writeback skipping logic doesn't die
-        int sw = (threadIdx.y * 32 + threadIdx.x * 33) & 0x1ff;
+        int sw = (threadIdx.y * 32 + threadIdx.x * 33) & {{NTHREADS-1}};
        int sr = threadIdx.y * 32 + threadIdx.x;
        swap[sw] = consec_bad;
-        swap[sw+512] = x;
+        swap[sw+{{NTHREADS}}] = x;
-        swap[sw+1024] = y;
+        swap[sw+{{2*NTHREADS}}] = y;
-        swap[sw+1536] = color;
+        swap[sw+{{3*NTHREADS}}] = color;
        __syncthreads();
        // This is in the middle of the function so that only one sync is
        // required per loop.
@ -229,14 +229,14 @@ void iter(mwc_st *msts, iter_info *infos, uint64_t accbuf_ptr) {
        {{if not features.chaos_used}}
        // Similarly, we select the next xforms here.
-        if (threadIdx.y == 0 && threadIdx.x < 16)
+        if (threadIdx.y == 0 && threadIdx.x < {{NWARPS}})
            cosel[threadIdx.x] = mwc_next_01(rctx);
        {{endif}}
        consec_bad = swap[sr];
-        x = swap[sr+512];
+        x = swap[sr+{{NTHREADS}}];
-        y = swap[sr+1024];
+        y = swap[sr+{{2*NTHREADS}}];
-        color = swap[sr+1536];
+        color = swap[sr+{{3*NTHREADS}}];
        {{endif}}
        if (consec_bad < 0) {
@ -286,5 +286,7 @@ void iter(mwc_st *msts, iter_info *infos, uint64_t accbuf_ptr) {
        return tmpl.substitute(
                features = self.features,
                packer = self.packer.view('info'),
                NTHREADS = self.NTHREADS,
                NWARPS = self.NTHREADS / 32,
                **globals())
--- a/cuburn/render.py
+++ b/cuburn/render.py
@ -107,7 +107,7 @@ class Animation(object):
    In other words, it's best to use exactly one Animation for each
    interpolated sequence between one or two genomes.
    """
-    cmp_options = ('-use_fast_math', '-maxrregcount', '32')
+    cmp_options = ('-use_fast_math', '-maxrregcount', '42')
    keep = False
    def __init__(self, ctypes_genome_array):