From daf56ffc5306408d1543b7ac2ef1bf7e1b1ad0ef Mon Sep 17 00:00:00 2001 From: Steven Robertson Date: Sun, 29 May 2011 15:15:06 -0400 Subject: [PATCH] Split thread group up along warp boundary (this is useful later) --- cuburn/code/iter.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/cuburn/code/iter.py b/cuburn/code/iter.py index 851ec6f..15f7ce4 100644 --- a/cuburn/code/iter.py +++ b/cuburn/code/iter.py @@ -68,7 +68,8 @@ void iter(mwc_st *msts, iter_info *infos, float4 *accbuf, float *denbuf) { iter_info *info_glob = &(infos[blockIdx.x]); // load info to shared memory cooperatively - for (int i = threadIdx.x; i * 4 < sizeof(iter_info); i += blockDim.x) + for (int i = threadIdx.y * 32 + threadIdx.x; + i * 4 < sizeof(iter_info); i += blockDim.x * blockDim.y) reinterpret_cast(&info)[i] = reinterpret_cast(info_glob)[i]; @@ -215,7 +216,7 @@ def render(features, cps): fun = mod.get_function("iter") fun.set_cache_config(cuda.func_cache.PREFER_L1) t = fun(InOut(seeds), InOut(infos), abufd, dbufd, - block=(512,1,1), grid=(nsteps,1), time_kernel=True) + block=(32,16,1), grid=(nsteps,1), time_kernel=True) print "Completed render in %g seconds" % t f = np.float32