diff --git a/sortbench.cu b/sortbench.cu index 5abc0f9..c41abbf 100644 --- a/sortbench.cu +++ b/sortbench.cu @@ -1,5 +1,5 @@ #include - +#include __global__ void prefix_scan_8_0_shmem(unsigned char *keys, int nitems, int *pfxs) { @@ -27,6 +27,131 @@ void prefix_scan_8_0_shmem(unsigned char *keys, int nitems, int *pfxs) { } } + + +__global__ +void prefix_scan_8_0_shmem_shortseg(unsigned char *keys, int *pfxs) { + const int blksz = 256; + const int grpsz = 8192; + const int tid = threadIdx.y * 32 + threadIdx.x; + __shared__ int shr_pfxs[blksz]; + + shr_pfxs[tid] = 0; + __syncthreads(); + + // TODO: this introduces a hard upper limit of 512M keys (3GB) sorted in a + // pass. It'll be a while before we get the 8GB cards needed to do this. + int i = tid + grpsz * blockIdx.x; + + for (int j = 0; j < 32; j++) { + int value = keys[i]; + atomicAdd(shr_pfxs + value, 1); + i += blksz; + } + + __syncthreads(); + pfxs[tid + blksz * blockIdx.x] = shr_pfxs[tid]; +} + +__global__ +void crappy_split(int *pfxs, int *pfxs_out) { + const int blksz = 256; + const int tid = threadIdx.y * 32 + threadIdx.x; + int i = blksz * (tid + blockIdx.x * blksz); + int i_bound = i + blksz; + int val = 0; + for (; i < i_bound; i++) { + pfxs_out[i] = val; + val += pfxs[i]; + } +} + +__global__ +void prefix_sum(int *pfxs, int nitems, int *out_pfxs, int *out_sums) { + const int blksz = 256; + const int tid = threadIdx.y * 32 + threadIdx.x; + int val = 0; + for (int i = tid; i < nitems; i += blksz) val += pfxs[i]; + + out_pfxs[tid] = val; + + // I know there's a better way to implement this summing network, + // but it's not a time-critical piece of code. + __shared__ int sh_pfxs[blksz]; + sh_pfxs[tid] = val; + val = 0; + __syncthreads(); + // Intentionally exclusive indexing here, val{0} should be 0 + for (int i = 0; i < tid; i++) val += sh_pfxs[i]; + out_sums[tid] = val; + + // Here we shift things over by 1, to make retrieving the + // indices and differences easier in the sorting step. + int i; + for (i = tid; i < nitems; i += blksz) { + int t = pfxs[i]; + pfxs[i] = val; + val += t; + } + // Now write the last column and we're done. + pfxs[i] = val; +} + +__global__ +void sort_8(unsigned char *keys, int *sorted_keys, int *pfxs) { + const int grpsz = 8192; + const int blksz = 256; + const int tid = threadIdx.y * 32 + threadIdx.x; + const int blk_offset = grpsz * blockIdx.x; + __shared__ int shr_pfxs[blksz]; + + if (threadIdx.y < 8) { + int pfx_i = blksz * blockIdx.x + tid; + shr_pfxs[tid] = pfxs[pfx_i]; + } + __syncthreads(); + + int i = tid; + for (int j = 0; j < 32; j++) { + int value = keys[i+blk_offset]; + int offset = atomicAdd(shr_pfxs + value, 1); + sorted_keys[offset] = value; + i += blksz; + } +} + +__global__ +void sort_8_a(unsigned char *keys, int *sorted_keys, int *pfxs, int *split) { + const int grpsz = 8192; + const int blksz = 256; + const int tid = threadIdx.y * 32 + threadIdx.x; + const int blk_offset = grpsz * blockIdx.x; + __shared__ int shr_pfxs[blksz]; + __shared__ int shr_offs[blksz]; + __shared__ int defer[grpsz]; + + const int pfx_i = blksz * blockIdx.x + tid; + shr_pfxs[tid] = pfxs[pfx_i]; + shr_offs[tid] = split[pfx_i]; + __syncthreads(); + + for (int i = tid; i < grpsz; i += blksz) { + int value = keys[i+blk_offset]; + int offset = atomicAdd(shr_offs + value, 1); + defer[offset] = value; + } + //shr_pfxs[tid] = pfxs[pfx_i]; + __syncthreads(); + + for (int i = tid; i < grpsz; i += blksz) { + int value = defer[i]; + int offset = shr_pfxs[value] + i - (shr_offs[value] - shr_offs[0]); + sorted_keys[offset] = value; + } +} + + + __global__ void prefix_scan_8_0_shmem_lessconf(unsigned char *keys, int nitems, int *pfxs) { __shared__ int sh_pfxs_banked[256][32]; diff --git a/sortbench.py b/sortbench.py index ea2574d..70fd6bc 100644 --- a/sortbench.py +++ b/sortbench.py @@ -7,7 +7,7 @@ import pycuda.driver as cuda import numpy as np import os -os.environ['PATH'] = ('/usr/x86_64-pc-linux-gnu/gcc-bin/4.4.5:' +os.environ['PATH'] = ('/usr/x86_64-pc-linux-gnu/gcc-bin/4.4.6:' + os.environ['PATH']) def go(scale, block, test_cpu): @@ -22,7 +22,7 @@ def go(scale, block, test_cpu): print 'took %g secs on CPU' % (b - a) with open('sortbench.cu') as f: src = f.read() - mod = pycuda.compiler.SourceModule(src) + mod = pycuda.compiler.SourceModule(src, keep=True) fun = mod.get_function('prefix_scan_8_0_shmem') shmem_pfxs = np.zeros(256, dtype=np.int32) t = fun(cuda.In(data), np.int32(block), cuda.InOut(shmem_pfxs), @@ -53,11 +53,65 @@ def go(scale, block, test_cpu): print popc5_pfxs + grids = scale * block / 8192 + print grids + incr_pfxs = np.zeros((grids + 1, 256), dtype=np.int32) + shortseg_pfxs = np.zeros(256, dtype=np.int32) + shortseg_sums = np.zeros(256, dtype=np.int32) + fun = mod.get_function('prefix_scan_8_0_shmem_shortseg') + fun.set_cache_config(cuda.func_cache.PREFER_L1) + t = fun(cuda.In(data), cuda.Out(incr_pfxs), + block=(32, 8, 1), grid=(grids, 1), time_kernel=True) + print 'shortseg took %g secs.' % t + print incr_pfxs[0] + print incr_pfxs[1] + + split = np.zeros((grids, 256), dtype=np.int32) + fun = mod.get_function('crappy_split') + fun.set_cache_config(cuda.func_cache.PREFER_L1) + t = fun(cuda.In(incr_pfxs), cuda.Out(split), + block=(32, 8, 1), grid=(grids / 256, 1), time_kernel=True) + print 'crappy_split took %g secs.' % t + print split + + fun = mod.get_function('prefix_sum') + fun.set_cache_config(cuda.func_cache.PREFER_L1) + t = fun(cuda.InOut(incr_pfxs), np.int32(grids * 256), + cuda.Out(shortseg_pfxs), cuda.Out(shortseg_sums), + block=(32, 8, 1), grid=(1, 1), time_kernel=True) + print 'shortseg_sum took %g secs.' % t + print 'it worked? %s' % (np.all(shortseg_pfxs == popc_pfxs)) + print shortseg_pfxs + print shortseg_sums + print incr_pfxs[1] - incr_pfxs[0] + + sorted = np.zeros(scale * block, dtype=np.int32) + fun = mod.get_function('sort_8') + fun.set_cache_config(cuda.func_cache.PREFER_L1) + t = fun(cuda.In(data), cuda.Out(sorted), cuda.In(incr_pfxs), + block=(32, 8, 1), grid=(grids, 1), time_kernel=True) + print 'shortseg_sort took %g secs.' % t + print 'incr0', incr_pfxs[0] + print sorted[:100] + print sorted[-100:] + + sorted = np.zeros(scale * block, dtype=np.int32) + fun = mod.get_function('sort_8_a') + t = fun(cuda.In(data), cuda.Out(sorted), cuda.In(incr_pfxs), cuda.In(split), + block=(32, 8, 1), grid=(grids, 1), time_kernel=True) + print 'shortseg_sort took %g secs.' % t + print 'incr0', incr_pfxs[0] + print sorted[:100] + print sorted[-100:] + + + def main(): # shmem is known good; disable the CPU run to get better info from cuprof #go(8, 512<<10, True) - go(1024, 512<<10, False) + go(1024, 512<<8, False) + #go(32768, 8192, False) main()