From 638d068a00d13492c7627ac0bfea9aa29f12158a Mon Sep 17 00:00:00 2001 From: Steven Robertson Date: Sat, 27 Aug 2011 12:56:06 -0400 Subject: [PATCH] Promising performance here. --- sortbench.cu | 160 ++++++++++++++++++++++++++++++++++++++++----------- sortbench.py | 152 ++++++++++++++++++++++++++---------------------- 2 files changed, 210 insertions(+), 102 deletions(-) diff --git a/sortbench.cu b/sortbench.cu index c41abbf..91f1ce9 100644 --- a/sortbench.cu +++ b/sortbench.cu @@ -27,30 +27,51 @@ void prefix_scan_8_0_shmem(unsigned char *keys, int nitems, int *pfxs) { } } +#define GRP_RDX_FACTOR (GRPSZ / RDXSZ) +#define GRP_BLK_FACTOR (GRPSZ / BLKSZ) +#define GRPSZ 8192 +#define RDXSZ 256 +#define BLKSZ 512 +__global__ +void prefix_scan(unsigned short *keys, int *pfxs, const int shift) { + const int tid = threadIdx.y * 32 + threadIdx.x; + __shared__ int shr_pfxs[BLKSZ]; + + shr_pfxs[tid] = 0; + __syncthreads(); + int i = tid + GRPSZ * blockIdx.x; + + for (int j = 0; j < GRP_BLK_FACTOR; j++) { + int value = (keys[i] >> shift) && 0xff; + atomicAdd(shr_pfxs + value, 1); + i += BLKSZ; + } + + __syncthreads(); + pfxs[tid + BLKSZ * blockIdx.x] = shr_pfxs[tid]; +} __global__ void prefix_scan_8_0_shmem_shortseg(unsigned char *keys, int *pfxs) { - const int blksz = 256; - const int grpsz = 8192; const int tid = threadIdx.y * 32 + threadIdx.x; - __shared__ int shr_pfxs[blksz]; + __shared__ int shr_pfxs[RDXSZ]; - shr_pfxs[tid] = 0; + if (tid < RDXSZ) shr_pfxs[tid] = 0; __syncthreads(); // TODO: this introduces a hard upper limit of 512M keys (3GB) sorted in a // pass. It'll be a while before we get the 8GB cards needed to do this. - int i = tid + grpsz * blockIdx.x; + int i = tid + GRPSZ * blockIdx.x; - for (int j = 0; j < 32; j++) { + for (int j = 0; j < GRP_BLK_FACTOR; j++) { int value = keys[i]; atomicAdd(shr_pfxs + value, 1); - i += blksz; + i += BLKSZ; } __syncthreads(); - pfxs[tid + blksz * blockIdx.x] = shr_pfxs[tid]; + if (tid < RDXSZ) pfxs[tid + RDXSZ * blockIdx.x] = shr_pfxs[tid]; } __global__ @@ -66,10 +87,64 @@ void crappy_split(int *pfxs, int *pfxs_out) { } } +__global__ +void better_split(int *pfxs_out, const int *pfxs) { + // This one must be launched as 32x1, regardless of BLKSZ. + const int tid = threadIdx.x; + const int tid5 = tid << 5; + __shared__ int swap[1024]; + + int base = RDXSZ * 32 * blockIdx.x; + + int value = 0; + + // Performs a fast "split" (don't know why I called it that, will rename + // soon). For each entry in pfxs (corresponding to the number of elements + // per radix in a group), this writes the exclusive prefix sum for that + // group. This is in fact a bunch of serial prefix sums in parallel, and + // not a parallel prefix sum. + // + // The contents of 32 group radix counts are loaded in 32-element chunks + // into shared memory, rotated by 1 unit each group to avoid bank + // conflicts. Each thread in the warp sums across each group serially, + // updating the values as it goes, then the results are written coherently + // to global memory. + // + // This leaves the processor extremely compute-starved, as this only allows + // 12 warps per SM. It might be better to halve the chunk size and lose + // some coalescing efficiency; need to benchmark. It's a relatively cheap + // step overall though. + + for (int j = 0; j < 8; j++) { + int jj = j << 5; + for (int i = 0; i < 32; i++) { + int base_offset = (i << 8) + jj + base + tid; + int swap_offset = (i << 5) + ((i + tid) & 0x1f); + swap[swap_offset] = pfxs[base_offset]; + } + +#pragma unroll + for (int i = 0; i < 32; i++) { + int swap_offset = tid5 + ((i + tid) & 0x1f); + int tmp = swap[swap_offset]; + swap[swap_offset] = value; + value += tmp; + } + + for (int i = 0; i < 32; i++) { + int base_offset = (i << 8) + jj + base + tid; + int swap_offset = (i << 5) + ((i + tid) & 0x1f); + pfxs_out[base_offset] = swap[swap_offset]; + } + } +} + __global__ void prefix_sum(int *pfxs, int nitems, int *out_pfxs, int *out_sums) { - const int blksz = 256; + // Needs optimizing (later). Should be rolled into split. + // Must launch 32x8. const int tid = threadIdx.y * 32 + threadIdx.x; + const int blksz = 256; int val = 0; for (int i = tid; i < nitems; i += blksz) val += pfxs[i]; @@ -99,54 +174,73 @@ void prefix_sum(int *pfxs, int nitems, int *out_pfxs, int *out_sums) { __global__ void sort_8(unsigned char *keys, int *sorted_keys, int *pfxs) { - const int grpsz = 8192; - const int blksz = 256; const int tid = threadIdx.y * 32 + threadIdx.x; - const int blk_offset = grpsz * blockIdx.x; - __shared__ int shr_pfxs[blksz]; + const int blk_offset = GRPSZ * blockIdx.x; + __shared__ int shr_pfxs[RDXSZ]; - if (threadIdx.y < 8) { - int pfx_i = blksz * blockIdx.x + tid; - shr_pfxs[tid] = pfxs[pfx_i]; - } + if (tid < RDXSZ) shr_pfxs[tid] = pfxs[RDXSZ * blockIdx.x + tid]; __syncthreads(); int i = tid; - for (int j = 0; j < 32; j++) { + for (int j = 0; j < GRP_BLK_FACTOR; j++) { int value = keys[i+blk_offset]; int offset = atomicAdd(shr_pfxs + value, 1); sorted_keys[offset] = value; - i += blksz; + i += BLKSZ; } } +#undef BLKSZ +#define BLKSZ 1024 __global__ -void sort_8_a(unsigned char *keys, int *sorted_keys, int *pfxs, int *split) { - const int grpsz = 8192; - const int blksz = 256; +void sort_8_a(unsigned char *keys, int *sorted_keys, + const int *pfxs, const int *split) { const int tid = threadIdx.y * 32 + threadIdx.x; - const int blk_offset = grpsz * blockIdx.x; - __shared__ int shr_pfxs[blksz]; - __shared__ int shr_offs[blksz]; - __shared__ int defer[grpsz]; + const int blk_offset = GRPSZ * blockIdx.x; + __shared__ int shr_offs[RDXSZ]; + __shared__ int defer[GRPSZ]; - const int pfx_i = blksz * blockIdx.x + tid; - shr_pfxs[tid] = pfxs[pfx_i]; - shr_offs[tid] = split[pfx_i]; + const int pfx_i = RDXSZ * blockIdx.x + tid; + if (tid < RDXSZ) shr_offs[tid] = split[pfx_i]; __syncthreads(); - for (int i = tid; i < grpsz; i += blksz) { + for (int i = tid; i < GRPSZ; i += BLKSZ) { int value = keys[i+blk_offset]; int offset = atomicAdd(shr_offs + value, 1); defer[offset] = value; } - //shr_pfxs[tid] = pfxs[pfx_i]; __syncthreads(); - for (int i = tid; i < grpsz; i += blksz) { + // This calculation is a bit odd. + // + // For a given radix value 'r', shr_offs[r] currently holds the first index + // of the *next* radix in defer[] (i.e. if there are 28 '0'-radix values + // in defer[], shr_offs[0]==28). We want to get back to a normal exclusive + // prefix, so we subtract shr_offs[0] from everything. + // + // In the next block, we want to be able to find the correct position for a + // value in defer[], given that value's index 'i' and its radix 'r'. This + // requires two values: the destination index in sorted_keys[] of the first + // value in the group with radix 'r' (given by pfxs[BASE + r]), and the + // number of radix-'r' values before this one in defer[]. So, ultimately, + // we want an equation in the inner loop below that looks like this: + // + // int dst_offset = pfxs[r] + i - (shr_offs[r] - shr_offs[0]); + // sorted_keys[dst_offset] = defer[i]; + // + // Of course, this generates tons of memory lookups and bank conflicts so + // we precombine some of this here. + int off0 = shr_offs[0]; + if (tid < RDXSZ) shr_offs[tid] = pfxs[0] - (shr_offs[tid] - off0); + __syncthreads(); + + int i = tid; +#pragma unroll + for (int j = 0; j < GRP_BLK_FACTOR; j++) { int value = defer[i]; - int offset = shr_pfxs[value] + i - (shr_offs[value] - shr_offs[0]); + int offset = shr_offs[value] + i; sorted_keys[offset] = value; + i += BLKSZ; } } diff --git a/sortbench.py b/sortbench.py index 70fd6bc..8382b07 100644 --- a/sortbench.py +++ b/sortbench.py @@ -6,10 +6,28 @@ import pycuda.driver as cuda import numpy as np -import os +import sys, os os.environ['PATH'] = ('/usr/x86_64-pc-linux-gnu/gcc-bin/4.4.6:' + os.environ['PATH']) +with open('sortbench.cu') as f: src = f.read() +mod = pycuda.compiler.SourceModule(src, keep=True) + +def launch(name, *args, **kwargs): + fun = mod.get_function(name) + if kwargs.pop('l1', False): + fun.set_cache_config(cuda.func_cache.PREFER_L1) + if not kwargs.get('stream'): + kwargs['time_kernel'] = True + print 'launching %s with %sx%s... ' % (name, kwargs['block'], + kwargs['grid']), + t = fun(*args, **kwargs) + if t: + print 'done (%g secs).' % t + else: + print 'done.' + + def go(scale, block, test_cpu): data = np.fromstring(np.random.bytes(scale*block), dtype=np.uint8) print 'Done seeding' @@ -21,98 +39,94 @@ def go(scale, block, test_cpu): print cpu_pfxs print 'took %g secs on CPU' % (b - a) - with open('sortbench.cu') as f: src = f.read() - mod = pycuda.compiler.SourceModule(src, keep=True) - fun = mod.get_function('prefix_scan_8_0_shmem') shmem_pfxs = np.zeros(256, dtype=np.int32) - t = fun(cuda.In(data), np.int32(block), cuda.InOut(shmem_pfxs), - block=(32, 16, 1), grid=(scale, 1), time_kernel=True) - print 'shmem took %g secs.' % t + launch('prefix_scan_8_0_shmem', + cuda.In(data), np.int32(block), cuda.InOut(shmem_pfxs), + block=(32, 16, 1), grid=(scale, 1), l1=1) if test_cpu: print 'it worked? %s' % (np.all(shmem_pfxs == cpu_pfxs)) - fun = mod.get_function('prefix_scan_8_0_shmem_lessconf') shmeml_pfxs = np.zeros(256, dtype=np.int32) - t = fun(cuda.In(data), np.int32(block), cuda.InOut(shmeml_pfxs), - block=(32, 32, 1), grid=(scale, 1), time_kernel=True) - print 'shmeml took %g secs.' % t + launch('prefix_scan_8_0_shmem_lessconf', + cuda.In(data), np.int32(block), cuda.InOut(shmeml_pfxs), + block=(32, 32, 1), grid=(scale, 1), l1=1) print 'it worked? %s' % (np.all(shmeml_pfxs == shmem_pfxs)) - fun = mod.get_function('prefix_scan_8_0_popc') popc_pfxs = np.zeros(256, dtype=np.int32) - t = fun(cuda.In(data), np.int32(block), cuda.InOut(popc_pfxs), - block=(32, 16, 1), grid=(scale, 1), time_kernel=True) - print 'popc took %g secs.' % t - print 'it worked? %s' % (np.all(shmem_pfxs == popc_pfxs)) + launch('prefix_scan_8_0_popc', + cuda.In(data), np.int32(block), cuda.InOut(popc_pfxs), + block=(32, 16, 1), grid=(scale, 1), l1=1) - fun = mod.get_function('prefix_scan_5_0_popc') popc5_pfxs = np.zeros(32, dtype=np.int32) - t = fun(cuda.In(data), np.int32(block), cuda.InOut(popc5_pfxs), - block=(32, 16, 1), grid=(scale, 1), time_kernel=True) - print 'popc5 took %g secs.' % t - print popc5_pfxs + launch('prefix_scan_5_0_popc', + cuda.In(data), np.int32(block), cuda.InOut(popc5_pfxs), + block=(32, 16, 1), grid=(scale, 1), l1=1) +def rle(a): + pos, = np.where(np.diff(a)) + lens = np.diff(np.concatenate((pos, [len(a)]))) + return [(a[p], p, l) for p, l in zip(pos, lens)[:5000]] - grids = scale * block / 8192 - print grids - incr_pfxs = np.zeros((grids + 1, 256), dtype=np.int32) - shortseg_pfxs = np.zeros(256, dtype=np.int32) - shortseg_sums = np.zeros(256, dtype=np.int32) - fun = mod.get_function('prefix_scan_8_0_shmem_shortseg') - fun.set_cache_config(cuda.func_cache.PREFER_L1) - t = fun(cuda.In(data), cuda.Out(incr_pfxs), - block=(32, 8, 1), grid=(grids, 1), time_kernel=True) - print 'shortseg took %g secs.' % t - print incr_pfxs[0] - print incr_pfxs[1] +def go_sort(count, stream=None): + data = np.fromstring(np.random.bytes(count), dtype=np.uint8) + ddata = cuda.to_device(data) + print 'Done seeding' - split = np.zeros((grids, 256), dtype=np.int32) - fun = mod.get_function('crappy_split') - fun.set_cache_config(cuda.func_cache.PREFER_L1) - t = fun(cuda.In(incr_pfxs), cuda.Out(split), - block=(32, 8, 1), grid=(grids / 256, 1), time_kernel=True) - print 'crappy_split took %g secs.' % t - print split + grids = count / 8192 + pfxs = np.zeros((grids + 1, 256), dtype=np.int32) + dpfxs = cuda.to_device(pfxs) - fun = mod.get_function('prefix_sum') - fun.set_cache_config(cuda.func_cache.PREFER_L1) - t = fun(cuda.InOut(incr_pfxs), np.int32(grids * 256), - cuda.Out(shortseg_pfxs), cuda.Out(shortseg_sums), - block=(32, 8, 1), grid=(1, 1), time_kernel=True) - print 'shortseg_sum took %g secs.' % t - print 'it worked? %s' % (np.all(shortseg_pfxs == popc_pfxs)) - print shortseg_pfxs - print shortseg_sums - print incr_pfxs[1] - incr_pfxs[0] + launch('prefix_scan_8_0_shmem_shortseg', ddata, dpfxs, + block=(32, 16, 1), grid=(grids, 1), stream=stream, l1=1) - sorted = np.zeros(scale * block, dtype=np.int32) - fun = mod.get_function('sort_8') - fun.set_cache_config(cuda.func_cache.PREFER_L1) - t = fun(cuda.In(data), cuda.Out(sorted), cuda.In(incr_pfxs), - block=(32, 8, 1), grid=(grids, 1), time_kernel=True) - print 'shortseg_sort took %g secs.' % t - print 'incr0', incr_pfxs[0] - print sorted[:100] - print sorted[-100:] + #dsplit = cuda.to_device(pfxs) + #launch('crappy_split', dpfxs, dsplit, + #block=(32, 8, 1), grid=(grids / 256, 1), stream=stream, l1=1) - sorted = np.zeros(scale * block, dtype=np.int32) - fun = mod.get_function('sort_8_a') - t = fun(cuda.In(data), cuda.Out(sorted), cuda.In(incr_pfxs), cuda.In(split), - block=(32, 8, 1), grid=(grids, 1), time_kernel=True) - print 'shortseg_sort took %g secs.' % t - print 'incr0', incr_pfxs[0] - print sorted[:100] - print sorted[-100:] + dsplit = cuda.mem_alloc(grids * 256 * 4) + launch('better_split', dsplit, dpfxs, + block=(32, 1, 1), grid=(grids / 32, 1), stream=stream) + #if not stream: + #split = cuda.from_device_like(dsplit, pfxs) + #split_ = cuda.from_device_like(dsplit_, pfxs) + #print np.all(split == split_) + dshortseg_pfxs = cuda.mem_alloc(256 * 4) + dshortseg_sums = cuda.mem_alloc(256 * 4) + launch('prefix_sum', dpfxs, np.int32(grids * 256), + dshortseg_pfxs, dshortseg_sums, + block=(32, 8, 1), grid=(1, 1), stream=stream, l1=1) + dsorted = cuda.mem_alloc(count * 4) + launch('sort_8', ddata, dsorted, dpfxs, + block=(32, 16, 1), grid=(grids, 1), stream=stream, l1=1) + + launch('sort_8_a', ddata, dsorted, dpfxs, dsplit, + block=(32, 32, 1), grid=(grids, 1), stream=stream) + if not stream: + sorted = cuda.from_device(dsorted, (count,), np.int32) + f = lambda r: ''.join(['\n\t%3d %4d %4d' % v for v in r]) + sort_stat = f(rle(sorted)) + with open('dev.txt', 'w') as fp: fp.write(sort_stat) + + sorted_np = np.sort(data) + np_stat = f(rle(sorted_np)) + with open('cpu.txt', 'w') as fp: fp.write(np_stat) + + print 'is_sorted?', np.all(sorted == sorted_np) + + #data = np.fromstring(np.random.bytes(scale*block), dtype=np.uint16) def main(): # shmem is known good; disable the CPU run to get better info from cuprof #go(8, 512<<10, True) - go(1024, 512<<8, False) + #go(1024, 512<<8, False) #go(32768, 8192, False) - + stream = cuda.Stream() if '-s' in sys.argv else None + go_sort(128<<20, stream) + if stream: + stream.synchronize() main()