From 83704dd303593aa3e90a48a897c8857eb6bf57f8 Mon Sep 17 00:00:00 2001 From: Steven Robertson Date: Wed, 31 Aug 2011 10:39:01 -0400 Subject: [PATCH] Fine performance, but the scan's mis-ordering is worse than I thought. --- sortbench.cu | 149 ++++++++++++++++++++++++++++++++++++++-------- sortbench.py | 163 ++++++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 282 insertions(+), 30 deletions(-) diff --git a/sortbench.cu b/sortbench.cu index 91f1ce9..f501902 100644 --- a/sortbench.cu +++ b/sortbench.cu @@ -1,6 +1,8 @@ #include #include +#define s(x) #x + __global__ void prefix_scan_8_0_shmem(unsigned char *keys, int nitems, int *pfxs) { __shared__ int sh_pfxs[256]; @@ -34,22 +36,26 @@ void prefix_scan_8_0_shmem(unsigned char *keys, int nitems, int *pfxs) { #define BLKSZ 512 __global__ -void prefix_scan(unsigned short *keys, int *pfxs, const int shift) { - const int tid = threadIdx.y * 32 + threadIdx.x; - __shared__ int shr_pfxs[BLKSZ]; +void prefix_scan(unsigned short *offsets, int *pfxs, + const unsigned short *keys, const int shift) { + const int tid = threadIdx.x; + __shared__ int shr_pfxs[RDXSZ]; - shr_pfxs[tid] = 0; + if (tid < RDXSZ) shr_pfxs[tid] = 0; __syncthreads(); int i = tid + GRPSZ * blockIdx.x; for (int j = 0; j < GRP_BLK_FACTOR; j++) { - int value = (keys[i] >> shift) && 0xff; - atomicAdd(shr_pfxs + value, 1); + // TODO: compiler smart enough to turn this into a BFE? + // TODO: should this just be two functions with fixed shifts? + // TODO: separate or integrated loop vars? unrolling? + int value = (keys[i] >> shift) & 0xff; + offsets[i] = atomicAdd(shr_pfxs + value, 1); i += BLKSZ; } __syncthreads(); - pfxs[tid + BLKSZ * blockIdx.x] = shr_pfxs[tid]; + if (tid < RDXSZ) pfxs[tid + RDXSZ * blockIdx.x] = shr_pfxs[tid]; } __global__ @@ -110,10 +116,9 @@ void better_split(int *pfxs_out, const int *pfxs) { // updating the values as it goes, then the results are written coherently // to global memory. // - // This leaves the processor extremely compute-starved, as this only allows - // 12 warps per SM. It might be better to halve the chunk size and lose - // some coalescing efficiency; need to benchmark. It's a relatively cheap - // step overall though. + // This leaves the SM underloaded, as this only allows 12 warps per SM. It + // might be better to halve the chunk size and lose some coalescing + // efficiency; need to benchmark. It's a relatively cheap step, though. for (int j = 0; j < 8; j++) { int jj = j << 5; @@ -139,17 +144,16 @@ void better_split(int *pfxs_out, const int *pfxs) { } } + __global__ -void prefix_sum(int *pfxs, int nitems, int *out_pfxs, int *out_sums) { +void prefix_sum(int *pfxs, const int nitems) { // Needs optimizing (later). Should be rolled into split. - // Must launch 32x8. - const int tid = threadIdx.y * 32 + threadIdx.x; + // Must launch 256 threads. + const int tid = threadIdx.x; const int blksz = 256; int val = 0; for (int i = tid; i < nitems; i += blksz) val += pfxs[i]; - out_pfxs[tid] = val; - // I know there's a better way to implement this summing network, // but it's not a time-critical piece of code. __shared__ int sh_pfxs[blksz]; @@ -158,23 +162,18 @@ void prefix_sum(int *pfxs, int nitems, int *out_pfxs, int *out_sums) { __syncthreads(); // Intentionally exclusive indexing here, val{0} should be 0 for (int i = 0; i < tid; i++) val += sh_pfxs[i]; - out_sums[tid] = val; - // Here we shift things over by 1, to make retrieving the - // indices and differences easier in the sorting step. int i; for (i = tid; i < nitems; i += blksz) { int t = pfxs[i]; pfxs[i] = val; val += t; } - // Now write the last column and we're done. - pfxs[i] = val; } __global__ void sort_8(unsigned char *keys, int *sorted_keys, int *pfxs) { - const int tid = threadIdx.y * 32 + threadIdx.x; + const int tid = threadIdx.x; const int blk_offset = GRPSZ * blockIdx.x; __shared__ int shr_pfxs[RDXSZ]; @@ -190,12 +189,13 @@ void sort_8(unsigned char *keys, int *sorted_keys, int *pfxs) { } } + #undef BLKSZ #define BLKSZ 1024 __global__ void sort_8_a(unsigned char *keys, int *sorted_keys, const int *pfxs, const int *split) { - const int tid = threadIdx.y * 32 + threadIdx.x; + const int tid = threadIdx.x; const int blk_offset = GRPSZ * blockIdx.x; __shared__ int shr_offs[RDXSZ]; __shared__ int defer[GRPSZ]; @@ -244,6 +244,109 @@ void sort_8_a(unsigned char *keys, int *sorted_keys, } } +__global__ +void convert_offsets( + unsigned short *offsets, // input and output + const int *split, + const unsigned short *keys, + const int shift + ) { + const int tid = threadIdx.x; + const int blk_offset = GRPSZ * blockIdx.x; + const int rdx_offset = RDXSZ * blockIdx.x; + __shared__ int shr_offsets[GRPSZ]; + __shared__ int shr_split[RDXSZ]; + + if (tid < RDXSZ) shr_split[tid] = split[rdx_offset + tid]; + __syncthreads(); + + for (int i = tid; i < GRPSZ; i += BLKSZ) { + int r = (keys[blk_offset + i] >> shift) & 0xff; + int o = shr_split[r] + offsets[blk_offset + i]; + shr_offsets[o] = i; + } + __syncthreads(); + + for (int i = tid; i < GRPSZ; i += BLKSZ) + offsets[blk_offset + i] = shr_offsets[i]; +} + +__global__ +void radix_sort_maybe( + unsigned short *sorted_keys, + int *sorted_values, + const unsigned short *keys, + const unsigned int *values, + const unsigned short *offsets, + const int *pfxs, + const int *split, + const int shift + ) { + const int tid = threadIdx.x; + const int blk_offset = GRPSZ * blockIdx.x; + const int rdx_offset = RDXSZ * blockIdx.x; + __shared__ int shr_offs[RDXSZ]; + + if (tid < RDXSZ) + shr_offs[tid] = pfxs[rdx_offset + tid] - split[rdx_offset + tid]; + __syncthreads(); + + int i = tid; + for (int j = 0; j < GRP_BLK_FACTOR; j++) { + int offset = offsets[blk_offset + i]; + int key = keys[blk_offset + offset]; + int radix = (key >> shift) & 0xff; + int glob_offset = shr_offs[radix] + i; + /*if (sorted_values[glob_offset] != 0xffffffff) + printf("\nbad offset pos:%6x off:%4x gloff:%6x key:%4x " + "okey:%4x val:%8x oval:%8x", + i+blk_offset, offset, glob_offset, key, + sorted_keys[glob_offset], sorted_values[glob_offset]);*/ + sorted_keys[glob_offset] = key; + sorted_values[glob_offset] = values[blk_offset + offset]; + i += BLKSZ; + } +} + +__global__ +void radix_sort(unsigned short *sorted_keys, int *sorted_values, + const unsigned short *keys, const unsigned int *values, + const int *pfxs, const int *offsets, const int *split, + const int shift) { + const int tid = threadIdx.x; + const int blk_offset = GRPSZ * blockIdx.x; + __shared__ int shr_offs[RDXSZ]; + __shared__ int defer[GRPSZ]; + __shared__ unsigned char radishes[GRPSZ]; + + const int pfx_i = RDXSZ * blockIdx.x + tid; + if (tid < RDXSZ) shr_offs[tid] = split[pfx_i]; + __syncthreads(); + + for (int i = tid; i < GRPSZ; i += BLKSZ) { + int idx = i + blk_offset; + int value = keys[idx]; + int radix = radishes[i] = (value >> shift) & 0xff; + int offset = offsets[idx] + split[radix]; + defer[offset] = value; + } + __syncthreads(); + + if (tid < RDXSZ) shr_offs[tid] = pfxs[tid] - shr_offs[tid]; + __syncthreads(); + + // Faster to reload these or to recompute them in shmem? Need to see if we + // can safely stash both + + int i = tid; +#pragma unroll + for (int j = 0; j < GRP_BLK_FACTOR; j++) { + int value = defer[i]; + int offset = shr_offs[value] + i; + sorted_keys[offset] = value; + i += BLKSZ; + } +} __global__ diff --git a/sortbench.py b/sortbench.py index 8382b07..b3c6e6d 100644 --- a/sortbench.py +++ b/sortbench.py @@ -5,11 +5,14 @@ import pycuda.compiler import pycuda.driver as cuda import numpy as np +np.set_printoptions(precision=5, edgeitems=20, linewidth=100, threshold=9000) import sys, os os.environ['PATH'] = ('/usr/x86_64-pc-linux-gnu/gcc-bin/4.4.6:' + os.environ['PATH']) +i32 = np.int32 + with open('sortbench.cu') as f: src = f.read() mod = pycuda.compiler.SourceModule(src, keep=True) @@ -62,12 +65,161 @@ def go(scale, block, test_cpu): cuda.In(data), np.int32(block), cuda.InOut(popc5_pfxs), block=(32, 16, 1), grid=(scale, 1), l1=1) -def rle(a): +def rle(a, n=512): pos, = np.where(np.diff(a)) - lens = np.diff(np.concatenate((pos, [len(a)]))) - return [(a[p], p, l) for p, l in zip(pos, lens)[:5000]] + pos = np.concatenate(([0], pos+1, [len(a)])) + lens = np.diff(pos) + return [(a[p], p, l) for p, l in zip(pos, lens)[:n]] + + +def frle(a, n=512): + return ''.join(['\n\t%4x %6x %6x' % v for v in rle(a, n)]) + +# Some reference implementations follow for debugging. +def py_convert_offsets(offsets, split, keys, shift): + grids = len(offsets) + new_offs = np.empty((grids, 8192), dtype=np.int32) + for i in range(grids): + rdxs = (keys[i] >> shift) & 0xff + o = split[i][rdxs] + offsets[i] + new_offs[i][o] = np.arange(8192, dtype=np.int32) + return new_offs + +def py_radix_sort_maybe(keys, offsets, pfxs, split, shift): + grids = len(offsets) + idxs = np.arange(8192) + + okeys = np.empty(grids*8192, dtype=np.int32) + okeys.fill(-1) + + for i in range(grids): + offs = pfxs[i] - split[i] + lkeys = keys[i][offsets[i]] + rdxs = (lkeys >> shift) & 0xff + glob_offsets = offs[rdxs] + idxs + okeys[glob_offsets] = lkeys + return okeys def go_sort(count, stream=None): + grids = count / 8192 + + #keys = np.fromstring(np.random.bytes(count*2), dtype=np.uint16) + keys = np.arange(count, dtype=np.uint16) + np.random.shuffle(keys) + mkeys = np.reshape(keys, (grids, 8192)) + vals = np.arange(count, dtype=np.uint32) + dkeys = cuda.to_device(keys) + dvals = cuda.to_device(vals) + print 'Done seeding' + + dpfxs = cuda.mem_alloc(grids * 256 * 4) + doffsets = cuda.mem_alloc(count * 2) + launch('prefix_scan', doffsets, dpfxs, dkeys, i32(0), + block=(512, 1, 1), grid=(grids, 1), stream=stream, l1=1) + print cuda.from_device(dpfxs, (2, 256), np.uint32) + + dsplit = cuda.mem_alloc(grids * 256 * 4) + launch('better_split', dsplit, dpfxs, + block=(32, 1, 1), grid=(grids / 32, 1), stream=stream) + + # This stage will be rejiggered along with the split + launch('prefix_sum', dpfxs, np.int32(grids * 256), + block=(256, 1, 1), grid=(1, 1), stream=stream, l1=1) + print cuda.from_device(dpfxs, (2, 256), np.uint32) + + launch('convert_offsets', doffsets, dsplit, dkeys, i32(0), + block=(1024, 1, 1), grid=(grids, 1), stream=stream) + if not stream: + offsets = cuda.from_device(doffsets, (grids, 8192), np.uint16) + split = cuda.from_device(dsplit, (grids, 256), np.uint32) + pfxs = cuda.from_device(dpfxs, (grids, 256), np.uint32) + tkeys = py_radix_sort_maybe(mkeys, offsets, pfxs, split, 0) + print frle(tkeys & 0xff) + + d_skeys = cuda.mem_alloc(count * 2) + d_svals = cuda.mem_alloc(count * 4) + if not stream: + cuda.memset_d32(d_skeys, 0, count/2) + cuda.memset_d32(d_svals, 0xffffffff, count) + launch('radix_sort_maybe', d_skeys, d_svals, + dkeys, dvals, doffsets, dpfxs, dsplit, i32(0), + block=(1024, 1, 1), grid=(grids, 1), stream=stream, l1=1) + + if not stream: + skeys = cuda.from_device_like(d_skeys, keys) + svals = cuda.from_device_like(d_svals, vals) + + # Test integrity of sort (keys and values kept together): + # skeys[i] = keys[svals[i]] for all i + print 'Integrity: ', + if np.all(svals < len(keys)) and np.all(skeys == keys[svals]): + print 'pass' + else: + print 'FAIL' + + print frle(skeys & 0xff) + + dkeys, d_skeys = d_skeys, dkeys + dvals, d_svals = d_svals, dvals + + if not stream: + cuda.memset_d32(d_skeys, 0, count/2) + cuda.memset_d32(d_svals, 0xffffffff, count) + + launch('prefix_scan', doffsets, dpfxs, dkeys, i32(8), + block=(512, 1, 1), grid=(grids, 1), stream=stream, l1=1) + launch('better_split', dsplit, dpfxs, + block=(32, 1, 1), grid=(grids / 32, 1), stream=stream) + launch('prefix_sum', dpfxs, np.int32(grids * 256), + block=(256, 1, 1), grid=(1, 1), stream=stream, l1=1) + pre_offsets = cuda.from_device(doffsets, (grids, 8192), np.uint16) + launch('convert_offsets', doffsets, dsplit, dkeys, i32(8), + block=(1024, 1, 1), grid=(grids, 1), stream=stream) + if not stream: + offsets = cuda.from_device(doffsets, (grids, 8192), np.uint16) + split = cuda.from_device(dsplit, (grids, 256), np.uint32) + pfxs = cuda.from_device(dpfxs, (grids, 256), np.uint32) + tkeys = np.reshape(tkeys, (grids, 8192)) + + new_offs = py_convert_offsets(pre_offsets, split, tkeys, 8) + print new_offs[:3] + print offsets[:3] + print np.nonzero(new_offs != offsets) + + fkeys = py_radix_sort_maybe(tkeys, new_offs, pfxs, split, 8) + print frle(fkeys) + + + launch('radix_sort_maybe', d_skeys, d_svals, + dkeys, dvals, doffsets, dpfxs, dsplit, i32(8), + block=(1024, 1, 1), grid=(grids, 1), stream=stream, l1=1) + + if not stream: + #print cuda.from_device(doffsets, (4, 8192), np.uint16) + #print cuda.from_device(dkeys, (4, 8192), np.uint16) + #print cuda.from_device(d_skeys, (4, 8192), np.uint16) + + skeys = cuda.from_device_like(d_skeys, keys) + svals = cuda.from_device_like(d_svals, vals) + + print 'Integrity: ', + if np.all(svals < len(keys)) and np.all(skeys == keys[svals]): + print 'pass' + else: + print 'FAIL' + + sorted_keys = np.sort(keys) + # Test that ordering is correct. (Note that we don't need 100% + # correctness, so this test should be made "soft".) + print 'Order: ', 'pass' if np.all(skeys == sorted_keys) else 'FAIL' + + print frle(skeys) + print fkeys + print skeys + print np.nonzero(fkeys != skeys)[0] + + +def go_sort_old(count, stream=None): data = np.fromstring(np.random.bytes(count), dtype=np.uint8) ddata = cuda.to_device(data) print 'Done seeding' @@ -115,16 +267,13 @@ def go_sort(count, stream=None): print 'is_sorted?', np.all(sorted == sorted_np) - #data = np.fromstring(np.random.bytes(scale*block), dtype=np.uint16) - - def main(): # shmem is known good; disable the CPU run to get better info from cuprof #go(8, 512<<10, True) #go(1024, 512<<8, False) #go(32768, 8192, False) stream = cuda.Stream() if '-s' in sys.argv else None - go_sort(128<<20, stream) + go_sort(1<<20, stream) if stream: stream.synchronize()