From 0813bacebea9b6c5a03f5b8c129e85983a509c92 Mon Sep 17 00:00:00 2001 From: Steven Robertson Date: Sun, 19 Jun 2011 18:13:39 -0400 Subject: [PATCH] Added first prefix-scan tests. --- sortbench.cu | 123 +++++++++++++++++++++++++++++++++++++++++++++++++++ sortbench.py | 47 ++++++++++++++++++++ 2 files changed, 170 insertions(+) create mode 100644 sortbench.cu create mode 100644 sortbench.py diff --git a/sortbench.cu b/sortbench.cu new file mode 100644 index 0000000..1ad1d59 --- /dev/null +++ b/sortbench.cu @@ -0,0 +1,123 @@ +#include + + +__global__ +void prefix_scan_8_0_shmem(unsigned char *keys, int nitems, int *pfxs) { + __shared__ int sh_pfxs[256]; + + if (threadIdx.y < 8) + sh_pfxs[threadIdx.y * 32 + threadIdx.x] = 0; + + __syncthreads(); + + int blksz = blockDim.x * blockDim.y; + int cap = nitems * (blockIdx.x + 1); + + for (int i = threadIdx.y * 32 + threadIdx.x + nitems * blockIdx.x; + i < cap; i += blksz) { + int value = keys[i]; + atomicAdd(sh_pfxs + value, 1); + } + + __syncthreads(); + + if (threadIdx.y < 8) { + int off = threadIdx.y * 32 + threadIdx.x; + atomicAdd(pfxs + off, sh_pfxs[off]); + } +} + +__global__ +void prefix_scan_8_0_popc(unsigned char *keys, int nitems, int *pfxs) { + __shared__ int sh_pfxs[256]; + + if (threadIdx.y < 8) + sh_pfxs[threadIdx.y * 32 + threadIdx.x] = 0; + + __syncthreads(); + + int blksz = blockDim.x * blockDim.y; + int cap = nitems * (blockIdx.x + 1); + + int sum_000 = 0; + int sum_001 = 0; + int sum_010 = 0; + int sum_011 = 0; + int sum_100 = 0; + int sum_101 = 0; + int sum_110 = 0; + int sum_111 = 0; + + for (int i = threadIdx.y * 32 + threadIdx.x + nitems * blockIdx.x; + i < cap; i += blksz) { + + int value = keys[i]; + int test_000 = __ballot(value & 1); + if (!(threadIdx.x & 1)) test_000 = ~test_000; + + int popc_res = __ballot(value & 2); + if (!(threadIdx.x & 2)) popc_res = ~popc_res; + test_000 &= popc_res; + + popc_res = __ballot(value & 4); + if (!(threadIdx.x & 4)) popc_res = ~popc_res; + test_000 &= popc_res; + + popc_res = __ballot(value & 8); + if (!(threadIdx.x & 8)) popc_res = ~popc_res; + test_000 &= popc_res; + + popc_res = __ballot(value & 16); + if (!(threadIdx.x & 16)) popc_res = ~popc_res; + test_000 &= popc_res; + + popc_res = __ballot(value & 32); + int test_001 = test_000 & popc_res; + popc_res = ~popc_res; + test_000 &= popc_res; + + popc_res = __ballot(value & 64); + int test_010 = test_000 & popc_res; + int test_011 = test_001 & popc_res; + popc_res = ~popc_res; + test_000 &= popc_res; + test_001 &= popc_res; + + popc_res = __ballot(value & 128); + int test_100 = test_000 & popc_res; + int test_101 = test_001 & popc_res; + int test_110 = test_010 & popc_res; + int test_111 = test_011 & popc_res; + popc_res = ~popc_res; + test_000 &= popc_res; + test_001 &= popc_res; + test_010 &= popc_res; + test_011 &= popc_res; + + sum_000 += __popc(test_000); + sum_001 += __popc(test_001); + sum_010 += __popc(test_010); + sum_011 += __popc(test_011); + sum_100 += __popc(test_100); + sum_101 += __popc(test_101); + sum_110 += __popc(test_110); + sum_111 += __popc(test_111); + } + + atomicAdd(sh_pfxs + (threadIdx.x + 0), sum_000); + atomicAdd(sh_pfxs + (threadIdx.x + 32), sum_001); + atomicAdd(sh_pfxs + (threadIdx.x + 64), sum_010); + atomicAdd(sh_pfxs + (threadIdx.x + 96), sum_011); + atomicAdd(sh_pfxs + (threadIdx.x + 128), sum_100); + atomicAdd(sh_pfxs + (threadIdx.x + 160), sum_101); + atomicAdd(sh_pfxs + (threadIdx.x + 192), sum_110); + atomicAdd(sh_pfxs + (threadIdx.x + 224), sum_111); + + __syncthreads(); + + if (threadIdx.y < 8) { + int off = threadIdx.y * 32 + threadIdx.x; + atomicAdd(pfxs + off, sh_pfxs[off]); + } +} + diff --git a/sortbench.py b/sortbench.py new file mode 100644 index 0000000..d23a7fc --- /dev/null +++ b/sortbench.py @@ -0,0 +1,47 @@ +import time + +import pycuda.autoinit +import pycuda.compiler +import pycuda.driver as cuda + +import numpy as np + +import os +os.environ['PATH'] = ('/usr/x86_64-pc-linux-gnu/gcc-bin/4.4.5:' + + os.environ['PATH']) + +def go(scale, block, test_cpu): + data = np.fromstring(np.random.bytes(scale*block), dtype=np.uint8) + print 'Done seeding' + + if test_cpu: + a = time.time() + cpu_pfxs = np.array([np.sum(data == v) for v in range(256)]) + b = time.time() + print cpu_pfxs + print 'took %g secs on CPU' % (b - a) + + with open('sortbench.cu') as f: src = f.read() + mod = pycuda.compiler.SourceModule(src) + fun = mod.get_function('prefix_scan_8_0_shmem') + shmem_pfxs = np.zeros(256, dtype=np.int32) + t = fun(cuda.In(data), np.int32(block), cuda.InOut(shmem_pfxs), + block=(32, 16, 1), grid=(scale, 1), time_kernel=True) + print 'shmem took %g secs.' % t + if test_cpu: + print 'it worked? %s' % (np.all(shmem_pfxs == cpu_pfxs)) + + fun = mod.get_function('prefix_scan_8_0_popc') + popc_pfxs = np.zeros(256, dtype=np.int32) + t = fun(cuda.In(data), np.int32(block), cuda.InOut(popc_pfxs), + block=(32, 16, 1), grid=(scale, 1), time_kernel=True) + print 'popc took %g secs.' % t + print 'it worked? %s' % (np.all(shmem_pfxs == popc_pfxs)) + +def main(): + go(8, 512<<10, True) + go(1024, 512<<10, False) + + +main() +