From 0813bacebea9b6c5a03f5b8c129e85983a509c92 Mon Sep 17 00:00:00 2001
From: Steven Robertson <steven@strobe.cc>
Date: Sun, 19 Jun 2011 18:13:39 -0400
Subject: [PATCH] Added first prefix-scan tests.

---
 sortbench.cu | 123 +++++++++++++++++++++++++++++++++++++++++++++++++++
 sortbench.py |  47 ++++++++++++++++++++
 2 files changed, 170 insertions(+)
 create mode 100644 sortbench.cu
 create mode 100644 sortbench.py

diff --git a/sortbench.cu b/sortbench.cu
new file mode 100644
index 0000000..1ad1d59
--- /dev/null
+++ b/sortbench.cu
@@ -0,0 +1,123 @@
+#include <cuda.h>
+
+
+__global__
+void prefix_scan_8_0_shmem(unsigned char *keys, int nitems, int *pfxs) {
+    __shared__ int sh_pfxs[256];
+
+    if (threadIdx.y < 8)
+        sh_pfxs[threadIdx.y * 32 + threadIdx.x] = 0;
+
+    __syncthreads();
+
+    int blksz = blockDim.x * blockDim.y;
+    int cap = nitems * (blockIdx.x + 1);
+
+    for (int i = threadIdx.y * 32 + threadIdx.x + nitems * blockIdx.x;
+         i < cap; i += blksz) {
+        int value = keys[i];
+        atomicAdd(sh_pfxs + value, 1);
+    }
+
+    __syncthreads();
+
+    if (threadIdx.y < 8) {
+        int off = threadIdx.y * 32 + threadIdx.x;
+        atomicAdd(pfxs + off, sh_pfxs[off]);
+    }
+}
+
+__global__
+void prefix_scan_8_0_popc(unsigned char *keys, int nitems, int *pfxs) {
+    __shared__ int sh_pfxs[256];
+
+    if (threadIdx.y < 8)
+        sh_pfxs[threadIdx.y * 32 + threadIdx.x] = 0;
+
+    __syncthreads();
+
+    int blksz = blockDim.x * blockDim.y;
+    int cap = nitems * (blockIdx.x + 1);
+
+    int sum_000 = 0;
+    int sum_001 = 0;
+    int sum_010 = 0;
+    int sum_011 = 0;
+    int sum_100 = 0;
+    int sum_101 = 0;
+    int sum_110 = 0;
+    int sum_111 = 0;
+
+    for (int i = threadIdx.y * 32 + threadIdx.x + nitems * blockIdx.x;
+         i < cap; i += blksz) {
+
+        int value = keys[i];
+        int test_000 = __ballot(value & 1);
+        if (!(threadIdx.x & 1)) test_000 = ~test_000;
+
+        int popc_res = __ballot(value & 2);
+        if (!(threadIdx.x & 2)) popc_res = ~popc_res;
+        test_000 &= popc_res;
+
+        popc_res = __ballot(value & 4);
+        if (!(threadIdx.x & 4)) popc_res = ~popc_res;
+        test_000 &= popc_res;
+
+        popc_res = __ballot(value & 8);
+        if (!(threadIdx.x & 8)) popc_res = ~popc_res;
+        test_000 &= popc_res;
+
+        popc_res = __ballot(value & 16);
+        if (!(threadIdx.x & 16)) popc_res = ~popc_res;
+        test_000 &= popc_res;
+
+        popc_res = __ballot(value & 32);
+        int test_001 = test_000 & popc_res;
+        popc_res = ~popc_res;
+        test_000 &= popc_res;
+
+        popc_res = __ballot(value & 64);
+        int test_010 = test_000 & popc_res;
+        int test_011 = test_001 & popc_res;
+        popc_res = ~popc_res;
+        test_000 &= popc_res;
+        test_001 &= popc_res;
+
+        popc_res = __ballot(value & 128);
+        int test_100 = test_000 & popc_res;
+        int test_101 = test_001 & popc_res;
+        int test_110 = test_010 & popc_res;
+        int test_111 = test_011 & popc_res;
+        popc_res = ~popc_res;
+        test_000 &= popc_res;
+        test_001 &= popc_res;
+        test_010 &= popc_res;
+        test_011 &= popc_res;
+
+        sum_000 += __popc(test_000);
+        sum_001 += __popc(test_001);
+        sum_010 += __popc(test_010);
+        sum_011 += __popc(test_011);
+        sum_100 += __popc(test_100);
+        sum_101 += __popc(test_101);
+        sum_110 += __popc(test_110);
+        sum_111 += __popc(test_111);
+    }
+
+    atomicAdd(sh_pfxs + (threadIdx.x + 0),   sum_000);
+    atomicAdd(sh_pfxs + (threadIdx.x + 32),  sum_001);
+    atomicAdd(sh_pfxs + (threadIdx.x + 64),  sum_010);
+    atomicAdd(sh_pfxs + (threadIdx.x + 96),  sum_011);
+    atomicAdd(sh_pfxs + (threadIdx.x + 128), sum_100);
+    atomicAdd(sh_pfxs + (threadIdx.x + 160), sum_101);
+    atomicAdd(sh_pfxs + (threadIdx.x + 192), sum_110);
+    atomicAdd(sh_pfxs + (threadIdx.x + 224), sum_111);
+
+    __syncthreads();
+
+    if (threadIdx.y < 8) {
+        int off = threadIdx.y * 32 + threadIdx.x;
+        atomicAdd(pfxs + off, sh_pfxs[off]);
+    }
+}
+
diff --git a/sortbench.py b/sortbench.py
new file mode 100644
index 0000000..d23a7fc
--- /dev/null
+++ b/sortbench.py
@@ -0,0 +1,47 @@
+import time
+
+import pycuda.autoinit
+import pycuda.compiler
+import pycuda.driver as cuda
+
+import numpy as np
+
+import os
+os.environ['PATH'] = ('/usr/x86_64-pc-linux-gnu/gcc-bin/4.4.5:'
+                     + os.environ['PATH'])
+
+def go(scale, block, test_cpu):
+    data = np.fromstring(np.random.bytes(scale*block), dtype=np.uint8)
+    print 'Done seeding'
+
+    if test_cpu:
+        a = time.time()
+        cpu_pfxs = np.array([np.sum(data == v) for v in range(256)])
+        b = time.time()
+        print cpu_pfxs
+        print 'took %g secs on CPU' % (b - a)
+
+    with open('sortbench.cu') as f: src = f.read()
+    mod = pycuda.compiler.SourceModule(src)
+    fun = mod.get_function('prefix_scan_8_0_shmem')
+    shmem_pfxs = np.zeros(256, dtype=np.int32)
+    t = fun(cuda.In(data), np.int32(block), cuda.InOut(shmem_pfxs),
+            block=(32, 16, 1), grid=(scale, 1), time_kernel=True)
+    print 'shmem took %g secs.' % t
+    if test_cpu:
+        print 'it worked? %s' % (np.all(shmem_pfxs == cpu_pfxs))
+
+    fun = mod.get_function('prefix_scan_8_0_popc')
+    popc_pfxs = np.zeros(256, dtype=np.int32)
+    t = fun(cuda.In(data), np.int32(block), cuda.InOut(popc_pfxs),
+            block=(32, 16, 1), grid=(scale, 1), time_kernel=True)
+    print 'popc took %g secs.' % t
+    print 'it worked? %s' % (np.all(shmem_pfxs == popc_pfxs))
+
+def main():
+    go(8, 512<<10, True)
+    go(1024, 512<<10, False)
+
+
+main()
+