cuburn/cuburn/code/util.py
2013-01-06 22:03:03 -08:00

337 lines
11 KiB
Python

"""
Provides tools and miscellaneous functions for building device code.
"""
import os
import tempfile
from collections import namedtuple
try:
import pycuda.driver as cuda
import pycuda.compiler
except ImportError, e:
import traceback
traceback.print_exc()
print 'Continuing without CUDA. Things will break.'
import numpy as np
import tempita
fst = lambda (a,b): a
snd = lambda (a,b): b
def argset(obj, **kwargs):
"""
Allow an object with many properties to be set using one call.
>>> x = argset(X(), a=1, b=2)
>>> x.a
1
"""
for k, v in kwargs.items():
setattr(obj, k, v)
return obj
def launch(name, mod, stream, block, grid, *args, **kwargs):
"""
Oft-used kernel launch helper. Provides a nice boost in readability of
densely-packed asynchronous code launches.
"""
fun = mod.get_function(name)
if isinstance(block, (int, np.number)):
block = (int(block), 1, 1)
if isinstance(grid, (int, np.number)):
grid = (int(grid), 1)
fun(*args, block=block, grid=grid, stream=stream, **kwargs)
def launch2(name, mod, stream, dim, *args, **kwargs):
"""
Launch using a standardized 2D grid: blocks in the shape (32, 8, 1), and
grid size set to fully cover the image. The GET_IDX and GET_IDX_2 macros
assume this launch pattern.
"""
# 32 has a tied constant in the GET_IDX_2 macro definition below
block, grid = (32, 8, 1), (dim.astride / 32, dim.ah / 8)
launch(name, mod, stream, block, grid, *args, **kwargs)
def crep(s):
"""Multiline literal escape for inline PTX assembly."""
if isinstance(s, unicode):
s = s.encode('utf-8')
return '"%s"' % s.encode("string_escape")
class Template(tempita.Template):
"""
A Tempita template with extra stuff in the default namespace.
"""
default_namespace = tempita.Template.default_namespace.copy()
Template.default_namespace.update({'np': np, 'crep': crep})
# Passive container for device code.
DevLib = namedtuple('DevLib', 'deps headers decls defs')
def devlib(deps=(), headers='', decls='', defs=''):
"""Create a library of device code."""
# This exists because namedtuple doesn't support default args
return DevLib(deps, headers, decls, defs)
def assemble_code(*libs):
seen = set()
out = []
def go(lib):
map(go, lib.deps)
code = lib[1:]
if code not in seen:
seen.add(code)
out.append(code)
go(stdlib)
map(go, libs)
return ''.join(sum(zip(*out), ()))
DEFAULT_CMP_OPTIONS = ('-use_fast_math', '-maxrregcount', '42')
DEFAULT_SAVE_KERNEL = True
def compile(name, src, opts=DEFAULT_CMP_OPTIONS, save=DEFAULT_SAVE_KERNEL,
arch=None):
"""
Compile a module. Returns a copy of the source (for inspection or
display) and the compiled cubin.
"""
dir = tempfile.gettempdir()
if save:
with open(os.path.join(dir, name + '_kern.cu'), 'w') as fp:
fp.write(src)
cubin = pycuda.compiler.compile(src, options=list(opts), arch=arch)
if save:
with open(os.path.join(dir, name + '_kern.cubin'), 'w') as fp:
fp.write(cubin)
return cubin
class ClsMod(object):
"""
Convenience class or mixin that automatically compiles and loads a module
once per class, saving potentially expensive code regeneration. Only use
if your class does not employ run-time code generation.
"""
mod = None
# Supply your own DevLib on this property
lib = None
def __init__(self):
super(ClsMod, self).__init__()
self.load()
@classmethod
def load(cls, name=None):
if cls.mod is None:
if name is None:
name = cls.__name__.lower()
cubin = compile(name, assemble_code(cls.lib))
cls.mod = cuda.module_from_buffer(cubin)
# This lib is included with *every* assembled module. It contains no global
# functions, so it shouldn't slow down compilation time too much.
stdlib = devlib(headers="""
#include<cuda.h>
#include<stdint.h>
#include<stdio.h>
""", decls=r"""
#undef M_E
#undef M_LOG2E
#undef M_LOG10E
#undef M_LN2
#undef M_LN10
#undef M_PI
#undef M_PI_2
#undef M_PI_4
#undef M_1_PI
#undef M_2_PI
#undef M_2_SQRTPI
#undef M_SQRT2
#undef M_SQRT1_2
#define M_E 2.71828174591064f
#define M_LOG2E 1.44269502162933f
#define M_LOG10E 0.43429449200630f
#define M_LN2 0.69314718246460f
#define M_LN10 2.30258512496948f
#define M_PI 3.14159274101257f
#define M_PI_2 1.57079637050629f
#define M_PI_4 0.78539818525314f
#define M_1_PI 0.31830987334251f
#define M_2_PI 0.63661974668503f
#define M_2_SQRTPI 1.12837922573090f
#define M_SQRT2 1.41421353816986f
#define M_SQRT1_2 0.70710676908493f
#define bfe(d, s, o, w) \
asm("bfe.u32 %0, %1, %2, %3;" : "=r"(d) : "r"(s), "r"(o), "r"(w))
#define bfe_decl(d, s, o, w) \
int d; \
bfe(d, s, o, w)
#define GET_IDX_2(xi, yi, gi) \
int xi = blockIdx.x * blockDim.x + threadIdx.x; \
int yi = blockIdx.y * blockDim.y + threadIdx.y; \
int gi = yi * (32 * gridDim.x) + xi
#define GET_IDX(i) GET_IDX_2(x___, y___, i)
""", defs=r'''
__device__ uint32_t gtid() {
return threadIdx.x + blockDim.x *
(threadIdx.y + blockDim.y *
(threadIdx.z + blockDim.z *
(blockIdx.x + (gridDim.x * blockIdx.y))));
}
__device__ uint32_t trunca(float f) {
// truncate as used in address calculations. note the use of a signed
// conversion is intentional here (simplifies image bounds checking).
uint32_t ret;
asm("cvt.rni.s32.f32 %0, %1;" : "=r"(ret) : "f"(f));
return ret;
}
__device__ void scale_float4(float4& pix, float scale) {
pix.x *= scale;
pix.y *= scale;
pix.z *= scale;
pix.w *= scale;
}
''')
def mkbinsearchlib(rounds):
"""
Search through the fixed-size list 'hay' to find the rightmost index which
contains a value strictly smaller than the input 'needle'. The list must
be exactly '2^rounds' long, although padding at the top with very large
numbers or even +inf effectively shortens it.
"""
# TODO: this doesn't optimize well on a 64-bit arch, not that it's a
# performance-critical chunk of code or anything
src = Template(r'''
__device__ int bitwise_binsearch(const float *hay, float needle) {
int lo = 0;
// TODO: improve efficiency on 64-bit arches
{{for i in range(search_rounds-1, -1, -1)}}
if (needle > hay[lo + {{1 << i}}])
lo += {{1 << i}};
{{endfor}}
return lo;
}
''', 'bitwise_binsearch')
return devlib(defs=src.substitute(search_rounds=rounds))
# 2^search_rounds is the maximum number of knots allowed in a single spline.
# This includes the four required knots, so a 5 round search supports 28
# interior knots in the domain (0, 1). 2^5 fits nicely on a single cache line.
DEFAULT_SEARCH_ROUNDS = 5
binsearchlib = mkbinsearchlib(DEFAULT_SEARCH_ROUNDS)
filldptrlib = devlib(defs=r'''
__global__ void
fill_dptr(uint32_t* dptr, int size, uint32_t value) {
int i = (gridDim.x * blockIdx.y + blockIdx.x) * blockDim.x + threadIdx.x;
if (i < size) {
dptr[i] = value;
}
}
''')
def fill_dptr(mod, dptr, size, stream=None, value=np.uint32(0)):
"""
A memory zeroer which can be embedded in a stream, unlike the various
memset routines. Size is the number of 4-byte words in the pointer;
value is the word to fill it with. If value is not an np.uint32, it
will be coerced to a buffer and the first four bytes taken.
"""
if not isinstance(value, np.uint32):
if isinstance(value, int):
value = np.uint32(value)
else:
value = np.frombuffer(buffer(value), np.uint32)[0]
blocks = int(np.ceil(np.sqrt(size / 1024.)))
launch('fill_dptr', mod, stream, (1024, 1, 1), (blocks, blocks),
dptr, np.int32(size), value)
writehalflib = devlib(defs=r'''
/* read_half and write_half decode and encode, respectively, two
* floating-point values from a 32-bit value (typed as a 'float' for
* convenience but not really). The values are packed into u16s as a
* proportion of a third value, as in 'ux = u16( x / d * (2^16-1) )'.
* This is used during accumulation.
*
* TODO: also write a function that will efficiently add a value to the packed
* values while incrementing the density, to improve the speed of this
* approach when the alpha channel is present.
*/
__device__ void
read_half(float &x, float &y, float xy, float den) {
asm("\n\t{"
"\n\t .reg .u16 x, y;"
"\n\t .reg .f32 rc;"
"\n\t mov.b32 {x, y}, %2;"
"\n\t mul.f32 rc, %3, 0f37800080;" // 1/65535.
"\n\t cvt.rn.f32.u16 %0, x;"
"\n\t cvt.rn.f32.u16 %1, y;"
"\n\t mul.f32 %0, %0, rc;"
"\n\t mul.f32 %1, %1, rc;"
"\n\t}"
: "=f"(x), "=f"(y) : "f"(xy), "f"(den));
}
__device__ void
write_half(float &xy, float x, float y, float den) {
asm("\n\t{"
"\n\t .reg .u16 x, y;"
"\n\t .reg .f32 rc, xf, yf;"
"\n\t rcp.approx.f32 rc, %3;"
"\n\t mul.f32 rc, rc, 65535.0;"
"\n\t mul.f32 xf, %1, rc;"
"\n\t mul.f32 yf, %2, rc;"
"\n\t cvt.rni.u16.f32 x, xf;"
"\n\t cvt.rni.u16.f32 y, yf;"
"\n\t mov.b32 %0, {x, y};"
"\n\t}"
: "=f"(xy) : "f"(x), "f"(y), "f"(den));
}
''')
def mkringbuflib(rb_size):
"""
A ringbuffer for access to shared resources.
Some components, such as the MWC contexts, are expensive to generate, and
have no affinity to a particular block. Rather than maintain a separate
copy of each of these objects for every thread block in a launch, we want
to only keep enough copies of these resources around to service every
thread block that could possibly be active simultaneously on one card,
which is often considerably smaller. This class provides a simple
ringbuffer type and an increment function, used in a couple places to
implement this kind of sharing.
"""
return devlib(headers="#define RB_SIZE_MASK %d" % (rb_size - 1), decls='''
typedef struct {
uint32_t head;
uint32_t tail;
} ringbuf;
''', defs=r'''
__shared__ uint32_t rb_idx;
__device__ uint32_t rb_incr(uint32_t &rb_base, int tidx) {
if (threadIdx.y == 0 && threadIdx.x == 0)
rb_idx = 256 * (atomicAdd(&rb_base, 1) & RB_SIZE_MASK);
__syncthreads();
return rb_idx + tidx;
}
''')
# For now, the number of entries is fixed to a value known to work on all
# Fermi cards. Autodetection, or perhaps just a global limit increase, will be
# done when I get my hands on a Kepler device. The fixed size assumes blocks
# of 256 threads, although even at that size there are pathological cases that
# could break the assumption.
DEFAULT_RB_SIZE = 64
ringbuflib = mkringbuflib(DEFAULT_RB_SIZE)