mirror of
https://github.com/stevenrobertson/cuburn.git
synced 2025-02-05 19:50:04 -05:00
337 lines
11 KiB
Python
337 lines
11 KiB
Python
"""
|
|
Provides tools and miscellaneous functions for building device code.
|
|
"""
|
|
import os
|
|
import tempfile
|
|
from collections import namedtuple
|
|
|
|
try:
|
|
import pycuda.driver as cuda
|
|
import pycuda.compiler
|
|
except ImportError, e:
|
|
import traceback
|
|
traceback.print_exc()
|
|
print 'Continuing without CUDA. Things will break.'
|
|
import numpy as np
|
|
import tempita
|
|
|
|
fst = lambda (a,b): a
|
|
snd = lambda (a,b): b
|
|
|
|
def argset(obj, **kwargs):
|
|
"""
|
|
Allow an object with many properties to be set using one call.
|
|
|
|
>>> x = argset(X(), a=1, b=2)
|
|
>>> x.a
|
|
1
|
|
"""
|
|
for k, v in kwargs.items():
|
|
setattr(obj, k, v)
|
|
return obj
|
|
|
|
def launch(name, mod, stream, block, grid, *args, **kwargs):
|
|
"""
|
|
Oft-used kernel launch helper. Provides a nice boost in readability of
|
|
densely-packed asynchronous code launches.
|
|
"""
|
|
fun = mod.get_function(name)
|
|
if isinstance(block, (int, np.number)):
|
|
block = (int(block), 1, 1)
|
|
if isinstance(grid, (int, np.number)):
|
|
grid = (int(grid), 1)
|
|
fun(*args, block=block, grid=grid, stream=stream, **kwargs)
|
|
|
|
def launch2(name, mod, stream, dim, *args, **kwargs):
|
|
"""
|
|
Launch using a standardized 2D grid: blocks in the shape (32, 8, 1), and
|
|
grid size set to fully cover the image. The GET_IDX and GET_IDX_2 macros
|
|
assume this launch pattern.
|
|
"""
|
|
# 32 has a tied constant in the GET_IDX_2 macro definition below
|
|
block, grid = (32, 8, 1), (dim.astride / 32, dim.ah / 8)
|
|
launch(name, mod, stream, block, grid, *args, **kwargs)
|
|
|
|
def crep(s):
|
|
"""Multiline literal escape for inline PTX assembly."""
|
|
if isinstance(s, unicode):
|
|
s = s.encode('utf-8')
|
|
return '"%s"' % s.encode("string_escape")
|
|
|
|
class Template(tempita.Template):
|
|
"""
|
|
A Tempita template with extra stuff in the default namespace.
|
|
"""
|
|
default_namespace = tempita.Template.default_namespace.copy()
|
|
Template.default_namespace.update({'np': np, 'crep': crep})
|
|
|
|
# Passive container for device code.
|
|
DevLib = namedtuple('DevLib', 'deps headers decls defs')
|
|
|
|
def devlib(deps=(), headers='', decls='', defs=''):
|
|
"""Create a library of device code."""
|
|
# This exists because namedtuple doesn't support default args
|
|
return DevLib(deps, headers, decls, defs)
|
|
|
|
def assemble_code(*libs):
|
|
seen = set()
|
|
out = []
|
|
def go(lib):
|
|
map(go, lib.deps)
|
|
code = lib[1:]
|
|
if code not in seen:
|
|
seen.add(code)
|
|
out.append(code)
|
|
go(stdlib)
|
|
map(go, libs)
|
|
return ''.join(sum(zip(*out), ()))
|
|
|
|
DEFAULT_CMP_OPTIONS = ('-use_fast_math', '-maxrregcount', '42')
|
|
DEFAULT_SAVE_KERNEL = True
|
|
def compile(name, src, opts=DEFAULT_CMP_OPTIONS, save=DEFAULT_SAVE_KERNEL,
|
|
arch=None):
|
|
"""
|
|
Compile a module. Returns a copy of the source (for inspection or
|
|
display) and the compiled cubin.
|
|
"""
|
|
dir = tempfile.gettempdir()
|
|
if save:
|
|
with open(os.path.join(dir, name + '_kern.cu'), 'w') as fp:
|
|
fp.write(src)
|
|
cubin = pycuda.compiler.compile(src, options=list(opts), arch=arch)
|
|
if save:
|
|
with open(os.path.join(dir, name + '_kern.cubin'), 'w') as fp:
|
|
fp.write(cubin)
|
|
return cubin
|
|
|
|
class ClsMod(object):
|
|
"""
|
|
Convenience class or mixin that automatically compiles and loads a module
|
|
once per class, saving potentially expensive code regeneration. Only use
|
|
if your class does not employ run-time code generation.
|
|
"""
|
|
mod = None
|
|
# Supply your own DevLib on this property
|
|
lib = None
|
|
|
|
def __init__(self):
|
|
super(ClsMod, self).__init__()
|
|
self.load()
|
|
|
|
@classmethod
|
|
def load(cls, name=None):
|
|
if cls.mod is None:
|
|
if name is None:
|
|
name = cls.__name__.lower()
|
|
cubin = compile(name, assemble_code(cls.lib))
|
|
cls.mod = cuda.module_from_buffer(cubin)
|
|
|
|
# This lib is included with *every* assembled module. It contains no global
|
|
# functions, so it shouldn't slow down compilation time too much.
|
|
stdlib = devlib(headers="""
|
|
#include<cuda.h>
|
|
#include<stdint.h>
|
|
#include<stdio.h>
|
|
""", decls=r"""
|
|
#undef M_E
|
|
#undef M_LOG2E
|
|
#undef M_LOG10E
|
|
#undef M_LN2
|
|
#undef M_LN10
|
|
#undef M_PI
|
|
#undef M_PI_2
|
|
#undef M_PI_4
|
|
#undef M_1_PI
|
|
#undef M_2_PI
|
|
#undef M_2_SQRTPI
|
|
#undef M_SQRT2
|
|
#undef M_SQRT1_2
|
|
|
|
#define M_E 2.71828174591064f
|
|
#define M_LOG2E 1.44269502162933f
|
|
#define M_LOG10E 0.43429449200630f
|
|
#define M_LN2 0.69314718246460f
|
|
#define M_LN10 2.30258512496948f
|
|
#define M_PI 3.14159274101257f
|
|
#define M_PI_2 1.57079637050629f
|
|
#define M_PI_4 0.78539818525314f
|
|
#define M_1_PI 0.31830987334251f
|
|
#define M_2_PI 0.63661974668503f
|
|
#define M_2_SQRTPI 1.12837922573090f
|
|
#define M_SQRT2 1.41421353816986f
|
|
#define M_SQRT1_2 0.70710676908493f
|
|
|
|
#define bfe(d, s, o, w) \
|
|
asm("bfe.u32 %0, %1, %2, %3;" : "=r"(d) : "r"(s), "r"(o), "r"(w))
|
|
|
|
#define bfe_decl(d, s, o, w) \
|
|
int d; \
|
|
bfe(d, s, o, w)
|
|
|
|
#define GET_IDX_2(xi, yi, gi) \
|
|
int xi = blockIdx.x * blockDim.x + threadIdx.x; \
|
|
int yi = blockIdx.y * blockDim.y + threadIdx.y; \
|
|
int gi = yi * (32 * gridDim.x) + xi
|
|
|
|
#define GET_IDX(i) GET_IDX_2(x___, y___, i)
|
|
|
|
""", defs=r'''
|
|
__device__ uint32_t gtid() {
|
|
return threadIdx.x + blockDim.x *
|
|
(threadIdx.y + blockDim.y *
|
|
(threadIdx.z + blockDim.z *
|
|
(blockIdx.x + (gridDim.x * blockIdx.y))));
|
|
}
|
|
|
|
__device__ uint32_t trunca(float f) {
|
|
// truncate as used in address calculations. note the use of a signed
|
|
// conversion is intentional here (simplifies image bounds checking).
|
|
uint32_t ret;
|
|
asm("cvt.rni.s32.f32 %0, %1;" : "=r"(ret) : "f"(f));
|
|
return ret;
|
|
}
|
|
|
|
__device__ void scale_float4(float4& pix, float scale) {
|
|
pix.x *= scale;
|
|
pix.y *= scale;
|
|
pix.z *= scale;
|
|
pix.w *= scale;
|
|
}
|
|
''')
|
|
|
|
def mkbinsearchlib(rounds):
|
|
"""
|
|
Search through the fixed-size list 'hay' to find the rightmost index which
|
|
contains a value strictly smaller than the input 'needle'. The list must
|
|
be exactly '2^rounds' long, although padding at the top with very large
|
|
numbers or even +inf effectively shortens it.
|
|
"""
|
|
# TODO: this doesn't optimize well on a 64-bit arch, not that it's a
|
|
# performance-critical chunk of code or anything
|
|
src = Template(r'''
|
|
__device__ int bitwise_binsearch(const float *hay, float needle) {
|
|
int lo = 0;
|
|
|
|
// TODO: improve efficiency on 64-bit arches
|
|
{{for i in range(search_rounds-1, -1, -1)}}
|
|
if (needle > hay[lo + {{1 << i}}])
|
|
lo += {{1 << i}};
|
|
{{endfor}}
|
|
return lo;
|
|
}
|
|
''', 'bitwise_binsearch')
|
|
return devlib(defs=src.substitute(search_rounds=rounds))
|
|
|
|
# 2^search_rounds is the maximum number of knots allowed in a single spline.
|
|
# This includes the four required knots, so a 5 round search supports 28
|
|
# interior knots in the domain (0, 1). 2^5 fits nicely on a single cache line.
|
|
DEFAULT_SEARCH_ROUNDS = 5
|
|
binsearchlib = mkbinsearchlib(DEFAULT_SEARCH_ROUNDS)
|
|
|
|
|
|
filldptrlib = devlib(defs=r'''
|
|
__global__ void
|
|
fill_dptr(uint32_t* dptr, int size, uint32_t value) {
|
|
int i = (gridDim.x * blockIdx.y + blockIdx.x) * blockDim.x + threadIdx.x;
|
|
if (i < size) {
|
|
dptr[i] = value;
|
|
}
|
|
}
|
|
''')
|
|
def fill_dptr(mod, dptr, size, stream=None, value=np.uint32(0)):
|
|
"""
|
|
A memory zeroer which can be embedded in a stream, unlike the various
|
|
memset routines. Size is the number of 4-byte words in the pointer;
|
|
value is the word to fill it with. If value is not an np.uint32, it
|
|
will be coerced to a buffer and the first four bytes taken.
|
|
"""
|
|
if not isinstance(value, np.uint32):
|
|
if isinstance(value, int):
|
|
value = np.uint32(value)
|
|
else:
|
|
value = np.frombuffer(buffer(value), np.uint32)[0]
|
|
blocks = int(np.ceil(np.sqrt(size / 1024.)))
|
|
launch('fill_dptr', mod, stream, (1024, 1, 1), (blocks, blocks),
|
|
dptr, np.int32(size), value)
|
|
|
|
writehalflib = devlib(defs=r'''
|
|
/* read_half and write_half decode and encode, respectively, two
|
|
* floating-point values from a 32-bit value (typed as a 'float' for
|
|
* convenience but not really). The values are packed into u16s as a
|
|
* proportion of a third value, as in 'ux = u16( x / d * (2^16-1) )'.
|
|
* This is used during accumulation.
|
|
*
|
|
* TODO: also write a function that will efficiently add a value to the packed
|
|
* values while incrementing the density, to improve the speed of this
|
|
* approach when the alpha channel is present.
|
|
*/
|
|
|
|
__device__ void
|
|
read_half(float &x, float &y, float xy, float den) {
|
|
asm("\n\t{"
|
|
"\n\t .reg .u16 x, y;"
|
|
"\n\t .reg .f32 rc;"
|
|
"\n\t mov.b32 {x, y}, %2;"
|
|
"\n\t mul.f32 rc, %3, 0f37800080;" // 1/65535.
|
|
"\n\t cvt.rn.f32.u16 %0, x;"
|
|
"\n\t cvt.rn.f32.u16 %1, y;"
|
|
"\n\t mul.f32 %0, %0, rc;"
|
|
"\n\t mul.f32 %1, %1, rc;"
|
|
"\n\t}"
|
|
: "=f"(x), "=f"(y) : "f"(xy), "f"(den));
|
|
}
|
|
|
|
__device__ void
|
|
write_half(float &xy, float x, float y, float den) {
|
|
asm("\n\t{"
|
|
"\n\t .reg .u16 x, y;"
|
|
"\n\t .reg .f32 rc, xf, yf;"
|
|
"\n\t rcp.approx.f32 rc, %3;"
|
|
"\n\t mul.f32 rc, rc, 65535.0;"
|
|
"\n\t mul.f32 xf, %1, rc;"
|
|
"\n\t mul.f32 yf, %2, rc;"
|
|
"\n\t cvt.rni.u16.f32 x, xf;"
|
|
"\n\t cvt.rni.u16.f32 y, yf;"
|
|
"\n\t mov.b32 %0, {x, y};"
|
|
"\n\t}"
|
|
: "=f"(xy) : "f"(x), "f"(y), "f"(den));
|
|
}
|
|
''')
|
|
|
|
|
|
def mkringbuflib(rb_size):
|
|
"""
|
|
A ringbuffer for access to shared resources.
|
|
|
|
Some components, such as the MWC contexts, are expensive to generate, and
|
|
have no affinity to a particular block. Rather than maintain a separate
|
|
copy of each of these objects for every thread block in a launch, we want
|
|
to only keep enough copies of these resources around to service every
|
|
thread block that could possibly be active simultaneously on one card,
|
|
which is often considerably smaller. This class provides a simple
|
|
ringbuffer type and an increment function, used in a couple places to
|
|
implement this kind of sharing.
|
|
"""
|
|
return devlib(headers="#define RB_SIZE_MASK %d" % (rb_size - 1), decls='''
|
|
typedef struct {
|
|
uint32_t head;
|
|
uint32_t tail;
|
|
} ringbuf;
|
|
''', defs=r'''
|
|
__shared__ uint32_t rb_idx;
|
|
__device__ uint32_t rb_incr(uint32_t &rb_base, int tidx) {
|
|
if (threadIdx.y == 0 && threadIdx.x == 0)
|
|
rb_idx = 256 * (atomicAdd(&rb_base, 1) & RB_SIZE_MASK);
|
|
__syncthreads();
|
|
return rb_idx + tidx;
|
|
}
|
|
''')
|
|
|
|
# For now, the number of entries is fixed to a value known to work on all
|
|
# Fermi cards. Autodetection, or perhaps just a global limit increase, will be
|
|
# done when I get my hands on a Kepler device. The fixed size assumes blocks
|
|
# of 256 threads, although even at that size there are pathological cases that
|
|
# could break the assumption.
|
|
DEFAULT_RB_SIZE = 64
|
|
ringbuflib = mkringbuflib(DEFAULT_RB_SIZE)
|