mirror of
				https://github.com/stevenrobertson/cuburn.git
				synced 2025-11-04 02:10:45 -05:00 
			
		
		
		
	
		
			
				
	
	
		
			337 lines
		
	
	
		
			11 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			337 lines
		
	
	
		
			11 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
"""
 | 
						|
Provides tools and miscellaneous functions for building device code.
 | 
						|
"""
 | 
						|
import os
 | 
						|
import tempfile
 | 
						|
from collections import namedtuple
 | 
						|
 | 
						|
try:
 | 
						|
    import pycuda.driver as cuda
 | 
						|
    import pycuda.compiler
 | 
						|
except ImportError, e:
 | 
						|
    import traceback
 | 
						|
    traceback.print_exc()
 | 
						|
    print 'Continuing without CUDA. Things will break.'
 | 
						|
import numpy as np
 | 
						|
import tempita
 | 
						|
 | 
						|
fst = lambda (a,b): a
 | 
						|
snd = lambda (a,b): b
 | 
						|
 | 
						|
def argset(obj, **kwargs):
 | 
						|
    """
 | 
						|
    Allow an object with many properties to be set using one call.
 | 
						|
 | 
						|
    >>> x = argset(X(), a=1, b=2)
 | 
						|
    >>> x.a
 | 
						|
    1
 | 
						|
    """
 | 
						|
    for k, v in kwargs.items():
 | 
						|
        setattr(obj, k, v)
 | 
						|
    return obj
 | 
						|
 | 
						|
def launch(name, mod, stream, block, grid, *args, **kwargs):
 | 
						|
    """
 | 
						|
    Oft-used kernel launch helper. Provides a nice boost in readability of
 | 
						|
    densely-packed asynchronous code launches.
 | 
						|
    """
 | 
						|
    fun = mod.get_function(name)
 | 
						|
    if isinstance(block, (int, np.number)):
 | 
						|
        block = (int(block), 1, 1)
 | 
						|
    if isinstance(grid, (int, np.number)):
 | 
						|
        grid = (int(grid), 1)
 | 
						|
    fun(*args, block=block, grid=grid, stream=stream, **kwargs)
 | 
						|
 | 
						|
def launch2(name, mod, stream, dim, *args, **kwargs):
 | 
						|
    """
 | 
						|
    Launch using a standardized 2D grid: blocks in the shape (32, 8, 1), and
 | 
						|
    grid size set to fully cover the image. The GET_IDX and GET_IDX_2 macros
 | 
						|
    assume this launch pattern.
 | 
						|
    """
 | 
						|
    # 32 has a tied constant in the GET_IDX_2 macro definition below
 | 
						|
    block, grid = (32, 8, 1), (dim.astride / 32, dim.ah / 8)
 | 
						|
    launch(name, mod, stream, block, grid, *args, **kwargs)
 | 
						|
 | 
						|
def crep(s):
 | 
						|
    """Multiline literal escape for inline PTX assembly."""
 | 
						|
    if isinstance(s, unicode):
 | 
						|
        s = s.encode('utf-8')
 | 
						|
    return '"%s"' % s.encode("string_escape")
 | 
						|
 | 
						|
class Template(tempita.Template):
 | 
						|
    """
 | 
						|
    A Tempita template with extra stuff in the default namespace.
 | 
						|
    """
 | 
						|
    default_namespace = tempita.Template.default_namespace.copy()
 | 
						|
Template.default_namespace.update({'np': np, 'crep': crep})
 | 
						|
 | 
						|
# Passive container for device code.
 | 
						|
DevLib = namedtuple('DevLib', 'deps headers decls defs')
 | 
						|
 | 
						|
def devlib(deps=(), headers='', decls='', defs=''):
 | 
						|
    """Create a library of device code."""
 | 
						|
    # This exists because namedtuple doesn't support default args
 | 
						|
    return DevLib(deps, headers, decls, defs)
 | 
						|
 | 
						|
def assemble_code(*libs):
 | 
						|
    seen = set()
 | 
						|
    out = []
 | 
						|
    def go(lib):
 | 
						|
        map(go, lib.deps)
 | 
						|
        code = lib[1:]
 | 
						|
        if code not in seen:
 | 
						|
            seen.add(code)
 | 
						|
            out.append(code)
 | 
						|
    go(stdlib)
 | 
						|
    map(go, libs)
 | 
						|
    return ''.join(sum(zip(*out), ()))
 | 
						|
 | 
						|
DEFAULT_CMP_OPTIONS = ('-use_fast_math', '-maxrregcount', '42')
 | 
						|
DEFAULT_SAVE_KERNEL = True
 | 
						|
def compile(name, src, opts=DEFAULT_CMP_OPTIONS, save=DEFAULT_SAVE_KERNEL,
 | 
						|
            arch=None):
 | 
						|
    """
 | 
						|
    Compile a module. Returns a copy of the source (for inspection or
 | 
						|
    display) and the compiled cubin.
 | 
						|
    """
 | 
						|
    dir = tempfile.gettempdir()
 | 
						|
    if save:
 | 
						|
        with open(os.path.join(dir, name + '_kern.cu'), 'w') as fp:
 | 
						|
            fp.write(src)
 | 
						|
    cubin = pycuda.compiler.compile(src, options=list(opts), arch=arch)
 | 
						|
    if save:
 | 
						|
        with open(os.path.join(dir, name + '_kern.cubin'), 'w') as fp:
 | 
						|
            fp.write(cubin)
 | 
						|
    return cubin
 | 
						|
 | 
						|
class ClsMod(object):
 | 
						|
    """
 | 
						|
    Convenience class or mixin that automatically compiles and loads a module
 | 
						|
    once per class, saving potentially expensive code regeneration. Only use
 | 
						|
    if your class does not employ run-time code generation.
 | 
						|
    """
 | 
						|
    mod = None
 | 
						|
    # Supply your own DevLib on this property
 | 
						|
    lib = None
 | 
						|
 | 
						|
    def __init__(self):
 | 
						|
        super(ClsMod, self).__init__()
 | 
						|
        self.load()
 | 
						|
 | 
						|
    @classmethod
 | 
						|
    def load(cls, name=None):
 | 
						|
        if cls.mod is None:
 | 
						|
            if name is None:
 | 
						|
                name = cls.__name__.lower()
 | 
						|
            cubin = compile(name, assemble_code(cls.lib))
 | 
						|
            cls.mod = cuda.module_from_buffer(cubin)
 | 
						|
 | 
						|
# This lib is included with *every* assembled module. It contains no global
 | 
						|
# functions, so it shouldn't slow down compilation time too much.
 | 
						|
stdlib = devlib(headers="""
 | 
						|
#include<cuda.h>
 | 
						|
#include<stdint.h>
 | 
						|
#include<stdio.h>
 | 
						|
""", decls=r"""
 | 
						|
#undef M_E
 | 
						|
#undef M_LOG2E
 | 
						|
#undef M_LOG10E
 | 
						|
#undef M_LN2
 | 
						|
#undef M_LN10
 | 
						|
#undef M_PI
 | 
						|
#undef M_PI_2
 | 
						|
#undef M_PI_4
 | 
						|
#undef M_1_PI
 | 
						|
#undef M_2_PI
 | 
						|
#undef M_2_SQRTPI
 | 
						|
#undef M_SQRT2
 | 
						|
#undef M_SQRT1_2
 | 
						|
 | 
						|
#define  M_E          2.71828174591064f
 | 
						|
#define  M_LOG2E      1.44269502162933f
 | 
						|
#define  M_LOG10E     0.43429449200630f
 | 
						|
#define  M_LN2        0.69314718246460f
 | 
						|
#define  M_LN10       2.30258512496948f
 | 
						|
#define  M_PI         3.14159274101257f
 | 
						|
#define  M_PI_2       1.57079637050629f
 | 
						|
#define  M_PI_4       0.78539818525314f
 | 
						|
#define  M_1_PI       0.31830987334251f
 | 
						|
#define  M_2_PI       0.63661974668503f
 | 
						|
#define  M_2_SQRTPI   1.12837922573090f
 | 
						|
#define  M_SQRT2      1.41421353816986f
 | 
						|
#define  M_SQRT1_2    0.70710676908493f
 | 
						|
 | 
						|
#define bfe(d, s, o, w) \
 | 
						|
        asm("bfe.u32 %0, %1, %2, %3;" : "=r"(d) : "r"(s), "r"(o), "r"(w))
 | 
						|
 | 
						|
#define bfe_decl(d, s, o, w) \
 | 
						|
        int d; \
 | 
						|
        bfe(d, s, o, w)
 | 
						|
 | 
						|
#define GET_IDX_2(xi, yi, gi) \
 | 
						|
    int xi = blockIdx.x * blockDim.x + threadIdx.x; \
 | 
						|
    int yi = blockIdx.y * blockDim.y + threadIdx.y; \
 | 
						|
    int gi = yi * (32 * gridDim.x) + xi
 | 
						|
 | 
						|
#define GET_IDX(i) GET_IDX_2(x___, y___, i)
 | 
						|
 | 
						|
""", defs=r'''
 | 
						|
__device__ uint32_t gtid() {
 | 
						|
    return threadIdx.x + blockDim.x *
 | 
						|
            (threadIdx.y + blockDim.y *
 | 
						|
                (threadIdx.z + blockDim.z *
 | 
						|
                    (blockIdx.x + (gridDim.x * blockIdx.y))));
 | 
						|
}
 | 
						|
 | 
						|
__device__ uint32_t trunca(float f) {
 | 
						|
    // truncate as used in address calculations. note the use of a signed
 | 
						|
    // conversion is intentional here (simplifies image bounds checking).
 | 
						|
    uint32_t ret;
 | 
						|
    asm("cvt.rni.s32.f32    %0,     %1;" : "=r"(ret) : "f"(f));
 | 
						|
    return ret;
 | 
						|
}
 | 
						|
 | 
						|
__device__ void scale_float4(float4& pix, float scale) {
 | 
						|
    pix.x *= scale;
 | 
						|
    pix.y *= scale;
 | 
						|
    pix.z *= scale;
 | 
						|
    pix.w *= scale;
 | 
						|
}
 | 
						|
''')
 | 
						|
 | 
						|
def mkbinsearchlib(rounds):
 | 
						|
    """
 | 
						|
    Search through the fixed-size list 'hay' to find the rightmost index which
 | 
						|
    contains a value strictly smaller than the input 'needle'. The list must
 | 
						|
    be exactly '2^rounds' long, although padding at the top with very large
 | 
						|
    numbers or even +inf effectively shortens it.
 | 
						|
    """
 | 
						|
    # TODO: this doesn't optimize well on a 64-bit arch, not that it's a
 | 
						|
    # performance-critical chunk of code or anything
 | 
						|
    src = Template(r'''
 | 
						|
__device__ int bitwise_binsearch(const float *hay, float needle) {
 | 
						|
    int lo = 0;
 | 
						|
 | 
						|
    // TODO: improve efficiency on 64-bit arches
 | 
						|
    {{for i in range(search_rounds-1, -1, -1)}}
 | 
						|
    if (needle > hay[lo + {{1 << i}}])
 | 
						|
        lo += {{1 << i}};
 | 
						|
    {{endfor}}
 | 
						|
    return lo;
 | 
						|
}
 | 
						|
''', 'bitwise_binsearch')
 | 
						|
    return devlib(defs=src.substitute(search_rounds=rounds))
 | 
						|
 | 
						|
# 2^search_rounds is the maximum number of knots allowed in a single spline.
 | 
						|
# This includes the four required knots, so a 5 round search supports 28
 | 
						|
# interior knots in the domain (0, 1). 2^5 fits nicely on a single cache line.
 | 
						|
DEFAULT_SEARCH_ROUNDS = 5
 | 
						|
binsearchlib = mkbinsearchlib(DEFAULT_SEARCH_ROUNDS)
 | 
						|
 | 
						|
 | 
						|
filldptrlib = devlib(defs=r'''
 | 
						|
__global__ void
 | 
						|
fill_dptr(uint32_t* dptr, int size, uint32_t value) {
 | 
						|
    int i = (gridDim.x * blockIdx.y + blockIdx.x) * blockDim.x + threadIdx.x;
 | 
						|
    if (i < size) {
 | 
						|
        dptr[i] = value;
 | 
						|
    }
 | 
						|
}
 | 
						|
''')
 | 
						|
def fill_dptr(mod, dptr, size, stream=None, value=np.uint32(0)):
 | 
						|
    """
 | 
						|
    A memory zeroer which can be embedded in a stream, unlike the various
 | 
						|
    memset routines. Size is the number of 4-byte words in the pointer;
 | 
						|
    value is the word to fill it with. If value is not an np.uint32, it
 | 
						|
    will be coerced to a buffer and the first four bytes taken.
 | 
						|
    """
 | 
						|
    if not isinstance(value, np.uint32):
 | 
						|
        if isinstance(value, int):
 | 
						|
            value = np.uint32(value)
 | 
						|
        else:
 | 
						|
            value = np.frombuffer(buffer(value), np.uint32)[0]
 | 
						|
    blocks = int(np.ceil(np.sqrt(size / 1024.)))
 | 
						|
    launch('fill_dptr', mod, stream, (1024, 1, 1), (blocks, blocks),
 | 
						|
            dptr, np.int32(size), value)
 | 
						|
 | 
						|
writehalflib = devlib(defs=r'''
 | 
						|
/* read_half and write_half decode and encode, respectively, two
 | 
						|
 * floating-point values from a 32-bit value (typed as a 'float' for
 | 
						|
 * convenience but not really). The values are packed into u16s as a
 | 
						|
 * proportion of a third value, as in 'ux = u16( x / d * (2^16-1) )'.
 | 
						|
 * This is used during accumulation.
 | 
						|
 *
 | 
						|
 * TODO: also write a function that will efficiently add a value to the packed
 | 
						|
 * values while incrementing the density, to improve the speed of this
 | 
						|
 * approach when the alpha channel is present.
 | 
						|
 */
 | 
						|
 | 
						|
__device__ void
 | 
						|
read_half(float &x, float &y, float xy, float den) {
 | 
						|
    asm("\n\t{"
 | 
						|
        "\n\t   .reg .u16       x, y;"
 | 
						|
        "\n\t   .reg .f32       rc;"
 | 
						|
        "\n\t   mov.b32         {x, y},     %2;"
 | 
						|
        "\n\t   mul.f32         rc,         %3,     0f37800080;" // 1/65535.
 | 
						|
        "\n\t   cvt.rn.f32.u16     %0,         x;"
 | 
						|
        "\n\t   cvt.rn.f32.u16     %1,         y;"
 | 
						|
        "\n\t   mul.f32         %0,         %0,     rc;"
 | 
						|
        "\n\t   mul.f32         %1,         %1,     rc;"
 | 
						|
        "\n\t}"
 | 
						|
        : "=f"(x), "=f"(y) : "f"(xy), "f"(den));
 | 
						|
}
 | 
						|
 | 
						|
__device__ void
 | 
						|
write_half(float &xy, float x, float y, float den) {
 | 
						|
    asm("\n\t{"
 | 
						|
        "\n\t   .reg .u16       x, y;"
 | 
						|
        "\n\t   .reg .f32       rc, xf, yf;"
 | 
						|
        "\n\t   rcp.approx.f32  rc,         %3;"
 | 
						|
        "\n\t   mul.f32         rc,         rc,     65535.0;"
 | 
						|
        "\n\t   mul.f32         xf,         %1,     rc;"
 | 
						|
        "\n\t   mul.f32         yf,         %2,     rc;"
 | 
						|
        "\n\t   cvt.rni.u16.f32 x,  xf;"
 | 
						|
        "\n\t   cvt.rni.u16.f32 y,  yf;"
 | 
						|
        "\n\t   mov.b32         %0,         {x, y};"
 | 
						|
        "\n\t}"
 | 
						|
        : "=f"(xy) : "f"(x), "f"(y), "f"(den));
 | 
						|
}
 | 
						|
''')
 | 
						|
 | 
						|
 | 
						|
def mkringbuflib(rb_size):
 | 
						|
    """
 | 
						|
    A ringbuffer for access to shared resources.
 | 
						|
 | 
						|
    Some components, such as the MWC contexts, are expensive to generate, and
 | 
						|
    have no affinity to a particular block. Rather than maintain a separate
 | 
						|
    copy of each of these objects for every thread block in a launch, we want
 | 
						|
    to only keep enough copies of these resources around to service every
 | 
						|
    thread block that could possibly be active simultaneously on one card,
 | 
						|
    which is often considerably smaller. This class provides a simple
 | 
						|
    ringbuffer type and an increment function, used in a couple places to
 | 
						|
    implement this kind of sharing.
 | 
						|
    """
 | 
						|
    return devlib(headers="#define RB_SIZE_MASK %d" % (rb_size - 1), decls='''
 | 
						|
typedef struct {
 | 
						|
    uint32_t head;
 | 
						|
    uint32_t tail;
 | 
						|
} ringbuf;
 | 
						|
''', defs=r'''
 | 
						|
__shared__ uint32_t rb_idx;
 | 
						|
__device__ uint32_t rb_incr(uint32_t &rb_base, int tidx) {
 | 
						|
    if (threadIdx.y == 0 && threadIdx.x == 0)
 | 
						|
        rb_idx = 256 * (atomicAdd(&rb_base, 1) & RB_SIZE_MASK);
 | 
						|
    __syncthreads();
 | 
						|
    return rb_idx + tidx;
 | 
						|
}
 | 
						|
''')
 | 
						|
 | 
						|
# For now, the number of entries is fixed to a value known to work on all
 | 
						|
# Fermi cards. Autodetection, or perhaps just a global limit increase, will be
 | 
						|
# done when I get my hands on a Kepler device. The fixed size assumes blocks
 | 
						|
# of 256 threads, although even at that size there are pathological cases that
 | 
						|
# could break the assumption.
 | 
						|
DEFAULT_RB_SIZE = 64
 | 
						|
ringbuflib = mkringbuflib(DEFAULT_RB_SIZE)
 |