cuburn/cuburn/code/util.py

"""
Provides tools and miscellaneous functions for building device code.
"""

import numpy as np
import tempita

def crep(s):
    """Escape for PTX assembly"""
    if isinstance(s, unicode):
        s = s.encode('utf-8')
    return '"%s"' % s.encode("string_escape")

class Template(tempita.Template):
    default_namespace = tempita.Template.default_namespace.copy()
Template.default_namespace.update({'np': np, 'crep': crep})

class HunkOCode(object):
    """An apparently passive container for device code."""
    # Use property objects to make these dynamic
    headers = ''
    decls = ''
    defs = ''

def assemble_code(*sections):
    return ''.join([''.join([getattr(sect, kind) for sect in sections])
                    for kind in ['headers', 'decls', 'defs']])

def apply_affine(x, y, xo, yo, packer):
    return Template("""
    {{xo}} = {{packer.xx}} * {{x}} + {{packer.xy}} * {{y}} + {{packer.xo}};
    {{yo}} = {{packer.yx}} * {{x}} + {{packer.yy}} * {{y}} + {{packer.yo}};
    """).substitute(locals())

class BaseCode(HunkOCode):
    headers = """
#include<cuda.h>
#include<stdint.h>
#include<stdio.h>
"""

    decls = """
float3 rgb2hsv(float3 rgb);
float3 hsv2rgb(float3 hsv);
"""

    defs = Template(r"""
#undef M_E
#undef M_LOG2E
#undef M_LOG10E
#undef M_LN2
#undef M_LN10
#undef M_PI
#undef M_PI_2
#undef M_PI_4
#undef M_1_PI
#undef M_2_PI
#undef M_2_SQRTPI
#undef M_SQRT2
#undef M_SQRT1_2

#define  M_E          2.71828174591064f
#define  M_LOG2E      1.44269502162933f
#define  M_LOG10E     0.43429449200630f
#define  M_LN2        0.69314718246460f
#define  M_LN10       2.30258512496948f
#define  M_PI         3.14159274101257f
#define  M_PI_2       1.57079637050629f
#define  M_PI_4       0.78539818525314f
#define  M_1_PI       0.31830987334251f
#define  M_2_PI       0.63661974668503f
#define  M_2_SQRTPI   1.12837922573090f
#define  M_SQRT2      1.41421353816986f
#define  M_SQRT1_2    0.70710676908493f

#define bfe(d, s, o, w) \
        asm("bfe.u32 %0, %1, %2, %3;" : "=r"(d) : "r"(s), "r"(o), "r"(w))

#define bfe_decl(d, s, o, w) \
        int d; \
        bfe(d, s, o, w)

// TODO: use launch parameter preconfig to eliminate unnecessary parts
__device__
uint32_t gtid() {
    return threadIdx.x + blockDim.x *
            (threadIdx.y + blockDim.y *
                (threadIdx.z + blockDim.z *
                    (blockIdx.x + (gridDim.x * blockIdx.y))));
}

__device__
uint32_t trunca(float f) {
    // truncate as used in address calculations. note the use of a signed
    // conversion is intentional here (simplifies image bounds checking).
    uint32_t ret;
    asm("cvt.rni.s32.f32    %0,     %1;" : "=r"(ret) : "f"(f));
    return ret;
}

__global__
void fill_dptr(uint32_t* dptr, int size, uint32_t value) {
    int i = (gridDim.x * blockIdx.y + blockIdx.x) * blockDim.x + threadIdx.x;
    if (i < size) {
        dptr[i] = value;
    }
}

/* read_half and write_half decode and encode, respectively, two
 * floating-point values from a 32-bit value (typed as a 'float' for
 * convenience but not really). The values are packed into u16s as a
 * proportion of a third value, as in 'ux = u16( x / d * (2^16-1) )'.
 * This is used during accumulation.
 *
 * TODO: also write a function that will efficiently add a value to the packed
 * values while incrementing the density, to improve the speed of this
 * approach when the alpha channel is present.
 */

__device__
void read_half(float &x, float &y, float xy, float den) {
    asm("\n\t{"
        "\n\t   .reg .u16       x, y;"
        "\n\t   .reg .f32       rc;"
        "\n\t   mov.b32         {x, y},     %2;"
        "\n\t   mul.f32         rc,         %3,     0f37800080;" // 1/65535.
        "\n\t   cvt.rn.f32.u16     %0,         x;"
        "\n\t   cvt.rn.f32.u16     %1,         y;"
        "\n\t   mul.f32         %0,         %0,     rc;"
        "\n\t   mul.f32         %1,         %1,     rc;"
        "\n\t}"
        : "=f"(x), "=f"(y) : "f"(xy), "f"(den));
}

__device__
void write_half(float &xy, float x, float y, float den) {
    asm("\n\t{"
        "\n\t   .reg .u16       x, y;"
        "\n\t   .reg .f32       rc, xf, yf;"
        "\n\t   rcp.approx.f32  rc,         %3;"
        "\n\t   mul.f32         rc,         rc,     65535.0;"
        "\n\t   mul.f32         xf,         %1,     rc;"
        "\n\t   mul.f32         yf,         %2,     rc;"
        "\n\t   cvt.rni.u16.f32 x,  xf;"
        "\n\t   cvt.rni.u16.f32 y,  yf;"
        "\n\t   mov.b32         %0,         {x, y};"
        "\n\t}"
        : "=f"(xy) : "f"(x), "f"(y), "f"(den));
}

__device__
float3 rgb2hsv(float3 rgb) {
    float M = fmaxf(fmaxf(rgb.x, rgb.y), rgb.z);
    float m = fminf(fminf(rgb.x, rgb.y), rgb.z);
    float C = M - m;

    float s = M > 0.0f ? C / M : 0.0f;

    float h = 0.0f;
    if (s != 0.0f) {
        C = 1.0f / C;
        float rc = (M - rgb.x) * C;
        float gc = (M - rgb.y) * C;
        float bc = (M - rgb.z) * C;

        if      (rgb.x == M)  h = bc - gc;
        else if (rgb.y == M)  h = 2 + rc - bc;
        else                  h = 4 + gc - rc;

        if (h < 0) h += 6.0f;
    }
    return make_float3(h, s, M);
}

__device__
float3 hsv2rgb(float3 hsv) {

    float whole = floorf(hsv.x);
    float frac = hsv.x - whole;
    float val = hsv.z;
    float min = val * (1 - hsv.y);
    float mid = val * (1 - (hsv.y * frac));
    float alt = val * (1 - (hsv.y * (1 - frac)));

    float3 out;
         if (whole == 0.0f) { out.x = val; out.y = alt; out.z = min; }
    else if (whole == 1.0f) { out.x = mid; out.y = val; out.z = min; }
    else if (whole == 2.0f) { out.x = min; out.y = val; out.z = alt; }
    else if (whole == 3.0f) { out.x = min; out.y = mid; out.z = val; }
    else if (whole == 4.0f) { out.x = alt; out.y = min; out.z = val; }
    else                    { out.x = val; out.y = min; out.z = mid; }
    return out;
}
""").substitute()

    @staticmethod
    def fill_dptr(mod, dptr, size, stream=None, value=np.uint32(0)):
        """
        A memory zeroer which can be embedded in a stream, unlike the various
        memset routines. Size is the number of 4-byte words in the pointer;
        value is the word to fill it with. If value is not an np.uint32, it
        will be coerced to a buffer and the first four bytes taken.
        """
        fill = mod.get_function("fill_dptr")
        if not isinstance(value, np.uint32):
            if isinstance(value, int):
                value = np.uint32(value)
            else:
                value = np.frombuffer(buffer(value), np.uint32)[0]
        blocks = int(np.ceil(np.sqrt(size / 1024 + 1)))
        fill(dptr, np.int32(size), value, stream=stream,
             block=(1024, 1, 1), grid=(blocks, blocks, 1))