mirror of
https://github.com/stevenrobertson/cuburn.git
synced 2025-02-05 11:40:04 -05:00
Use a unified block and grid addressing scheme.
This commit is contained in:
parent
eca8a8e1d3
commit
c57917abe6
@ -16,7 +16,7 @@ __constant__ float2 addressing_patterns[16] = {
|
|||||||
{ 1.0f, -0.333333f}, { 0.333333f, 1.0f}, // 14, 15: -15, 75
|
{ 1.0f, -0.333333f}, { 0.333333f, 1.0f}, // 14, 15: -15, 75
|
||||||
};
|
};
|
||||||
|
|
||||||
// Mon dieu! A C++ feature? Gotta to close the "extern C" added by the compiler.
|
// Mon dieu! A C++ feature? Gotta close the "extern C" added by the compiler.
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T> __device__ T
|
template <typename T> __device__ T
|
||||||
@ -40,7 +40,7 @@ extern "C" {
|
|||||||
logscalelib = devlib(defs=r'''
|
logscalelib = devlib(defs=r'''
|
||||||
__global__ void
|
__global__ void
|
||||||
logscale(float4 *outbuf, const float4 *pixbuf, float k1, float k2) {
|
logscale(float4 *outbuf, const float4 *pixbuf, float k1, float k2) {
|
||||||
int i = blockDim.x * blockIdx.x + threadIdx.x;
|
GET_IDX(i);
|
||||||
float4 pix = pixbuf[i];
|
float4 pix = pixbuf[i];
|
||||||
|
|
||||||
float ls = fmaxf(0, k1 * logf(1.0f + pix.w * k2) / pix.w);
|
float ls = fmaxf(0, k1 * logf(1.0f + pix.w * k2) / pix.w);
|
||||||
@ -56,10 +56,8 @@ logscale(float4 *outbuf, const float4 *pixbuf, float k1, float k2) {
|
|||||||
fmabuflib = devlib(defs=r'''
|
fmabuflib = devlib(defs=r'''
|
||||||
// Element-wise computation of ``dst[i]=dst[i]+src[i]*scale``.
|
// Element-wise computation of ``dst[i]=dst[i]+src[i]*scale``.
|
||||||
__global__ void
|
__global__ void
|
||||||
fma_buf(float4 *dst, const float4 *src, int astride, float scale) {
|
fma_buf(float4 *dst, const float4 *src, float scale) {
|
||||||
int x = blockIdx.x * blockDim.x + threadIdx.x;
|
GET_IDX(i);
|
||||||
int y = blockIdx.y * blockDim.y + threadIdx.y;
|
|
||||||
int i = y * astride + x;
|
|
||||||
float4 d = dst[i], s = src[i];
|
float4 d = dst[i], s = src[i];
|
||||||
d.x += s.x * scale;
|
d.x += s.x * scale;
|
||||||
d.y += s.y * scale;
|
d.y += s.y * scale;
|
||||||
@ -82,8 +80,7 @@ __constant__ float gauss_coefs[7] = {
|
|||||||
// ``chan4_src`` in the horizontal direction, and write it to ``dst``, a
|
// ``chan4_src`` in the horizontal direction, and write it to ``dst``, a
|
||||||
// one-channel buffer.
|
// one-channel buffer.
|
||||||
__global__ void den_blur(float *dst, int pattern, int upsample) {
|
__global__ void den_blur(float *dst, int pattern, int upsample) {
|
||||||
int xi = blockIdx.x * blockDim.x + threadIdx.x;
|
GET_IDX_2(xi, yi, gi);
|
||||||
int yi = blockIdx.y * blockDim.y + threadIdx.y;
|
|
||||||
float x = xi, y = yi;
|
float x = xi, y = yi;
|
||||||
|
|
||||||
float den = 0.0f;
|
float den = 0.0f;
|
||||||
@ -92,13 +89,12 @@ __global__ void den_blur(float *dst, int pattern, int upsample) {
|
|||||||
for (int i = 0; i < 7; i++)
|
for (int i = 0; i < 7; i++)
|
||||||
den += tex_shear(chan4_src, pattern, x, y, (i - 3) << upsample).w
|
den += tex_shear(chan4_src, pattern, x, y, (i - 3) << upsample).w
|
||||||
* gauss_coefs[i];
|
* gauss_coefs[i];
|
||||||
dst[yi * (blockDim.x * gridDim.x) + xi] = den;
|
dst[gi] = den;
|
||||||
}
|
}
|
||||||
|
|
||||||
// As above, but with the one-channel texture as source
|
// As den_blur, but with the one-channel texture as source
|
||||||
__global__ void den_blur_1c(float *dst, int pattern, int upsample) {
|
__global__ void den_blur_1c(float *dst, int pattern, int upsample) {
|
||||||
int xi = blockIdx.x * blockDim.x + threadIdx.x;
|
GET_IDX_2(xi, yi, gi);
|
||||||
int yi = blockIdx.y * blockDim.y + threadIdx.y;
|
|
||||||
float x = xi, y = yi;
|
float x = xi, y = yi;
|
||||||
|
|
||||||
float den = 0.0f;
|
float den = 0.0f;
|
||||||
@ -107,7 +103,7 @@ __global__ void den_blur_1c(float *dst, int pattern, int upsample) {
|
|||||||
for (int i = 0; i < 7; i++)
|
for (int i = 0; i < 7; i++)
|
||||||
den += tex_shear(chan1_src, pattern, x, y, (i - 3) << upsample)
|
den += tex_shear(chan1_src, pattern, x, y, (i - 3) << upsample)
|
||||||
* gauss_coefs[i];
|
* gauss_coefs[i];
|
||||||
dst[yi * (blockDim.x * gridDim.x) + xi] = den;
|
dst[gi] = den;
|
||||||
}
|
}
|
||||||
''')
|
''')
|
||||||
|
|
||||||
@ -127,8 +123,7 @@ __global__ void
|
|||||||
bilateral(float4 *dst, int pattern, int radius,
|
bilateral(float4 *dst, int pattern, int radius,
|
||||||
float sstd, float cstd, float dstd, float dpow, float gspeed)
|
float sstd, float cstd, float dstd, float dpow, float gspeed)
|
||||||
{
|
{
|
||||||
int xi = blockIdx.x * blockDim.x + threadIdx.x;
|
GET_IDX_2(xi, yi, gi);
|
||||||
int yi = blockIdx.y * blockDim.y + threadIdx.y;
|
|
||||||
float x = xi, y = yi;
|
float x = xi, y = yi;
|
||||||
|
|
||||||
// Precalculate the spatial coeffecients.
|
// Precalculate the spatial coeffecients.
|
||||||
@ -221,19 +216,16 @@ bilateral(float4 *dst, int pattern, int radius,
|
|||||||
out.z *= weightrcp;
|
out.z *= weightrcp;
|
||||||
out.w *= weightrcp;
|
out.w *= weightrcp;
|
||||||
|
|
||||||
const int astride = blockDim.x * gridDim.x;
|
dst[gi] = out;
|
||||||
dst[yi * astride + xi] = out;
|
|
||||||
}
|
}
|
||||||
''')
|
''')
|
||||||
|
|
||||||
colorcliplib = devlib(deps=[yuvlib], defs=r'''
|
colorcliplib = devlib(deps=[yuvlib], defs=r'''
|
||||||
__global__ void
|
__global__ void
|
||||||
colorclip(float4 *pixbuf, float gamma, float vibrance, float highpow,
|
colorclip(float4 *pixbuf, float gamma, float vibrance, float highpow,
|
||||||
float linrange, float lingam, float3 bkgd, int fbsize)
|
float linrange, float lingam, float3 bkgd)
|
||||||
{
|
{
|
||||||
int i = threadIdx.x + blockDim.x * (blockIdx.x + gridDim.x * blockIdx.y);
|
GET_IDX(i);
|
||||||
if (i >= fbsize) return;
|
|
||||||
|
|
||||||
float4 pix = pixbuf[i];
|
float4 pix = pixbuf[i];
|
||||||
|
|
||||||
if (pix.w <= 0) {
|
if (pix.w <= 0) {
|
||||||
|
@ -34,6 +34,16 @@ def launch(name, mod, stream, block, grid, *args, **kwargs):
|
|||||||
grid = (int(grid), 1)
|
grid = (int(grid), 1)
|
||||||
fun(*args, block=block, grid=grid, stream=stream, **kwargs)
|
fun(*args, block=block, grid=grid, stream=stream, **kwargs)
|
||||||
|
|
||||||
|
def launch2(name, mod, stream, dim, *args, **kwargs):
|
||||||
|
"""
|
||||||
|
Launch using a standardized 2D grid: blocks in the shape (32, 8, 1), and
|
||||||
|
grid size set to fully cover the image. The GET_IDX and GET_IDX_2 macros
|
||||||
|
assume this launch pattern.
|
||||||
|
"""
|
||||||
|
# 32 has a tied constant in the GET_IDX_2 macro definition below
|
||||||
|
block, grid = (32, 8, 1), (dim.astride / 32, dim.ah / 8)
|
||||||
|
launch(name, mod, stream, block, grid, *args, **kwargs)
|
||||||
|
|
||||||
def crep(s):
|
def crep(s):
|
||||||
"""Multiline literal escape for inline PTX assembly."""
|
"""Multiline literal escape for inline PTX assembly."""
|
||||||
if isinstance(s, unicode):
|
if isinstance(s, unicode):
|
||||||
@ -148,6 +158,14 @@ stdlib = devlib(headers="""
|
|||||||
#define bfe_decl(d, s, o, w) \
|
#define bfe_decl(d, s, o, w) \
|
||||||
int d; \
|
int d; \
|
||||||
bfe(d, s, o, w)
|
bfe(d, s, o, w)
|
||||||
|
|
||||||
|
#define GET_IDX_2(xi, yi, gi) \
|
||||||
|
int xi = blockIdx.x * blockDim.x + threadIdx.x; \
|
||||||
|
int yi = blockIdx.y * blockDim.y + threadIdx.y; \
|
||||||
|
int gi = yi * (32 * gridDim.x) + xi
|
||||||
|
|
||||||
|
#define GET_IDX(i) GET_IDX_2(x___, y___, i)
|
||||||
|
|
||||||
""", defs=r'''
|
""", defs=r'''
|
||||||
__device__ uint32_t gtid() {
|
__device__ uint32_t gtid() {
|
||||||
return threadIdx.x + blockDim.x *
|
return threadIdx.x + blockDim.x *
|
||||||
|
@ -6,7 +6,7 @@ import pycuda.compiler
|
|||||||
from pycuda.gpuarray import vec
|
from pycuda.gpuarray import vec
|
||||||
|
|
||||||
import code.filters
|
import code.filters
|
||||||
from code.util import ClsMod, argset, launch
|
from code.util import ClsMod, argset, launch2
|
||||||
|
|
||||||
def mktref(mod, n):
|
def mktref(mod, n):
|
||||||
tref = mod.get_texref(n)
|
tref = mod.get_texref(n)
|
||||||
@ -44,7 +44,6 @@ class Bilateral(Filter, ClsMod):
|
|||||||
# Helper variables and functions to keep it clean
|
# Helper variables and functions to keep it clean
|
||||||
sb = 16 * dim.astride
|
sb = 16 * dim.astride
|
||||||
bs = sb * dim.ah
|
bs = sb * dim.ah
|
||||||
bl, gr = (32, 8, 1), (dim.astride / 32, dim.ah / 8)
|
|
||||||
|
|
||||||
dsc = mkdsc(dim, 4)
|
dsc = mkdsc(dim, 4)
|
||||||
tref = mktref(self.mod, 'chan4_src')
|
tref = mktref(self.mod, 'chan4_src')
|
||||||
@ -60,14 +59,14 @@ class Bilateral(Filter, ClsMod):
|
|||||||
|
|
||||||
# Blur density two octaves along sampling vector, ultimately
|
# Blur density two octaves along sampling vector, ultimately
|
||||||
# storing in the side buffer
|
# storing in the side buffer
|
||||||
launch('den_blur', self.mod, stream, bl, gr,
|
launch2('den_blur', self.mod, stream, dim,
|
||||||
fb.d_back, i32(pattern), i32(0), texrefs=[tref])
|
fb.d_back, i32(pattern), i32(0), texrefs=[tref])
|
||||||
grad_tref.set_address_2d(fb.d_back, grad_dsc, sb / 4)
|
grad_tref.set_address_2d(fb.d_back, grad_dsc, sb / 4)
|
||||||
launch('den_blur_1c', self.mod, stream, bl, gr,
|
launch2('den_blur_1c', self.mod, stream, dim,
|
||||||
fb.d_side, i32(pattern), i32(1), texrefs=[grad_tref])
|
fb.d_side, i32(pattern), i32(1), texrefs=[grad_tref])
|
||||||
grad_tref.set_address_2d(fb.d_side, grad_dsc, sb / 4)
|
grad_tref.set_address_2d(fb.d_side, grad_dsc, sb / 4)
|
||||||
|
|
||||||
launch('bilateral', self.mod, stream, bl, gr,
|
launch2('bilateral', self.mod, stream, dim,
|
||||||
fb.d_back, i32(pattern), i32(self.r),
|
fb.d_back, i32(pattern), i32(self.r),
|
||||||
f32(sstd), f32(self.cstd), f32(self.dstd),
|
f32(sstd), f32(self.cstd), f32(self.dstd),
|
||||||
f32(self.dpow), f32(self.gspeed),
|
f32(self.dpow), f32(self.gspeed),
|
||||||
@ -83,8 +82,7 @@ class Logscale(Filter, ClsMod):
|
|||||||
# s/w, new definition is (w*h/(s*s*w*w)) = (h/(s*s*w))
|
# s/w, new definition is (w*h/(s*s*w*w)) = (h/(s*s*w))
|
||||||
area = dim.h / (gnm.camera.scale(tc) ** 2 * dim.w)
|
area = dim.h / (gnm.camera.scale(tc) ** 2 * dim.w)
|
||||||
k2 = f32(1.0 / (area * gnm.spp(tc)))
|
k2 = f32(1.0 / (area * gnm.spp(tc)))
|
||||||
nbins = dim.ah * dim.astride
|
launch2('logscale', self.mod, stream, dim,
|
||||||
launch('logscale', self.mod, stream, 256, nbins/256,
|
|
||||||
fb.d_front, fb.d_front, k1, k2)
|
fb.d_front, fb.d_front, k1, k2)
|
||||||
|
|
||||||
class ColorClip(Filter, ClsMod):
|
class ColorClip(Filter, ClsMod):
|
||||||
@ -101,7 +99,5 @@ class ColorClip(Filter, ClsMod):
|
|||||||
gnm.color.background.g(tc),
|
gnm.color.background.g(tc),
|
||||||
gnm.color.background.b(tc))
|
gnm.color.background.b(tc))
|
||||||
|
|
||||||
nbins = dim.ah * dim.astride
|
launch2('colorclip', self.mod, stream, dim,
|
||||||
blocks = int(np.ceil(np.sqrt(nbins / 256.)))
|
fb.d_front, gam, vib, hipow, lin, lingam, bkgd)
|
||||||
launch('colorclip', self.mod, stream, 256, (blocks, blocks),
|
|
||||||
fb.d_front, gam, vib, hipow, lin, lingam, bkgd, i32(nbins))
|
|
||||||
|
Loading…
Reference in New Issue
Block a user