From c57917abe6bdb8bc9ac51e763ef2a018c102aa3e Mon Sep 17 00:00:00 2001 From: Steven Robertson Date: Fri, 6 Apr 2012 21:24:25 -0700 Subject: [PATCH] Use a unified block and grid addressing scheme. --- cuburn/code/filters.py | 34 +++++++++++++--------------------- cuburn/code/util.py | 18 ++++++++++++++++++ cuburn/filters.py | 18 +++++++----------- 3 files changed, 38 insertions(+), 32 deletions(-) diff --git a/cuburn/code/filters.py b/cuburn/code/filters.py index 3167237..4d03309 100644 --- a/cuburn/code/filters.py +++ b/cuburn/code/filters.py @@ -16,7 +16,7 @@ __constant__ float2 addressing_patterns[16] = { { 1.0f, -0.333333f}, { 0.333333f, 1.0f}, // 14, 15: -15, 75 }; -// Mon dieu! A C++ feature? Gotta to close the "extern C" added by the compiler. +// Mon dieu! A C++ feature? Gotta close the "extern C" added by the compiler. } template __device__ T @@ -40,7 +40,7 @@ extern "C" { logscalelib = devlib(defs=r''' __global__ void logscale(float4 *outbuf, const float4 *pixbuf, float k1, float k2) { - int i = blockDim.x * blockIdx.x + threadIdx.x; + GET_IDX(i); float4 pix = pixbuf[i]; float ls = fmaxf(0, k1 * logf(1.0f + pix.w * k2) / pix.w); @@ -56,10 +56,8 @@ logscale(float4 *outbuf, const float4 *pixbuf, float k1, float k2) { fmabuflib = devlib(defs=r''' // Element-wise computation of ``dst[i]=dst[i]+src[i]*scale``. __global__ void -fma_buf(float4 *dst, const float4 *src, int astride, float scale) { - int x = blockIdx.x * blockDim.x + threadIdx.x; - int y = blockIdx.y * blockDim.y + threadIdx.y; - int i = y * astride + x; +fma_buf(float4 *dst, const float4 *src, float scale) { + GET_IDX(i); float4 d = dst[i], s = src[i]; d.x += s.x * scale; d.y += s.y * scale; @@ -82,8 +80,7 @@ __constant__ float gauss_coefs[7] = { // ``chan4_src`` in the horizontal direction, and write it to ``dst``, a // one-channel buffer. __global__ void den_blur(float *dst, int pattern, int upsample) { - int xi = blockIdx.x * blockDim.x + threadIdx.x; - int yi = blockIdx.y * blockDim.y + threadIdx.y; + GET_IDX_2(xi, yi, gi); float x = xi, y = yi; float den = 0.0f; @@ -92,13 +89,12 @@ __global__ void den_blur(float *dst, int pattern, int upsample) { for (int i = 0; i < 7; i++) den += tex_shear(chan4_src, pattern, x, y, (i - 3) << upsample).w * gauss_coefs[i]; - dst[yi * (blockDim.x * gridDim.x) + xi] = den; + dst[gi] = den; } -// As above, but with the one-channel texture as source +// As den_blur, but with the one-channel texture as source __global__ void den_blur_1c(float *dst, int pattern, int upsample) { - int xi = blockIdx.x * blockDim.x + threadIdx.x; - int yi = blockIdx.y * blockDim.y + threadIdx.y; + GET_IDX_2(xi, yi, gi); float x = xi, y = yi; float den = 0.0f; @@ -107,7 +103,7 @@ __global__ void den_blur_1c(float *dst, int pattern, int upsample) { for (int i = 0; i < 7; i++) den += tex_shear(chan1_src, pattern, x, y, (i - 3) << upsample) * gauss_coefs[i]; - dst[yi * (blockDim.x * gridDim.x) + xi] = den; + dst[gi] = den; } ''') @@ -127,8 +123,7 @@ __global__ void bilateral(float4 *dst, int pattern, int radius, float sstd, float cstd, float dstd, float dpow, float gspeed) { - int xi = blockIdx.x * blockDim.x + threadIdx.x; - int yi = blockIdx.y * blockDim.y + threadIdx.y; + GET_IDX_2(xi, yi, gi); float x = xi, y = yi; // Precalculate the spatial coeffecients. @@ -221,19 +216,16 @@ bilateral(float4 *dst, int pattern, int radius, out.z *= weightrcp; out.w *= weightrcp; - const int astride = blockDim.x * gridDim.x; - dst[yi * astride + xi] = out; + dst[gi] = out; } ''') colorcliplib = devlib(deps=[yuvlib], defs=r''' __global__ void colorclip(float4 *pixbuf, float gamma, float vibrance, float highpow, - float linrange, float lingam, float3 bkgd, int fbsize) + float linrange, float lingam, float3 bkgd) { - int i = threadIdx.x + blockDim.x * (blockIdx.x + gridDim.x * blockIdx.y); - if (i >= fbsize) return; - + GET_IDX(i); float4 pix = pixbuf[i]; if (pix.w <= 0) { diff --git a/cuburn/code/util.py b/cuburn/code/util.py index e3501da..30bbd29 100644 --- a/cuburn/code/util.py +++ b/cuburn/code/util.py @@ -34,6 +34,16 @@ def launch(name, mod, stream, block, grid, *args, **kwargs): grid = (int(grid), 1) fun(*args, block=block, grid=grid, stream=stream, **kwargs) +def launch2(name, mod, stream, dim, *args, **kwargs): + """ + Launch using a standardized 2D grid: blocks in the shape (32, 8, 1), and + grid size set to fully cover the image. The GET_IDX and GET_IDX_2 macros + assume this launch pattern. + """ + # 32 has a tied constant in the GET_IDX_2 macro definition below + block, grid = (32, 8, 1), (dim.astride / 32, dim.ah / 8) + launch(name, mod, stream, block, grid, *args, **kwargs) + def crep(s): """Multiline literal escape for inline PTX assembly.""" if isinstance(s, unicode): @@ -148,6 +158,14 @@ stdlib = devlib(headers=""" #define bfe_decl(d, s, o, w) \ int d; \ bfe(d, s, o, w) + +#define GET_IDX_2(xi, yi, gi) \ + int xi = blockIdx.x * blockDim.x + threadIdx.x; \ + int yi = blockIdx.y * blockDim.y + threadIdx.y; \ + int gi = yi * (32 * gridDim.x) + xi + +#define GET_IDX(i) GET_IDX_2(x___, y___, i) + """, defs=r''' __device__ uint32_t gtid() { return threadIdx.x + blockDim.x * diff --git a/cuburn/filters.py b/cuburn/filters.py index 1621a75..f45b5df 100644 --- a/cuburn/filters.py +++ b/cuburn/filters.py @@ -6,7 +6,7 @@ import pycuda.compiler from pycuda.gpuarray import vec import code.filters -from code.util import ClsMod, argset, launch +from code.util import ClsMod, argset, launch2 def mktref(mod, n): tref = mod.get_texref(n) @@ -44,7 +44,6 @@ class Bilateral(Filter, ClsMod): # Helper variables and functions to keep it clean sb = 16 * dim.astride bs = sb * dim.ah - bl, gr = (32, 8, 1), (dim.astride / 32, dim.ah / 8) dsc = mkdsc(dim, 4) tref = mktref(self.mod, 'chan4_src') @@ -60,14 +59,14 @@ class Bilateral(Filter, ClsMod): # Blur density two octaves along sampling vector, ultimately # storing in the side buffer - launch('den_blur', self.mod, stream, bl, gr, + launch2('den_blur', self.mod, stream, dim, fb.d_back, i32(pattern), i32(0), texrefs=[tref]) grad_tref.set_address_2d(fb.d_back, grad_dsc, sb / 4) - launch('den_blur_1c', self.mod, stream, bl, gr, + launch2('den_blur_1c', self.mod, stream, dim, fb.d_side, i32(pattern), i32(1), texrefs=[grad_tref]) grad_tref.set_address_2d(fb.d_side, grad_dsc, sb / 4) - launch('bilateral', self.mod, stream, bl, gr, + launch2('bilateral', self.mod, stream, dim, fb.d_back, i32(pattern), i32(self.r), f32(sstd), f32(self.cstd), f32(self.dstd), f32(self.dpow), f32(self.gspeed), @@ -83,8 +82,7 @@ class Logscale(Filter, ClsMod): # s/w, new definition is (w*h/(s*s*w*w)) = (h/(s*s*w)) area = dim.h / (gnm.camera.scale(tc) ** 2 * dim.w) k2 = f32(1.0 / (area * gnm.spp(tc))) - nbins = dim.ah * dim.astride - launch('logscale', self.mod, stream, 256, nbins/256, + launch2('logscale', self.mod, stream, dim, fb.d_front, fb.d_front, k1, k2) class ColorClip(Filter, ClsMod): @@ -101,7 +99,5 @@ class ColorClip(Filter, ClsMod): gnm.color.background.g(tc), gnm.color.background.b(tc)) - nbins = dim.ah * dim.astride - blocks = int(np.ceil(np.sqrt(nbins / 256.))) - launch('colorclip', self.mod, stream, 256, (blocks, blocks), - fb.d_front, gam, vib, hipow, lin, lingam, bkgd, i32(nbins)) + launch2('colorclip', self.mod, stream, dim, + fb.d_front, gam, vib, hipow, lin, lingam, bkgd)