Wavelet experiments

This commit is contained in:
Steven Robertson 2012-01-20 11:27:21 -05:00
parent acbde65b9f
commit 964b11efdf

View File

@ -2,6 +2,7 @@
import numpy as np import numpy as np
from numpy import float32 as f32, int32 as i32 from numpy import float32 as f32, int32 as i32
import pycuda.driver as cuda
import pycuda.compiler import pycuda.compiler
from pycuda.gpuarray import vec from pycuda.gpuarray import vec
@ -106,45 +107,142 @@ void logscale(float4 *pixbuf, float4 *outbuf, float k1, float k2) {
outbuf[i] = pix; outbuf[i] = pix;
} }
// SS must be no greater than 4x __device__ void fmav(float4 &dst, float4 src, float scale) {
dst.x += src.x * scale;
dst.y += src.y * scale;
dst.z += src.z * scale;
dst.w += src.w * scale;
}
// The 7-tap filters are missing the leading zero to compensate for delay. I
// have no frigging clue what the theory behind that is, but it seems to work.
__constant__ float daub97_lo[9] = {
0.03782845550726404f, -0.023849465019556843f, -0.11062440441843718f,
0.37740285561283066f, 0.8526986790088938f, 0.37740285561283066f,
-0.11062440441843718f, -0.023849465019556843f, 0.03782845550726404f
};
__constant__ float daub97_hi[9] = {
-0.06453888262869706f, 0.04068941760916406f,
0.41809227322161724f, -0.7884856164055829f, 0.41809227322161724f,
0.04068941760916406f, -0.06453888262869706f, 0.0f,
};
__constant__ float daub97_ilo[9] = {
-0.06453888262869706f, -0.04068941760916406f,
0.41809227322161724f, 0.7884856164055829f, 0.41809227322161724f,
-0.04068941760916406f, -0.06453888262869706f, 0.0f
};
__constant__ float daub97_ihi[9] = {
-0.03782845550726404f, -0.023849465019556843f, 0.11062440441843718f,
0.37740285561283066f, -0.8526986790088938f, 0.37740285561283066f,
0.11062440441843718f, -0.023849465019556843f, -0.03782845550726404f
};
/*
#define S 0.7071067811f
__constant__ float daub97[4][9] = {
{ 0, 0, 0, S, S}, {0, 0, 0, -S, S}, {0, 0, 0, S, S}, {0, 0, 0, S, -S}};
*/
texture<float4, cudaTextureType2D> conv_down_src;
__global__ __global__
void blur_density_h(float *scratch, const float4 *pixbuf, int astride) { void conv_down(float4 *dst, int astride, int as_eff, int ah_eff,
// TODO: respect supersampling? configurable blur width? int vert, float xo, float yo) {
int x = blockIdx.x * blockDim.x + threadIdx.x; int xi = blockIdx.x * blockDim.x + threadIdx.x;
if (x > astride) return; int yi = blockIdx.y;
int y = blockIdx.y;
const float *dsrc =
reinterpret_cast<const float*>(&pixbuf[astride * y]) + 3;
float den = 0.0f; float4 lo = make_float4(0, 0, 0, 0);
float4 hi = make_float4(0, 0, 0, 0);
den += dsrc[4*max(0, x-2)] * 0.00135f; if (vert) {
den += dsrc[4*max(0, x-1)] * 0.1573f; if (xi >= as_eff) return;
den += dsrc[4*x] * 0.6827f; float x = xi - xo, y = yi * 2 - yo;
den += dsrc[4*min(astride-1, x+1)] * 0.1573f;
den += dsrc[4*min(astride-1, x+2)] * 0.00135f; #pragma unroll
scratch[astride * y + x] = den; for (int i = 0; i < 9; i++) {
float4 src = tex2D(conv_down_src, x, y);
fmav(lo, src, daub97_lo[i]);
fmav(hi, src, daub97_hi[i]);
y -= 1.0f;
if (y < 0) y += ah_eff;
}
dst[(yi + ah_eff / 2) * astride + xi] = hi;
} else {
if (xi >= as_eff / 2) return;
float x = xi * 2 - xo, y = yi - yo;
#pragma unroll
for (int i = 0; i < 9; i++) {
float4 src = tex2D(conv_down_src, x, y);
fmav(lo, src, daub97_lo[i]);
fmav(hi, src, daub97_hi[i]);
x -= 1.0f;
if (x < 0) x += as_eff;
}
dst[yi * astride + xi + as_eff / 2] = hi;
}
dst[yi * astride + xi] = lo;
}
texture<float4, cudaTextureType2D> conv_up_src_lo;
texture<float4, cudaTextureType2D> conv_up_src_hi;
__global__
void conv_up(float4 *dst, int astride, int as_eff,
int vert, int xo, int yo) {
int xi = blockIdx.x * blockDim.x + threadIdx.x;
int yi = blockIdx.y;
if (xi >= as_eff) return;
float4 out = make_float4(0, 0, 0, 0);
if (vert) {
float x = xi, y = yi / 2;
for (int i = ~yi & 1; i < 9; i+=2) {
fmav(out, tex2D(conv_up_src_lo, x, y), daub97_ilo[8-i]);
fmav(out, tex2D(conv_up_src_hi, x, y), daub97_ihi[8-i]);
y += 1.0f;
if (y >= gridDim.y / 2) y = 0.0f;
}
} else {
float x = xi / 2, y = yi;
for (int i = ~xi & 1; i < 9; i+=2) {
fmav(out, tex2D(conv_up_src_lo, x, y), daub97_ilo[8-i]);
fmav(out, tex2D(conv_up_src_hi, x, y), daub97_ihi[8-i]);
x += 1.0f;
if (x >= as_eff / 2) x = 0.0f;
}
}
xi += xo;
yi += yo;
if (xi >= as_eff) xi -= as_eff;
if (yi >= gridDim.y) yi -= gridDim.y;
dst[yi * astride + xi] = out;
} }
__global__ __global__
void blur_density_v(float4 *pixbuf, const float *scratch, void simple_thresh(float4 *buf, int astride, float thr, int min_x, int min_y) {
int astride, int aheight) {
// TODO: respect supersampling? configurable blur width?
int x = blockIdx.x * blockDim.x + threadIdx.x; int x = blockIdx.x * blockDim.x + threadIdx.x;
if (x > astride) return;
int y = blockIdx.y; int y = blockIdx.y;
const float *dsrc = scratch + x; if (x >= astride || (x < min_x && y < min_y)) return;
float den = 0.0f; float4 val = buf[y * astride + x];
float fact = 1.0f - expf(powf(fabsf(val.w), 1.2) * -0.2f);
val.x *= fact;
val.y *= fact;
val.z *= fact;
val.w *= fact;
buf[y * astride + x] = val;
}
den += dsrc[astride*max(0, y-2)] * 0.00135f; __global__
den += dsrc[astride*max(0, y-1)] * 0.1573f; void fma_buf(float4 *dst, const float4 *sub, int astride, float scale) {
den += dsrc[astride*y] * 0.6827f; int x = blockIdx.x * blockDim.x + threadIdx.x;
den += dsrc[astride*min(astride-1, y+1)] * 0.1573f; int i = blockIdx.y * astride + x;
den += dsrc[astride*min(astride-1, y+2)] * 0.00135f; if (x >= astride) return;
float4 d = dst[i], s = sub[i];
float *ddst = reinterpret_cast<float*>(&pixbuf[astride * y + x]) + 3; d.x += s.x * scale;
*ddst = min(*ddst, den); d.y += s.y * scale;
d.z += s.z * scale;
d.w += s.w * scale;
dst[i] = d;
} }
#define W 21 // Filter width (regardless of standard deviation chosen) #define W 21 // Filter width (regardless of standard deviation chosen)
@ -280,23 +378,136 @@ class Filtering(object):
def __init__(self): def __init__(self):
self.init_mod() self.init_mod()
self.scratch = None
def blur_density(self, ddst, dscratch, dim, stream=None): def blur_density(self, ddst, dscratch, dim, stream=None):
blur_h = self.mod.get_function("blur_density_h") blur_h = self.mod.get_function("blur_density_h")
blur_h(dscratch, ddst, i32(dim.astride), block=(192, 1, 1), blur_h(dscratch, ddst, i32(dim.astride), block=(192, 1, 1),
grid=(dim.astride / 192, dim.ah), stream=stream) grid=(1 + dim.astride / 192, dim.ah), stream=stream)
blur_v = self.mod.get_function("blur_density_v") blur_v = self.mod.get_function("blur_density_v")
blur_v(ddst, dscratch, i32(dim.astride), i32(dim.ah), block=(192, 1, 1), blur_v(ddst, dscratch, i32(dim.astride), i32(dim.ah), block=(192, 1, 1),
grid=(dim.astride / 192, dim.ah), stream=stream) grid=(1 + dim.astride / 192, dim.ah), stream=stream)
def de(self, ddst, dsrc, gnm, dim, tc, stream=None): def de(self, ddst, dsrc, gnm, dim, tc, stream=None):
from cuburn.render import argset
print dim.ah
np.set_printoptions(linewidth=160, precision=4)
k1 = f32(gnm.color.brightness(tc) * 268 / 256) k1 = f32(gnm.color.brightness(tc) * 268 / 256)
# Old definition of area is (w*h/(s*s)). Since new scale 'ns' is now # Old definition of area is (w*h/(s*s)). Since new scale 'ns' is now
# s/w, new definition is (w*h/(s*s*w*w)) = (h/(s*s*w)) # s/w, new definition is (w*h/(s*s*w*w)) = (h/(s*s*w))
area = dim.h / (gnm.camera.scale(tc) ** 2 * dim.w) area = dim.h / (gnm.camera.scale(tc) ** 2 * dim.w)
k2 = f32(1.0 / (area * gnm.spp(tc))) k2 = f32(1.0 / (area * gnm.spp(tc)))
if gnm.de.radius == 0: # Stride in bytes, buffer size
sb = 16 * dim.astride
bs = sb * dim.ah
if self.scratch is None:
self.scratch = cuda.mem_alloc(bs)
self.hi = cuda.mem_alloc(bs)
q = np.zeros((dim.ah, dim.astride * 4), dtype=np.float32)
q[100:102,128:1024] = 1
#cuda.memcpy_htod(dsrc, q)
dsc = argset(cuda.ArrayDescriptor(), height=dim.ah,
width=dim.astride, format=cuda.array_format.FLOAT,
num_channels=4)
bl = (192, 1, 1)
gr = lambda x, y: (1 + dim.astride / (192 << x), dim.ah >> y)
def get_tref(name):
tref = self.mod.get_texref(name)
tref.set_filter_mode(cuda.filter_mode.POINT)
tref.set_address_mode(0, cuda.address_mode.WRAP)
tref.set_address_mode(1, cuda.address_mode.WRAP)
print tref.get_address_mode(0)
print tref.get_address_mode(1)
return tref
conv_down_src, conv_up_src_lo, conv_up_src_hi = map(get_tref,
['conv_down_src', 'conv_up_src_lo', 'conv_up_src_hi'])
conv_down = self.mod.get_function('conv_down')
conv_up = self.mod.get_function('conv_up')
memcpy = cuda.Memcpy2D()
memcpy.src_pitch = sb
memcpy.src_height = memcpy.dst_height = memcpy.height = dim.ah
memcpy.set_src_device(self.scratch)
memcpy.set_dst_device(self.hi)
STEPS=3
print dim
def th(x):
x = np.int64(x*1e6)
v = np.nonzero(x)[0]
print np.array((v, x[v]))
for xo, yo in [(0, 0), (1, 1), (3, 3), (1, 3), (3, 1), (2, 2), (0, 0)]:
for i in range(STEPS):
xon, yon = (xo, yo) if i == 0 else (0, 0)
as_eff, ah_eff = dim.astride >> i, dim.ah >> i
dsc.width, dsc.height = as_eff, ah_eff
conv_down_src.set_address_2d(dsrc, dsc, sb)
conv_down(self.scratch, i32(dim.astride),
i32(as_eff), i32(ah_eff), i32(1), f32(xon), f32(yon),
block=bl, grid=gr(i, i+1),
texrefs=[conv_down_src], stream=stream)
#cuda.memcpy_dtod_async(self.scratch, dsrc, bs, stream)
conv_down_src.set_address_2d(self.scratch, dsc, sb)
conv_down(dsrc, i32(dim.astride),
i32(as_eff), i32(ah_eff), i32(0), f32(xon), f32(yon),
block=bl, grid=gr(i+1, i),
texrefs=[conv_down_src], stream=stream)
#cuda.memcpy_dtod_async(dsrc, self.scratch, bs, stream)
#th(cuda.from_device_like(self.scratch, q).T[128])
cuda.memcpy_dtod_async(ddst, dsrc, bs, stream)
thresh = self.mod.get_function('simple_thresh')
thresh(dsrc, i32(dim.astride), f32(1),
i32(dim.astride >> STEPS), i32(dim.ah >> STEPS),
block=bl, grid=gr(0, 0), stream=stream)
for i in list(reversed(range(STEPS))):
xon, yon = (xo, yo) if i == 0 else (0, 0)
dsc.width, dsc.height = dim.astride >> i, dim.ah >> (i+1)
conv_up_src_lo.set_address_2d(dsrc, dsc, sb)
hi_addr = int(dsrc) + sb * (dim.ah >> (i+1))
conv_up_src_hi.set_address_2d(hi_addr, dsc, sb)
conv_up(self.scratch, i32(dim.astride), i32(dim.astride >> i),
i32(1), i32(xon), i32(yon),
block=bl, grid=gr(i, i), stream=stream,
texrefs=[conv_up_src_lo, conv_up_src_hi])
sb_2 = sb >> (i+1)
memcpy.dst_pitch = sb_2
memcpy.width_in_bytes = sb_2
memcpy.src_x_in_bytes = sb_2
memcpy(stream)
dsc.width, dsc.height = dim.astride >> (i+1), dim.ah >> i
conv_up_src_lo.set_address_2d(self.scratch, dsc, sb)
conv_up_src_hi.set_address_2d(self.hi, dsc, sb>>(i+1))
conv_up(dsrc, i32(dim.astride), i32(dim.astride >> i),
i32(0), i32(xon), i32(yon),
block=bl, grid=gr(i, i), stream=stream,
texrefs=[conv_up_src_lo, conv_up_src_hi])
#cuda.memcpy_dtod_async(dsrc, self.scratch, bs, stream)
#th(cuda.from_device_like(self.scratch, q).T[128])
cuda.memcpy_dtod_async(ddst, dsrc, bs, stream)
#y = cuda.from_device_like(self.scratch, q)
#print y[93:110,128].T
#print x[93:110,128].T + y[93:110,128].T
#fun = self.mod.get_function('fma_buf')
#fun(self.scratch, self.lo, i32(dim.astride), f32(1),
#block=bl, grid=gr, stream=stream)
#cuda.memcpy_dtod_async(ddst, self.scratch, bs, stream=stream)
#return
if gnm.de.radius(tc) == 0 or True:
nbins = dim.ah * dim.astride nbins = dim.ah * dim.astride
fun = self.mod.get_function("logscale") fun = self.mod.get_function("logscale")
t = fun(dsrc, ddst, k1, k2, t = fun(dsrc, ddst, k1, k2,