mirror of
https://github.com/stevenrobertson/cuburn.git
synced 2025-02-05 03:30:05 -05:00
Experiments in cycle spinning
This commit is contained in:
parent
964b11efdf
commit
7f6332cc73
@ -114,6 +114,32 @@ __device__ void fmav(float4 &dst, float4 src, float scale) {
|
|||||||
dst.w += src.w * scale;
|
dst.w += src.w * scale;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
// The 7-tap filters are missing the leading zero to compensate for delay. I
|
||||||
|
// have no frigging clue what the theory behind that is, but it seems to work.
|
||||||
|
__constant__ float daub97_lo[9] = {
|
||||||
|
0.03782845550726404f, -0.023849465019556843f, -0.11062440441843718f,
|
||||||
|
0.37740285561283066f, 0.8526986790088938f, 0.37740285561283066f,
|
||||||
|
-0.11062440441843718f, -0.023849465019556843f, 0.03782845550726404f
|
||||||
|
};
|
||||||
|
__constant__ float daub97_hi[9] = {
|
||||||
|
-0.06453888262869706f, 0.04068941760916406f,
|
||||||
|
0.41809227322161724f, -0.7884856164055829f, 0.41809227322161724f,
|
||||||
|
0.04068941760916406f, -0.06453888262869706f, 0.0f,
|
||||||
|
};
|
||||||
|
__constant__ float daub97_ilo[9] = {
|
||||||
|
-0.06453888262869706f, -0.04068941760916406f,
|
||||||
|
0.41809227322161724f, 0.7884856164055829f, 0.41809227322161724f,
|
||||||
|
-0.04068941760916406f, -0.06453888262869706f, 0.0f
|
||||||
|
};
|
||||||
|
__constant__ float daub97_ihi[9] = {
|
||||||
|
-0.03782845550726404f, -0.023849465019556843f, 0.11062440441843718f,
|
||||||
|
0.37740285561283066f, -0.8526986790088938f, 0.37740285561283066f,
|
||||||
|
0.11062440441843718f, -0.023849465019556843f, -0.03782845550726404f
|
||||||
|
};
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
// The 7-tap filters are missing the leading zero to compensate for delay. I
|
// The 7-tap filters are missing the leading zero to compensate for delay. I
|
||||||
// have no frigging clue what the theory behind that is, but it seems to work.
|
// have no frigging clue what the theory behind that is, but it seems to work.
|
||||||
__constant__ float daub97_lo[9] = {
|
__constant__ float daub97_lo[9] = {
|
||||||
@ -137,10 +163,13 @@ __constant__ float daub97_ihi[9] = {
|
|||||||
0.11062440441843718f, -0.023849465019556843f, -0.03782845550726404f
|
0.11062440441843718f, -0.023849465019556843f, -0.03782845550726404f
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
#define S 0.7071067811f
|
#define S 0.7071067811f
|
||||||
__constant__ float daub97[4][9] = {
|
__constant__ float daub97_lo[9] = { S, S };
|
||||||
{ 0, 0, 0, S, S}, {0, 0, 0, -S, S}, {0, 0, 0, S, S}, {0, 0, 0, S, -S}};
|
__constant__ float daub97_hi[9] = { -S, S };
|
||||||
|
__constant__ float daub97_ilo[9] = { 0, 0, 0, 0, 0, 0, S, S };
|
||||||
|
__constant__ float daub97_ihi[9] = { 0, 0, 0, 0, 0, 0, S, -S };
|
||||||
*/
|
*/
|
||||||
|
|
||||||
texture<float4, cudaTextureType2D> conv_down_src;
|
texture<float4, cudaTextureType2D> conv_down_src;
|
||||||
@ -155,7 +184,7 @@ void conv_down(float4 *dst, int astride, int as_eff, int ah_eff,
|
|||||||
|
|
||||||
if (vert) {
|
if (vert) {
|
||||||
if (xi >= as_eff) return;
|
if (xi >= as_eff) return;
|
||||||
float x = xi - xo, y = yi * 2 - yo;
|
float x = xi, y = yi * 2 - yo;
|
||||||
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int i = 0; i < 9; i++) {
|
for (int i = 0; i < 9; i++) {
|
||||||
@ -168,7 +197,7 @@ void conv_down(float4 *dst, int astride, int as_eff, int ah_eff,
|
|||||||
dst[(yi + ah_eff / 2) * astride + xi] = hi;
|
dst[(yi + ah_eff / 2) * astride + xi] = hi;
|
||||||
} else {
|
} else {
|
||||||
if (xi >= as_eff / 2) return;
|
if (xi >= as_eff / 2) return;
|
||||||
float x = xi * 2 - xo, y = yi - yo;
|
float x = xi * 2 - xo, y = yi;
|
||||||
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int i = 0; i < 9; i++) {
|
for (int i = 0; i < 9; i++) {
|
||||||
@ -201,6 +230,7 @@ void conv_up(float4 *dst, int astride, int as_eff,
|
|||||||
y += 1.0f;
|
y += 1.0f;
|
||||||
if (y >= gridDim.y / 2) y = 0.0f;
|
if (y >= gridDim.y / 2) y = 0.0f;
|
||||||
}
|
}
|
||||||
|
yi -= yo;
|
||||||
} else {
|
} else {
|
||||||
float x = xi / 2, y = yi;
|
float x = xi / 2, y = yi;
|
||||||
for (int i = ~xi & 1; i < 9; i+=2) {
|
for (int i = ~xi & 1; i < 9; i+=2) {
|
||||||
@ -209,11 +239,10 @@ void conv_up(float4 *dst, int astride, int as_eff,
|
|||||||
x += 1.0f;
|
x += 1.0f;
|
||||||
if (x >= as_eff / 2) x = 0.0f;
|
if (x >= as_eff / 2) x = 0.0f;
|
||||||
}
|
}
|
||||||
|
xi -= xo;
|
||||||
}
|
}
|
||||||
xi += xo;
|
if (xi < 0) xi += as_eff;
|
||||||
yi += yo;
|
if (yi < 0) yi += gridDim.y;
|
||||||
if (xi >= as_eff) xi -= as_eff;
|
|
||||||
if (yi >= gridDim.y) yi -= gridDim.y;
|
|
||||||
dst[yi * astride + xi] = out;
|
dst[yi * astride + xi] = out;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -224,7 +253,7 @@ void simple_thresh(float4 *buf, int astride, float thr, int min_x, int min_y) {
|
|||||||
if (x >= astride || (x < min_x && y < min_y)) return;
|
if (x >= astride || (x < min_x && y < min_y)) return;
|
||||||
|
|
||||||
float4 val = buf[y * astride + x];
|
float4 val = buf[y * astride + x];
|
||||||
float fact = 1.0f - expf(powf(fabsf(val.w), 1.2) * -0.2f);
|
float fact = expf(val.w * val.w * thr) < 0.2f ? 1.0f : 0.0f;
|
||||||
val.x *= fact;
|
val.x *= fact;
|
||||||
val.y *= fact;
|
val.y *= fact;
|
||||||
val.z *= fact;
|
val.z *= fact;
|
||||||
@ -232,6 +261,20 @@ void simple_thresh(float4 *buf, int astride, float thr, int min_x, int min_y) {
|
|||||||
buf[y * astride + x] = val;
|
buf[y * astride + x] = val;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
__global__
|
||||||
|
void buf_abs(float4 *buf, int astride) {
|
||||||
|
int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
int y = blockIdx.y;
|
||||||
|
if (x >= astride) return;
|
||||||
|
|
||||||
|
float4 val = buf[y * astride + x];
|
||||||
|
val.x = fabsf(val.x);
|
||||||
|
val.y = fabsf(val.y);
|
||||||
|
val.z = fabsf(val.z);
|
||||||
|
val.w = fabsf(val.w);
|
||||||
|
buf[y * astride + x] = val;
|
||||||
|
}
|
||||||
|
|
||||||
__global__
|
__global__
|
||||||
void fma_buf(float4 *dst, const float4 *sub, int astride, float scale) {
|
void fma_buf(float4 *dst, const float4 *sub, int astride, float scale) {
|
||||||
int x = blockIdx.x * blockDim.x + threadIdx.x;
|
int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
@ -390,7 +433,6 @@ class Filtering(object):
|
|||||||
|
|
||||||
def de(self, ddst, dsrc, gnm, dim, tc, stream=None):
|
def de(self, ddst, dsrc, gnm, dim, tc, stream=None):
|
||||||
from cuburn.render import argset
|
from cuburn.render import argset
|
||||||
print dim.ah
|
|
||||||
np.set_printoptions(linewidth=160, precision=4)
|
np.set_printoptions(linewidth=160, precision=4)
|
||||||
|
|
||||||
k1 = f32(gnm.color.brightness(tc) * 268 / 256)
|
k1 = f32(gnm.color.brightness(tc) * 268 / 256)
|
||||||
@ -406,6 +448,7 @@ class Filtering(object):
|
|||||||
if self.scratch is None:
|
if self.scratch is None:
|
||||||
self.scratch = cuda.mem_alloc(bs)
|
self.scratch = cuda.mem_alloc(bs)
|
||||||
self.hi = cuda.mem_alloc(bs)
|
self.hi = cuda.mem_alloc(bs)
|
||||||
|
self.aux = cuda.mem_alloc(bs)
|
||||||
q = np.zeros((dim.ah, dim.astride * 4), dtype=np.float32)
|
q = np.zeros((dim.ah, dim.astride * 4), dtype=np.float32)
|
||||||
q[100:102,128:1024] = 1
|
q[100:102,128:1024] = 1
|
||||||
#cuda.memcpy_htod(dsrc, q)
|
#cuda.memcpy_htod(dsrc, q)
|
||||||
@ -421,14 +464,15 @@ class Filtering(object):
|
|||||||
tref.set_filter_mode(cuda.filter_mode.POINT)
|
tref.set_filter_mode(cuda.filter_mode.POINT)
|
||||||
tref.set_address_mode(0, cuda.address_mode.WRAP)
|
tref.set_address_mode(0, cuda.address_mode.WRAP)
|
||||||
tref.set_address_mode(1, cuda.address_mode.WRAP)
|
tref.set_address_mode(1, cuda.address_mode.WRAP)
|
||||||
print tref.get_address_mode(0)
|
|
||||||
print tref.get_address_mode(1)
|
|
||||||
return tref
|
return tref
|
||||||
conv_down_src, conv_up_src_lo, conv_up_src_hi = map(get_tref,
|
conv_down_src, conv_up_src_lo, conv_up_src_hi = map(get_tref,
|
||||||
['conv_down_src', 'conv_up_src_lo', 'conv_up_src_hi'])
|
['conv_down_src', 'conv_up_src_lo', 'conv_up_src_hi'])
|
||||||
|
|
||||||
conv_down = self.mod.get_function('conv_down')
|
conv_down = self.mod.get_function('conv_down')
|
||||||
conv_up = self.mod.get_function('conv_up')
|
conv_up = self.mod.get_function('conv_up')
|
||||||
|
fma_buf = self.mod.get_function('fma_buf')
|
||||||
|
thresh = self.mod.get_function('simple_thresh')
|
||||||
|
buf_abs = self.mod.get_function('buf_abs')
|
||||||
|
|
||||||
memcpy = cuda.Memcpy2D()
|
memcpy = cuda.Memcpy2D()
|
||||||
memcpy.src_pitch = sb
|
memcpy.src_pitch = sb
|
||||||
@ -436,20 +480,34 @@ class Filtering(object):
|
|||||||
memcpy.set_src_device(self.scratch)
|
memcpy.set_src_device(self.scratch)
|
||||||
memcpy.set_dst_device(self.hi)
|
memcpy.set_dst_device(self.hi)
|
||||||
|
|
||||||
STEPS=3
|
STEPS=4
|
||||||
|
SHIFTS = [(0, 0), (0, 1), (1, 1), (1, 0),
|
||||||
|
(3, 0), (5, 0), (7, 0), (15, 0),
|
||||||
|
(0, 3), (0, 5), (0, 7), (0, 15),
|
||||||
|
(3, 3), (5, 5), (7, 7), (15, 15)]
|
||||||
|
|
||||||
|
#SHIFTS = [(0, 0)]
|
||||||
|
#SHIFTS = [(0, 0), (3, 0)]
|
||||||
|
|
||||||
print dim
|
|
||||||
def th(x):
|
def th(x):
|
||||||
x = np.int64(x*1e6)
|
x = np.int64(x*1e6)
|
||||||
v = np.nonzero(x)[0]
|
v = np.nonzero(x)[0]
|
||||||
print np.array((v, x[v]))
|
print np.array((v, x[v]))
|
||||||
|
|
||||||
for xo, yo in [(0, 0), (1, 1), (3, 3), (1, 3), (3, 1), (2, 2), (0, 0)]:
|
stream.synchronize()
|
||||||
|
cuda.memset_d32(ddst, int(0), bs / 4)
|
||||||
|
|
||||||
|
cuda.memcpy_dtod_async(self.aux, dsrc, bs, stream)
|
||||||
|
|
||||||
|
for xo, yo in SHIFTS:
|
||||||
for i in range(STEPS):
|
for i in range(STEPS):
|
||||||
xon, yon = (xo, yo) if i == 0 else (0, 0)
|
xon, yon = (xo, yo) if i == 0 else (0, 0)
|
||||||
as_eff, ah_eff = dim.astride >> i, dim.ah >> i
|
as_eff, ah_eff = dim.astride >> i, dim.ah >> i
|
||||||
dsc.width, dsc.height = as_eff, ah_eff
|
dsc.width, dsc.height = as_eff, ah_eff
|
||||||
conv_down_src.set_address_2d(dsrc, dsc, sb)
|
if i == 0:
|
||||||
|
conv_down_src.set_address_2d(self.aux, dsc, sb)
|
||||||
|
else:
|
||||||
|
conv_down_src.set_address_2d(dsrc, dsc, sb)
|
||||||
conv_down(self.scratch, i32(dim.astride),
|
conv_down(self.scratch, i32(dim.astride),
|
||||||
i32(as_eff), i32(ah_eff), i32(1), f32(xon), f32(yon),
|
i32(as_eff), i32(ah_eff), i32(1), f32(xon), f32(yon),
|
||||||
block=bl, grid=gr(i, i+1),
|
block=bl, grid=gr(i, i+1),
|
||||||
@ -462,14 +520,15 @@ class Filtering(object):
|
|||||||
texrefs=[conv_down_src], stream=stream)
|
texrefs=[conv_down_src], stream=stream)
|
||||||
#cuda.memcpy_dtod_async(dsrc, self.scratch, bs, stream)
|
#cuda.memcpy_dtod_async(dsrc, self.scratch, bs, stream)
|
||||||
#th(cuda.from_device_like(self.scratch, q).T[128])
|
#th(cuda.from_device_like(self.scratch, q).T[128])
|
||||||
cuda.memcpy_dtod_async(ddst, dsrc, bs, stream)
|
|
||||||
|
|
||||||
thresh = self.mod.get_function('simple_thresh')
|
for i, t in enumerate([-0.05, -0.1, -0.3, -0.5]):
|
||||||
thresh(dsrc, i32(dim.astride), f32(1),
|
thresh(dsrc, i32(dim.astride), f32(t),
|
||||||
i32(dim.astride >> STEPS), i32(dim.ah >> STEPS),
|
i32(dim.astride >> (i+1)), i32(dim.ah >> (i+1)),
|
||||||
block=bl, grid=gr(0, 0), stream=stream)
|
block=bl, grid=gr(i, i), stream=stream)
|
||||||
|
#buf_abs(dsrc, i32(dim.astride),
|
||||||
|
#block=bl, grid=gr(0, 0), stream=stream)
|
||||||
|
|
||||||
for i in list(reversed(range(STEPS))):
|
for i in reversed(range(STEPS)):
|
||||||
xon, yon = (xo, yo) if i == 0 else (0, 0)
|
xon, yon = (xo, yo) if i == 0 else (0, 0)
|
||||||
dsc.width, dsc.height = dim.astride >> i, dim.ah >> (i+1)
|
dsc.width, dsc.height = dim.astride >> i, dim.ah >> (i+1)
|
||||||
conv_up_src_lo.set_address_2d(dsrc, dsc, sb)
|
conv_up_src_lo.set_address_2d(dsrc, dsc, sb)
|
||||||
@ -494,7 +553,10 @@ class Filtering(object):
|
|||||||
#cuda.memcpy_dtod_async(dsrc, self.scratch, bs, stream)
|
#cuda.memcpy_dtod_async(dsrc, self.scratch, bs, stream)
|
||||||
|
|
||||||
#th(cuda.from_device_like(self.scratch, q).T[128])
|
#th(cuda.from_device_like(self.scratch, q).T[128])
|
||||||
cuda.memcpy_dtod_async(ddst, dsrc, bs, stream)
|
fma_buf(ddst, dsrc, i32(dim.astride), f32(1.0 / len(SHIFTS)),
|
||||||
|
block=bl, grid=gr(0, 0), stream=stream)
|
||||||
|
|
||||||
|
cuda.memcpy_dtod_async(dsrc, ddst, bs, stream)
|
||||||
|
|
||||||
#y = cuda.from_device_like(self.scratch, q)
|
#y = cuda.from_device_like(self.scratch, q)
|
||||||
#print y[93:110,128].T
|
#print y[93:110,128].T
|
||||||
|
Loading…
Reference in New Issue
Block a user