mirror of
https://github.com/stevenrobertson/cuburn.git
synced 2025-02-05 11:40:04 -05:00
Experimental bilateral filtering.
This commit is contained in:
parent
0aa0106f51
commit
45b75d3fa5
@ -2,6 +2,7 @@
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
from numpy import float32 as f32, int32 as i32
|
from numpy import float32 as f32, int32 as i32
|
||||||
|
|
||||||
|
import pycuda.driver as cuda
|
||||||
import pycuda.compiler
|
import pycuda.compiler
|
||||||
from pycuda.gpuarray import vec
|
from pycuda.gpuarray import vec
|
||||||
|
|
||||||
@ -266,6 +267,57 @@ void density_est(float4 *outbuf, const float4 *pixbuf,
|
|||||||
__syncthreads();
|
__syncthreads();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
__global__
|
||||||
|
void fma_buf(float4 *dst, const float4 *sub, int astride, float scale) {
|
||||||
|
int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||||
|
int i = y * astride + x;
|
||||||
|
float4 d = dst[i], s = sub[i];
|
||||||
|
d.x += s.x * scale;
|
||||||
|
d.y += s.y * scale;
|
||||||
|
d.z += s.z * scale;
|
||||||
|
d.w += s.w * scale;
|
||||||
|
dst[i] = d;
|
||||||
|
}
|
||||||
|
|
||||||
|
texture<float4, cudaTextureType2D> bilateral_src;
|
||||||
|
|
||||||
|
__global__
|
||||||
|
void bilateral(float4 *dst, int astride, float dstd, float rstd) {
|
||||||
|
int xi = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
int yi = blockIdx.y * blockDim.y + threadIdx.y;
|
||||||
|
float x = xi, y = yi;
|
||||||
|
|
||||||
|
const float r = 9.0f;
|
||||||
|
|
||||||
|
float cen_den = tex2D(bilateral_src, x, y).w;
|
||||||
|
float4 out = make_float4(0, 0, 0, 0);
|
||||||
|
float weightsum = 0.0f;
|
||||||
|
|
||||||
|
for (float i = -r; i <= r; i++) {
|
||||||
|
for (float j = -r; j <= r; j++) {
|
||||||
|
float4 pix = tex2D(bilateral_src, x + i, y + j);
|
||||||
|
float den_diff = log2f(fabsf(pix.w - cen_den) + 1.0f);
|
||||||
|
float factor = expf(i * i * dstd)
|
||||||
|
* expf(j * j * dstd)
|
||||||
|
* expf(den_diff * den_diff * rstd);
|
||||||
|
out.x += factor * pix.x;
|
||||||
|
out.y += factor * pix.y;
|
||||||
|
out.z += factor * pix.z;
|
||||||
|
out.w += factor * pix.w;
|
||||||
|
weightsum += factor;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
float weightrcp = 1.0f / weightsum;
|
||||||
|
out.x *= weightrcp;
|
||||||
|
out.y *= weightrcp;
|
||||||
|
out.z *= weightrcp;
|
||||||
|
out.w *= weightrcp;
|
||||||
|
|
||||||
|
dst[yi * astride + xi] = out;
|
||||||
|
}
|
||||||
|
|
||||||
'''
|
'''
|
||||||
|
|
||||||
class Filtering(object):
|
class Filtering(object):
|
||||||
@ -289,17 +341,56 @@ class Filtering(object):
|
|||||||
blur_v(ddst, dscratch, i32(dim.astride), i32(dim.ah), block=(192, 1, 1),
|
blur_v(ddst, dscratch, i32(dim.astride), i32(dim.ah), block=(192, 1, 1),
|
||||||
grid=(dim.astride / 192, dim.ah), stream=stream)
|
grid=(dim.astride / 192, dim.ah), stream=stream)
|
||||||
|
|
||||||
def de(self, ddst, dsrc, gnm, dim, tc, stream=None):
|
def de(self, ddst, dsrc, gnm, dim, tc, nxf, stream=None):
|
||||||
k1 = f32(gnm.color.brightness(tc) * 268 / 256)
|
k1 = f32(gnm.color.brightness(tc) * 268 / 256)
|
||||||
# Old definition of area is (w*h/(s*s)). Since new scale 'ns' is now
|
# Old definition of area is (w*h/(s*s)). Since new scale 'ns' is now
|
||||||
# s/w, new definition is (w*h/(s*s*w*w)) = (h/(s*s*w))
|
# s/w, new definition is (w*h/(s*s*w*w)) = (h/(s*s*w))
|
||||||
area = dim.h / (gnm.camera.scale(tc) ** 2 * dim.w)
|
area = dim.h / (gnm.camera.scale(tc) ** 2 * dim.w)
|
||||||
k2 = f32(1.0 / (area * gnm.spp(tc)))
|
k2 = f32(1.0 / (area * gnm.spp(tc)))
|
||||||
|
|
||||||
if gnm.de.radius == 0:
|
if gnm.de.radius(tc) == 0 or True:
|
||||||
|
# Stride in bytes, and buffer size
|
||||||
|
sb = 16 * dim.astride
|
||||||
|
bs = sb * dim.ah
|
||||||
|
|
||||||
|
|
||||||
|
dsc = argset(cuda.ArrayDescriptor(), height=dim.ah,
|
||||||
|
width=dim.astride, format=cuda.array_format.FLOAT,
|
||||||
|
num_channels=4)
|
||||||
|
tref = self.mod.get_texref('bilateral_src')
|
||||||
|
tref.set_filter_mode(cuda.filter_mode.POINT)
|
||||||
|
tref.set_address_mode(0, cuda.address_mode.WRAP)
|
||||||
|
tref.set_address_mode(1, cuda.address_mode.WRAP)
|
||||||
|
|
||||||
|
bilateral = self.mod.get_function('bilateral')
|
||||||
|
fma_buf = self.mod.get_function('fma_buf')
|
||||||
|
for i in range(nxf):
|
||||||
|
tref.set_address_2d(int(dsrc) + bs * i, dsc, sb)
|
||||||
|
bilateral(ddst, i32(dim.astride), f32(-0.1), f32(-0.1),
|
||||||
|
block=(32, 8, 1), grid=(dim.astride / 32, dim.ah / 8),
|
||||||
|
texrefs=[tref], stream=stream)
|
||||||
|
if i == 0:
|
||||||
|
cuda.memcpy_dtod_async(dsrc, ddst, bs, stream)
|
||||||
|
else:
|
||||||
|
fma_buf(dsrc, ddst, i32(dim.astride), f32(1.0),
|
||||||
|
block=(32, 8, 1), grid=(dim.astride / 32, dim.ah / 8),
|
||||||
|
stream=stream)
|
||||||
|
|
||||||
|
tref.set_address_2d(dsrc, dsc, sb)
|
||||||
|
ROUNDS = 3
|
||||||
|
|
||||||
|
a, b = dsrc, ddst
|
||||||
|
for i in range(ROUNDS):
|
||||||
|
bilateral(b, i32(dim.astride), f32(-0.1), f32(-0.2),
|
||||||
|
block=(32, 8, 1), grid=(dim.astride / 32, dim.ah / 8),
|
||||||
|
texrefs=[tref], stream=stream)
|
||||||
|
a, b = b, a
|
||||||
|
|
||||||
nbins = dim.ah * dim.astride
|
nbins = dim.ah * dim.astride
|
||||||
fun = self.mod.get_function("logscale")
|
fun = self.mod.get_function("logscale")
|
||||||
t = fun(dsrc, ddst, k1, k2,
|
# This function only looks at one pixel, so using the same
|
||||||
|
# parameter for input and output is OK (if e.g. ROUNDS is odd)
|
||||||
|
t = fun(a, ddst, k1, k2,
|
||||||
block=(512, 1, 1), grid=(nbins/512, 1), stream=stream)
|
block=(512, 1, 1), grid=(nbins/512, 1), stream=stream)
|
||||||
else:
|
else:
|
||||||
scale_coeff = f32(1.0 / gnm.de.radius(tc))
|
scale_coeff = f32(1.0 / gnm.de.radius(tc))
|
||||||
|
@ -240,6 +240,7 @@ void iter(
|
|||||||
if (threadIdx.y == 0 && threadIdx.x < {{NWARPS*2}})
|
if (threadIdx.y == 0 && threadIdx.x < {{NWARPS*2}})
|
||||||
cosel[threadIdx.x] = mwc_next_01(rctx);
|
cosel[threadIdx.x] = mwc_next_01(rctx);
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
int last_xf_used = 0;
|
||||||
{{endif}}
|
{{endif}}
|
||||||
|
|
||||||
bool fuse = false;
|
bool fuse = false;
|
||||||
@ -285,12 +286,16 @@ void iter(
|
|||||||
{{precalc_densities(pcp, std_xforms)}}
|
{{precalc_densities(pcp, std_xforms)}}
|
||||||
float xfsel = cosel[threadIdx.y];
|
float xfsel = cosel[threadIdx.y];
|
||||||
|
|
||||||
{{for xform_name in std_xforms[:-1]}}
|
{{for xform_idx, xform_name in enumerate(std_xforms[:-1])}}
|
||||||
if (xfsel <= {{pcp['den_'+xform_name]}}) {
|
if (xfsel <= {{pcp['den_'+xform_name]}}) {
|
||||||
apply_xf_{{xform_name}}(x, y, color, rctx);
|
apply_xf_{{xform_name}}(x, y, color, rctx);
|
||||||
|
last_xf_used = {{xform_idx}};
|
||||||
} else
|
} else
|
||||||
{{endfor}}
|
{{endfor}}
|
||||||
|
{
|
||||||
apply_xf_{{std_xforms[-1]}}(x, y, color, rctx);
|
apply_xf_{{std_xforms[-1]}}(x, y, color, rctx);
|
||||||
|
last_xf_used = {{len(std_xforms)-1}};
|
||||||
|
}
|
||||||
|
|
||||||
// Rotate points between threads.
|
// Rotate points between threads.
|
||||||
int swr = threadIdx.y + threadIdx.x
|
int swr = threadIdx.y + threadIdx.x
|
||||||
@ -298,7 +303,7 @@ void iter(
|
|||||||
int sw = (swr * 32 + threadIdx.x) & {{NTHREADS-1}};
|
int sw = (swr * 32 + threadIdx.x) & {{NTHREADS-1}};
|
||||||
int sr = threadIdx.y * 32 + threadIdx.x;
|
int sr = threadIdx.y * 32 + threadIdx.x;
|
||||||
|
|
||||||
swap[sw] = fuse ? 1.0f : 0.0f;
|
swap[sw] = fuse ? -1.0f : last_xf_used;
|
||||||
swap[sw+{{NTHREADS}}] = x;
|
swap[sw+{{NTHREADS}}] = x;
|
||||||
swap[sw+{{2*NTHREADS}}] = y;
|
swap[sw+{{2*NTHREADS}}] = y;
|
||||||
swap[sw+{{3*NTHREADS}}] = color;
|
swap[sw+{{3*NTHREADS}}] = color;
|
||||||
@ -308,7 +313,8 @@ void iter(
|
|||||||
if (threadIdx.y == 0 && threadIdx.x < {{NWARPS*2}})
|
if (threadIdx.y == 0 && threadIdx.x < {{NWARPS*2}})
|
||||||
cosel[threadIdx.x] = mwc_next_01(rctx);
|
cosel[threadIdx.x] = mwc_next_01(rctx);
|
||||||
|
|
||||||
fuse = swap[sr];
|
last_xf_used = swap[sr];
|
||||||
|
fuse = (last_xf_used < 0);
|
||||||
x = swap[sr+{{NTHREADS}}];
|
x = swap[sr+{{NTHREADS}}];
|
||||||
y = swap[sr+{{2*NTHREADS}}];
|
y = swap[sr+{{2*NTHREADS}}];
|
||||||
color = swap[sr+{{3*NTHREADS}}];
|
color = swap[sr+{{3*NTHREADS}}];
|
||||||
@ -352,7 +358,8 @@ void iter(
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
uint32_t i = iy * acc_size.astride + ix;
|
uint32_t i = (last_xf_used * acc_size.aheight + iy)
|
||||||
|
* acc_size.astride + ix;
|
||||||
{{if info.acc_mode == 'atomic'}}
|
{{if info.acc_mode == 'atomic'}}
|
||||||
asm volatile ({{crep("""
|
asm volatile ({{crep("""
|
||||||
{
|
{
|
||||||
|
@ -185,11 +185,11 @@ class Renderer(object):
|
|||||||
cuda.memcpy_htod_async(d_acc_size, u32(list(dim)), write_stream)
|
cuda.memcpy_htod_async(d_acc_size, u32(list(dim)), write_stream)
|
||||||
|
|
||||||
nbins = astride * aheight
|
nbins = astride * aheight
|
||||||
# Extra padding in accum helps with write_shmem overruns
|
nxf = len(filter(lambda g: g != 'final', genome.xforms))
|
||||||
d_accum = cuda.mem_alloc(16 * nbins + (1<<16))
|
d_accum = cuda.mem_alloc(16 * nbins * nxf)
|
||||||
d_out = cuda.mem_alloc(16 * aheight * astride)
|
d_out = cuda.mem_alloc(16 * nbins)
|
||||||
if self.acc_mode == 'atomic':
|
if self.acc_mode == 'atomic':
|
||||||
d_atom = cuda.mem_alloc(8 * nbins)
|
d_atom = cuda.mem_alloc(8 * nbins * nxf)
|
||||||
flush_fun = self.mod.get_function("flush_atom")
|
flush_fun = self.mod.get_function("flush_atom")
|
||||||
|
|
||||||
obuf_copy = util.argset(cuda.Memcpy2D(),
|
obuf_copy = util.argset(cuda.Memcpy2D(),
|
||||||
@ -310,9 +310,11 @@ class Renderer(object):
|
|||||||
#for i, n in zip(d_temp[5], self._iter.packer.packed):
|
#for i, n in zip(d_temp[5], self._iter.packer.packed):
|
||||||
#print '%60s %g' % ('_'.join(n), i)
|
#print '%60s %g' % ('_'.join(n), i)
|
||||||
|
|
||||||
util.BaseCode.fill_dptr(self.mod, d_accum, 4 * nbins, write_stream)
|
util.BaseCode.fill_dptr(self.mod, d_accum, 4 * nbins * nxf,
|
||||||
|
write_stream)
|
||||||
if self.acc_mode == 'atomic':
|
if self.acc_mode == 'atomic':
|
||||||
util.BaseCode.fill_dptr(self.mod, d_atom, 2 * nbins, write_stream)
|
util.BaseCode.fill_dptr(self.mod, d_atom, 2 * nbins * nxf,
|
||||||
|
write_stream)
|
||||||
nrounds = int( (genome.spp(tc) * width * height)
|
nrounds = int( (genome.spp(tc) * width * height)
|
||||||
/ (ntemporal_samples * 256 * 256) ) + 1
|
/ (ntemporal_samples * 256 * 256) ) + 1
|
||||||
if self.acc_mode == 'deferred':
|
if self.acc_mode == 'deferred':
|
||||||
@ -334,14 +336,14 @@ class Renderer(object):
|
|||||||
iter_fun(*args, block=(32, self._iter.NTHREADS/32, 1),
|
iter_fun(*args, block=(32, self._iter.NTHREADS/32, 1),
|
||||||
grid=(ntemporal_samples, nrounds), stream=iter_stream)
|
grid=(ntemporal_samples, nrounds), stream=iter_stream)
|
||||||
if self.acc_mode == 'atomic':
|
if self.acc_mode == 'atomic':
|
||||||
nblocks = int(np.ceil(np.sqrt(nbins/float(512))))
|
nblocks = int(np.ceil(np.sqrt(nbins*nxf/float(512))))
|
||||||
flush_fun(u64(d_accum), u64(d_atom), i32(nbins),
|
flush_fun(u64(d_accum), u64(d_atom), i32(nbins*nxf),
|
||||||
block=(512, 1, 1), grid=(nblocks, nblocks),
|
block=(512, 1, 1), grid=(nblocks, nblocks),
|
||||||
stream=iter_stream)
|
stream=iter_stream)
|
||||||
|
|
||||||
util.BaseCode.fill_dptr(self.mod, d_out, 4 * nbins, filt_stream)
|
util.BaseCode.fill_dptr(self.mod, d_out, 4 * nbins, filt_stream)
|
||||||
_sync_stream(filt_stream, write_stream)
|
_sync_stream(filt_stream, write_stream)
|
||||||
filt.de(d_out, d_accum, genome, dim, tc, stream=filt_stream)
|
filt.de(d_out, d_accum, genome, dim, tc, nxf, stream=filt_stream)
|
||||||
_sync_stream(write_stream, filt_stream)
|
_sync_stream(write_stream, filt_stream)
|
||||||
filt.colorclip(d_out, genome, dim, tc, blend, stream=filt_stream)
|
filt.colorclip(d_out, genome, dim, tc, blend, stream=filt_stream)
|
||||||
obuf_copy.set_dst_host(h_out_a)
|
obuf_copy.set_dst_host(h_out_a)
|
||||||
|
Loading…
Reference in New Issue
Block a user