Experimental bilateral filtering.

This commit is contained in:
Steven Robertson 2012-01-21 00:06:15 -05:00
parent 0aa0106f51
commit 45b75d3fa5
3 changed files with 116 additions and 16 deletions

View File

@ -2,6 +2,7 @@
import numpy as np import numpy as np
from numpy import float32 as f32, int32 as i32 from numpy import float32 as f32, int32 as i32
import pycuda.driver as cuda
import pycuda.compiler import pycuda.compiler
from pycuda.gpuarray import vec from pycuda.gpuarray import vec
@ -266,6 +267,57 @@ void density_est(float4 *outbuf, const float4 *pixbuf,
__syncthreads(); __syncthreads();
} }
} }
__global__
void fma_buf(float4 *dst, const float4 *sub, int astride, float scale) {
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
int i = y * astride + x;
float4 d = dst[i], s = sub[i];
d.x += s.x * scale;
d.y += s.y * scale;
d.z += s.z * scale;
d.w += s.w * scale;
dst[i] = d;
}
texture<float4, cudaTextureType2D> bilateral_src;
__global__
void bilateral(float4 *dst, int astride, float dstd, float rstd) {
int xi = blockIdx.x * blockDim.x + threadIdx.x;
int yi = blockIdx.y * blockDim.y + threadIdx.y;
float x = xi, y = yi;
const float r = 9.0f;
float cen_den = tex2D(bilateral_src, x, y).w;
float4 out = make_float4(0, 0, 0, 0);
float weightsum = 0.0f;
for (float i = -r; i <= r; i++) {
for (float j = -r; j <= r; j++) {
float4 pix = tex2D(bilateral_src, x + i, y + j);
float den_diff = log2f(fabsf(pix.w - cen_den) + 1.0f);
float factor = expf(i * i * dstd)
* expf(j * j * dstd)
* expf(den_diff * den_diff * rstd);
out.x += factor * pix.x;
out.y += factor * pix.y;
out.z += factor * pix.z;
out.w += factor * pix.w;
weightsum += factor;
}
}
float weightrcp = 1.0f / weightsum;
out.x *= weightrcp;
out.y *= weightrcp;
out.z *= weightrcp;
out.w *= weightrcp;
dst[yi * astride + xi] = out;
}
''' '''
class Filtering(object): class Filtering(object):
@ -289,17 +341,56 @@ class Filtering(object):
blur_v(ddst, dscratch, i32(dim.astride), i32(dim.ah), block=(192, 1, 1), blur_v(ddst, dscratch, i32(dim.astride), i32(dim.ah), block=(192, 1, 1),
grid=(dim.astride / 192, dim.ah), stream=stream) grid=(dim.astride / 192, dim.ah), stream=stream)
def de(self, ddst, dsrc, gnm, dim, tc, stream=None): def de(self, ddst, dsrc, gnm, dim, tc, nxf, stream=None):
k1 = f32(gnm.color.brightness(tc) * 268 / 256) k1 = f32(gnm.color.brightness(tc) * 268 / 256)
# Old definition of area is (w*h/(s*s)). Since new scale 'ns' is now # Old definition of area is (w*h/(s*s)). Since new scale 'ns' is now
# s/w, new definition is (w*h/(s*s*w*w)) = (h/(s*s*w)) # s/w, new definition is (w*h/(s*s*w*w)) = (h/(s*s*w))
area = dim.h / (gnm.camera.scale(tc) ** 2 * dim.w) area = dim.h / (gnm.camera.scale(tc) ** 2 * dim.w)
k2 = f32(1.0 / (area * gnm.spp(tc))) k2 = f32(1.0 / (area * gnm.spp(tc)))
if gnm.de.radius == 0: if gnm.de.radius(tc) == 0 or True:
# Stride in bytes, and buffer size
sb = 16 * dim.astride
bs = sb * dim.ah
dsc = argset(cuda.ArrayDescriptor(), height=dim.ah,
width=dim.astride, format=cuda.array_format.FLOAT,
num_channels=4)
tref = self.mod.get_texref('bilateral_src')
tref.set_filter_mode(cuda.filter_mode.POINT)
tref.set_address_mode(0, cuda.address_mode.WRAP)
tref.set_address_mode(1, cuda.address_mode.WRAP)
bilateral = self.mod.get_function('bilateral')
fma_buf = self.mod.get_function('fma_buf')
for i in range(nxf):
tref.set_address_2d(int(dsrc) + bs * i, dsc, sb)
bilateral(ddst, i32(dim.astride), f32(-0.1), f32(-0.1),
block=(32, 8, 1), grid=(dim.astride / 32, dim.ah / 8),
texrefs=[tref], stream=stream)
if i == 0:
cuda.memcpy_dtod_async(dsrc, ddst, bs, stream)
else:
fma_buf(dsrc, ddst, i32(dim.astride), f32(1.0),
block=(32, 8, 1), grid=(dim.astride / 32, dim.ah / 8),
stream=stream)
tref.set_address_2d(dsrc, dsc, sb)
ROUNDS = 3
a, b = dsrc, ddst
for i in range(ROUNDS):
bilateral(b, i32(dim.astride), f32(-0.1), f32(-0.2),
block=(32, 8, 1), grid=(dim.astride / 32, dim.ah / 8),
texrefs=[tref], stream=stream)
a, b = b, a
nbins = dim.ah * dim.astride nbins = dim.ah * dim.astride
fun = self.mod.get_function("logscale") fun = self.mod.get_function("logscale")
t = fun(dsrc, ddst, k1, k2, # This function only looks at one pixel, so using the same
# parameter for input and output is OK (if e.g. ROUNDS is odd)
t = fun(a, ddst, k1, k2,
block=(512, 1, 1), grid=(nbins/512, 1), stream=stream) block=(512, 1, 1), grid=(nbins/512, 1), stream=stream)
else: else:
scale_coeff = f32(1.0 / gnm.de.radius(tc)) scale_coeff = f32(1.0 / gnm.de.radius(tc))

View File

@ -240,6 +240,7 @@ void iter(
if (threadIdx.y == 0 && threadIdx.x < {{NWARPS*2}}) if (threadIdx.y == 0 && threadIdx.x < {{NWARPS*2}})
cosel[threadIdx.x] = mwc_next_01(rctx); cosel[threadIdx.x] = mwc_next_01(rctx);
__syncthreads(); __syncthreads();
int last_xf_used = 0;
{{endif}} {{endif}}
bool fuse = false; bool fuse = false;
@ -285,12 +286,16 @@ void iter(
{{precalc_densities(pcp, std_xforms)}} {{precalc_densities(pcp, std_xforms)}}
float xfsel = cosel[threadIdx.y]; float xfsel = cosel[threadIdx.y];
{{for xform_name in std_xforms[:-1]}} {{for xform_idx, xform_name in enumerate(std_xforms[:-1])}}
if (xfsel <= {{pcp['den_'+xform_name]}}) { if (xfsel <= {{pcp['den_'+xform_name]}}) {
apply_xf_{{xform_name}}(x, y, color, rctx); apply_xf_{{xform_name}}(x, y, color, rctx);
last_xf_used = {{xform_idx}};
} else } else
{{endfor}} {{endfor}}
{
apply_xf_{{std_xforms[-1]}}(x, y, color, rctx); apply_xf_{{std_xforms[-1]}}(x, y, color, rctx);
last_xf_used = {{len(std_xforms)-1}};
}
// Rotate points between threads. // Rotate points between threads.
int swr = threadIdx.y + threadIdx.x int swr = threadIdx.y + threadIdx.x
@ -298,7 +303,7 @@ void iter(
int sw = (swr * 32 + threadIdx.x) & {{NTHREADS-1}}; int sw = (swr * 32 + threadIdx.x) & {{NTHREADS-1}};
int sr = threadIdx.y * 32 + threadIdx.x; int sr = threadIdx.y * 32 + threadIdx.x;
swap[sw] = fuse ? 1.0f : 0.0f; swap[sw] = fuse ? -1.0f : last_xf_used;
swap[sw+{{NTHREADS}}] = x; swap[sw+{{NTHREADS}}] = x;
swap[sw+{{2*NTHREADS}}] = y; swap[sw+{{2*NTHREADS}}] = y;
swap[sw+{{3*NTHREADS}}] = color; swap[sw+{{3*NTHREADS}}] = color;
@ -308,7 +313,8 @@ void iter(
if (threadIdx.y == 0 && threadIdx.x < {{NWARPS*2}}) if (threadIdx.y == 0 && threadIdx.x < {{NWARPS*2}})
cosel[threadIdx.x] = mwc_next_01(rctx); cosel[threadIdx.x] = mwc_next_01(rctx);
fuse = swap[sr]; last_xf_used = swap[sr];
fuse = (last_xf_used < 0);
x = swap[sr+{{NTHREADS}}]; x = swap[sr+{{NTHREADS}}];
y = swap[sr+{{2*NTHREADS}}]; y = swap[sr+{{2*NTHREADS}}];
color = swap[sr+{{3*NTHREADS}}]; color = swap[sr+{{3*NTHREADS}}];
@ -352,7 +358,8 @@ void iter(
continue; continue;
} }
uint32_t i = iy * acc_size.astride + ix; uint32_t i = (last_xf_used * acc_size.aheight + iy)
* acc_size.astride + ix;
{{if info.acc_mode == 'atomic'}} {{if info.acc_mode == 'atomic'}}
asm volatile ({{crep(""" asm volatile ({{crep("""
{ {

View File

@ -185,11 +185,11 @@ class Renderer(object):
cuda.memcpy_htod_async(d_acc_size, u32(list(dim)), write_stream) cuda.memcpy_htod_async(d_acc_size, u32(list(dim)), write_stream)
nbins = astride * aheight nbins = astride * aheight
# Extra padding in accum helps with write_shmem overruns nxf = len(filter(lambda g: g != 'final', genome.xforms))
d_accum = cuda.mem_alloc(16 * nbins + (1<<16)) d_accum = cuda.mem_alloc(16 * nbins * nxf)
d_out = cuda.mem_alloc(16 * aheight * astride) d_out = cuda.mem_alloc(16 * nbins)
if self.acc_mode == 'atomic': if self.acc_mode == 'atomic':
d_atom = cuda.mem_alloc(8 * nbins) d_atom = cuda.mem_alloc(8 * nbins * nxf)
flush_fun = self.mod.get_function("flush_atom") flush_fun = self.mod.get_function("flush_atom")
obuf_copy = util.argset(cuda.Memcpy2D(), obuf_copy = util.argset(cuda.Memcpy2D(),
@ -310,9 +310,11 @@ class Renderer(object):
#for i, n in zip(d_temp[5], self._iter.packer.packed): #for i, n in zip(d_temp[5], self._iter.packer.packed):
#print '%60s %g' % ('_'.join(n), i) #print '%60s %g' % ('_'.join(n), i)
util.BaseCode.fill_dptr(self.mod, d_accum, 4 * nbins, write_stream) util.BaseCode.fill_dptr(self.mod, d_accum, 4 * nbins * nxf,
write_stream)
if self.acc_mode == 'atomic': if self.acc_mode == 'atomic':
util.BaseCode.fill_dptr(self.mod, d_atom, 2 * nbins, write_stream) util.BaseCode.fill_dptr(self.mod, d_atom, 2 * nbins * nxf,
write_stream)
nrounds = int( (genome.spp(tc) * width * height) nrounds = int( (genome.spp(tc) * width * height)
/ (ntemporal_samples * 256 * 256) ) + 1 / (ntemporal_samples * 256 * 256) ) + 1
if self.acc_mode == 'deferred': if self.acc_mode == 'deferred':
@ -334,14 +336,14 @@ class Renderer(object):
iter_fun(*args, block=(32, self._iter.NTHREADS/32, 1), iter_fun(*args, block=(32, self._iter.NTHREADS/32, 1),
grid=(ntemporal_samples, nrounds), stream=iter_stream) grid=(ntemporal_samples, nrounds), stream=iter_stream)
if self.acc_mode == 'atomic': if self.acc_mode == 'atomic':
nblocks = int(np.ceil(np.sqrt(nbins/float(512)))) nblocks = int(np.ceil(np.sqrt(nbins*nxf/float(512))))
flush_fun(u64(d_accum), u64(d_atom), i32(nbins), flush_fun(u64(d_accum), u64(d_atom), i32(nbins*nxf),
block=(512, 1, 1), grid=(nblocks, nblocks), block=(512, 1, 1), grid=(nblocks, nblocks),
stream=iter_stream) stream=iter_stream)
util.BaseCode.fill_dptr(self.mod, d_out, 4 * nbins, filt_stream) util.BaseCode.fill_dptr(self.mod, d_out, 4 * nbins, filt_stream)
_sync_stream(filt_stream, write_stream) _sync_stream(filt_stream, write_stream)
filt.de(d_out, d_accum, genome, dim, tc, stream=filt_stream) filt.de(d_out, d_accum, genome, dim, tc, nxf, stream=filt_stream)
_sync_stream(write_stream, filt_stream) _sync_stream(write_stream, filt_stream)
filt.colorclip(d_out, genome, dim, tc, blend, stream=filt_stream) filt.colorclip(d_out, genome, dim, tc, blend, stream=filt_stream)
obuf_copy.set_dst_host(h_out_a) obuf_copy.set_dst_host(h_out_a)