diff --git a/cuburn/code/filters.py b/cuburn/code/filters.py index 4e6eb3d..d96dbc7 100644 --- a/cuburn/code/filters.py +++ b/cuburn/code/filters.py @@ -108,6 +108,26 @@ __global__ void den_blur_1c(float *dst, int pattern, int upsample) { } ''') + +fullblurlib = devlib(deps=[denblurlib], defs=r''' +__global__ void full_blur(float4 *dst, int pattern, int upsample) { + GET_IDX_2(xi, yi, gi); + float x = xi, y = yi; + + float4 val = make_float4(0, 0, 0, 0); + + #pragma unroll + for (int i = 0; i < 7; i++) { + float4 pix = tex_shear(chan4_src, pattern, x, y, (i - 3) << upsample); + val.x += pix.x * gauss_coefs[i]; + val.y += pix.y * gauss_coefs[i]; + val.z += pix.z * gauss_coefs[i]; + val.w += pix.w * gauss_coefs[i]; + } + dst[gi] = val; +} +''') + bilaterallib = devlib(deps=[logscalelib, texshearlib, denblurlib], defs=r''' /* sstd: spatial standard deviation (Gaussian filter) * cstd: color standard deviation (Gaussian on the range [0, 1], where 1 @@ -225,12 +245,11 @@ halocliplib = devlib(deps=[yuvlib, denblurlib], defs=r''' __global__ void apply_gamma(float *dst, float4 *src, float gamma) { GET_IDX(i); float4 pix = src[i]; - float ls = powf(fmaxf(0.0f, src[i].z), gamma); - dst[i] = ls * pix.x; + dst[i] = powf(pix.x, gamma); } __global__ void -haloclip(float4 *pixbuf, const float *denbuf, float gamma) { +haloclip(float4 *pixbuf, const float *denbuf, float gamma_m_1) { GET_IDX(i); float4 pix = pixbuf[i]; float areaval = denbuf[i]; @@ -240,12 +259,45 @@ haloclip(float4 *pixbuf, const float *denbuf, float gamma) { return; } - float ls = powf(pix.z, gamma) / fmaxf(1.0f, areaval); - + float ls = powf(pix.w, gamma_m_1) / fmaxf(1.0f, areaval); scale_float4(pix, ls); - yuvo2rgb(pix); + pixbuf[i] = pix; +} +''') +smearcliplib = devlib(deps=[yuvlib, fullblurlib], defs=r''' +// Apply gamma to all four pixels. Subtract one from the result, and clamp at +// a minimum of 0. +__global__ void apply_gamma_full_hi(float4 *dst, float4 *src, float gamma_m_1) { + GET_IDX(i); + float4 pix = src[i]; + float ls = 0.0f; + if (pix.w > 0.0f) + ls = fmaxf(0.0f, pix.w - 1.0f) / pix.w; + scale_float4(pix, ls); + dst[i] = pix; +} + +__global__ void +smearclip(float4 *pixbuf, const float4 *smearbuf, float gamma_m_1) { + GET_IDX(i); + float4 pix = pixbuf[i]; + float4 areaval = smearbuf[i]; + + pix.x += areaval.x; + pix.y += areaval.y; + pix.z += areaval.z; + pix.w += areaval.w; + + if (pix.w <= 0) { + pixbuf[i] = make_float4(0.0f, 0.0f, 0.0f, 0.0f); + return; + } + + float ls = powf(pix.w, gamma_m_1); + scale_float4(pix, ls); + yuvo2rgb(pix); pixbuf[i] = pix; } ''') diff --git a/cuburn/filters.py b/cuburn/filters.py index 93c7d91..322a2bc 100644 --- a/cuburn/filters.py +++ b/cuburn/filters.py @@ -28,6 +28,8 @@ def mkdsc(dim, ch): format=cuda.array_format.FLOAT) class Filter(object): + # Set to True if the filter requires a full 4-channel side buffer + full_side = False def apply(self, fb, gprof, params, dim, tc, stream=None): """ Queue the application of this filter. When the live stream finishes @@ -92,23 +94,43 @@ class Logscale(Filter, ClsMod): class HaloClip(Filter, ClsMod): lib = code.filters.halocliplib def apply(self, fb, gprof, params, dim, tc, stream=None): - gam = f32(1 / params.gamma(tc) - 1) + gam = f32(1 / gprof.filters.colorclip.gamma(tc) - 1) dsc = mkdsc(dim, 1) tref = mktref(self.mod, 'chan1_src') set_blur_width(self.mod, fb.pool, stream=stream) launch2('apply_gamma', self.mod, stream, dim, - fb.d_side, fb.d_front, gam) - tref.set_address_2d(fb.d_side, dsc, 4 * params.astride) + fb.d_side, fb.d_front, f32(0.1)) + tref.set_address_2d(fb.d_side, dsc, 4 * dim.astride) launch2('den_blur_1c', self.mod, stream, dim, - fb.d_back, i32(0), i32(0), texrefs=[tref]) - tref.set_address_2d(fb.d_back, dsc, 4 * params.astride) + fb.d_back, i32(2), i32(0), texrefs=[tref]) + tref.set_address_2d(fb.d_back, dsc, 4 * dim.astride) launch2('den_blur_1c', self.mod, stream, dim, - fb.d_side, i32(1), i32(0), texrefs=[tref]) + fb.d_side, i32(3), i32(0), texrefs=[tref]) launch2('haloclip', self.mod, stream, dim, - fb.d_front, fb.d_side) + fb.d_front, fb.d_side, gam) + +class SmearClip(Filter, ClsMod): + full_side = True + lib = code.filters.smearcliplib + def apply(self, fb, gprof, params, dim, tc, stream=None): + gam = f32(1 / gprof.filters.colorclip.gamma(tc) - 1) + dsc = mkdsc(dim, 4) + tref = mktref(self.mod, 'chan4_src') + + set_blur_width(self.mod, fb.pool, params.width(tc), stream) + launch2('apply_gamma_full_hi', self.mod, stream, dim, + fb.d_side, fb.d_front, gam) + tref.set_address_2d(fb.d_side, dsc, 16 * dim.astride) + launch2('full_blur', self.mod, stream, dim, + fb.d_back, i32(2), i32(0), texrefs=[tref]) + tref.set_address_2d(fb.d_back, dsc, 16 * dim.astride) + launch2('full_blur', self.mod, stream, dim, + fb.d_side, i32(3), i32(0), texrefs=[tref]) + launch2('smearclip', self.mod, stream, dim, + fb.d_front, fb.d_side, gam) class ColorClip(Filter, ClsMod): lib = code.filters.colorcliplib @@ -124,6 +146,6 @@ class ColorClip(Filter, ClsMod): # Ungainly but practical. filter_map = dict(bilateral=Bilateral, logscale=Logscale, haloclip=HaloClip, - colorclip=ColorClip) + colorclip=ColorClip, smearclip=SmearClip) def create(gprof): return [filter_map[f]() for f in gprof.filter_order] diff --git a/cuburn/genome/specs.py b/cuburn/genome/specs.py index 9f420c4..d9332d3 100644 --- a/cuburn/genome/specs.py +++ b/cuburn/genome/specs.py @@ -51,7 +51,8 @@ filters = ( , 'minimum': scalespline(0, max=1, d='Proportional min radius') , 'curve': scalespline(0.6, d='Power of filter radius with density') } - , 'haloclip': {'gamma': scalespline(4)} + , 'haloclip': {} + , 'smearclip': {'width': scalespline(0.75, d='Spatial stdev of filter')} , 'logscale': {'brightness': scalespline(4, d='Log-scale brightness')} }) @@ -96,7 +97,7 @@ edge.update(type='edge', author=author, blend=blend, link=link, time=time, anim = dict(base) anim.update(type='animation', authors=list_(author), link=link, time=time) -default_filters = ['bilateral', 'logscale', 'colorclip'] +default_filters = ['bilateral', 'logscale', 'smearclip'] # Yeah, now I'm just messing around. prof_filters = dict([(fk, dict([(k, refscalar(1, '.'.join(['filters', fk, k]))) for k in fv])) for fk, fv in filters.items()]) diff --git a/cuburn/render.py b/cuburn/render.py index b272a8e..8ffc5cc 100644 --- a/cuburn/render.py +++ b/cuburn/render.py @@ -113,7 +113,7 @@ class Framebuffers(object): try: self.d_front = cuda.mem_alloc(16 * nbins) self.d_back = cuda.mem_alloc(16 * nbins) - self.d_side = cuda.mem_alloc(8 * nbins) + self.d_side = cuda.mem_alloc(16 * nbins) self.nbins = nbins except cuda.MemoryError, e: # If a frame that's too large sneaks by the task distributor, we