From c572f62d7d949afa6538c7488fc6ab2241352c56 Mon Sep 17 00:00:00 2001 From: Steven Robertson Date: Sun, 22 Jan 2012 23:56:16 -0500 Subject: [PATCH] Use YUV during accumulation --- cuburn/code/filtering.py | 6 +++ cuburn/code/interp.py | 37 +++++++++-------- cuburn/code/iter.py | 87 ++++++++++++++++++++++++++-------------- cuburn/code/util.py | 4 +- 4 files changed, 83 insertions(+), 51 deletions(-) diff --git a/cuburn/code/filtering.py b/cuburn/code/filtering.py index 0c184cf..cea349c 100644 --- a/cuburn/code/filtering.py +++ b/cuburn/code/filtering.py @@ -25,6 +25,12 @@ void colorclip(float4 *pixbuf, float gamma, float vibrance, float highpow, pixbuf[i] = make_float4(bkgd.x, bkgd.y, bkgd.z, 0.0f); return; } + pix.y -= 0.5f * pix.w; + pix.z -= 0.5f * pix.w; + float3 tmp = yuv2rgb(make_float3(pix.x, pix.y, pix.z)); + pix.x = tmp.x; + pix.y = tmp.y; + pix.z = tmp.z; pix.x = fmaxf(0.0f, pix.x); pix.y = fmaxf(0.0f, pix.y); diff --git a/cuburn/code/interp.py b/cuburn/code/interp.py index 843abaa..e3a29a7 100644 --- a/cuburn/code/interp.py +++ b/cuburn/code/interp.py @@ -370,7 +370,7 @@ float4 interp_color(const float *times, const float4 *sources, float time) { float4 left = sources[blockDim.x * (idx - 1) + threadIdx.x]; float4 right = sources[blockDim.x * (idx) + threadIdx.x]; - float3 rgb; + float3 yuv; float3 l3 = make_float3(left.x, left.y, left.z); float3 r3 = make_float3(right.x, right.y, right.z); @@ -395,10 +395,8 @@ float4 interp_color(const float *times, const float4 *sources, float time) { if (hsv.x < 0.0f) hsv.x += 6.0f; - rgb = hsv2rgb(hsv); + yuv = rgb2yuv(hsv2rgb(hsv)); {{elif mode.startswith('yuv')}} - float3 yuv; - {{if mode == 'yuv'}} float3 lyuv = rgb2yuv(l3); float3 ryuv = rgb2yuv(r3); @@ -412,11 +410,12 @@ float4 interp_color(const float *times, const float4 *sources, float time) { yuv.y = radius * cosf(angle); yuv.z = radius * sinf(angle); {{endif}} - - rgb = yuv2rgb(yuv); {{endif}} - return make_float4(rgb.x, rgb.y, rgb.z, left.w * lf + right.w * rf); + yuv.y += 0.5f; + yuv.z += 0.5f; + + return make_float4(yuv.x, yuv.y, yuv.z, left.w * lf + right.w * rf); } __global__ @@ -424,13 +423,13 @@ void interp_palette(uchar4 *outs, const float *times, const float4 *sources, float tstart, float tstep) { float time = tstart + blockIdx.x * tstep; - float4 rgba = interp_color(times, sources, time); + float4 yuva = interp_color(times, sources, time); uchar4 out; - out.x = rgba.x * 255.0f; - out.y = rgba.y * 255.0f; - out.z = rgba.z * 255.0f; - out.w = rgba.w * 255.0f; + out.x = yuva.x * 255.0f; + out.y = yuva.y * 255.0f; + out.z = yuva.z * 255.0f; + out.w = yuva.w * 255.0f; outs[blockDim.x * blockIdx.x + threadIdx.x] = out; } @@ -443,16 +442,16 @@ void interp_palette_flat(mwc_st *rctxs, mwc_st rctx = rctxs[gid]; float time = tstart + blockIdx.x * tstep; - float4 rgba = interp_color(times, sources, time); + float4 yuva = interp_color(times, sources, time); - // TODO: use YUV; pack Y at full precision, UV at quarter + // TODO: pack Y at full precision, UV at quarter uint2 out; - uint32_t r = min(255, (uint32_t) (rgba.x * 255.0f + 0.49f * mwc_next_11(rctx))); - uint32_t g = min(255, (uint32_t) (rgba.y * 255.0f + 0.49f * mwc_next_11(rctx))); - uint32_t b = min(255, (uint32_t) (rgba.z * 255.0f + 0.49f * mwc_next_11(rctx))); - out.y = (1 << 22) | (r << 4); - out.x = (g << 18) | b; + uint32_t y = min(255, (uint32_t) (yuva.x * 255.0f + 0.49f * mwc_next_11(rctx))); + uint32_t u = min(255, (uint32_t) (yuva.y * 255.0f + 0.49f * mwc_next_11(rctx))); + uint32_t v = min(255, (uint32_t) (yuva.z * 255.0f + 0.49f * mwc_next_11(rctx))); + out.y = (1 << 22) | (y << 4); + out.x = (u << 18) | v; surf2Dwrite(out, flatpal, 8 * threadIdx.x, blockIdx.x); rctxs[gid] = rctx; } diff --git a/cuburn/code/iter.py b/cuburn/code/iter.py index 7cacc81..7cb2514 100644 --- a/cuburn/code/iter.py +++ b/cuburn/code/iter.py @@ -361,9 +361,15 @@ void iter( {{if info.acc_mode == 'atomic'}} asm volatile ({{crep(""" { + // To prevent overflow, we need to flush each pixel before the density + // wraps at 1024 points. This atomic segment performs writes to the + // integer buffer, occasionally checking the results. If they exceed a + // threshold, it zeros that cell in the integer buffer, converts the + // former contents to floats, and adds them to the float4 buffer. + .reg .pred p; - .reg .u32 off, color, hi, lo, d, r, g, b; - .reg .f32 colorf, rf, gf, bf, df; + .reg .u32 off, color, hi, lo, d, y, u, v; + .reg .f32 colorf, yf, uf, vf, df; .reg .u64 ptr, val; // TODO: coord dithering better, or pre-supersampled palette? @@ -371,38 +377,59 @@ void iter( cvt.rni.u32.f32 color, colorf; shl.b32 color, color, 3; + // Load the pre-packed 64-bit uint from the palette surf suld.b.2d.v2.b32.clamp {lo, hi}, [flatpal, {color, %2}]; mov.b64 val, {lo, hi}; + + // Calculate the output address in the atomic integer accumulator shl.b32 off, %3, 3; cvt.u64.u32 ptr, off; add.u64 ptr, ptr, %4; + + // 97% of the time, do an atomic add, then jump to the end without + // stalling the thread waiting for the data value setp.le.f32 p, %5, 0.97; @p red.global.add.u64 [ptr], val; @p bra oflow_end; + + // 3% of the time, do the atomic add, and wait for the results atom.global.add.u64 val, [ptr], val; mov.b64 {lo, hi}, val; + + // If the density is less than 256, jump to the end setp.lo.u32 p, hi, (256 << 22); @p bra oflow_end; + + // Atomically swap the integer cell with 0 and read its current value atom.global.exch.b64 val, [ptr], 0; mov.b64 {lo, hi}, val; + + // If the integer cell is zero, another thread has captured the full value + // in between the first atomic read and the second, so we can skip to the + // end again. + setp.eq.u32 p, hi, 0; +@p bra oflow_end; + + // Extract the values from the packed integer, convert to floats, and add + // them to the floating-point buffer. shr.u32 d, hi, 22; - bfe.u32 r, hi, 4, 18; - bfe.u32 g, lo, 18, 14; - bfi.b32 g, hi, g, 14, 4; - and.b32 b, lo, ((1<<18)-1); - cvt.rn.f32.u32 rf, r; - cvt.rn.f32.u32 gf, g; - cvt.rn.f32.u32 bf, b; + bfe.u32 y, hi, 4, 18; + bfe.u32 u, lo, 18, 14; + bfi.b32 u, hi, u, 14, 4; + and.b32 v, lo, ((1<<18)-1); + cvt.rn.f32.u32 yf, y; + cvt.rn.f32.u32 uf, u; + cvt.rn.f32.u32 vf, v; cvt.rn.f32.u32 df, d; - mul.rn.ftz.f32 rf, rf, (1.0/255.0); - mul.rn.ftz.f32 gf, gf, (1.0/255.0); - mul.rn.ftz.f32 bf, bf, (1.0/255.0); + mul.rn.ftz.f32 yf, yf, (1.0/255.0); + mul.rn.ftz.f32 uf, uf, (1.0/255.0); + mul.rn.ftz.f32 vf, vf, (1.0/255.0); shl.b32 off, %3, 4; cvt.u64.u32 ptr, off; add.u64 ptr, ptr, %6; - red.global.add.f32 [ptr], rf; - red.global.add.f32 [ptr+4], gf; - red.global.add.f32 [ptr+8], bf; + red.global.add.f32 [ptr], yf; + red.global.add.f32 [ptr+4], uf; + red.global.add.f32 [ptr+8], vf; red.global.add.f32 [ptr+12], df; oflow_end: } @@ -445,9 +472,9 @@ void flush_atom(uint64_t out_ptr, uint64_t atom_ptr, int nbins) { if (i >= nbins) return; asm volatile ({{crep(""" { - .reg .u32 off, hi, lo, d, r, g, b; + .reg .u32 off, hi, lo, d, y, u, v; .reg .u64 val, ptr; - .reg .f32 rf, gf, bf, df, rg, gg, bg, dg; + .reg .f32 yf, uf, vf, df, yg, ug, vg, dg; // TODO: use explicit movs to handle this shl.b32 off, %0, 3; @@ -457,21 +484,22 @@ void flush_atom(uint64_t out_ptr, uint64_t atom_ptr, int nbins) { shl.b32 off, %0, 4; cvt.u64.u32 ptr, off; add.u64 ptr, ptr, %2; - ld.global.v4.f32 {rg,gg,bg,dg}, [ptr]; + ld.global.v4.f32 {yg,ug,vg,dg}, [ptr]; shr.u32 d, hi, 22; - bfe.u32 r, hi, 4, 18; - bfe.u32 g, lo, 18, 14; - bfi.b32 g, hi, g, 14, 4; - and.b32 b, lo, ((1<<18)-1); - cvt.rn.f32.u32 rf, r; - cvt.rn.f32.u32 gf, g; - cvt.rn.f32.u32 bf, b; + bfe.u32 y, hi, 4, 18; + bfe.u32 u, lo, 18, 14; + bfi.b32 u, hi, u, 14, 4; + and.b32 v, lo, ((1<<18)-1); + cvt.rn.f32.u32 yf, y; + cvt.rn.f32.u32 uf, u; + cvt.rn.f32.u32 vf, v; cvt.rn.f32.u32 df, d; - fma.rn.ftz.f32 rg, rf, (1.0/255.0), rg; - fma.rn.ftz.f32 gg, gf, (1.0/255.0), gg; - fma.rn.ftz.f32 bg, bf, (1.0/255.0), bg; + fma.rn.ftz.f32 yg, yf, (1.0/255.0), yg; + fma.rn.ftz.f32 ug, uf, (1.0/255.0), ug; + fma.rn.ftz.f32 vg, vf, (1.0/255.0), vg; + add.rn.ftz.f32 dg, df, dg; - st.global.v4.f32 [ptr], {rg,gg,bg,dg}; + st.global.v4.f32 [ptr], {yg,ug,vg,dg}; } """)}} :: "r"(i), "l"(atom_ptr), "l"(out_ptr)); } @@ -640,4 +668,3 @@ oflow_write_end: NWARPS = self.NTHREADS / 32, std_xforms = [n for n in sorted(genome.xforms) if n != 'final'], **globals()) - diff --git a/cuburn/code/util.py b/cuburn/code/util.py index 1dfa908..1bd1614 100644 --- a/cuburn/code/util.py +++ b/cuburn/code/util.py @@ -154,8 +154,8 @@ void write_half(float &xy, float x, float y, float den) { } -/* This conversion uses the JPEG full-range standard, though it does *not* add - * an offset to UV to bias them into the positive regime. */ +/* This conversion uses the JPEG full-range standard. Note that UV have range + * [-0.5, 0.5], so consider biasing the results. */ __device__ float3 rgb2yuv(float3 rgb) { return make_float3(