From c572f62d7d949afa6538c7488fc6ab2241352c56 Mon Sep 17 00:00:00 2001
From: Steven Robertson <steven@strobe.cc>
Date: Sun, 22 Jan 2012 23:56:16 -0500
Subject: [PATCH] Use YUV during accumulation

---
 cuburn/code/filtering.py |  6 +++
 cuburn/code/interp.py    | 37 +++++++++--------
 cuburn/code/iter.py      | 87 ++++++++++++++++++++++++++--------------
 cuburn/code/util.py      |  4 +-
 4 files changed, 83 insertions(+), 51 deletions(-)

diff --git a/cuburn/code/filtering.py b/cuburn/code/filtering.py
index 0c184cf..cea349c 100644
--- a/cuburn/code/filtering.py
+++ b/cuburn/code/filtering.py
@@ -25,6 +25,12 @@ void colorclip(float4 *pixbuf, float gamma, float vibrance, float highpow,
         pixbuf[i] = make_float4(bkgd.x, bkgd.y, bkgd.z, 0.0f);
         return;
     }
+    pix.y -= 0.5f * pix.w;
+    pix.z -= 0.5f * pix.w;
+    float3 tmp = yuv2rgb(make_float3(pix.x, pix.y, pix.z));
+    pix.x = tmp.x;
+    pix.y = tmp.y;
+    pix.z = tmp.z;
 
     pix.x = fmaxf(0.0f, pix.x);
     pix.y = fmaxf(0.0f, pix.y);
diff --git a/cuburn/code/interp.py b/cuburn/code/interp.py
index 843abaa..e3a29a7 100644
--- a/cuburn/code/interp.py
+++ b/cuburn/code/interp.py
@@ -370,7 +370,7 @@ float4 interp_color(const float *times, const float4 *sources, float time) {
 
     float4 left  = sources[blockDim.x * (idx - 1) + threadIdx.x];
     float4 right = sources[blockDim.x * (idx)     + threadIdx.x];
-    float3 rgb;
+    float3 yuv;
 
     float3 l3 = make_float3(left.x, left.y, left.z);
     float3 r3 = make_float3(right.x, right.y, right.z);
@@ -395,10 +395,8 @@ float4 interp_color(const float *times, const float4 *sources, float time) {
     if (hsv.x < 0.0f)
         hsv.x += 6.0f;
 
-    rgb = hsv2rgb(hsv);
+    yuv = rgb2yuv(hsv2rgb(hsv));
 {{elif mode.startswith('yuv')}}
-    float3 yuv;
-
     {{if mode == 'yuv'}}
     float3 lyuv = rgb2yuv(l3);
     float3 ryuv = rgb2yuv(r3);
@@ -412,11 +410,12 @@ float4 interp_color(const float *times, const float4 *sources, float time) {
     yuv.y = radius * cosf(angle);
     yuv.z = radius * sinf(angle);
     {{endif}}
-
-    rgb = yuv2rgb(yuv);
 {{endif}}
 
-    return make_float4(rgb.x, rgb.y, rgb.z, left.w * lf + right.w * rf);
+    yuv.y += 0.5f;
+    yuv.z += 0.5f;
+
+    return make_float4(yuv.x, yuv.y, yuv.z, left.w * lf + right.w * rf);
 }
 
 __global__
@@ -424,13 +423,13 @@ void interp_palette(uchar4 *outs,
         const float *times, const float4 *sources,
         float tstart, float tstep) {
     float time = tstart + blockIdx.x * tstep;
-    float4 rgba = interp_color(times, sources, time);
+    float4 yuva = interp_color(times, sources, time);
 
     uchar4 out;
-    out.x = rgba.x * 255.0f;
-    out.y = rgba.y * 255.0f;
-    out.z = rgba.z * 255.0f;
-    out.w = rgba.w * 255.0f;
+    out.x = yuva.x * 255.0f;
+    out.y = yuva.y * 255.0f;
+    out.z = yuva.z * 255.0f;
+    out.w = yuva.w * 255.0f;
     outs[blockDim.x * blockIdx.x + threadIdx.x] = out;
 }
 
@@ -443,16 +442,16 @@ void interp_palette_flat(mwc_st *rctxs,
     mwc_st rctx = rctxs[gid];
 
     float time = tstart + blockIdx.x * tstep;
-    float4 rgba = interp_color(times, sources, time);
+    float4 yuva = interp_color(times, sources, time);
 
-    // TODO: use YUV; pack Y at full precision, UV at quarter
+    // TODO: pack Y at full precision, UV at quarter
     uint2 out;
 
-    uint32_t r = min(255, (uint32_t) (rgba.x * 255.0f + 0.49f * mwc_next_11(rctx)));
-    uint32_t g = min(255, (uint32_t) (rgba.y * 255.0f + 0.49f * mwc_next_11(rctx)));
-    uint32_t b = min(255, (uint32_t) (rgba.z * 255.0f + 0.49f * mwc_next_11(rctx)));
-    out.y = (1 << 22) | (r << 4);
-    out.x = (g << 18) | b;
+    uint32_t y = min(255, (uint32_t) (yuva.x * 255.0f + 0.49f * mwc_next_11(rctx)));
+    uint32_t u = min(255, (uint32_t) (yuva.y * 255.0f + 0.49f * mwc_next_11(rctx)));
+    uint32_t v = min(255, (uint32_t) (yuva.z * 255.0f + 0.49f * mwc_next_11(rctx)));
+    out.y = (1 << 22) | (y << 4);
+    out.x = (u << 18) | v;
     surf2Dwrite(out, flatpal, 8 * threadIdx.x, blockIdx.x);
     rctxs[gid] = rctx;
 }
diff --git a/cuburn/code/iter.py b/cuburn/code/iter.py
index 7cacc81..7cb2514 100644
--- a/cuburn/code/iter.py
+++ b/cuburn/code/iter.py
@@ -361,9 +361,15 @@ void iter(
 {{if info.acc_mode == 'atomic'}}
         asm volatile ({{crep("""
 {
+    // To prevent overflow, we need to flush each pixel before the density
+    // wraps at 1024 points. This atomic segment performs writes to the
+    // integer buffer, occasionally checking the results. If they exceed a
+    // threshold, it zeros that cell in the integer buffer, converts the
+    // former contents to floats, and adds them to the float4 buffer.
+
     .reg .pred  p;
-    .reg .u32   off, color, hi, lo, d, r, g, b;
-    .reg .f32   colorf, rf, gf, bf, df;
+    .reg .u32   off, color, hi, lo, d, y, u, v;
+    .reg .f32   colorf, yf, uf, vf, df;
     .reg .u64   ptr, val;
 
     // TODO: coord dithering better, or pre-supersampled palette?
@@ -371,38 +377,59 @@ void iter(
     cvt.rni.u32.f32     color,  colorf;
     shl.b32             color,  color,  3;
 
+    // Load the pre-packed 64-bit uint from the palette surf
     suld.b.2d.v2.b32.clamp      {lo, hi},   [flatpal, {color, %2}];
     mov.b64             val,    {lo, hi};
+
+    // Calculate the output address in the atomic integer accumulator
     shl.b32             off,    %3,     3;
     cvt.u64.u32         ptr,    off;
     add.u64             ptr,    ptr,    %4;
+
+    // 97% of the time, do an atomic add, then jump to the end without
+    // stalling the thread waiting for the data value
     setp.le.f32         p,      %5,     0.97;
 @p  red.global.add.u64  [ptr],  val;
 @p  bra                 oflow_end;
+
+    // 3% of the time, do the atomic add, and wait for the results
     atom.global.add.u64 val,    [ptr],  val;
     mov.b64             {lo, hi},       val;
+
+    // If the density is less than 256, jump to the end
     setp.lo.u32         p,      hi,     (256 << 22);
 @p  bra                 oflow_end;
+
+    // Atomically swap the integer cell with 0 and read its current value
     atom.global.exch.b64 val,   [ptr],  0;
     mov.b64             {lo, hi},       val;
+
+    // If the integer cell is zero, another thread has captured the full value
+    // in between the first atomic read and the second, so we can skip to the
+    // end again.
+    setp.eq.u32         p,      hi,     0;
+@p  bra                 oflow_end;
+
+    // Extract the values from the packed integer, convert to floats, and add
+    // them to the floating-point buffer.
     shr.u32             d,      hi,     22;
-    bfe.u32             r,      hi,     4,      18;
-    bfe.u32             g,      lo,     18,     14;
-    bfi.b32             g,      hi,     g,      14,     4;
-    and.b32             b,      lo,     ((1<<18)-1);
-    cvt.rn.f32.u32      rf,     r;
-    cvt.rn.f32.u32      gf,     g;
-    cvt.rn.f32.u32      bf,     b;
+    bfe.u32             y,      hi,     4,      18;
+    bfe.u32             u,      lo,     18,     14;
+    bfi.b32             u,      hi,     u,      14,     4;
+    and.b32             v,      lo,     ((1<<18)-1);
+    cvt.rn.f32.u32      yf,     y;
+    cvt.rn.f32.u32      uf,     u;
+    cvt.rn.f32.u32      vf,     v;
     cvt.rn.f32.u32      df,     d;
-    mul.rn.ftz.f32      rf,     rf,     (1.0/255.0);
-    mul.rn.ftz.f32      gf,     gf,     (1.0/255.0);
-    mul.rn.ftz.f32      bf,     bf,     (1.0/255.0);
+    mul.rn.ftz.f32      yf,     yf,     (1.0/255.0);
+    mul.rn.ftz.f32      uf,     uf,     (1.0/255.0);
+    mul.rn.ftz.f32      vf,     vf,     (1.0/255.0);
     shl.b32             off,    %3,     4;
     cvt.u64.u32         ptr,    off;
     add.u64             ptr,    ptr,    %6;
-    red.global.add.f32  [ptr],          rf;
-    red.global.add.f32  [ptr+4],        gf;
-    red.global.add.f32  [ptr+8],        bf;
+    red.global.add.f32  [ptr],          yf;
+    red.global.add.f32  [ptr+4],        uf;
+    red.global.add.f32  [ptr+8],        vf;
     red.global.add.f32  [ptr+12],       df;
 oflow_end:
 }
@@ -445,9 +472,9 @@ void flush_atom(uint64_t out_ptr, uint64_t atom_ptr, int nbins) {
     if (i >= nbins) return;
     asm volatile ({{crep("""
 {
-    .reg .u32   off, hi, lo, d, r, g, b;
+    .reg .u32   off, hi, lo, d, y, u, v;
     .reg .u64   val, ptr;
-    .reg .f32   rf, gf, bf, df, rg, gg, bg, dg;
+    .reg .f32   yf, uf, vf, df, yg, ug, vg, dg;
 
     // TODO: use explicit movs to handle this
     shl.b32             off,    %0,     3;
@@ -457,21 +484,22 @@ void flush_atom(uint64_t out_ptr, uint64_t atom_ptr, int nbins) {
     shl.b32             off,    %0,     4;
     cvt.u64.u32         ptr,    off;
     add.u64             ptr,    ptr,    %2;
-    ld.global.v4.f32    {rg,gg,bg,dg},  [ptr];
+    ld.global.v4.f32    {yg,ug,vg,dg},  [ptr];
     shr.u32             d,      hi,     22;
-    bfe.u32             r,      hi,     4,      18;
-    bfe.u32             g,      lo,     18,     14;
-    bfi.b32             g,      hi,     g,      14,     4;
-    and.b32             b,      lo,     ((1<<18)-1);
-    cvt.rn.f32.u32      rf,     r;
-    cvt.rn.f32.u32      gf,     g;
-    cvt.rn.f32.u32      bf,     b;
+    bfe.u32             y,      hi,     4,      18;
+    bfe.u32             u,      lo,     18,     14;
+    bfi.b32             u,      hi,     u,      14,     4;
+    and.b32             v,      lo,     ((1<<18)-1);
+    cvt.rn.f32.u32      yf,     y;
+    cvt.rn.f32.u32      uf,     u;
+    cvt.rn.f32.u32      vf,     v;
     cvt.rn.f32.u32      df,     d;
-    fma.rn.ftz.f32      rg,     rf,     (1.0/255.0),    rg;
-    fma.rn.ftz.f32      gg,     gf,     (1.0/255.0),    gg;
-    fma.rn.ftz.f32      bg,     bf,     (1.0/255.0),    bg;
+    fma.rn.ftz.f32      yg,     yf,     (1.0/255.0),    yg;
+    fma.rn.ftz.f32      ug,     uf,     (1.0/255.0),    ug;
+    fma.rn.ftz.f32      vg,     vf,     (1.0/255.0),    vg;
+
     add.rn.ftz.f32      dg,     df,     dg;
-    st.global.v4.f32    [ptr],  {rg,gg,bg,dg};
+    st.global.v4.f32    [ptr],  {yg,ug,vg,dg};
 }
     """)}}  ::  "r"(i), "l"(atom_ptr), "l"(out_ptr));
 }
@@ -640,4 +668,3 @@ oflow_write_end:
                 NWARPS = self.NTHREADS / 32,
                 std_xforms = [n for n in sorted(genome.xforms) if n != 'final'],
                 **globals())
-
diff --git a/cuburn/code/util.py b/cuburn/code/util.py
index 1dfa908..1bd1614 100644
--- a/cuburn/code/util.py
+++ b/cuburn/code/util.py
@@ -154,8 +154,8 @@ void write_half(float &xy, float x, float y, float den) {
 }
 
 
-/* This conversion uses the JPEG full-range standard, though it does *not* add
- * an offset to UV to bias them into the positive regime. */
+/* This conversion uses the JPEG full-range standard. Note that UV have range
+ * [-0.5, 0.5], so consider biasing the results. */
 __device__
 float3 rgb2yuv(float3 rgb) {
     return make_float3(