From 42f9ae282450cfa89b4e4add393917e414785df0 Mon Sep 17 00:00:00 2001
From: Steven Robertson <steven@strobe.cc>
Date: Thu, 25 Dec 2014 14:36:02 -0800
Subject: [PATCH] Pixlib fixes, a new yuv420p10 pix format, tests.

---
 cuburn/code/output.py            |  80 ++++++++++++++++----
 cuburn/code/tests/__init__.py    |   0
 cuburn/code/tests/test_output.py | 124 +++++++++++++++++++++++++++++++
 3 files changed, 188 insertions(+), 16 deletions(-)
 create mode 100644 cuburn/code/tests/__init__.py
 create mode 100644 cuburn/code/tests/test_output.py

diff --git a/cuburn/code/output.py b/cuburn/code/output.py
index 07cebbe..87a802b 100644
--- a/cuburn/code/output.py
+++ b/cuburn/code/output.py
@@ -1,19 +1,17 @@
 from util import devlib, ringbuflib
 from mwc import mwclib
 
-ditherlib = devlib(deps=[mwclib], defs=r'''
+pixfmtlib = devlib(deps=[ringbuflib, mwclib], defs=r'''
 // Clamp an input between 0 and a given peak (inclusive), dithering its output,
 // with full clamping for pixels that are true-black for compressibility.
 __device__ float dclampf(mwc_st &rctx, float peak, float in) {
   float ret = 0.0f;
   if (in > 0.0f) {
-    ret = fminf(peak, fmaxf(0.0f, in * peak + 0.49f * mwc_next_11(rctx)));
+    ret = fminf(peak, in * peak + 0.99f * mwc_next_01(rctx));
   }
   return ret;
 }
-''')
 
-rgba8lib = devlib(deps=[ringbuflib, mwclib, ditherlib], defs=r'''
 // Perform a conversion from float32 values to uint8 ones, applying
 // pixel- and channel-independent dithering to reduce suprathreshold banding
 // artifacts. Clamps values larger than 1.0f.
@@ -44,9 +42,7 @@ __global__ void f32_to_rgba_u8(
     dst[idst] = out;
     rctxs[rb_incr(rb->tail, tid)] = rctx;
 }
-''')
 
-rgba16lib = devlib(deps=[ringbuflib, mwclib, ditherlib], defs=r'''
 // Perform a conversion from float32 values to uint16 ones, as above.
 __global__ void f32_to_rgba_u16(
     ushort4 *dst, const float4 *src,
@@ -73,9 +69,7 @@ __global__ void f32_to_rgba_u16(
     dst[idst] = out;
     rctxs[rb_incr(rb->tail, tid)] = rctx;
 }
-''')
 
-yuv444plib = devlib(deps=[ringbuflib, mwclib, ditherlib], defs=r'''
 // Convert from rgb444 to planar YUV with no chroma subsampling.
 // Uses JPEG full-range color primaries.
 __global__ void f32_to_yuv444p(
@@ -106,9 +100,7 @@ __global__ void f32_to_yuv444p(
     dst[idst] = out.z;
     rctxs[rb_incr(rb->tail, tid)] = rctx;
 }
-''')
 
-yuv444p10lib = devlib(deps=[ringbuflib, mwclib, ditherlib], defs=r'''
 // Convert from rgb444 to planar YUV 10-bit, using JPEG full-range primaries.
 // TODO(strobe): Decide how YouTube will handle Rec. 2020, and then do that here.
 __global__ void f32_to_yuv444p10(
@@ -125,20 +117,76 @@ __global__ void f32_to_yuv444p10(
     mwc_st rctx = rctxs[rb_incr(rb->head, tid)];
 
     float4 in = src[isrc];
-    uchar3 out = make_uchar3(
-        dclampf(rctx, 1024.0f, 0.299f      * in.x + 0.587f     * in.y + 0.114f     * in.z),
-        dclampf(rctx, 1024.0f, -0.168736f  * in.x - 0.331264f  * in.y + 0.5f       * in.z + 0.5f),
-        dclampf(rctx, 1024.0f, 0.5f        * in.x - 0.418688f  * in.y - 0.081312f  * in.z + 0.5f)
+    ushort3 out = make_ushort3(
+        dclampf(rctx, 1023.0f, 0.299f      * in.x + 0.587f     * in.y + 0.114f     * in.z),
+        dclampf(rctx, 1023.0f, -0.168736f  * in.x - 0.331264f  * in.y + 0.5f       * in.z + 0.5f),
+        dclampf(rctx, 1023.0f, 0.5f        * in.x - 0.418688f  * in.y - 0.081312f  * in.z + 0.5f)
     );
 
     int idst = dstride * y + x;
     dst[idst] = out.x;
     idst += dstride * height;
-    dst[idst] = out.y;
+    dst[idst] = 1023.0f * (-0.168736f  * in.x - 0.331264f  * in.y + 0.5f       * in.z + 0.5f);
     idst += dstride * height;
     dst[idst] = out.z;
+
+    rctxs[rb_incr(rb->tail, tid)] = rctx;
+}
+
+// Convert from rgb444 to planar YUV 10-bit, using JPEG full-range primaries.
+// Perform subsampling of chroma using weighted averages.
+__global__ void f32_to_yuv420p10(
+    uint16_t *dst, const float4 *src,
+    int gutter, int dstride, int sstride, int height,
+    ringbuf *rb, mwc_st *rctxs)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x > dstride || y > height) return;
+    int tid = blockDim.x * threadIdx.y + threadIdx.x;
+    mwc_st rctx = rctxs[rb_incr(rb->head, tid)];
+
+    // Perform luma using real addressing
+    int isrc = sstride * (y + gutter) + x + gutter;
+    int idst = dstride * y + x;
+    float4 in = src[isrc];
+    dst[idst] = dclampf(rctx, 1023.0f, 0.299f      * in.x + 0.587f     * in.y + 0.114f     * in.z);
+
+    // Drop into subsampling mode for chroma components
+    if (x * 2 > dstride || y * 2 > height) return;
+
+    // Recompute addressing and collect weighted averages
+    // TODO(strobe): characterize overflow here
+    isrc = sstride * (y * 2 + gutter) + x * 2 + gutter;
+    in = src[isrc];
+    float sum = in.w + 1e-12;
+    float cb = in.w * (-0.168736f  * in.x - 0.331264f  * in.y + 0.5f       * in.z);
+    float cr = in.w * (0.5f        * in.x - 0.418688f  * in.y - 0.081312f  * in.z);
+
+    in = src[isrc + 1];
+    sum += in.w;
+    cb += in.w * (-0.168736f  * in.x - 0.331264f  * in.y + 0.5f       * in.z);
+    cr += in.w * (0.5f        * in.x - 0.418688f  * in.y - 0.081312f  * in.z);
+
+    isrc += sstride;
+    in = src[isrc];
+    sum += in.w;
+    cb += in.w * (-0.168736f  * in.x - 0.331264f  * in.y + 0.5f       * in.z);
+    cr += in.w * (0.5f        * in.x - 0.418688f  * in.y - 0.081312f  * in.z);
+
+    in = src[isrc + 1];
+    sum += in.w;
+    cb += in.w * (-0.168736f  * in.x - 0.331264f  * in.y + 0.5f       * in.z);
+    cr += in.w * (0.5f        * in.x - 0.418688f  * in.y - 0.081312f  * in.z);
+
+    // For this to work, dstride must equal the output frame width
+    // and be a multiple of four.
+    idst = dstride * height + dstride / 2 * y + x;
+    dst[idst] = dclampf(rctx, 1023.0f, cb / sum + 0.5f);
+    idst += dstride * height / 4;
+    dst[idst] = dclampf(rctx, 1023.0f, cr / sum + 0.5f);
+
     rctxs[rb_incr(rb->tail, tid)] = rctx;
 }
 ''')
 
-pixfmtlib = devlib(deps=[rgba8lib, rgba16lib, yuv444plib, yuv444p10lib])
diff --git a/cuburn/code/tests/__init__.py b/cuburn/code/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/cuburn/code/tests/test_output.py b/cuburn/code/tests/test_output.py
new file mode 100644
index 0000000..e82d4ee
--- /dev/null
+++ b/cuburn/code/tests/test_output.py
@@ -0,0 +1,124 @@
+import argparse
+import unittest
+import numpy as np
+
+import pycuda.driver as cuda
+import pycuda.autoinit
+
+from cuburn import render
+from cuburn.output import launchC
+from cuburn.code import output
+from cuburn.code import util
+
+class ProfileTest(unittest.TestCase, util.ClsMod):
+    lib = util.devlib(deps=[output.pixfmtlib])
+
+    def __init__(self, *args, **kwargs):
+        super(ProfileTest, self).__init__(*args, **kwargs)
+        self.load()
+        self.fb = render.Framebuffers()
+        self.dim = self.fb.calc_dim(640, 360)
+        self.fb.alloc(self.dim)
+
+    def test_clamping_below_0(self):
+        ins = np.empty((self.dim.ah, self.dim.astride, 4), dtype='f4')
+        ins[:] = -1
+        cuda.memcpy_htod(self.fb.d_front, ins)
+
+        launchC('f32_to_yuv444p', self.mod, None, self.dim, self.fb,
+                self.fb.d_rb, self.fb.d_seeds)
+
+        outs = np.empty((3, self.dim.h, self.dim.w), dtype='u1')
+        cuda.memcpy_dtoh(outs, self.fb.d_back)
+        self.assertTrue(np.all(outs[0] == 0))
+        self.assertTrue(np.all(outs[1] >= 127))
+        self.assertTrue(np.all(outs[1] <= 128))
+        self.assertTrue(np.all(outs[2] >= 127))
+        self.assertTrue(np.all(outs[2] <= 128))
+
+    def test_clamping_above_1(self):
+        ins = np.empty((self.dim.ah, self.dim.astride, 4), dtype='f4')
+        ins[:] = 5
+        cuda.memcpy_htod(self.fb.d_front, ins)
+
+        launchC('f32_to_yuv444p', self.mod, None, self.dim, self.fb,
+                self.fb.d_rb, self.fb.d_seeds)
+
+        outs = np.empty((3, self.dim.h, self.dim.w), dtype='u1')
+        cuda.memcpy_dtoh(outs, self.fb.d_back)
+        self.assertTrue(np.all(outs[0] == 255))
+        self.assertTrue(np.all(outs[1] >= 127))
+        self.assertTrue(np.all(outs[1] <= 128))
+        self.assertTrue(np.all(outs[2] >= 127))
+        self.assertTrue(np.all(outs[2] <= 128))
+
+    def test_yuv444p10_zero_passthru(self):
+        ins = np.zeros((self.dim.ah, self.dim.astride, 4), dtype='f4')
+        cuda.memcpy_htod(self.fb.d_front, ins)
+
+        launchC('f32_to_yuv444p10', self.mod, None, self.dim, self.fb,
+                self.fb.d_rb, self.fb.d_seeds)
+
+        outs = np.empty((3, self.dim.h, self.dim.w), dtype='u2')
+        cuda.memcpy_dtoh(outs, self.fb.d_back)
+        self.assertTrue(np.all(outs[0] == 0))
+        self.assertTrue(np.all(510 < outs[1]))
+        self.assertTrue(np.all(outs[1] < 513))
+        self.assertTrue(np.all(510 < outs[2]))
+        self.assertTrue(np.all(outs[2] < 513))
+
+    def test_yuv444p10_chroma_address_preservation(self):
+        ins = np.empty((self.dim.ah, self.dim.astride, 4), dtype='f4')
+        # Set everything to 0 except a few pixels
+        ins[:] = 0
+        ins[self.fb.gutter,self.fb.gutter,:] = [0, 1, 0, 1]
+        ins[self.fb.gutter+1,self.fb.gutter+1,:] = [0, 1, 0, 1]
+        cuda.memcpy_htod(self.fb.d_front, ins)
+
+        launchC('f32_to_yuv444p10', self.mod, None, self.dim, self.fb,
+                self.fb.d_rb, self.fb.d_seeds)
+
+        outs = np.empty((3, self.dim.h, self.dim.w), dtype='u2')
+        cuda.memcpy_dtoh(outs, self.fb.d_back)
+        self.assertTrue(outs[0,0,0] > 0)
+        self.assertTrue(outs[0,1,1] > 0)
+        self.assertTrue(outs[1,0,0] < 500)
+        self.assertTrue(outs[1,1,1] < 500)
+
+    def test_yuv420p10_chroma_address_preservation(self):
+        ins = np.empty((self.dim.ah, self.dim.astride, 4), dtype='f4')
+        # Set everything to 0 except a few pixels
+        ins[:] = 0
+
+        # chroma loc (0,0): one pixel on
+        ins[self.fb.gutter,self.fb.gutter,:] = [0, 1, 0, 1]
+
+        # chroma loc (1,1): average of two strong pixels
+        ins[self.fb.gutter+2,self.fb.gutter+2,:] = [0, 1, 0, 1]
+        ins[self.fb.gutter+3,self.fb.gutter+3,:] = [1, 0, 0, 1]
+
+        cuda.memcpy_htod(self.fb.d_front, ins)
+
+        launchC('f32_to_yuv420p10', self.mod, None, self.dim, self.fb,
+                self.fb.d_rb, self.fb.d_seeds)
+
+        w, h = self.dim.w, self.dim.h
+        outs = np.empty((3, h, w), dtype='u2')
+        cuda.memcpy_dtoh(outs, self.fb.d_back)
+        out_cr = outs[1,:h/4].reshape(h/2,w/2)
+        out_cb = outs[1,h/4:h/2].reshape(h/2,w/2)
+
+        # chroma blocking doesn't affect luma blocking
+        self.assertTrue(outs[0,0,0] > 0)
+        self.assertTrue(outs[0,1,0] == 0)
+        self.assertTrue(outs[0,0,1] == 0)
+        self.assertTrue(outs[0,1,1] == 0)
+
+        # locations are preserved
+        self.assertTrue(outs[0,2,2] > 0)
+        self.assertTrue(outs[0,3,3] > 0)
+
+        # chroma from first pixel makes it through, neighbor is fine
+        self.assertTrue(172 <= out_cr[0,0] <= 174)
+        self.assertTrue(511 <= out_cr[0,1] <= 512)
+        self.assertTrue(511 <= out_cr[1,0] <= 512)