Pixlib fixes, a new yuv420p10 pix format, tests.

2025-09-17 20:11:12 -04:00 · 2014-12-25 14:36:02 -08:00
parent 49ccf75c5f
commit 42f9ae2824
3 changed files with 188 additions and 16 deletions
--- a/cuburn/code/output.py
+++ b/cuburn/code/output.py
@ -1,19 +1,17 @@
 from util import devlib, ringbuflib
 from mwc import mwclib
-ditherlib = devlib(deps=[mwclib], defs=r'''
+pixfmtlib = devlib(deps=[ringbuflib, mwclib], defs=r'''
 // Clamp an input between 0 and a given peak (inclusive), dithering its output,
 // with full clamping for pixels that are true-black for compressibility.
 __device__ float dclampf(mwc_st &rctx, float peak, float in) {
  float ret = 0.0f;
  if (in > 0.0f) {
-    ret = fminf(peak, fmaxf(0.0f, in * peak + 0.49f * mwc_next_11(rctx)));
+    ret = fminf(peak, in * peak + 0.99f * mwc_next_01(rctx));
  }
  return ret;
 }
 ''')
 rgba8lib = devlib(deps=[ringbuflib, mwclib, ditherlib], defs=r'''
 // Perform a conversion from float32 values to uint8 ones, applying
 // pixel- and channel-independent dithering to reduce suprathreshold banding
 // artifacts. Clamps values larger than 1.0f.
@ -44,9 +42,7 @@ __global__ void f32_to_rgba_u8(
    dst[idst] = out;
    rctxs[rb_incr(rb->tail, tid)] = rctx;
 }
 ''')
 rgba16lib = devlib(deps=[ringbuflib, mwclib, ditherlib], defs=r'''
 // Perform a conversion from float32 values to uint16 ones, as above.
 __global__ void f32_to_rgba_u16(
    ushort4 *dst, const float4 *src,
@ -73,9 +69,7 @@ __global__ void f32_to_rgba_u16(
    dst[idst] = out;
    rctxs[rb_incr(rb->tail, tid)] = rctx;
 }
 ''')
 yuv444plib = devlib(deps=[ringbuflib, mwclib, ditherlib], defs=r'''
 // Convert from rgb444 to planar YUV with no chroma subsampling.
 // Uses JPEG full-range color primaries.
 __global__ void f32_to_yuv444p(
@ -106,9 +100,7 @@ __global__ void f32_to_yuv444p(
    dst[idst] = out.z;
    rctxs[rb_incr(rb->tail, tid)] = rctx;
 }
 ''')
 yuv444p10lib = devlib(deps=[ringbuflib, mwclib, ditherlib], defs=r'''
 // Convert from rgb444 to planar YUV 10-bit, using JPEG full-range primaries.
 // TODO(strobe): Decide how YouTube will handle Rec. 2020, and then do that here.
 __global__ void f32_to_yuv444p10(
@ -125,20 +117,76 @@ __global__ void f32_to_yuv444p10(
    mwc_st rctx = rctxs[rb_incr(rb->head, tid)];
    float4 in = src[isrc];
-    uchar3 out = make_uchar3(
+    ushort3 out = make_ushort3(
-        dclampf(rctx, 1024.0f, 0.299f      * in.x + 0.587f     * in.y + 0.114f     * in.z),
+        dclampf(rctx, 1023.0f, 0.299f      * in.x + 0.587f     * in.y + 0.114f     * in.z),
-        dclampf(rctx, 1024.0f, -0.168736f  * in.x - 0.331264f  * in.y + 0.5f       * in.z + 0.5f),
+        dclampf(rctx, 1023.0f, -0.168736f  * in.x - 0.331264f  * in.y + 0.5f       * in.z + 0.5f),
-        dclampf(rctx, 1024.0f, 0.5f        * in.x - 0.418688f  * in.y - 0.081312f  * in.z + 0.5f)
+        dclampf(rctx, 1023.0f, 0.5f        * in.x - 0.418688f  * in.y - 0.081312f  * in.z + 0.5f)
    );
    int idst = dstride * y + x;
    dst[idst] = out.x;
    idst += dstride * height;
-    dst[idst] = out.y;
+    dst[idst] = 1023.0f * (-0.168736f  * in.x - 0.331264f  * in.y + 0.5f       * in.z + 0.5f);
    idst += dstride * height;
    dst[idst] = out.z;
    rctxs[rb_incr(rb->tail, tid)] = rctx;
 }
 // Convert from rgb444 to planar YUV 10-bit, using JPEG full-range primaries.
 // Perform subsampling of chroma using weighted averages.
 __global__ void f32_to_yuv420p10(
    uint16_t *dst, const float4 *src,
    int gutter, int dstride, int sstride, int height,
    ringbuf *rb, mwc_st *rctxs)
 {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    if (x > dstride || y > height) return;
    int tid = blockDim.x * threadIdx.y + threadIdx.x;
    mwc_st rctx = rctxs[rb_incr(rb->head, tid)];
    // Perform luma using real addressing
    int isrc = sstride * (y + gutter) + x + gutter;
    int idst = dstride * y + x;
    float4 in = src[isrc];
    dst[idst] = dclampf(rctx, 1023.0f, 0.299f      * in.x + 0.587f     * in.y + 0.114f     * in.z);
    // Drop into subsampling mode for chroma components
    if (x * 2 > dstride || y * 2 > height) return;
    // Recompute addressing and collect weighted averages
    // TODO(strobe): characterize overflow here
    isrc = sstride * (y * 2 + gutter) + x * 2 + gutter;
    in = src[isrc];
    float sum = in.w + 1e-12;
    float cb = in.w * (-0.168736f  * in.x - 0.331264f  * in.y + 0.5f       * in.z);
    float cr = in.w * (0.5f        * in.x - 0.418688f  * in.y - 0.081312f  * in.z);
    in = src[isrc + 1];
    sum += in.w;
    cb += in.w * (-0.168736f  * in.x - 0.331264f  * in.y + 0.5f       * in.z);
    cr += in.w * (0.5f        * in.x - 0.418688f  * in.y - 0.081312f  * in.z);
    isrc += sstride;
    in = src[isrc];
    sum += in.w;
    cb += in.w * (-0.168736f  * in.x - 0.331264f  * in.y + 0.5f       * in.z);
    cr += in.w * (0.5f        * in.x - 0.418688f  * in.y - 0.081312f  * in.z);
    in = src[isrc + 1];
    sum += in.w;
    cb += in.w * (-0.168736f  * in.x - 0.331264f  * in.y + 0.5f       * in.z);
    cr += in.w * (0.5f        * in.x - 0.418688f  * in.y - 0.081312f  * in.z);
    // For this to work, dstride must equal the output frame width
    // and be a multiple of four.
    idst = dstride * height + dstride / 2 * y + x;
    dst[idst] = dclampf(rctx, 1023.0f, cb / sum + 0.5f);
    idst += dstride * height / 4;
    dst[idst] = dclampf(rctx, 1023.0f, cr / sum + 0.5f);
    rctxs[rb_incr(rb->tail, tid)] = rctx;
 }
 ''')
 pixfmtlib = devlib(deps=[rgba8lib, rgba16lib, yuv444plib, yuv444p10lib])
--- a/cuburn/code/tests/init.py
+++ b/cuburn/code/tests/init.py
--- a/cuburn/code/tests/test_output.py
+++ b/cuburn/code/tests/test_output.py
@ -0,0 +1,124 @@
 import argparse
 import unittest
 import numpy as np
 import pycuda.driver as cuda
 import pycuda.autoinit
 from cuburn import render
 from cuburn.output import launchC
 from cuburn.code import output
 from cuburn.code import util
 class ProfileTest(unittest.TestCase, util.ClsMod):
    lib = util.devlib(deps=[output.pixfmtlib])
    def __init__(self, *args, **kwargs):
        super(ProfileTest, self).__init__(*args, **kwargs)
        self.load()
        self.fb = render.Framebuffers()
        self.dim = self.fb.calc_dim(640, 360)
        self.fb.alloc(self.dim)
    def test_clamping_below_0(self):
        ins = np.empty((self.dim.ah, self.dim.astride, 4), dtype='f4')
        ins[:] = -1
        cuda.memcpy_htod(self.fb.d_front, ins)
        launchC('f32_to_yuv444p', self.mod, None, self.dim, self.fb,
                self.fb.d_rb, self.fb.d_seeds)
        outs = np.empty((3, self.dim.h, self.dim.w), dtype='u1')
        cuda.memcpy_dtoh(outs, self.fb.d_back)
        self.assertTrue(np.all(outs[0] == 0))
        self.assertTrue(np.all(outs[1] >= 127))
        self.assertTrue(np.all(outs[1] <= 128))
        self.assertTrue(np.all(outs[2] >= 127))
        self.assertTrue(np.all(outs[2] <= 128))
    def test_clamping_above_1(self):
        ins = np.empty((self.dim.ah, self.dim.astride, 4), dtype='f4')
        ins[:] = 5
        cuda.memcpy_htod(self.fb.d_front, ins)
        launchC('f32_to_yuv444p', self.mod, None, self.dim, self.fb,
                self.fb.d_rb, self.fb.d_seeds)
        outs = np.empty((3, self.dim.h, self.dim.w), dtype='u1')
        cuda.memcpy_dtoh(outs, self.fb.d_back)
        self.assertTrue(np.all(outs[0] == 255))
        self.assertTrue(np.all(outs[1] >= 127))
        self.assertTrue(np.all(outs[1] <= 128))
        self.assertTrue(np.all(outs[2] >= 127))
        self.assertTrue(np.all(outs[2] <= 128))
    def test_yuv444p10_zero_passthru(self):
        ins = np.zeros((self.dim.ah, self.dim.astride, 4), dtype='f4')
        cuda.memcpy_htod(self.fb.d_front, ins)
        launchC('f32_to_yuv444p10', self.mod, None, self.dim, self.fb,
                self.fb.d_rb, self.fb.d_seeds)
        outs = np.empty((3, self.dim.h, self.dim.w), dtype='u2')
        cuda.memcpy_dtoh(outs, self.fb.d_back)
        self.assertTrue(np.all(outs[0] == 0))
        self.assertTrue(np.all(510 < outs[1]))
        self.assertTrue(np.all(outs[1] < 513))
        self.assertTrue(np.all(510 < outs[2]))
        self.assertTrue(np.all(outs[2] < 513))
    def test_yuv444p10_chroma_address_preservation(self):
        ins = np.empty((self.dim.ah, self.dim.astride, 4), dtype='f4')
        # Set everything to 0 except a few pixels
        ins[:] = 0
        ins[self.fb.gutter,self.fb.gutter,:] = [0, 1, 0, 1]
        ins[self.fb.gutter+1,self.fb.gutter+1,:] = [0, 1, 0, 1]
        cuda.memcpy_htod(self.fb.d_front, ins)
        launchC('f32_to_yuv444p10', self.mod, None, self.dim, self.fb,
                self.fb.d_rb, self.fb.d_seeds)
        outs = np.empty((3, self.dim.h, self.dim.w), dtype='u2')
        cuda.memcpy_dtoh(outs, self.fb.d_back)
        self.assertTrue(outs[0,0,0] > 0)
        self.assertTrue(outs[0,1,1] > 0)
        self.assertTrue(outs[1,0,0] < 500)
        self.assertTrue(outs[1,1,1] < 500)
    def test_yuv420p10_chroma_address_preservation(self):
        ins = np.empty((self.dim.ah, self.dim.astride, 4), dtype='f4')
        # Set everything to 0 except a few pixels
        ins[:] = 0
        # chroma loc (0,0): one pixel on
        ins[self.fb.gutter,self.fb.gutter,:] = [0, 1, 0, 1]
        # chroma loc (1,1): average of two strong pixels
        ins[self.fb.gutter+2,self.fb.gutter+2,:] = [0, 1, 0, 1]
        ins[self.fb.gutter+3,self.fb.gutter+3,:] = [1, 0, 0, 1]
        cuda.memcpy_htod(self.fb.d_front, ins)
        launchC('f32_to_yuv420p10', self.mod, None, self.dim, self.fb,
                self.fb.d_rb, self.fb.d_seeds)
        w, h = self.dim.w, self.dim.h
        outs = np.empty((3, h, w), dtype='u2')
        cuda.memcpy_dtoh(outs, self.fb.d_back)
        out_cr = outs[1,:h/4].reshape(h/2,w/2)
        out_cb = outs[1,h/4:h/2].reshape(h/2,w/2)
        # chroma blocking doesn't affect luma blocking
        self.assertTrue(outs[0,0,0] > 0)
        self.assertTrue(outs[0,1,0] == 0)
        self.assertTrue(outs[0,0,1] == 0)
        self.assertTrue(outs[0,1,1] == 0)
        # locations are preserved
        self.assertTrue(outs[0,2,2] > 0)
        self.assertTrue(outs[0,3,3] > 0)
        # chroma from first pixel makes it through, neighbor is fine
        self.assertTrue(172 <= out_cr[0,0] <= 174)
        self.assertTrue(511 <= out_cr[0,1] <= 512)
        self.assertTrue(511 <= out_cr[1,0] <= 512)