From 42f9ae282450cfa89b4e4add393917e414785df0 Mon Sep 17 00:00:00 2001 From: Steven Robertson Date: Thu, 25 Dec 2014 14:36:02 -0800 Subject: [PATCH] Pixlib fixes, a new yuv420p10 pix format, tests. --- cuburn/code/output.py | 80 ++++++++++++++++---- cuburn/code/tests/__init__.py | 0 cuburn/code/tests/test_output.py | 124 +++++++++++++++++++++++++++++++ 3 files changed, 188 insertions(+), 16 deletions(-) create mode 100644 cuburn/code/tests/__init__.py create mode 100644 cuburn/code/tests/test_output.py diff --git a/cuburn/code/output.py b/cuburn/code/output.py index 07cebbe..87a802b 100644 --- a/cuburn/code/output.py +++ b/cuburn/code/output.py @@ -1,19 +1,17 @@ from util import devlib, ringbuflib from mwc import mwclib -ditherlib = devlib(deps=[mwclib], defs=r''' +pixfmtlib = devlib(deps=[ringbuflib, mwclib], defs=r''' // Clamp an input between 0 and a given peak (inclusive), dithering its output, // with full clamping for pixels that are true-black for compressibility. __device__ float dclampf(mwc_st &rctx, float peak, float in) { float ret = 0.0f; if (in > 0.0f) { - ret = fminf(peak, fmaxf(0.0f, in * peak + 0.49f * mwc_next_11(rctx))); + ret = fminf(peak, in * peak + 0.99f * mwc_next_01(rctx)); } return ret; } -''') -rgba8lib = devlib(deps=[ringbuflib, mwclib, ditherlib], defs=r''' // Perform a conversion from float32 values to uint8 ones, applying // pixel- and channel-independent dithering to reduce suprathreshold banding // artifacts. Clamps values larger than 1.0f. @@ -44,9 +42,7 @@ __global__ void f32_to_rgba_u8( dst[idst] = out; rctxs[rb_incr(rb->tail, tid)] = rctx; } -''') -rgba16lib = devlib(deps=[ringbuflib, mwclib, ditherlib], defs=r''' // Perform a conversion from float32 values to uint16 ones, as above. __global__ void f32_to_rgba_u16( ushort4 *dst, const float4 *src, @@ -73,9 +69,7 @@ __global__ void f32_to_rgba_u16( dst[idst] = out; rctxs[rb_incr(rb->tail, tid)] = rctx; } -''') -yuv444plib = devlib(deps=[ringbuflib, mwclib, ditherlib], defs=r''' // Convert from rgb444 to planar YUV with no chroma subsampling. // Uses JPEG full-range color primaries. __global__ void f32_to_yuv444p( @@ -106,9 +100,7 @@ __global__ void f32_to_yuv444p( dst[idst] = out.z; rctxs[rb_incr(rb->tail, tid)] = rctx; } -''') -yuv444p10lib = devlib(deps=[ringbuflib, mwclib, ditherlib], defs=r''' // Convert from rgb444 to planar YUV 10-bit, using JPEG full-range primaries. // TODO(strobe): Decide how YouTube will handle Rec. 2020, and then do that here. __global__ void f32_to_yuv444p10( @@ -125,20 +117,76 @@ __global__ void f32_to_yuv444p10( mwc_st rctx = rctxs[rb_incr(rb->head, tid)]; float4 in = src[isrc]; - uchar3 out = make_uchar3( - dclampf(rctx, 1024.0f, 0.299f * in.x + 0.587f * in.y + 0.114f * in.z), - dclampf(rctx, 1024.0f, -0.168736f * in.x - 0.331264f * in.y + 0.5f * in.z + 0.5f), - dclampf(rctx, 1024.0f, 0.5f * in.x - 0.418688f * in.y - 0.081312f * in.z + 0.5f) + ushort3 out = make_ushort3( + dclampf(rctx, 1023.0f, 0.299f * in.x + 0.587f * in.y + 0.114f * in.z), + dclampf(rctx, 1023.0f, -0.168736f * in.x - 0.331264f * in.y + 0.5f * in.z + 0.5f), + dclampf(rctx, 1023.0f, 0.5f * in.x - 0.418688f * in.y - 0.081312f * in.z + 0.5f) ); int idst = dstride * y + x; dst[idst] = out.x; idst += dstride * height; - dst[idst] = out.y; + dst[idst] = 1023.0f * (-0.168736f * in.x - 0.331264f * in.y + 0.5f * in.z + 0.5f); idst += dstride * height; dst[idst] = out.z; + + rctxs[rb_incr(rb->tail, tid)] = rctx; +} + +// Convert from rgb444 to planar YUV 10-bit, using JPEG full-range primaries. +// Perform subsampling of chroma using weighted averages. +__global__ void f32_to_yuv420p10( + uint16_t *dst, const float4 *src, + int gutter, int dstride, int sstride, int height, + ringbuf *rb, mwc_st *rctxs) +{ + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + if (x > dstride || y > height) return; + int tid = blockDim.x * threadIdx.y + threadIdx.x; + mwc_st rctx = rctxs[rb_incr(rb->head, tid)]; + + // Perform luma using real addressing + int isrc = sstride * (y + gutter) + x + gutter; + int idst = dstride * y + x; + float4 in = src[isrc]; + dst[idst] = dclampf(rctx, 1023.0f, 0.299f * in.x + 0.587f * in.y + 0.114f * in.z); + + // Drop into subsampling mode for chroma components + if (x * 2 > dstride || y * 2 > height) return; + + // Recompute addressing and collect weighted averages + // TODO(strobe): characterize overflow here + isrc = sstride * (y * 2 + gutter) + x * 2 + gutter; + in = src[isrc]; + float sum = in.w + 1e-12; + float cb = in.w * (-0.168736f * in.x - 0.331264f * in.y + 0.5f * in.z); + float cr = in.w * (0.5f * in.x - 0.418688f * in.y - 0.081312f * in.z); + + in = src[isrc + 1]; + sum += in.w; + cb += in.w * (-0.168736f * in.x - 0.331264f * in.y + 0.5f * in.z); + cr += in.w * (0.5f * in.x - 0.418688f * in.y - 0.081312f * in.z); + + isrc += sstride; + in = src[isrc]; + sum += in.w; + cb += in.w * (-0.168736f * in.x - 0.331264f * in.y + 0.5f * in.z); + cr += in.w * (0.5f * in.x - 0.418688f * in.y - 0.081312f * in.z); + + in = src[isrc + 1]; + sum += in.w; + cb += in.w * (-0.168736f * in.x - 0.331264f * in.y + 0.5f * in.z); + cr += in.w * (0.5f * in.x - 0.418688f * in.y - 0.081312f * in.z); + + // For this to work, dstride must equal the output frame width + // and be a multiple of four. + idst = dstride * height + dstride / 2 * y + x; + dst[idst] = dclampf(rctx, 1023.0f, cb / sum + 0.5f); + idst += dstride * height / 4; + dst[idst] = dclampf(rctx, 1023.0f, cr / sum + 0.5f); + rctxs[rb_incr(rb->tail, tid)] = rctx; } ''') -pixfmtlib = devlib(deps=[rgba8lib, rgba16lib, yuv444plib, yuv444p10lib]) diff --git a/cuburn/code/tests/__init__.py b/cuburn/code/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/cuburn/code/tests/test_output.py b/cuburn/code/tests/test_output.py new file mode 100644 index 0000000..e82d4ee --- /dev/null +++ b/cuburn/code/tests/test_output.py @@ -0,0 +1,124 @@ +import argparse +import unittest +import numpy as np + +import pycuda.driver as cuda +import pycuda.autoinit + +from cuburn import render +from cuburn.output import launchC +from cuburn.code import output +from cuburn.code import util + +class ProfileTest(unittest.TestCase, util.ClsMod): + lib = util.devlib(deps=[output.pixfmtlib]) + + def __init__(self, *args, **kwargs): + super(ProfileTest, self).__init__(*args, **kwargs) + self.load() + self.fb = render.Framebuffers() + self.dim = self.fb.calc_dim(640, 360) + self.fb.alloc(self.dim) + + def test_clamping_below_0(self): + ins = np.empty((self.dim.ah, self.dim.astride, 4), dtype='f4') + ins[:] = -1 + cuda.memcpy_htod(self.fb.d_front, ins) + + launchC('f32_to_yuv444p', self.mod, None, self.dim, self.fb, + self.fb.d_rb, self.fb.d_seeds) + + outs = np.empty((3, self.dim.h, self.dim.w), dtype='u1') + cuda.memcpy_dtoh(outs, self.fb.d_back) + self.assertTrue(np.all(outs[0] == 0)) + self.assertTrue(np.all(outs[1] >= 127)) + self.assertTrue(np.all(outs[1] <= 128)) + self.assertTrue(np.all(outs[2] >= 127)) + self.assertTrue(np.all(outs[2] <= 128)) + + def test_clamping_above_1(self): + ins = np.empty((self.dim.ah, self.dim.astride, 4), dtype='f4') + ins[:] = 5 + cuda.memcpy_htod(self.fb.d_front, ins) + + launchC('f32_to_yuv444p', self.mod, None, self.dim, self.fb, + self.fb.d_rb, self.fb.d_seeds) + + outs = np.empty((3, self.dim.h, self.dim.w), dtype='u1') + cuda.memcpy_dtoh(outs, self.fb.d_back) + self.assertTrue(np.all(outs[0] == 255)) + self.assertTrue(np.all(outs[1] >= 127)) + self.assertTrue(np.all(outs[1] <= 128)) + self.assertTrue(np.all(outs[2] >= 127)) + self.assertTrue(np.all(outs[2] <= 128)) + + def test_yuv444p10_zero_passthru(self): + ins = np.zeros((self.dim.ah, self.dim.astride, 4), dtype='f4') + cuda.memcpy_htod(self.fb.d_front, ins) + + launchC('f32_to_yuv444p10', self.mod, None, self.dim, self.fb, + self.fb.d_rb, self.fb.d_seeds) + + outs = np.empty((3, self.dim.h, self.dim.w), dtype='u2') + cuda.memcpy_dtoh(outs, self.fb.d_back) + self.assertTrue(np.all(outs[0] == 0)) + self.assertTrue(np.all(510 < outs[1])) + self.assertTrue(np.all(outs[1] < 513)) + self.assertTrue(np.all(510 < outs[2])) + self.assertTrue(np.all(outs[2] < 513)) + + def test_yuv444p10_chroma_address_preservation(self): + ins = np.empty((self.dim.ah, self.dim.astride, 4), dtype='f4') + # Set everything to 0 except a few pixels + ins[:] = 0 + ins[self.fb.gutter,self.fb.gutter,:] = [0, 1, 0, 1] + ins[self.fb.gutter+1,self.fb.gutter+1,:] = [0, 1, 0, 1] + cuda.memcpy_htod(self.fb.d_front, ins) + + launchC('f32_to_yuv444p10', self.mod, None, self.dim, self.fb, + self.fb.d_rb, self.fb.d_seeds) + + outs = np.empty((3, self.dim.h, self.dim.w), dtype='u2') + cuda.memcpy_dtoh(outs, self.fb.d_back) + self.assertTrue(outs[0,0,0] > 0) + self.assertTrue(outs[0,1,1] > 0) + self.assertTrue(outs[1,0,0] < 500) + self.assertTrue(outs[1,1,1] < 500) + + def test_yuv420p10_chroma_address_preservation(self): + ins = np.empty((self.dim.ah, self.dim.astride, 4), dtype='f4') + # Set everything to 0 except a few pixels + ins[:] = 0 + + # chroma loc (0,0): one pixel on + ins[self.fb.gutter,self.fb.gutter,:] = [0, 1, 0, 1] + + # chroma loc (1,1): average of two strong pixels + ins[self.fb.gutter+2,self.fb.gutter+2,:] = [0, 1, 0, 1] + ins[self.fb.gutter+3,self.fb.gutter+3,:] = [1, 0, 0, 1] + + cuda.memcpy_htod(self.fb.d_front, ins) + + launchC('f32_to_yuv420p10', self.mod, None, self.dim, self.fb, + self.fb.d_rb, self.fb.d_seeds) + + w, h = self.dim.w, self.dim.h + outs = np.empty((3, h, w), dtype='u2') + cuda.memcpy_dtoh(outs, self.fb.d_back) + out_cr = outs[1,:h/4].reshape(h/2,w/2) + out_cb = outs[1,h/4:h/2].reshape(h/2,w/2) + + # chroma blocking doesn't affect luma blocking + self.assertTrue(outs[0,0,0] > 0) + self.assertTrue(outs[0,1,0] == 0) + self.assertTrue(outs[0,0,1] == 0) + self.assertTrue(outs[0,1,1] == 0) + + # locations are preserved + self.assertTrue(outs[0,2,2] > 0) + self.assertTrue(outs[0,3,3] > 0) + + # chroma from first pixel makes it through, neighbor is fine + self.assertTrue(172 <= out_cr[0,0] <= 174) + self.assertTrue(511 <= out_cr[0,1] <= 512) + self.assertTrue(511 <= out_cr[1,0] <= 512)