mirror of
synced 2025-03-13 06:51:28 -04:00
Pixlib fixes, a new yuv420p10 pix format, tests.
This commit is contained in:
@ -1,19 +1,17 @@
from util import devlib, ringbuflib
from mwc import mwclib
ditherlib = devlib(deps=[mwclib], defs=r'''
pixfmtlib = devlib(deps=[ringbuflib, mwclib], defs=r'''
// Clamp an input between 0 and a given peak (inclusive), dithering its output,
// with full clamping for pixels that are true-black for compressibility.
__device__ float dclampf(mwc_st &rctx, float peak, float in) {
float ret = 0.0f;
if (in > 0.0f) {
ret = fminf(peak, fmaxf(0.0f, in * peak + 0.49f * mwc_next_11(rctx)));
ret = fminf(peak, in * peak + 0.99f * mwc_next_01(rctx));
return ret;
rgba8lib = devlib(deps=[ringbuflib, mwclib, ditherlib], defs=r'''
// Perform a conversion from float32 values to uint8 ones, applying
// pixel- and channel-independent dithering to reduce suprathreshold banding
// artifacts. Clamps values larger than 1.0f.
@ -44,9 +42,7 @@ __global__ void f32_to_rgba_u8(
dst[idst] = out;
rctxs[rb_incr(rb->tail, tid)] = rctx;
rgba16lib = devlib(deps=[ringbuflib, mwclib, ditherlib], defs=r'''
// Perform a conversion from float32 values to uint16 ones, as above.
__global__ void f32_to_rgba_u16(
ushort4 *dst, const float4 *src,
@ -73,9 +69,7 @@ __global__ void f32_to_rgba_u16(
dst[idst] = out;
rctxs[rb_incr(rb->tail, tid)] = rctx;
yuv444plib = devlib(deps=[ringbuflib, mwclib, ditherlib], defs=r'''
// Convert from rgb444 to planar YUV with no chroma subsampling.
// Uses JPEG full-range color primaries.
__global__ void f32_to_yuv444p(
@ -106,9 +100,7 @@ __global__ void f32_to_yuv444p(
dst[idst] = out.z;
rctxs[rb_incr(rb->tail, tid)] = rctx;
yuv444p10lib = devlib(deps=[ringbuflib, mwclib, ditherlib], defs=r'''
// Convert from rgb444 to planar YUV 10-bit, using JPEG full-range primaries.
// TODO(strobe): Decide how YouTube will handle Rec. 2020, and then do that here.
__global__ void f32_to_yuv444p10(
@ -125,20 +117,76 @@ __global__ void f32_to_yuv444p10(
mwc_st rctx = rctxs[rb_incr(rb->head, tid)];
float4 in = src[isrc];
uchar3 out = make_uchar3(
dclampf(rctx, 1024.0f, 0.299f * in.x + 0.587f * in.y + 0.114f * in.z),
dclampf(rctx, 1024.0f, -0.168736f * in.x - 0.331264f * in.y + 0.5f * in.z + 0.5f),
dclampf(rctx, 1024.0f, 0.5f * in.x - 0.418688f * in.y - 0.081312f * in.z + 0.5f)
ushort3 out = make_ushort3(
dclampf(rctx, 1023.0f, 0.299f * in.x + 0.587f * in.y + 0.114f * in.z),
dclampf(rctx, 1023.0f, -0.168736f * in.x - 0.331264f * in.y + 0.5f * in.z + 0.5f),
dclampf(rctx, 1023.0f, 0.5f * in.x - 0.418688f * in.y - 0.081312f * in.z + 0.5f)
int idst = dstride * y + x;
dst[idst] = out.x;
idst += dstride * height;
dst[idst] = out.y;
dst[idst] = 1023.0f * (-0.168736f * in.x - 0.331264f * in.y + 0.5f * in.z + 0.5f);
idst += dstride * height;
dst[idst] = out.z;
rctxs[rb_incr(rb->tail, tid)] = rctx;
// Convert from rgb444 to planar YUV 10-bit, using JPEG full-range primaries.
// Perform subsampling of chroma using weighted averages.
__global__ void f32_to_yuv420p10(
uint16_t *dst, const float4 *src,
int gutter, int dstride, int sstride, int height,
ringbuf *rb, mwc_st *rctxs)
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x > dstride || y > height) return;
int tid = blockDim.x * threadIdx.y + threadIdx.x;
mwc_st rctx = rctxs[rb_incr(rb->head, tid)];
// Perform luma using real addressing
int isrc = sstride * (y + gutter) + x + gutter;
int idst = dstride * y + x;
float4 in = src[isrc];
dst[idst] = dclampf(rctx, 1023.0f, 0.299f * in.x + 0.587f * in.y + 0.114f * in.z);
// Drop into subsampling mode for chroma components
if (x * 2 > dstride || y * 2 > height) return;
// Recompute addressing and collect weighted averages
// TODO(strobe): characterize overflow here
isrc = sstride * (y * 2 + gutter) + x * 2 + gutter;
in = src[isrc];
float sum = in.w + 1e-12;
float cb = in.w * (-0.168736f * in.x - 0.331264f * in.y + 0.5f * in.z);
float cr = in.w * (0.5f * in.x - 0.418688f * in.y - 0.081312f * in.z);
in = src[isrc + 1];
sum += in.w;
cb += in.w * (-0.168736f * in.x - 0.331264f * in.y + 0.5f * in.z);
cr += in.w * (0.5f * in.x - 0.418688f * in.y - 0.081312f * in.z);
isrc += sstride;
in = src[isrc];
sum += in.w;
cb += in.w * (-0.168736f * in.x - 0.331264f * in.y + 0.5f * in.z);
cr += in.w * (0.5f * in.x - 0.418688f * in.y - 0.081312f * in.z);
in = src[isrc + 1];
sum += in.w;
cb += in.w * (-0.168736f * in.x - 0.331264f * in.y + 0.5f * in.z);
cr += in.w * (0.5f * in.x - 0.418688f * in.y - 0.081312f * in.z);
// For this to work, dstride must equal the output frame width
// and be a multiple of four.
idst = dstride * height + dstride / 2 * y + x;
dst[idst] = dclampf(rctx, 1023.0f, cb / sum + 0.5f);
idst += dstride * height / 4;
dst[idst] = dclampf(rctx, 1023.0f, cr / sum + 0.5f);
rctxs[rb_incr(rb->tail, tid)] = rctx;
pixfmtlib = devlib(deps=[rgba8lib, rgba16lib, yuv444plib, yuv444p10lib])
Normal file
Normal file
Normal file
Normal file
@ -0,0 +1,124 @@
import argparse
import unittest
import numpy as np
import pycuda.driver as cuda
import pycuda.autoinit
from cuburn import render
from cuburn.output import launchC
from cuburn.code import output
from cuburn.code import util
class ProfileTest(unittest.TestCase, util.ClsMod):
lib = util.devlib(deps=[output.pixfmtlib])
def __init__(self, *args, **kwargs):
super(ProfileTest, self).__init__(*args, **kwargs)
self.fb = render.Framebuffers()
self.dim = self.fb.calc_dim(640, 360)
def test_clamping_below_0(self):
ins = np.empty((self.dim.ah, self.dim.astride, 4), dtype='f4')
ins[:] = -1
cuda.memcpy_htod(self.fb.d_front, ins)
launchC('f32_to_yuv444p', self.mod, None, self.dim, self.fb,
self.fb.d_rb, self.fb.d_seeds)
outs = np.empty((3, self.dim.h, self.dim.w), dtype='u1')
cuda.memcpy_dtoh(outs, self.fb.d_back)
self.assertTrue(np.all(outs[0] == 0))
self.assertTrue(np.all(outs[1] >= 127))
self.assertTrue(np.all(outs[1] <= 128))
self.assertTrue(np.all(outs[2] >= 127))
self.assertTrue(np.all(outs[2] <= 128))
def test_clamping_above_1(self):
ins = np.empty((self.dim.ah, self.dim.astride, 4), dtype='f4')
ins[:] = 5
cuda.memcpy_htod(self.fb.d_front, ins)
launchC('f32_to_yuv444p', self.mod, None, self.dim, self.fb,
self.fb.d_rb, self.fb.d_seeds)
outs = np.empty((3, self.dim.h, self.dim.w), dtype='u1')
cuda.memcpy_dtoh(outs, self.fb.d_back)
self.assertTrue(np.all(outs[0] == 255))
self.assertTrue(np.all(outs[1] >= 127))
self.assertTrue(np.all(outs[1] <= 128))
self.assertTrue(np.all(outs[2] >= 127))
self.assertTrue(np.all(outs[2] <= 128))
def test_yuv444p10_zero_passthru(self):
ins = np.zeros((self.dim.ah, self.dim.astride, 4), dtype='f4')
cuda.memcpy_htod(self.fb.d_front, ins)
launchC('f32_to_yuv444p10', self.mod, None, self.dim, self.fb,
self.fb.d_rb, self.fb.d_seeds)
outs = np.empty((3, self.dim.h, self.dim.w), dtype='u2')
cuda.memcpy_dtoh(outs, self.fb.d_back)
self.assertTrue(np.all(outs[0] == 0))
self.assertTrue(np.all(510 < outs[1]))
self.assertTrue(np.all(outs[1] < 513))
self.assertTrue(np.all(510 < outs[2]))
self.assertTrue(np.all(outs[2] < 513))
def test_yuv444p10_chroma_address_preservation(self):
ins = np.empty((self.dim.ah, self.dim.astride, 4), dtype='f4')
# Set everything to 0 except a few pixels
ins[:] = 0
ins[self.fb.gutter,self.fb.gutter,:] = [0, 1, 0, 1]
ins[self.fb.gutter+1,self.fb.gutter+1,:] = [0, 1, 0, 1]
cuda.memcpy_htod(self.fb.d_front, ins)
launchC('f32_to_yuv444p10', self.mod, None, self.dim, self.fb,
self.fb.d_rb, self.fb.d_seeds)
outs = np.empty((3, self.dim.h, self.dim.w), dtype='u2')
cuda.memcpy_dtoh(outs, self.fb.d_back)
self.assertTrue(outs[0,0,0] > 0)
self.assertTrue(outs[0,1,1] > 0)
self.assertTrue(outs[1,0,0] < 500)
self.assertTrue(outs[1,1,1] < 500)
def test_yuv420p10_chroma_address_preservation(self):
ins = np.empty((self.dim.ah, self.dim.astride, 4), dtype='f4')
# Set everything to 0 except a few pixels
ins[:] = 0
# chroma loc (0,0): one pixel on
ins[self.fb.gutter,self.fb.gutter,:] = [0, 1, 0, 1]
# chroma loc (1,1): average of two strong pixels
ins[self.fb.gutter+2,self.fb.gutter+2,:] = [0, 1, 0, 1]
ins[self.fb.gutter+3,self.fb.gutter+3,:] = [1, 0, 0, 1]
cuda.memcpy_htod(self.fb.d_front, ins)
launchC('f32_to_yuv420p10', self.mod, None, self.dim, self.fb,
self.fb.d_rb, self.fb.d_seeds)
w, h = self.dim.w, self.dim.h
outs = np.empty((3, h, w), dtype='u2')
cuda.memcpy_dtoh(outs, self.fb.d_back)
out_cr = outs[1,:h/4].reshape(h/2,w/2)
out_cb = outs[1,h/4:h/2].reshape(h/2,w/2)
# chroma blocking doesn't affect luma blocking
self.assertTrue(outs[0,0,0] > 0)
self.assertTrue(outs[0,1,0] == 0)
self.assertTrue(outs[0,0,1] == 0)
self.assertTrue(outs[0,1,1] == 0)
# locations are preserved
self.assertTrue(outs[0,2,2] > 0)
self.assertTrue(outs[0,3,3] > 0)
# chroma from first pixel makes it through, neighbor is fine
self.assertTrue(172 <= out_cr[0,0] <= 174)
self.assertTrue(511 <= out_cr[0,1] <= 512)
self.assertTrue(511 <= out_cr[1,0] <= 512)
Reference in New Issue
Block a user