mirror of
https://github.com/stevenrobertson/cuburn.git
synced 2025-02-05 11:40:04 -05:00
Pixlib fixes, a new yuv420p10 pix format, tests.
This commit is contained in:
parent
49ccf75c5f
commit
42f9ae2824
@ -1,19 +1,17 @@
|
|||||||
from util import devlib, ringbuflib
|
from util import devlib, ringbuflib
|
||||||
from mwc import mwclib
|
from mwc import mwclib
|
||||||
|
|
||||||
ditherlib = devlib(deps=[mwclib], defs=r'''
|
pixfmtlib = devlib(deps=[ringbuflib, mwclib], defs=r'''
|
||||||
// Clamp an input between 0 and a given peak (inclusive), dithering its output,
|
// Clamp an input between 0 and a given peak (inclusive), dithering its output,
|
||||||
// with full clamping for pixels that are true-black for compressibility.
|
// with full clamping for pixels that are true-black for compressibility.
|
||||||
__device__ float dclampf(mwc_st &rctx, float peak, float in) {
|
__device__ float dclampf(mwc_st &rctx, float peak, float in) {
|
||||||
float ret = 0.0f;
|
float ret = 0.0f;
|
||||||
if (in > 0.0f) {
|
if (in > 0.0f) {
|
||||||
ret = fminf(peak, fmaxf(0.0f, in * peak + 0.49f * mwc_next_11(rctx)));
|
ret = fminf(peak, in * peak + 0.99f * mwc_next_01(rctx));
|
||||||
}
|
}
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
''')
|
|
||||||
|
|
||||||
rgba8lib = devlib(deps=[ringbuflib, mwclib, ditherlib], defs=r'''
|
|
||||||
// Perform a conversion from float32 values to uint8 ones, applying
|
// Perform a conversion from float32 values to uint8 ones, applying
|
||||||
// pixel- and channel-independent dithering to reduce suprathreshold banding
|
// pixel- and channel-independent dithering to reduce suprathreshold banding
|
||||||
// artifacts. Clamps values larger than 1.0f.
|
// artifacts. Clamps values larger than 1.0f.
|
||||||
@ -44,9 +42,7 @@ __global__ void f32_to_rgba_u8(
|
|||||||
dst[idst] = out;
|
dst[idst] = out;
|
||||||
rctxs[rb_incr(rb->tail, tid)] = rctx;
|
rctxs[rb_incr(rb->tail, tid)] = rctx;
|
||||||
}
|
}
|
||||||
''')
|
|
||||||
|
|
||||||
rgba16lib = devlib(deps=[ringbuflib, mwclib, ditherlib], defs=r'''
|
|
||||||
// Perform a conversion from float32 values to uint16 ones, as above.
|
// Perform a conversion from float32 values to uint16 ones, as above.
|
||||||
__global__ void f32_to_rgba_u16(
|
__global__ void f32_to_rgba_u16(
|
||||||
ushort4 *dst, const float4 *src,
|
ushort4 *dst, const float4 *src,
|
||||||
@ -73,9 +69,7 @@ __global__ void f32_to_rgba_u16(
|
|||||||
dst[idst] = out;
|
dst[idst] = out;
|
||||||
rctxs[rb_incr(rb->tail, tid)] = rctx;
|
rctxs[rb_incr(rb->tail, tid)] = rctx;
|
||||||
}
|
}
|
||||||
''')
|
|
||||||
|
|
||||||
yuv444plib = devlib(deps=[ringbuflib, mwclib, ditherlib], defs=r'''
|
|
||||||
// Convert from rgb444 to planar YUV with no chroma subsampling.
|
// Convert from rgb444 to planar YUV with no chroma subsampling.
|
||||||
// Uses JPEG full-range color primaries.
|
// Uses JPEG full-range color primaries.
|
||||||
__global__ void f32_to_yuv444p(
|
__global__ void f32_to_yuv444p(
|
||||||
@ -106,9 +100,7 @@ __global__ void f32_to_yuv444p(
|
|||||||
dst[idst] = out.z;
|
dst[idst] = out.z;
|
||||||
rctxs[rb_incr(rb->tail, tid)] = rctx;
|
rctxs[rb_incr(rb->tail, tid)] = rctx;
|
||||||
}
|
}
|
||||||
''')
|
|
||||||
|
|
||||||
yuv444p10lib = devlib(deps=[ringbuflib, mwclib, ditherlib], defs=r'''
|
|
||||||
// Convert from rgb444 to planar YUV 10-bit, using JPEG full-range primaries.
|
// Convert from rgb444 to planar YUV 10-bit, using JPEG full-range primaries.
|
||||||
// TODO(strobe): Decide how YouTube will handle Rec. 2020, and then do that here.
|
// TODO(strobe): Decide how YouTube will handle Rec. 2020, and then do that here.
|
||||||
__global__ void f32_to_yuv444p10(
|
__global__ void f32_to_yuv444p10(
|
||||||
@ -125,20 +117,76 @@ __global__ void f32_to_yuv444p10(
|
|||||||
mwc_st rctx = rctxs[rb_incr(rb->head, tid)];
|
mwc_st rctx = rctxs[rb_incr(rb->head, tid)];
|
||||||
|
|
||||||
float4 in = src[isrc];
|
float4 in = src[isrc];
|
||||||
uchar3 out = make_uchar3(
|
ushort3 out = make_ushort3(
|
||||||
dclampf(rctx, 1024.0f, 0.299f * in.x + 0.587f * in.y + 0.114f * in.z),
|
dclampf(rctx, 1023.0f, 0.299f * in.x + 0.587f * in.y + 0.114f * in.z),
|
||||||
dclampf(rctx, 1024.0f, -0.168736f * in.x - 0.331264f * in.y + 0.5f * in.z + 0.5f),
|
dclampf(rctx, 1023.0f, -0.168736f * in.x - 0.331264f * in.y + 0.5f * in.z + 0.5f),
|
||||||
dclampf(rctx, 1024.0f, 0.5f * in.x - 0.418688f * in.y - 0.081312f * in.z + 0.5f)
|
dclampf(rctx, 1023.0f, 0.5f * in.x - 0.418688f * in.y - 0.081312f * in.z + 0.5f)
|
||||||
);
|
);
|
||||||
|
|
||||||
int idst = dstride * y + x;
|
int idst = dstride * y + x;
|
||||||
dst[idst] = out.x;
|
dst[idst] = out.x;
|
||||||
idst += dstride * height;
|
idst += dstride * height;
|
||||||
dst[idst] = out.y;
|
dst[idst] = 1023.0f * (-0.168736f * in.x - 0.331264f * in.y + 0.5f * in.z + 0.5f);
|
||||||
idst += dstride * height;
|
idst += dstride * height;
|
||||||
dst[idst] = out.z;
|
dst[idst] = out.z;
|
||||||
|
|
||||||
|
rctxs[rb_incr(rb->tail, tid)] = rctx;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Convert from rgb444 to planar YUV 10-bit, using JPEG full-range primaries.
|
||||||
|
// Perform subsampling of chroma using weighted averages.
|
||||||
|
__global__ void f32_to_yuv420p10(
|
||||||
|
uint16_t *dst, const float4 *src,
|
||||||
|
int gutter, int dstride, int sstride, int height,
|
||||||
|
ringbuf *rb, mwc_st *rctxs)
|
||||||
|
{
|
||||||
|
int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||||
|
if (x > dstride || y > height) return;
|
||||||
|
int tid = blockDim.x * threadIdx.y + threadIdx.x;
|
||||||
|
mwc_st rctx = rctxs[rb_incr(rb->head, tid)];
|
||||||
|
|
||||||
|
// Perform luma using real addressing
|
||||||
|
int isrc = sstride * (y + gutter) + x + gutter;
|
||||||
|
int idst = dstride * y + x;
|
||||||
|
float4 in = src[isrc];
|
||||||
|
dst[idst] = dclampf(rctx, 1023.0f, 0.299f * in.x + 0.587f * in.y + 0.114f * in.z);
|
||||||
|
|
||||||
|
// Drop into subsampling mode for chroma components
|
||||||
|
if (x * 2 > dstride || y * 2 > height) return;
|
||||||
|
|
||||||
|
// Recompute addressing and collect weighted averages
|
||||||
|
// TODO(strobe): characterize overflow here
|
||||||
|
isrc = sstride * (y * 2 + gutter) + x * 2 + gutter;
|
||||||
|
in = src[isrc];
|
||||||
|
float sum = in.w + 1e-12;
|
||||||
|
float cb = in.w * (-0.168736f * in.x - 0.331264f * in.y + 0.5f * in.z);
|
||||||
|
float cr = in.w * (0.5f * in.x - 0.418688f * in.y - 0.081312f * in.z);
|
||||||
|
|
||||||
|
in = src[isrc + 1];
|
||||||
|
sum += in.w;
|
||||||
|
cb += in.w * (-0.168736f * in.x - 0.331264f * in.y + 0.5f * in.z);
|
||||||
|
cr += in.w * (0.5f * in.x - 0.418688f * in.y - 0.081312f * in.z);
|
||||||
|
|
||||||
|
isrc += sstride;
|
||||||
|
in = src[isrc];
|
||||||
|
sum += in.w;
|
||||||
|
cb += in.w * (-0.168736f * in.x - 0.331264f * in.y + 0.5f * in.z);
|
||||||
|
cr += in.w * (0.5f * in.x - 0.418688f * in.y - 0.081312f * in.z);
|
||||||
|
|
||||||
|
in = src[isrc + 1];
|
||||||
|
sum += in.w;
|
||||||
|
cb += in.w * (-0.168736f * in.x - 0.331264f * in.y + 0.5f * in.z);
|
||||||
|
cr += in.w * (0.5f * in.x - 0.418688f * in.y - 0.081312f * in.z);
|
||||||
|
|
||||||
|
// For this to work, dstride must equal the output frame width
|
||||||
|
// and be a multiple of four.
|
||||||
|
idst = dstride * height + dstride / 2 * y + x;
|
||||||
|
dst[idst] = dclampf(rctx, 1023.0f, cb / sum + 0.5f);
|
||||||
|
idst += dstride * height / 4;
|
||||||
|
dst[idst] = dclampf(rctx, 1023.0f, cr / sum + 0.5f);
|
||||||
|
|
||||||
rctxs[rb_incr(rb->tail, tid)] = rctx;
|
rctxs[rb_incr(rb->tail, tid)] = rctx;
|
||||||
}
|
}
|
||||||
''')
|
''')
|
||||||
|
|
||||||
pixfmtlib = devlib(deps=[rgba8lib, rgba16lib, yuv444plib, yuv444p10lib])
|
|
||||||
|
0
cuburn/code/tests/__init__.py
Normal file
0
cuburn/code/tests/__init__.py
Normal file
124
cuburn/code/tests/test_output.py
Normal file
124
cuburn/code/tests/test_output.py
Normal file
@ -0,0 +1,124 @@
|
|||||||
|
import argparse
|
||||||
|
import unittest
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
import pycuda.driver as cuda
|
||||||
|
import pycuda.autoinit
|
||||||
|
|
||||||
|
from cuburn import render
|
||||||
|
from cuburn.output import launchC
|
||||||
|
from cuburn.code import output
|
||||||
|
from cuburn.code import util
|
||||||
|
|
||||||
|
class ProfileTest(unittest.TestCase, util.ClsMod):
|
||||||
|
lib = util.devlib(deps=[output.pixfmtlib])
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
super(ProfileTest, self).__init__(*args, **kwargs)
|
||||||
|
self.load()
|
||||||
|
self.fb = render.Framebuffers()
|
||||||
|
self.dim = self.fb.calc_dim(640, 360)
|
||||||
|
self.fb.alloc(self.dim)
|
||||||
|
|
||||||
|
def test_clamping_below_0(self):
|
||||||
|
ins = np.empty((self.dim.ah, self.dim.astride, 4), dtype='f4')
|
||||||
|
ins[:] = -1
|
||||||
|
cuda.memcpy_htod(self.fb.d_front, ins)
|
||||||
|
|
||||||
|
launchC('f32_to_yuv444p', self.mod, None, self.dim, self.fb,
|
||||||
|
self.fb.d_rb, self.fb.d_seeds)
|
||||||
|
|
||||||
|
outs = np.empty((3, self.dim.h, self.dim.w), dtype='u1')
|
||||||
|
cuda.memcpy_dtoh(outs, self.fb.d_back)
|
||||||
|
self.assertTrue(np.all(outs[0] == 0))
|
||||||
|
self.assertTrue(np.all(outs[1] >= 127))
|
||||||
|
self.assertTrue(np.all(outs[1] <= 128))
|
||||||
|
self.assertTrue(np.all(outs[2] >= 127))
|
||||||
|
self.assertTrue(np.all(outs[2] <= 128))
|
||||||
|
|
||||||
|
def test_clamping_above_1(self):
|
||||||
|
ins = np.empty((self.dim.ah, self.dim.astride, 4), dtype='f4')
|
||||||
|
ins[:] = 5
|
||||||
|
cuda.memcpy_htod(self.fb.d_front, ins)
|
||||||
|
|
||||||
|
launchC('f32_to_yuv444p', self.mod, None, self.dim, self.fb,
|
||||||
|
self.fb.d_rb, self.fb.d_seeds)
|
||||||
|
|
||||||
|
outs = np.empty((3, self.dim.h, self.dim.w), dtype='u1')
|
||||||
|
cuda.memcpy_dtoh(outs, self.fb.d_back)
|
||||||
|
self.assertTrue(np.all(outs[0] == 255))
|
||||||
|
self.assertTrue(np.all(outs[1] >= 127))
|
||||||
|
self.assertTrue(np.all(outs[1] <= 128))
|
||||||
|
self.assertTrue(np.all(outs[2] >= 127))
|
||||||
|
self.assertTrue(np.all(outs[2] <= 128))
|
||||||
|
|
||||||
|
def test_yuv444p10_zero_passthru(self):
|
||||||
|
ins = np.zeros((self.dim.ah, self.dim.astride, 4), dtype='f4')
|
||||||
|
cuda.memcpy_htod(self.fb.d_front, ins)
|
||||||
|
|
||||||
|
launchC('f32_to_yuv444p10', self.mod, None, self.dim, self.fb,
|
||||||
|
self.fb.d_rb, self.fb.d_seeds)
|
||||||
|
|
||||||
|
outs = np.empty((3, self.dim.h, self.dim.w), dtype='u2')
|
||||||
|
cuda.memcpy_dtoh(outs, self.fb.d_back)
|
||||||
|
self.assertTrue(np.all(outs[0] == 0))
|
||||||
|
self.assertTrue(np.all(510 < outs[1]))
|
||||||
|
self.assertTrue(np.all(outs[1] < 513))
|
||||||
|
self.assertTrue(np.all(510 < outs[2]))
|
||||||
|
self.assertTrue(np.all(outs[2] < 513))
|
||||||
|
|
||||||
|
def test_yuv444p10_chroma_address_preservation(self):
|
||||||
|
ins = np.empty((self.dim.ah, self.dim.astride, 4), dtype='f4')
|
||||||
|
# Set everything to 0 except a few pixels
|
||||||
|
ins[:] = 0
|
||||||
|
ins[self.fb.gutter,self.fb.gutter,:] = [0, 1, 0, 1]
|
||||||
|
ins[self.fb.gutter+1,self.fb.gutter+1,:] = [0, 1, 0, 1]
|
||||||
|
cuda.memcpy_htod(self.fb.d_front, ins)
|
||||||
|
|
||||||
|
launchC('f32_to_yuv444p10', self.mod, None, self.dim, self.fb,
|
||||||
|
self.fb.d_rb, self.fb.d_seeds)
|
||||||
|
|
||||||
|
outs = np.empty((3, self.dim.h, self.dim.w), dtype='u2')
|
||||||
|
cuda.memcpy_dtoh(outs, self.fb.d_back)
|
||||||
|
self.assertTrue(outs[0,0,0] > 0)
|
||||||
|
self.assertTrue(outs[0,1,1] > 0)
|
||||||
|
self.assertTrue(outs[1,0,0] < 500)
|
||||||
|
self.assertTrue(outs[1,1,1] < 500)
|
||||||
|
|
||||||
|
def test_yuv420p10_chroma_address_preservation(self):
|
||||||
|
ins = np.empty((self.dim.ah, self.dim.astride, 4), dtype='f4')
|
||||||
|
# Set everything to 0 except a few pixels
|
||||||
|
ins[:] = 0
|
||||||
|
|
||||||
|
# chroma loc (0,0): one pixel on
|
||||||
|
ins[self.fb.gutter,self.fb.gutter,:] = [0, 1, 0, 1]
|
||||||
|
|
||||||
|
# chroma loc (1,1): average of two strong pixels
|
||||||
|
ins[self.fb.gutter+2,self.fb.gutter+2,:] = [0, 1, 0, 1]
|
||||||
|
ins[self.fb.gutter+3,self.fb.gutter+3,:] = [1, 0, 0, 1]
|
||||||
|
|
||||||
|
cuda.memcpy_htod(self.fb.d_front, ins)
|
||||||
|
|
||||||
|
launchC('f32_to_yuv420p10', self.mod, None, self.dim, self.fb,
|
||||||
|
self.fb.d_rb, self.fb.d_seeds)
|
||||||
|
|
||||||
|
w, h = self.dim.w, self.dim.h
|
||||||
|
outs = np.empty((3, h, w), dtype='u2')
|
||||||
|
cuda.memcpy_dtoh(outs, self.fb.d_back)
|
||||||
|
out_cr = outs[1,:h/4].reshape(h/2,w/2)
|
||||||
|
out_cb = outs[1,h/4:h/2].reshape(h/2,w/2)
|
||||||
|
|
||||||
|
# chroma blocking doesn't affect luma blocking
|
||||||
|
self.assertTrue(outs[0,0,0] > 0)
|
||||||
|
self.assertTrue(outs[0,1,0] == 0)
|
||||||
|
self.assertTrue(outs[0,0,1] == 0)
|
||||||
|
self.assertTrue(outs[0,1,1] == 0)
|
||||||
|
|
||||||
|
# locations are preserved
|
||||||
|
self.assertTrue(outs[0,2,2] > 0)
|
||||||
|
self.assertTrue(outs[0,3,3] > 0)
|
||||||
|
|
||||||
|
# chroma from first pixel makes it through, neighbor is fine
|
||||||
|
self.assertTrue(172 <= out_cr[0,0] <= 174)
|
||||||
|
self.assertTrue(511 <= out_cr[0,1] <= 512)
|
||||||
|
self.assertTrue(511 <= out_cr[1,0] <= 512)
|
Loading…
Reference in New Issue
Block a user