mirror of
https://github.com/stevenrobertson/cuburn.git
synced 2025-02-05 03:30:05 -05:00
Use YUV during accumulation
This commit is contained in:
parent
a524db2c1d
commit
c572f62d7d
@ -25,6 +25,12 @@ void colorclip(float4 *pixbuf, float gamma, float vibrance, float highpow,
|
||||
pixbuf[i] = make_float4(bkgd.x, bkgd.y, bkgd.z, 0.0f);
|
||||
return;
|
||||
}
|
||||
pix.y -= 0.5f * pix.w;
|
||||
pix.z -= 0.5f * pix.w;
|
||||
float3 tmp = yuv2rgb(make_float3(pix.x, pix.y, pix.z));
|
||||
pix.x = tmp.x;
|
||||
pix.y = tmp.y;
|
||||
pix.z = tmp.z;
|
||||
|
||||
pix.x = fmaxf(0.0f, pix.x);
|
||||
pix.y = fmaxf(0.0f, pix.y);
|
||||
|
@ -370,7 +370,7 @@ float4 interp_color(const float *times, const float4 *sources, float time) {
|
||||
|
||||
float4 left = sources[blockDim.x * (idx - 1) + threadIdx.x];
|
||||
float4 right = sources[blockDim.x * (idx) + threadIdx.x];
|
||||
float3 rgb;
|
||||
float3 yuv;
|
||||
|
||||
float3 l3 = make_float3(left.x, left.y, left.z);
|
||||
float3 r3 = make_float3(right.x, right.y, right.z);
|
||||
@ -395,10 +395,8 @@ float4 interp_color(const float *times, const float4 *sources, float time) {
|
||||
if (hsv.x < 0.0f)
|
||||
hsv.x += 6.0f;
|
||||
|
||||
rgb = hsv2rgb(hsv);
|
||||
yuv = rgb2yuv(hsv2rgb(hsv));
|
||||
{{elif mode.startswith('yuv')}}
|
||||
float3 yuv;
|
||||
|
||||
{{if mode == 'yuv'}}
|
||||
float3 lyuv = rgb2yuv(l3);
|
||||
float3 ryuv = rgb2yuv(r3);
|
||||
@ -412,11 +410,12 @@ float4 interp_color(const float *times, const float4 *sources, float time) {
|
||||
yuv.y = radius * cosf(angle);
|
||||
yuv.z = radius * sinf(angle);
|
||||
{{endif}}
|
||||
|
||||
rgb = yuv2rgb(yuv);
|
||||
{{endif}}
|
||||
|
||||
return make_float4(rgb.x, rgb.y, rgb.z, left.w * lf + right.w * rf);
|
||||
yuv.y += 0.5f;
|
||||
yuv.z += 0.5f;
|
||||
|
||||
return make_float4(yuv.x, yuv.y, yuv.z, left.w * lf + right.w * rf);
|
||||
}
|
||||
|
||||
__global__
|
||||
@ -424,13 +423,13 @@ void interp_palette(uchar4 *outs,
|
||||
const float *times, const float4 *sources,
|
||||
float tstart, float tstep) {
|
||||
float time = tstart + blockIdx.x * tstep;
|
||||
float4 rgba = interp_color(times, sources, time);
|
||||
float4 yuva = interp_color(times, sources, time);
|
||||
|
||||
uchar4 out;
|
||||
out.x = rgba.x * 255.0f;
|
||||
out.y = rgba.y * 255.0f;
|
||||
out.z = rgba.z * 255.0f;
|
||||
out.w = rgba.w * 255.0f;
|
||||
out.x = yuva.x * 255.0f;
|
||||
out.y = yuva.y * 255.0f;
|
||||
out.z = yuva.z * 255.0f;
|
||||
out.w = yuva.w * 255.0f;
|
||||
outs[blockDim.x * blockIdx.x + threadIdx.x] = out;
|
||||
}
|
||||
|
||||
@ -443,16 +442,16 @@ void interp_palette_flat(mwc_st *rctxs,
|
||||
mwc_st rctx = rctxs[gid];
|
||||
|
||||
float time = tstart + blockIdx.x * tstep;
|
||||
float4 rgba = interp_color(times, sources, time);
|
||||
float4 yuva = interp_color(times, sources, time);
|
||||
|
||||
// TODO: use YUV; pack Y at full precision, UV at quarter
|
||||
// TODO: pack Y at full precision, UV at quarter
|
||||
uint2 out;
|
||||
|
||||
uint32_t r = min(255, (uint32_t) (rgba.x * 255.0f + 0.49f * mwc_next_11(rctx)));
|
||||
uint32_t g = min(255, (uint32_t) (rgba.y * 255.0f + 0.49f * mwc_next_11(rctx)));
|
||||
uint32_t b = min(255, (uint32_t) (rgba.z * 255.0f + 0.49f * mwc_next_11(rctx)));
|
||||
out.y = (1 << 22) | (r << 4);
|
||||
out.x = (g << 18) | b;
|
||||
uint32_t y = min(255, (uint32_t) (yuva.x * 255.0f + 0.49f * mwc_next_11(rctx)));
|
||||
uint32_t u = min(255, (uint32_t) (yuva.y * 255.0f + 0.49f * mwc_next_11(rctx)));
|
||||
uint32_t v = min(255, (uint32_t) (yuva.z * 255.0f + 0.49f * mwc_next_11(rctx)));
|
||||
out.y = (1 << 22) | (y << 4);
|
||||
out.x = (u << 18) | v;
|
||||
surf2Dwrite(out, flatpal, 8 * threadIdx.x, blockIdx.x);
|
||||
rctxs[gid] = rctx;
|
||||
}
|
||||
|
@ -361,9 +361,15 @@ void iter(
|
||||
{{if info.acc_mode == 'atomic'}}
|
||||
asm volatile ({{crep("""
|
||||
{
|
||||
// To prevent overflow, we need to flush each pixel before the density
|
||||
// wraps at 1024 points. This atomic segment performs writes to the
|
||||
// integer buffer, occasionally checking the results. If they exceed a
|
||||
// threshold, it zeros that cell in the integer buffer, converts the
|
||||
// former contents to floats, and adds them to the float4 buffer.
|
||||
|
||||
.reg .pred p;
|
||||
.reg .u32 off, color, hi, lo, d, r, g, b;
|
||||
.reg .f32 colorf, rf, gf, bf, df;
|
||||
.reg .u32 off, color, hi, lo, d, y, u, v;
|
||||
.reg .f32 colorf, yf, uf, vf, df;
|
||||
.reg .u64 ptr, val;
|
||||
|
||||
// TODO: coord dithering better, or pre-supersampled palette?
|
||||
@ -371,38 +377,59 @@ void iter(
|
||||
cvt.rni.u32.f32 color, colorf;
|
||||
shl.b32 color, color, 3;
|
||||
|
||||
// Load the pre-packed 64-bit uint from the palette surf
|
||||
suld.b.2d.v2.b32.clamp {lo, hi}, [flatpal, {color, %2}];
|
||||
mov.b64 val, {lo, hi};
|
||||
|
||||
// Calculate the output address in the atomic integer accumulator
|
||||
shl.b32 off, %3, 3;
|
||||
cvt.u64.u32 ptr, off;
|
||||
add.u64 ptr, ptr, %4;
|
||||
|
||||
// 97% of the time, do an atomic add, then jump to the end without
|
||||
// stalling the thread waiting for the data value
|
||||
setp.le.f32 p, %5, 0.97;
|
||||
@p red.global.add.u64 [ptr], val;
|
||||
@p bra oflow_end;
|
||||
|
||||
// 3% of the time, do the atomic add, and wait for the results
|
||||
atom.global.add.u64 val, [ptr], val;
|
||||
mov.b64 {lo, hi}, val;
|
||||
|
||||
// If the density is less than 256, jump to the end
|
||||
setp.lo.u32 p, hi, (256 << 22);
|
||||
@p bra oflow_end;
|
||||
|
||||
// Atomically swap the integer cell with 0 and read its current value
|
||||
atom.global.exch.b64 val, [ptr], 0;
|
||||
mov.b64 {lo, hi}, val;
|
||||
|
||||
// If the integer cell is zero, another thread has captured the full value
|
||||
// in between the first atomic read and the second, so we can skip to the
|
||||
// end again.
|
||||
setp.eq.u32 p, hi, 0;
|
||||
@p bra oflow_end;
|
||||
|
||||
// Extract the values from the packed integer, convert to floats, and add
|
||||
// them to the floating-point buffer.
|
||||
shr.u32 d, hi, 22;
|
||||
bfe.u32 r, hi, 4, 18;
|
||||
bfe.u32 g, lo, 18, 14;
|
||||
bfi.b32 g, hi, g, 14, 4;
|
||||
and.b32 b, lo, ((1<<18)-1);
|
||||
cvt.rn.f32.u32 rf, r;
|
||||
cvt.rn.f32.u32 gf, g;
|
||||
cvt.rn.f32.u32 bf, b;
|
||||
bfe.u32 y, hi, 4, 18;
|
||||
bfe.u32 u, lo, 18, 14;
|
||||
bfi.b32 u, hi, u, 14, 4;
|
||||
and.b32 v, lo, ((1<<18)-1);
|
||||
cvt.rn.f32.u32 yf, y;
|
||||
cvt.rn.f32.u32 uf, u;
|
||||
cvt.rn.f32.u32 vf, v;
|
||||
cvt.rn.f32.u32 df, d;
|
||||
mul.rn.ftz.f32 rf, rf, (1.0/255.0);
|
||||
mul.rn.ftz.f32 gf, gf, (1.0/255.0);
|
||||
mul.rn.ftz.f32 bf, bf, (1.0/255.0);
|
||||
mul.rn.ftz.f32 yf, yf, (1.0/255.0);
|
||||
mul.rn.ftz.f32 uf, uf, (1.0/255.0);
|
||||
mul.rn.ftz.f32 vf, vf, (1.0/255.0);
|
||||
shl.b32 off, %3, 4;
|
||||
cvt.u64.u32 ptr, off;
|
||||
add.u64 ptr, ptr, %6;
|
||||
red.global.add.f32 [ptr], rf;
|
||||
red.global.add.f32 [ptr+4], gf;
|
||||
red.global.add.f32 [ptr+8], bf;
|
||||
red.global.add.f32 [ptr], yf;
|
||||
red.global.add.f32 [ptr+4], uf;
|
||||
red.global.add.f32 [ptr+8], vf;
|
||||
red.global.add.f32 [ptr+12], df;
|
||||
oflow_end:
|
||||
}
|
||||
@ -445,9 +472,9 @@ void flush_atom(uint64_t out_ptr, uint64_t atom_ptr, int nbins) {
|
||||
if (i >= nbins) return;
|
||||
asm volatile ({{crep("""
|
||||
{
|
||||
.reg .u32 off, hi, lo, d, r, g, b;
|
||||
.reg .u32 off, hi, lo, d, y, u, v;
|
||||
.reg .u64 val, ptr;
|
||||
.reg .f32 rf, gf, bf, df, rg, gg, bg, dg;
|
||||
.reg .f32 yf, uf, vf, df, yg, ug, vg, dg;
|
||||
|
||||
// TODO: use explicit movs to handle this
|
||||
shl.b32 off, %0, 3;
|
||||
@ -457,21 +484,22 @@ void flush_atom(uint64_t out_ptr, uint64_t atom_ptr, int nbins) {
|
||||
shl.b32 off, %0, 4;
|
||||
cvt.u64.u32 ptr, off;
|
||||
add.u64 ptr, ptr, %2;
|
||||
ld.global.v4.f32 {rg,gg,bg,dg}, [ptr];
|
||||
ld.global.v4.f32 {yg,ug,vg,dg}, [ptr];
|
||||
shr.u32 d, hi, 22;
|
||||
bfe.u32 r, hi, 4, 18;
|
||||
bfe.u32 g, lo, 18, 14;
|
||||
bfi.b32 g, hi, g, 14, 4;
|
||||
and.b32 b, lo, ((1<<18)-1);
|
||||
cvt.rn.f32.u32 rf, r;
|
||||
cvt.rn.f32.u32 gf, g;
|
||||
cvt.rn.f32.u32 bf, b;
|
||||
bfe.u32 y, hi, 4, 18;
|
||||
bfe.u32 u, lo, 18, 14;
|
||||
bfi.b32 u, hi, u, 14, 4;
|
||||
and.b32 v, lo, ((1<<18)-1);
|
||||
cvt.rn.f32.u32 yf, y;
|
||||
cvt.rn.f32.u32 uf, u;
|
||||
cvt.rn.f32.u32 vf, v;
|
||||
cvt.rn.f32.u32 df, d;
|
||||
fma.rn.ftz.f32 rg, rf, (1.0/255.0), rg;
|
||||
fma.rn.ftz.f32 gg, gf, (1.0/255.0), gg;
|
||||
fma.rn.ftz.f32 bg, bf, (1.0/255.0), bg;
|
||||
fma.rn.ftz.f32 yg, yf, (1.0/255.0), yg;
|
||||
fma.rn.ftz.f32 ug, uf, (1.0/255.0), ug;
|
||||
fma.rn.ftz.f32 vg, vf, (1.0/255.0), vg;
|
||||
|
||||
add.rn.ftz.f32 dg, df, dg;
|
||||
st.global.v4.f32 [ptr], {rg,gg,bg,dg};
|
||||
st.global.v4.f32 [ptr], {yg,ug,vg,dg};
|
||||
}
|
||||
""")}} :: "r"(i), "l"(atom_ptr), "l"(out_ptr));
|
||||
}
|
||||
@ -640,4 +668,3 @@ oflow_write_end:
|
||||
NWARPS = self.NTHREADS / 32,
|
||||
std_xforms = [n for n in sorted(genome.xforms) if n != 'final'],
|
||||
**globals())
|
||||
|
||||
|
@ -154,8 +154,8 @@ void write_half(float &xy, float x, float y, float den) {
|
||||
}
|
||||
|
||||
|
||||
/* This conversion uses the JPEG full-range standard, though it does *not* add
|
||||
* an offset to UV to bias them into the positive regime. */
|
||||
/* This conversion uses the JPEG full-range standard. Note that UV have range
|
||||
* [-0.5, 0.5], so consider biasing the results. */
|
||||
__device__
|
||||
float3 rgb2yuv(float3 rgb) {
|
||||
return make_float3(
|
||||
|
Loading…
Reference in New Issue
Block a user