diff --git a/cuburn/code/output.py b/cuburn/code/output.py
index 808698b..1644e74 100644
--- a/cuburn/code/output.py
+++ b/cuburn/code/output.py
@@ -1,7 +1,19 @@
 from util import devlib, ringbuflib
 from mwc import mwclib
 
-rgba8lib = devlib(deps=[ringbuflib, mwclib], defs=r'''
+ditherlib = devlib(deps=[mwclib], defs=r'''
+// Clamp an input between 0 and a given peak (inclusive), dithering its output,
+// with full clamping for pixels that are true-black for compressibility.
+__device__ float dclampf(mwc_st &rctx, float peak, float in) {
+  float ret = 0.0f;
+  if (in > 0.0f) {
+    ret = fminf(peak, fmaxf(0.0f, in * peak + 0.49f * mwc_next_11(rctx)));
+  }
+  return ret;
+}
+''')
+
+rgba8lib = devlib(deps=[ringbuflib, mwclib, ditherlib], defs=r'''
 // Perform a conversion from float32 values to uint8 ones, applying
 // pixel- and channel-independent dithering to reduce suprathreshold banding
 // artifacts. Clamps values larger than 1.0f.
@@ -22,10 +34,10 @@ __global__ void f32_to_rgba_u8(
 
     float4 in = src[isrc];
     uchar4 out = make_uchar4(
-        fminf(1.0f, in.x) * 255.0f + 0.49f * mwc_next_11(rctx),
-        fminf(1.0f, in.y) * 255.0f + 0.49f * mwc_next_11(rctx),
-        fminf(1.0f, in.z) * 255.0f + 0.49f * mwc_next_11(rctx),
-        fminf(1.0f, in.w) * 255.0f + 0.49f * mwc_next_11(rctx)
+        dclampf(rctx, 255.0f, in.x),
+        dclampf(rctx, 255.0f, in.y),
+        dclampf(rctx, 255.0f, in.z),
+        dclampf(rctx, 255.0f, in.w)
     );
 
     int idst = dstride * y + x;
@@ -34,7 +46,7 @@ __global__ void f32_to_rgba_u8(
 }
 ''')
 
-rgba16lib = devlib(deps=[ringbuflib, mwclib], defs=r'''
+rgba16lib = devlib(deps=[ringbuflib, mwclib, ditherlib], defs=r'''
 // Perform a conversion from float32 values to uint16 ones, as above.
 __global__ void f32_to_rgba_u16(
     ushort4 *dst, const float4 *src,
@@ -51,10 +63,10 @@ __global__ void f32_to_rgba_u16(
 
     float4 in = src[isrc];
     ushort4 out = make_ushort4(
-        fminf(1.0f, in.x) * 65535.0f + 0.49f * mwc_next_11(rctx),
-        fminf(1.0f, in.y) * 65535.0f + 0.49f * mwc_next_11(rctx),
-        fminf(1.0f, in.z) * 65535.0f + 0.49f * mwc_next_11(rctx),
-        fminf(1.0f, in.w) * 65535.0f + 0.49f * mwc_next_11(rctx)
+        dclampf(rctx, 65535.0f, in.x),
+        dclampf(rctx, 65535.0f, in.y),
+        dclampf(rctx, 65535.0f, in.z),
+        dclampf(rctx, 65535.0f, in.w)
     );
 
     int idst = dstride * y + x;