diff --git a/cuburn/code/util.py b/cuburn/code/util.py
index 5c119a9..6954c38 100644
--- a/cuburn/code/util.py
+++ b/cuburn/code/util.py
@@ -104,13 +104,24 @@ void zero_dptr(float* dptr, int size) {
     }
 }
 
+/* read_half and write_half decode and encode, respectively, two
+ * floating-point values from a 32-bit value (typed as a 'float' for
+ * convenience but not really). The values are packed into u16s as a
+ * proportion of a third value, as in 'ux = u16( x / d * (2^16-1) )'.
+ * This is used during accumulation.
+ *
+ * TODO: also write a function that will efficiently add a value to the packed
+ * values while incrementing the density, to improve the speed of this
+ * approach when the alpha channel is present.
+ */
+
 __device__
 void read_half(float &x, float &y, float xy, float den) {
     asm("\n\t{"
         "\n\t   .reg .u16       x, y;"
         "\n\t   .reg .f32       rc;"
         "\n\t   mov.b32         {x, y},     %2;"
-        "\n\t   mul.f32         rc,         %3,     0f37800000;" // 1/65536.
+        "\n\t   mul.f32         rc,         %3,     0f37800080;" // 1/65535.
         "\n\t   cvt.rn.f32.u16     %0,         x;"
         "\n\t   cvt.rn.f32.u16     %1,         y;"
         "\n\t   mul.f32         %0,         %0,     rc;"
@@ -125,7 +136,7 @@ void write_half(float &xy, float x, float y, float den) {
         "\n\t   .reg .u16       x, y;"
         "\n\t   .reg .f32       rc, xf, yf;"
         "\n\t   rcp.approx.f32  rc,         %3;"
-        "\n\t   mul.f32         rc,         rc,     65536.0;"
+        "\n\t   mul.f32         rc,         rc,     65535.0;"
         "\n\t   mul.f32         xf,         %1,     rc;"
         "\n\t   mul.f32         yf,         %2,     rc;"
         "\n\t   cvt.rni.u16.f32 x,  xf;"