From dd645bcbf65f62335867d3d1e8f42b5d12806211 Mon Sep 17 00:00:00 2001
From: Steven Robertson <steven@strobe.cc>
Date: Sat, 15 Oct 2011 00:29:22 -0400
Subject: [PATCH] Use one dither offset per block.

---
 cuburn/code/iter.py | 28 +++++++++++++---------------
 cuburn/code/util.py |  7 ++++---
 2 files changed, 17 insertions(+), 18 deletions(-)

diff --git a/cuburn/code/iter.py b/cuburn/code/iter.py
index 4d50ca8..4089288 100644
--- a/cuburn/code/iter.py
+++ b/cuburn/code/iter.py
@@ -110,6 +110,16 @@ void iter(mwc_st *msts, iter_info *infos, float4 *accbuf, float *denbuf) {
         cosel[threadIdx.x] = mwc_next_01(rctx);
     {{endif}}
 
+    if (threadIdx.y == 1 && threadIdx.x == 0) {
+        float ditherwidth = {{packer.get("0.33 * cp.spatial_filter_radius")}};
+        float u0 = mwc_next_01(rctx);
+        float r = ditherwidth * sqrt(-2.0f * log2f(u0) / M_LOG2E);
+
+        float u1 = 2.0f * M_PI * mwc_next_01(rctx);
+        info.cam_xo += r * cos(u1);
+        info.cam_yo += r * sin(u1);
+    }
+
     __syncthreads();
     int consec_bad = -{{features.fuse}};
 
@@ -197,21 +207,9 @@ void iter(mwc_st *msts, iter_info *infos, float4 *accbuf, float *denbuf) {
         {{apply_affine('x', 'y', 'cx', 'cy', packer,
                        'cp.camera_transform', 'cam')}}
         {{endif}}
+        uint32_t ix = trunca(cx), iy = trunca(cy);
 
-        // TODO: verify that constants get premultiplied
-        float ditherwidth = {{packer.get("0.33 * cp.spatial_filter_radius")}};
-        float u0 = mwc_next_01(rctx);
-        float r = ditherwidth * sqrt(-2.0f * log2f(u0) / M_LOG2E);
-
-        // TODO: provide mwc_next_0_2pi()
-        float u1 = 2.0f * M_PI * mwc_next_01(rctx);
-
-        float ditherx = r * cos(u1);
-        float dithery = r * sin(u1);
-        int ix = trunca(cx+ditherx), iy = trunca(cy+dithery);
-
-        if (ix < 0 || ix >= {{features.acc_width}} ||
-            iy < 0 || iy >= {{features.acc_height}} ) {
+        if (ix >= {{features.acc_width}} || iy >= {{features.acc_height}} ) {
             consec_bad++;
             if (consec_bad > {{features.max_oob}}) {
                 x = mwc_next_11(rctx);
@@ -222,7 +220,7 @@ void iter(mwc_st *msts, iter_info *infos, float4 *accbuf, float *denbuf) {
             continue;
         }
 
-        int i = iy * {{features.acc_stride}} + ix;
+        uint32_t i = iy * {{features.acc_stride}} + ix;
 
         float4 outcol = tex2D(palTex, color, {{packer.get("cp_step_frac")}});
         float4 pix = accbuf[i];
diff --git a/cuburn/code/util.py b/cuburn/code/util.py
index 8f0ab1b..f6e8dcb 100644
--- a/cuburn/code/util.py
+++ b/cuburn/code/util.py
@@ -89,9 +89,10 @@ uint32_t gtid() {
 }
 
 __device__
-int trunca(float f) {
-    // truncate as used in address calculations
-    int ret;
+uint32_t trunca(float f) {
+    // truncate as used in address calculations. note the use of a signed
+    // conversion is intentional here (simplifies image bounds checking).
+    uint32_t ret;
     asm("cvt.rni.s32.f32    %0,     %1;" : "=r"(ret) : "f"(f));
     return ret;
 }