diff --git a/cuburn/code/iter.py b/cuburn/code/iter.py
index 0825dbf..acc6af9 100644
--- a/cuburn/code/iter.py
+++ b/cuburn/code/iter.py
@@ -351,7 +351,7 @@ void iter(
         *accbuf = pix;
 {{elif info.acc_mode == 'deferred'}}
         // 'color' gets the top 9 bits. TODO: add dithering via precalc.
-        uint32_t icolor = cc * 512.0f;
+        uint32_t icolor = fminf(1.0f, cc) * 511.0f;
         asm("bfi.b32    %0, %1, %0, 23, 9;" : "+r"(i) : "r"(icolor));
         *log = i;
 {{endif}}
@@ -377,8 +377,8 @@ __device__
 void write_shmem_helper(
         float4 *acc,
         const int glo_idx,
-        const int dr,
-        const int gb
+        const uint32_t dr,
+        const uint32_t gb
 ) {
     float4 pix = acc[glo_idx];
     pix.x += (dr & 0xffff) / 255.0f;
@@ -461,7 +461,7 @@ write_shmem(
         bfe_decl(shr_addr, entry, 0, SHAB);
         bfe_decl(color, entry, 23, 9);
 
-        float colorf = color / 512.0f;
+        float colorf = color / 511.0f;
         float4 outcol = tex2D(palTex, colorf, time);
 
         // TODO: change texture sampler to return shorts and avoid this
diff --git a/cuburn/render.py b/cuburn/render.py
index 1834237..32aa68b 100644
--- a/cuburn/render.py
+++ b/cuburn/render.py
@@ -246,15 +246,14 @@ class Renderer(object):
                 for i in range(nrounds):
                     iter_fun(np.uint64(d_log), d_seeds, d_points, d_infos,
                              block=(32, self._iter.NTHREADS/32, 1),
-                             grid=(ntemporal_samples, 1),
-                             texrefs=[tref], stream=iter_stream)
+                             grid=(ntemporal_samples, 1), stream=iter_stream)
                     _sync_stream(write_stream, iter_stream)
                     sorter.sort(d_log_sorted, d_log, log_size, start_bit, True,
                                 stream=write_stream)
                     _sync_stream(iter_stream, write_stream)
                     write_fun(d_accum, d_log_sorted, sorter.dglobal, log_shift,
                               block=(1024, 1, 1), grid=(nwriteblocks, 1),
-                              stream=write_stream)
+                              texrefs=[tref], stream=write_stream)
             else:
                 iter_fun(np.uint64(d_accum), d_seeds, d_points, d_infos,
                          block=(32, self._iter.NTHREADS/32, 1),