diff --git a/cuburn/code/filtering.py b/cuburn/code/filtering.py
index 0e4e8d9..10515bc 100644
--- a/cuburn/code/filtering.py
+++ b/cuburn/code/filtering.py
@@ -11,7 +11,7 @@ void colorclip(float4 *pixbuf, float gamma, float vibrancy, float highpow,
                float linrange, float lingam, float3 bkgd) {
     // TODO: test if over an edge of the framebuffer - currently gutters are
     // used and up to 256 pixels are ignored, which breaks when width<256
-    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    int i = (gridDim.x * blockIdx.y + blockIdx.x) * blockDim.x + threadIdx.x;
     float4 pix = pixbuf[i];
 
     if (pix.w <= 0) {
diff --git a/cuburn/code/util.py b/cuburn/code/util.py
index fc6eeeb..d6676d7 100644
--- a/cuburn/code/util.py
+++ b/cuburn/code/util.py
@@ -103,7 +103,7 @@ uint32_t trunca(float f) {
 
 __global__
 void zero_dptr(float* dptr, int size) {
-    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    int i = (gridDim.x * blockIdx.y + blockIdx.x) * blockDim.x + threadIdx.x;
     if (i < size) {
         dptr[i] = 0.0f;
     }
@@ -161,8 +161,9 @@ void write_half(float &xy, float x, float y, float den) {
         number of 4-byte words in the pointer.
         """
         zero = mod.get_function("zero_dptr")
+        blocks = int(np.ceil(np.sqrt(size / 1024 + 1)))
         zero(dptr, np.int32(size), stream=stream,
-             block=(1024, 1, 1), grid=(size/1024+1, 1))
+             block=(1024, 1, 1), grid=(blocks, blocks, 1))
 
 class DataPackerView(object):
     """
diff --git a/cuburn/render.py b/cuburn/render.py
index b9ad522..d79659d 100644
--- a/cuburn/render.py
+++ b/cuburn/render.py
@@ -337,8 +337,9 @@ class Animation(object):
             bkgd = vec.make_float3(*(bkgd / n))
 
             color_fun = self.mod.get_function("colorclip")
+            blocks = int(np.ceil(np.sqrt(nbins / 256)))
             color_fun(d_out, gam, vib, hipow, lin, lingam, bkgd,
-                      block=(256, 1, 1), grid=(nbins / 256, 1),
+                      block=(256, 1, 1), grid=(blocks, blocks),
                       stream=filt_stream)
             cuda.memcpy_dtoh_async(h_out, d_out, filt_stream)