diff --git a/cuburn/code/filter.py b/cuburn/code/filter.py
new file mode 100644
index 0000000..c34392f
--- /dev/null
+++ b/cuburn/code/filter.py
@@ -0,0 +1,66 @@
+
+from cuburn.code.util import *
+
+class ColorClip(HunkOCode):
+    defs = """
+__global__
+void logfilt(float4 *pixbuf, float k1, float k2,
+             float gamma, float vibrancy, float highpow) {
+    // TODO: test if over an edge of the framebuffer
+    int i = 512 * blockIdx.x + threadIdx.x;
+    float4 pix = pixbuf[i];
+
+    if (pix.w <= 0) return;
+
+    float ls = k1 * logf(1.0 + pix.w * k2) / pix.w;
+    pix.x *= ls;
+    pix.y *= ls;
+    pix.z *= ls;
+    pix.w *= ls;
+
+    float4 opix = pix;
+
+    // TODO: linearized bottom range
+    float alpha = powf(pix.w, gamma);
+    ls = vibrancy * alpha / pix.w;
+
+    float maxc = fmaxf(pix.x, fmaxf(pix.y, pix.z));
+    float newls = 1 / maxc;
+
+    // TODO: detect if highlight power is globally disabled and drop
+    // this branch
+
+    if (maxc * ls > 1 && highpow >= 0) {
+        // TODO: does CUDA autopromote the int here to a float before GPU?
+        float lsratio = powf(newls / ls, highpow);
+
+        pix.x *= newls;
+        pix.y *= newls;
+        pix.z *= newls;
+        maxc  *= newls;
+
+        // Reduce saturation (according to the HSV model) by proportionally
+        // increasing the values of the other colors.
+
+        pix.x = maxc - (maxc - pix.x) * lsratio;
+        pix.y = maxc - (maxc - pix.y) * lsratio;
+        pix.z = maxc - (maxc - pix.z) * lsratio;
+
+    } else {
+        highpow = -highpow;
+        if (highpow > 1 || maxc * ls <= 1) highpow = 1;
+        float adj = ((1.0 - highpow) * newls + highpow * ls);
+        pix.x *= adj;
+        pix.y *= adj;
+        pix.z *= adj;
+    }
+
+    pix.x = fminf(1.0, pix.x + (1.0 - vibrancy) * powf(opix.x, gamma));
+    pix.y = fminf(1.0, pix.y + (1.0 - vibrancy) * powf(opix.y, gamma));
+    pix.z = fminf(1.0, pix.z + (1.0 - vibrancy) * powf(opix.z, gamma));
+
+    pixbuf[i] = pix;
+}
+"""
+
+
diff --git a/cuburn/code/iter.py b/cuburn/code/iter.py
index e8318b4..4d947dc 100644
--- a/cuburn/code/iter.py
+++ b/cuburn/code/iter.py
@@ -10,7 +10,7 @@ from pycuda.compiler import SourceModule
 import numpy as np
 
 from fr0stlib.pyflam3 import flam3_interpolate
-from cuburn.code import mwc, variations
+from cuburn.code import mwc, variations, filter
 from cuburn.code.util import *
 from cuburn.render import Genome
 
@@ -109,7 +109,7 @@ void iter(mwc_st *msts, const iter_info *infos, float *accbuf, float *denbuf) {
         }
 
         // TODO: dither?
-        int i = ((int)((y + 1.0f) * 255.0f) * 512)
+        int i = ((int)((1.0f - y) * 255.0f) * 512)
               +  (int)((x + 1.0f) * 255.0f);
 
         // since info was declared const, C++ barfs unless it's loaded first
@@ -120,7 +120,6 @@ void iter(mwc_st *msts, const iter_info *infos, float *accbuf, float *denbuf) {
         accbuf[i*4+2]   += outcol.z;
         accbuf[i*4+3]   += outcol.w;
         denbuf[i] += 1.0f;
-
     }
 }
 """)
@@ -136,7 +135,7 @@ def silly(features, cps):
     seeds = mwc.MWC.make_seeds(512 * nsteps)
 
     iter = IterCode(features)
-    code = assemble_code(BaseCode, mwc.MWC, iter, iter.packer)
+    code = assemble_code(BaseCode, mwc.MWC, iter, iter.packer, filter.ColorClip)
     print code
     mod = SourceModule(code, options=['-use_fast_math'], keep=True)
 
@@ -168,10 +167,26 @@ def silly(features, cps):
     tref.set_format(cuda.array_format.UNSIGNED_INT8, 4)
     tref.set_flags(cuda.TRSF_NORMALIZED_COORDINATES)
 
+    abufd = cuda.to_device(abuf)
+    dbufd = cuda.to_device(dbuf)
+
     fun = mod.get_function("iter")
-    t = fun(InOut(seeds), In(infos), InOut(abuf), InOut(dbuf),
+    t = fun(InOut(seeds), In(infos), abufd, dbufd,
         block=(512,1,1), grid=(nsteps,1), time_kernel=True)
     print "Completed render in %g seconds" % t
 
+    f = np.float32
+
+    k1 = cp.contrast * cp.brightness * 268 / 256
+    area = 1
+    k2 = 4 / (cp.contrast * 5000)
+
+    fun = mod.get_function("logfilt")
+    t = fun(abufd, f(k1), f(k2),
+        f(1 / cp.gamma), f(cp.vibrancy), f(cp.highlight_power),
+        block=(512,1,1), grid=(512,1), time_kernel=True)
+    print "Completed color filtering in %g seconds" % t
+
+    abuf = cuda.from_device_like(abufd, abuf)
     return abuf, dbuf
 
diff --git a/main.py b/main.py
index 15ebd7d..028355c 100644
--- a/main.py
+++ b/main.py
@@ -48,7 +48,7 @@ def main(args):
     if '-g' not in args:
         return
 
-    imgbuf = (accum * 255).astype(np.uint8)
+    imgbuf = (np.minimum(accum * 255, 255)).astype(np.uint8)
 
     window = pyglet.window.Window(1600, 900)
     image = pyglet.image.ImageData(512, 512, 'RGBA', imgbuf.tostring())