From 964b11efdfc57c64770469290ce4558d1dfc7070 Mon Sep 17 00:00:00 2001
From: Steven Robertson <steven@strobe.cc>
Date: Fri, 20 Jan 2012 11:27:21 -0500
Subject: [PATCH] Wavelet experiments

---
 cuburn/code/filtering.py | 275 ++++++++++++++++++++++++++++++++++-----
 1 file changed, 243 insertions(+), 32 deletions(-)
diff --git a/cuburn/code/filtering.py b/cuburn/code/filtering.py
index 8f8ad63..355c50b 100644
--- a/cuburn/code/filtering.py
+++ b/cuburn/code/filtering.py
@@ -2,6 +2,7 @@
 import numpy as np
 from numpy import float32 as f32, int32 as i32
 
+import pycuda.driver as cuda
 import pycuda.compiler
 from pycuda.gpuarray import vec
 
@@ -106,45 +107,142 @@ void logscale(float4 *pixbuf, float4 *outbuf, float k1, float k2) {
     outbuf[i] = pix;
 }
 
-// SS must be no greater than 4x
+__device__ void fmav(float4 &dst, float4 src, float scale) {
+    dst.x += src.x * scale;
+    dst.y += src.y * scale;
+    dst.z += src.z * scale;
+    dst.w += src.w * scale;
+}
+
+// The 7-tap filters are missing the leading zero to compensate for delay. I
+// have no frigging clue what the theory behind that is, but it seems to work.
+__constant__ float daub97_lo[9] = {
+     0.03782845550726404f, -0.023849465019556843f, -0.11062440441843718f,
+     0.37740285561283066f,  0.8526986790088938f,    0.37740285561283066f,
+    -0.11062440441843718f, -0.023849465019556843f,  0.03782845550726404f
+};
+__constant__ float daub97_hi[9] = {
+                           -0.06453888262869706f,   0.04068941760916406f,
+     0.41809227322161724f, -0.7884856164055829f,    0.41809227322161724f,
+     0.04068941760916406f, -0.06453888262869706f,   0.0f,
+};
+__constant__ float daub97_ilo[9] = {
+                           -0.06453888262869706f,  -0.04068941760916406f,
+     0.41809227322161724f,  0.7884856164055829f,    0.41809227322161724f,
+    -0.04068941760916406f, -0.06453888262869706f,   0.0f
+};
+__constant__ float daub97_ihi[9] = {
+    -0.03782845550726404f, -0.023849465019556843f,  0.11062440441843718f,
+     0.37740285561283066f, -0.8526986790088938f,    0.37740285561283066f,
+     0.11062440441843718f, -0.023849465019556843f, -0.03782845550726404f
+};
+
+/*
+#define S 0.7071067811f
+__constant__ float daub97[4][9] = {
+  { 0, 0, 0, S, S}, {0, 0, 0, -S, S}, {0, 0, 0, S, S}, {0, 0, 0, S, -S}};
+*/
+
+texture<float4, cudaTextureType2D> conv_down_src;
 __global__
-void blur_density_h(float *scratch, const float4 *pixbuf, int astride) {
-    // TODO: respect supersampling? configurable blur width?
-    int x = blockIdx.x * blockDim.x + threadIdx.x;
-    if (x > astride) return;
-    int y = blockIdx.y;
-    const float *dsrc =
-        reinterpret_cast<const float*>(&pixbuf[astride * y]) + 3;
+void conv_down(float4 *dst, int astride, int as_eff, int ah_eff,
+               int vert, float xo, float yo) {
+    int xi = blockIdx.x * blockDim.x + threadIdx.x;
+    int yi = blockIdx.y;
 
-    float den = 0.0f;
+    float4 lo = make_float4(0, 0, 0, 0);
+    float4 hi = make_float4(0, 0, 0, 0);
 
-    den += dsrc[4*max(0, x-2)] * 0.00135f;
-    den += dsrc[4*max(0, x-1)] * 0.1573f;
-    den += dsrc[4*x] * 0.6827f;
-    den += dsrc[4*min(astride-1, x+1)] * 0.1573f;
-    den += dsrc[4*min(astride-1, x+2)] * 0.00135f;
-    scratch[astride * y + x] = den;
+    if (vert) {
+        if (xi >= as_eff) return;
+        float x = xi - xo, y = yi * 2 - yo;
+
+        #pragma unroll
+        for (int i = 0; i < 9; i++) {
+            float4 src = tex2D(conv_down_src, x, y);
+            fmav(lo, src, daub97_lo[i]);
+            fmav(hi, src, daub97_hi[i]);
+            y -= 1.0f;
+            if (y < 0) y += ah_eff;
+        }
+        dst[(yi + ah_eff / 2) * astride + xi] = hi;
+    } else {
+        if (xi >= as_eff / 2) return;
+        float x = xi * 2 - xo, y = yi - yo;
+
+        #pragma unroll
+        for (int i = 0; i < 9; i++) {
+            float4 src = tex2D(conv_down_src, x, y);
+            fmav(lo, src, daub97_lo[i]);
+            fmav(hi, src, daub97_hi[i]);
+            x -= 1.0f;
+            if (x < 0) x += as_eff;
+        }
+        dst[yi * astride + xi + as_eff / 2] = hi;
+    }
+    dst[yi * astride + xi] = lo;
+}
+
+texture<float4, cudaTextureType2D> conv_up_src_lo;
+texture<float4, cudaTextureType2D> conv_up_src_hi;
+__global__
+void conv_up(float4 *dst, int astride, int as_eff,
+             int vert, int xo, int yo) {
+    int xi = blockIdx.x * blockDim.x + threadIdx.x;
+    int yi = blockIdx.y;
+    if (xi >= as_eff) return;
+
+    float4 out = make_float4(0, 0, 0, 0);
+    if (vert) {
+        float x = xi, y = yi / 2;
+        for (int i = ~yi & 1; i < 9; i+=2) {
+            fmav(out, tex2D(conv_up_src_lo, x, y), daub97_ilo[8-i]);
+            fmav(out, tex2D(conv_up_src_hi, x, y), daub97_ihi[8-i]);
+            y += 1.0f;
+            if (y >= gridDim.y / 2) y = 0.0f;
+        }
+    } else {
+        float x = xi / 2, y = yi;
+        for (int i = ~xi & 1; i < 9; i+=2) {
+            fmav(out, tex2D(conv_up_src_lo, x, y), daub97_ilo[8-i]);
+            fmav(out, tex2D(conv_up_src_hi, x, y), daub97_ihi[8-i]);
+            x += 1.0f;
+            if (x >= as_eff / 2) x = 0.0f;
+        }
+    }
+    xi += xo;
+    yi += yo;
+    if (xi >= as_eff) xi -= as_eff;
+    if (yi >= gridDim.y) yi -= gridDim.y;
+    dst[yi * astride + xi] = out;
 }
 
 __global__
-void blur_density_v(float4 *pixbuf, const float *scratch,
-                    int astride, int aheight) {
-    // TODO: respect supersampling? configurable blur width?
+void simple_thresh(float4 *buf, int astride, float thr, int min_x, int min_y) {
     int x = blockIdx.x * blockDim.x + threadIdx.x;
-    if (x > astride) return;
     int y = blockIdx.y;
-    const float *dsrc = scratch + x;
+    if (x >= astride || (x < min_x && y < min_y)) return;
 
-    float den = 0.0f;
+    float4 val = buf[y * astride + x];
+    float fact = 1.0f - expf(powf(fabsf(val.w), 1.2) * -0.2f);
+    val.x *= fact;
+    val.y *= fact;
+    val.z *= fact;
+    val.w *= fact;
+    buf[y * astride + x] = val;
+}
 
-    den += dsrc[astride*max(0, y-2)] * 0.00135f;
-    den += dsrc[astride*max(0, y-1)] * 0.1573f;
-    den += dsrc[astride*y] * 0.6827f;
-    den += dsrc[astride*min(astride-1, y+1)] * 0.1573f;
-    den += dsrc[astride*min(astride-1, y+2)] * 0.00135f;
-
-    float *ddst = reinterpret_cast<float*>(&pixbuf[astride * y + x]) + 3;
-    *ddst = min(*ddst, den);
+__global__
+void fma_buf(float4 *dst, const float4 *sub, int astride, float scale) {
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int i = blockIdx.y * astride + x;
+    if (x >= astride) return;
+    float4 d = dst[i], s = sub[i];
+    d.x += s.x * scale;
+    d.y += s.y * scale;
+    d.z += s.z * scale;
+    d.w += s.w * scale;
+    dst[i] = d;
 }
 
 #define W 21        // Filter width (regardless of standard deviation chosen)
@@ -280,23 +378,136 @@ class Filtering(object):
 
     def __init__(self):
         self.init_mod()
+        self.scratch = None
 
     def blur_density(self, ddst, dscratch, dim, stream=None):
         blur_h = self.mod.get_function("blur_density_h")
         blur_h(dscratch, ddst, i32(dim.astride), block=(192, 1, 1),
-               grid=(dim.astride / 192, dim.ah), stream=stream)
+               grid=(1 + dim.astride / 192, dim.ah), stream=stream)
         blur_v = self.mod.get_function("blur_density_v")
         blur_v(ddst, dscratch, i32(dim.astride), i32(dim.ah), block=(192, 1, 1),
-               grid=(dim.astride / 192, dim.ah), stream=stream)
+               grid=(1 + dim.astride / 192, dim.ah), stream=stream)
 
     def de(self, ddst, dsrc, gnm, dim, tc, stream=None):
+        from cuburn.render import argset
+        print dim.ah
+        np.set_printoptions(linewidth=160, precision=4)
+
         k1 = f32(gnm.color.brightness(tc) * 268 / 256)
         # Old definition of area is (w*h/(s*s)). Since new scale 'ns' is now
         # s/w, new definition is (w*h/(s*s*w*w)) = (h/(s*s*w))
         area = dim.h / (gnm.camera.scale(tc) ** 2 * dim.w)
         k2 = f32(1.0 / (area * gnm.spp(tc)))
 
-        if gnm.de.radius == 0:
+        # Stride in bytes, buffer size
+        sb = 16 * dim.astride
+        bs = sb * dim.ah
+
+        if self.scratch is None:
+            self.scratch = cuda.mem_alloc(bs)
+            self.hi = cuda.mem_alloc(bs)
+        q = np.zeros((dim.ah, dim.astride * 4), dtype=np.float32)
+        q[100:102,128:1024] = 1
+        #cuda.memcpy_htod(dsrc, q)
+
+        dsc = argset(cuda.ArrayDescriptor(), height=dim.ah,
+                width=dim.astride, format=cuda.array_format.FLOAT,
+                num_channels=4)
+        bl = (192, 1, 1)
+        gr = lambda x, y: (1 + dim.astride / (192 << x), dim.ah >> y)
+
+        def get_tref(name):
+            tref = self.mod.get_texref(name)
+            tref.set_filter_mode(cuda.filter_mode.POINT)
+            tref.set_address_mode(0, cuda.address_mode.WRAP)
+            tref.set_address_mode(1, cuda.address_mode.WRAP)
+            print tref.get_address_mode(0)
+            print tref.get_address_mode(1)
+            return tref
+        conv_down_src, conv_up_src_lo, conv_up_src_hi = map(get_tref,
+                ['conv_down_src', 'conv_up_src_lo', 'conv_up_src_hi'])
+
+        conv_down = self.mod.get_function('conv_down')
+        conv_up = self.mod.get_function('conv_up')
+
+        memcpy = cuda.Memcpy2D()
+        memcpy.src_pitch = sb
+        memcpy.src_height = memcpy.dst_height = memcpy.height = dim.ah
+        memcpy.set_src_device(self.scratch)
+        memcpy.set_dst_device(self.hi)
+
+        STEPS=3
+
+        print dim
+        def th(x):
+            x = np.int64(x*1e6)
+            v = np.nonzero(x)[0]
+            print np.array((v, x[v]))
+
+        for xo, yo in [(0, 0), (1, 1), (3, 3), (1, 3), (3, 1), (2, 2), (0, 0)]:
+            for i in range(STEPS):
+                xon, yon = (xo, yo) if i == 0 else (0, 0)
+                as_eff, ah_eff = dim.astride >> i, dim.ah >> i
+                dsc.width, dsc.height = as_eff, ah_eff
+                conv_down_src.set_address_2d(dsrc, dsc, sb)
+                conv_down(self.scratch, i32(dim.astride),
+                          i32(as_eff), i32(ah_eff), i32(1), f32(xon), f32(yon),
+                          block=bl, grid=gr(i, i+1),
+                          texrefs=[conv_down_src], stream=stream)
+                #cuda.memcpy_dtod_async(self.scratch, dsrc, bs, stream)
+                conv_down_src.set_address_2d(self.scratch, dsc, sb)
+                conv_down(dsrc, i32(dim.astride),
+                          i32(as_eff), i32(ah_eff), i32(0), f32(xon), f32(yon),
+                          block=bl, grid=gr(i+1, i),
+                          texrefs=[conv_down_src], stream=stream)
+                #cuda.memcpy_dtod_async(dsrc, self.scratch, bs, stream)
+                #th(cuda.from_device_like(self.scratch, q).T[128])
+            cuda.memcpy_dtod_async(ddst, dsrc, bs, stream)
+
+            thresh = self.mod.get_function('simple_thresh')
+            thresh(dsrc, i32(dim.astride), f32(1),
+                   i32(dim.astride >> STEPS), i32(dim.ah >> STEPS),
+                   block=bl, grid=gr(0, 0), stream=stream)
+
+            for i in list(reversed(range(STEPS))):
+                xon, yon = (xo, yo) if i == 0 else (0, 0)
+                dsc.width, dsc.height = dim.astride >> i, dim.ah >> (i+1)
+                conv_up_src_lo.set_address_2d(dsrc, dsc, sb)
+                hi_addr = int(dsrc) + sb * (dim.ah >> (i+1))
+                conv_up_src_hi.set_address_2d(hi_addr, dsc, sb)
+                conv_up(self.scratch, i32(dim.astride), i32(dim.astride >> i),
+                        i32(1), i32(xon), i32(yon),
+                        block=bl, grid=gr(i, i), stream=stream,
+                        texrefs=[conv_up_src_lo, conv_up_src_hi])
+                sb_2 = sb >> (i+1)
+                memcpy.dst_pitch = sb_2
+                memcpy.width_in_bytes = sb_2
+                memcpy.src_x_in_bytes = sb_2
+                memcpy(stream)
+                dsc.width, dsc.height = dim.astride >> (i+1), dim.ah >> i
+                conv_up_src_lo.set_address_2d(self.scratch, dsc, sb)
+                conv_up_src_hi.set_address_2d(self.hi, dsc, sb>>(i+1))
+                conv_up(dsrc, i32(dim.astride), i32(dim.astride >> i),
+                        i32(0), i32(xon), i32(yon),
+                        block=bl, grid=gr(i, i), stream=stream,
+                        texrefs=[conv_up_src_lo, conv_up_src_hi])
+                #cuda.memcpy_dtod_async(dsrc, self.scratch, bs, stream)
+
+            #th(cuda.from_device_like(self.scratch, q).T[128])
+        cuda.memcpy_dtod_async(ddst, dsrc, bs, stream)
+
+        #y = cuda.from_device_like(self.scratch, q)
+        #print y[93:110,128].T
+
+        #print x[93:110,128].T + y[93:110,128].T
+
+        #fun = self.mod.get_function('fma_buf')
+        #fun(self.scratch, self.lo, i32(dim.astride), f32(1),
+            #block=bl, grid=gr, stream=stream)
+        #cuda.memcpy_dtod_async(ddst, self.scratch, bs, stream=stream)
+        #return
+
+        if gnm.de.radius(tc) == 0 or True:
             nbins = dim.ah * dim.astride
             fun = self.mod.get_function("logscale")
             t = fun(dsrc, ddst, k1, k2,