Refactor API

--HG-- rename : cuburn/code/filter.py => cuburn/code/filtering.py
2025-07-12 03:05:14 -04:00 · 2011-06-11 15:59:10 -04:00
parent 6f3c27007a
commit e79df46c66
6 changed files with 362 additions and 142 deletions
--- a/cuburn/code/filtering.py
+++ b/cuburn/code/filtering.py
@ -223,24 +223,22 @@ void density_est(float4 *pixbuf, float4 *outbuf, float *denbuf,

 """)

-    def invoke(self, mod, abufd, obufd, dbufd):
+    def invoke(self, mod, abufd, obufd, dbufd, stream=None):
        # TODO: add no-est version
        # TODO: come up with a general way to average these parameters

        k1 = self.cp.brightness * 268 / 256
        area = self.features.acc_width * self.features.acc_height / self.cp.ppu ** 2
        k2 = 1 / (area * self.cp.adj_density)
-        print k1, k2, area

        if self.cp.estimator == 0:
            fun = mod.get_function("logscale")
            t = fun(abufd, obufd, np.float32(k1), np.float32(k2),
                    block=(self.features.acc_width, 1, 1),
-                    grid=(self.features.acc_height, 1), time_kernel=True)
+                    grid=(self.features.acc_height, 1), stream=stream)
        else:
            fun = mod.get_function("density_est")
-            t = fun(abufd, obufd, dbufd, np.float32(k1), np.float32(k2),
-                    block=(32, 32, 1), grid=(self.features.acc_width/32, 1),
-                    time_kernel=True)
-            print "Density estimation: %g" % t
+            fun(abufd, obufd, dbufd, np.float32(k1), np.float32(k2),
+                block=(32, 32, 1), grid=(self.features.acc_width/32, 1),
+                stream=stream)

--- a/cuburn/code/iter.py
+++ b/cuburn/code/iter.py
@ -2,20 +2,13 @@
 The main iteration loop.
 """

-from ctypes import byref, memset, sizeof
-
-import pycuda.driver as cuda
-from pycuda.driver import In, Out, InOut
-from pycuda.compiler import SourceModule
-import numpy as np
-from scipy import ndimage
-
-from fr0stlib.pyflam3 import flam3_interpolate
-from cuburn.code import mwc, variations, filter
+from cuburn.code import mwc, variations
 from cuburn.code.util import *
-from cuburn.render import Genome

 class IterCode(HunkOCode):
+    # The number of threads per block
+    NTHREADS = 512
+
    def __init__(self, features):
        self.features = features
        self.packer = DataPacker('iter_info')
@ -69,14 +62,14 @@ void iter(mwc_st *msts, iter_info *infos, float4 *accbuf, float *denbuf) {
    iter_info *info_glob = &(infos[blockIdx.x]);

    // load info to shared memory cooperatively
-    for (int i = threadIdx.y * 32 + threadIdx.x;
+    for (int i = threadIdx.y * blockDim.x + threadIdx.x;
         i * 4 < sizeof(iter_info); i += blockDim.x * blockDim.y)
        reinterpret_cast<float*>(&info)[i] =
            reinterpret_cast<float*>(info_glob)[i];

    int consec_bad = -{{features.fuse}};
-    // TODO: make nsteps adjustable via genome
-    int nsamps = {{packer.get('cp.width * cp.height / 512000. * cp.adj_density')}};
+    // TODO: remove '512' constant
+    int nsamps = {{packer.get('cp.width * cp.height / (cp.ntemporal_samples * 512.) * cp.adj_density')}};

    float x, y, color;
    x = mwc_next_11(&rctx);
@ -157,86 +150,3 @@ void iter(mwc_st *msts, iter_info *infos, float4 *accbuf, float *denbuf) {
                packer = self.packer.view('info'),
                **globals())

-def render(features, cps):
-    # TODO: make this adjustable via genome
-    nsteps = 1000
-    abuf = np.zeros((features.acc_height, features.acc_stride, 4), dtype=np.float32)
-    dbuf = np.zeros((features.acc_height, features.acc_stride), dtype=np.float32)
-    seeds = mwc.MWC.make_seeds(512 * nsteps)
-
-    iter = IterCode(features)
-    de = filter.DensityEst(features, cps[0])
-    code = assemble_code(BaseCode, mwc.MWC, iter.packer, iter,
-                         filter.ColorClip, de)
-
-    for lno, line in enumerate(code.split('\n')):
-        print '%3d %s' % (lno, line)
-    mod = SourceModule(code,
-            options=['-use_fast_math', '-maxrregcount', '32'])
-
-    cps_as_array = (Genome * len(cps))()
-    for i, cp in enumerate(cps):
-        cps_as_array[i] = cp
-
-    infos = []
-    pal = np.empty((16, 256, 4), dtype=np.uint8)
-
-    # TODO: move this into a common function
-    if len(cps) > 1:
-        cp = Genome()
-        memset(byref(cp), 0, sizeof(cp))
-
-        sampAt = [int(i/15.*(nsteps-1)) for i in range(16)]
-        for n in range(nsteps):
-            flam3_interpolate(cps_as_array, 2, float(n)/nsteps - 0.5,
-                              0, byref(cp))
-            cp._init()
-            if n in sampAt:
-                pidx = sampAt.index(n)
-                for i, e in enumerate(cp.palette.entries):
-                    pal[pidx][i] = np.uint8(np.array(e.color) * 255.0)
-            infos.append(iter.packer.pack(cp=cp, cp_step_frac=float(n)/nsteps))
-    else:
-        for i, e in enumerate(cps[0].palette.entries):
-            pal[0][i] = np.uint8(np.array(e.color) * 255.0)
-        pal[1:] = pal[0]
-        infos.append(iter.packer.pack(cp=cps[0], cp_step_frac=0))
-        infos *= nsteps
-
-    infos = np.concatenate(infos)
-
-    dpal = cuda.make_multichannel_2d_array(pal, 'C')
-    tref = mod.get_texref('palTex')
-    tref.set_array(dpal)
-    tref.set_format(cuda.array_format.UNSIGNED_INT8, 4)
-    tref.set_flags(cuda.TRSF_NORMALIZED_COORDINATES)
-    tref.set_filter_mode(cuda.filter_mode.LINEAR)
-
-    abufd = cuda.to_device(abuf)
-    dbufd = cuda.to_device(dbuf)
-
-    fun = mod.get_function("iter")
-    fun.set_cache_config(cuda.func_cache.PREFER_L1)
-    t = fun(InOut(seeds), InOut(infos), abufd, dbufd,
-        block=(32,16,1), grid=(nsteps,1), time_kernel=True)
-    print "Completed render in %g seconds" % t
-
-    f = np.float32
-
-    npix = features.acc_width * features.acc_height
-
-    # TODO: just allocate
-    obufd = cuda.to_device(abuf)
-    dbuf = cuda.from_device_like(dbufd, dbuf)
-    dbuf = ndimage.filters.gaussian_filter(dbuf, 0.6)
-    dbufd = cuda.to_device(dbuf)
-    de.invoke(mod, abufd, obufd, dbufd)
-
-    fun = mod.get_function("colorclip")
-    t = fun(obufd, f(1 / cp.gamma), f(cp.vibrancy), f(cp.highlight_power),
-        block=(256,1,1), grid=(npix/256,1), time_kernel=True)
-    print "Completed color filtering in %g seconds" % t
-
-    abuf = cuda.from_device_like(obufd, abuf)
-    return abuf, dbuf
-
--- a/cuburn/code/util.py
+++ b/cuburn/code/util.py
@ -66,8 +66,26 @@ int trunca(float f) {
    asm("cvt.rni.s32.f32    %0,     %1;" : "=r"(ret) : "f"(f));
    return ret;
 }
+
+__global__
+void zero_dptr(float* dptr, int size) {
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < size) {
+        dptr[i] = 0.0f;
+    }
+}
 """

+    @staticmethod
+    def zero_dptr(mod, dptr, size, stream=None):
+        """
+        A memory zeroer which can be embedded in a stream. Size is the
+        number of 4-byte words in the pointer.
+        """
+        zero = mod.get_function("zero_dptr")
+        zero(dptr, np.int32(size), stream=stream,
+             block=(1024, 1, 1), grid=(size/1024+1, 1))
+
 class DataPackerView(object):
    """
    View of a data packer. Intended to be initialized using DataPacker.view().