mirror of
				https://github.com/stevenrobertson/cuburn.git
				synced 2025-11-03 18:00:55 -05:00 
			
		
		
		
	Saving unsuccessful separable filtering code
This commit is contained in:
		@ -63,4 +63,127 @@ void logfilt(float4 *pixbuf, float k1, float k2,
 | 
				
			|||||||
}
 | 
					}
 | 
				
			||||||
"""
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class DensityEst(HunkOCode):
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    NOTE: for now, this *must* be invoked with a block size of (32,16,1), and
 | 
				
			||||||
 | 
					    a grid size of (W/32) for vertical filtering or (H/32) for horizontal.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    It will probably fail for images that are not multiples of 32.
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def __init__(self, features, cp):
 | 
				
			||||||
 | 
					        self.features, self.cp = features, cp
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    @property
 | 
				
			||||||
 | 
					    def defs(self):
 | 
				
			||||||
 | 
					        return self.defs_tmpl.substitute(features=self.features, cp=self.cp)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    defs_tmpl = Template("""
 | 
				
			||||||
 | 
					#define W 15        // Filter width (regardless of standard deviation chosen)
 | 
				
			||||||
 | 
					#define W2 7        // Half of filter width, rounded down
 | 
				
			||||||
 | 
					#define NW 16       // Number of warps in each set of points
 | 
				
			||||||
 | 
					#define FW 30       // Width of local result storage per-lane (NW+W2+W2)
 | 
				
			||||||
 | 
					#define BX 32       // The size of a block's X dimension (== 1 warp)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					__global__
 | 
				
			||||||
 | 
					void density_est(float4 *pixbuf, float *denbuf, int vertical) {
 | 
				
			||||||
 | 
					    __shared__ float r[BX*FW], g[BX*FW], b[BX*FW], a[BX*FW];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    int ibase;    // The index of the first element within this lane's strip
 | 
				
			||||||
 | 
					    int imax;     // The maximum offset from the first element in the strip
 | 
				
			||||||
 | 
					    int istride;  // Number of indices until next point to filter
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    if (vertical) {
 | 
				
			||||||
 | 
					        ibase = threadIdx.x + blockIdx.x * BX;
 | 
				
			||||||
 | 
					        imax = {{features.acc_height}};
 | 
				
			||||||
 | 
					        istride = {{features.acc_stride}};
 | 
				
			||||||
 | 
					    } else {
 | 
				
			||||||
 | 
					        ibase = (blockIdx.x * BX + threadIdx.x) * {{features.acc_stride}};
 | 
				
			||||||
 | 
					        imax = {{features.acc_width}};
 | 
				
			||||||
 | 
					        istride = 1;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    for (int i = threadIdx.x + BX*threadIdx.y; i < BX*FW; i += NW * BX)
 | 
				
			||||||
 | 
					        r[i] = g[i] = b[i] = a[i] = 0.0f;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    for (int i = threadIdx.y; i < imax; i += NW) {
 | 
				
			||||||
 | 
					        int idx = ibase+i*istride;
 | 
				
			||||||
 | 
					        float4 in = pixbuf[idx];
 | 
				
			||||||
 | 
					        float den = denbuf[idx];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        int j = (threadIdx.y + W2) * 32 + threadIdx.x;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        float sd = {{0.35 * cp.estimator}} / powf(den+1.0f, {{cp.estimator_curve}});
 | 
				
			||||||
 | 
					        {{if cp.estimator_minimum > 1}}
 | 
				
			||||||
 | 
					        sd = fmaxf(sd, {{cp.estimator_minimum}});
 | 
				
			||||||
 | 
					        {{endif}}
 | 
				
			||||||
 | 
					        sd *= sd;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        // TODO: log scaling here? matches flam3, but, ick
 | 
				
			||||||
 | 
					        // TODO: investigate harm caused by varying standard deviation in a
 | 
				
			||||||
 | 
					        // separable environment
 | 
				
			||||||
 | 
					        float coeff = rsqrtf(2.0f*M_PI*sd*sd);
 | 
				
			||||||
 | 
					        atomicAdd(r+j, in.x * coeff);
 | 
				
			||||||
 | 
					        atomicAdd(g+j, in.y * coeff);
 | 
				
			||||||
 | 
					        atomicAdd(b+j, in.z * coeff);
 | 
				
			||||||
 | 
					        atomicAdd(a+j, in.w * coeff);
 | 
				
			||||||
 | 
					        sd = -0.5/sd;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        // #pragma unroll
 | 
				
			||||||
 | 
					        for (int k = 1; k <= W2; k++) {
 | 
				
			||||||
 | 
					            float scale = exp(sd*k*k)*coeff;
 | 
				
			||||||
 | 
					            idx = j+k*32;
 | 
				
			||||||
 | 
					            atomicAdd(r+idx, in.x * scale);
 | 
				
			||||||
 | 
					            atomicAdd(g+idx, in.y * scale);
 | 
				
			||||||
 | 
					            atomicAdd(b+idx, in.z * scale);
 | 
				
			||||||
 | 
					            atomicAdd(a+idx, in.w * scale);
 | 
				
			||||||
 | 
					            idx = j-k*32;
 | 
				
			||||||
 | 
					            atomicAdd(r+idx, in.x * scale);
 | 
				
			||||||
 | 
					            atomicAdd(g+idx, in.y * scale);
 | 
				
			||||||
 | 
					            atomicAdd(b+idx, in.z * scale);
 | 
				
			||||||
 | 
					            atomicAdd(a+idx, in.w * scale);
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        __syncthreads();
 | 
				
			||||||
 | 
					        float4 out;
 | 
				
			||||||
 | 
					        j = threadIdx.y * BX + threadIdx.x;
 | 
				
			||||||
 | 
					        out.x = r[j];
 | 
				
			||||||
 | 
					        out.y = g[j];
 | 
				
			||||||
 | 
					        out.z = b[j];
 | 
				
			||||||
 | 
					        out.w = a[j];
 | 
				
			||||||
 | 
					        idx = ibase+(i-W2)*istride;
 | 
				
			||||||
 | 
					        if (idx > 0)
 | 
				
			||||||
 | 
					            pixbuf[idx] = out;
 | 
				
			||||||
 | 
					        __syncthreads();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        // TODO: shift instead of copying
 | 
				
			||||||
 | 
					        idx = threadIdx.x + BX * threadIdx.y;
 | 
				
			||||||
 | 
					        if (threadIdx.y < NW-2) {
 | 
				
			||||||
 | 
					            r[idx] = r[idx + BX*NW];
 | 
				
			||||||
 | 
					            g[idx] = g[idx + BX*NW];
 | 
				
			||||||
 | 
					            b[idx] = b[idx + BX*NW];
 | 
				
			||||||
 | 
					            a[idx] = a[idx + BX*NW];
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					        __syncthreads();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        r[idx + BX*(NW-2)] = 0.0f;
 | 
				
			||||||
 | 
					        g[idx + BX*(NW-2)] = 0.0f;
 | 
				
			||||||
 | 
					        b[idx + BX*(NW-2)] = 0.0f;
 | 
				
			||||||
 | 
					        a[idx + BX*(NW-2)] = 0.0f;
 | 
				
			||||||
 | 
					        __syncthreads();
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					""")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def invoke(self, mod, abufd, dbufd):
 | 
				
			||||||
 | 
					        fun = mod.get_function("density_est")
 | 
				
			||||||
 | 
					        t = fun(abufd, dbufd, np.int32(0),
 | 
				
			||||||
 | 
					                block=(32, 16, 1), grid=(self.features.acc_height/32,1),
 | 
				
			||||||
 | 
					                time_kernel=True)
 | 
				
			||||||
 | 
					        print "Horizontal density estimation: %g" % t
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        t = fun(abufd, dbufd, np.int32(1),
 | 
				
			||||||
 | 
					                block=(32, 16, 1), grid=(self.features.acc_width/32,1),
 | 
				
			||||||
 | 
					                time_kernel=True)
 | 
				
			||||||
 | 
					        print "Vertical density estimation: %g" % t
 | 
				
			||||||
 | 
				
			|||||||
@ -160,11 +160,13 @@ def render(features, cps):
 | 
				
			|||||||
    seeds = mwc.MWC.make_seeds(512 * nsteps)
 | 
					    seeds = mwc.MWC.make_seeds(512 * nsteps)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    iter = IterCode(features)
 | 
					    iter = IterCode(features)
 | 
				
			||||||
    code = assemble_code(BaseCode, mwc.MWC, iter.packer, iter, filter.ColorClip)
 | 
					    de = filter.DensityEst(features, cps[0])
 | 
				
			||||||
 | 
					    code = assemble_code(BaseCode, mwc.MWC, iter.packer, iter,
 | 
				
			||||||
 | 
					                         filter.ColorClip, de)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    for lno, line in enumerate(code.split('\n')):
 | 
					    for lno, line in enumerate(code.split('\n')):
 | 
				
			||||||
        print '%3d %s' % (lno, line)
 | 
					        print '%3d %s' % (lno, line)
 | 
				
			||||||
    mod = SourceModule(code, keep=True,
 | 
					    mod = SourceModule(code,
 | 
				
			||||||
            options=['-use_fast_math', '-maxrregcount', '32'])
 | 
					            options=['-use_fast_math', '-maxrregcount', '32'])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cps_as_array = (Genome * len(cps))()
 | 
					    cps_as_array = (Genome * len(cps))()
 | 
				
			||||||
@ -222,6 +224,8 @@ def render(features, cps):
 | 
				
			|||||||
    area = features.width * features.height / cp.ppu ** 2
 | 
					    area = features.width * features.height / cp.ppu ** 2
 | 
				
			||||||
    k2 = 1 / (area * cp.adj_density)
 | 
					    k2 = 1 / (area * cp.adj_density)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    de.invoke(mod, abufd, dbufd)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    fun = mod.get_function("logfilt")
 | 
					    fun = mod.get_function("logfilt")
 | 
				
			||||||
    t = fun(abufd, f(k1), f(k2),
 | 
					    t = fun(abufd, f(k1), f(k2),
 | 
				
			||||||
        f(1 / cp.gamma), f(cp.vibrancy), f(cp.highlight_power),
 | 
					        f(1 / cp.gamma), f(cp.vibrancy), f(cp.highlight_power),
 | 
				
			||||||
@ -229,10 +233,10 @@ def render(features, cps):
 | 
				
			|||||||
    print "Completed color filtering in %g seconds" % t
 | 
					    print "Completed color filtering in %g seconds" % t
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    abuf = cuda.from_device_like(abufd, abuf)
 | 
					    abuf = cuda.from_device_like(abufd, abuf)
 | 
				
			||||||
    dbuf = cuda.from_device_like(dbufd, dbuf)
 | 
					 | 
				
			||||||
    return abuf, dbuf
 | 
					    return abuf, dbuf
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# TODO: find a better place to stick this code
 | 
					# TODO: find a better place to stick this code
 | 
				
			||||||
class MemBench(HunkOCode):
 | 
					class MemBench(HunkOCode):
 | 
				
			||||||
    decls = """
 | 
					    decls = """
 | 
				
			||||||
 | 
				
			|||||||
@ -112,6 +112,9 @@ class Features(object):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
        self.width = genomes[0].width
 | 
					        self.width = genomes[0].width
 | 
				
			||||||
        self.height = genomes[0].height
 | 
					        self.height = genomes[0].height
 | 
				
			||||||
 | 
					        self.acc_width = genomes[0].width
 | 
				
			||||||
 | 
					        self.acc_height = genomes[0].height
 | 
				
			||||||
 | 
					        self.acc_stride = genomes[0].width
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class XFormFeatures(object):
 | 
					class XFormFeatures(object):
 | 
				
			||||||
    def __init__(self, xforms, xform_id):
 | 
					    def __init__(self, xforms, xform_id):
 | 
				
			||||||
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user