diff --git a/cuburn/code/filtering.py b/cuburn/code/filtering.py index 3a64d48..2ad17bf 100644 --- a/cuburn/code/filtering.py +++ b/cuburn/code/filtering.py @@ -15,7 +15,7 @@ void colorclip(float4 *pixbuf, float gamma, float vibrancy, float highpow, float4 pix = pixbuf[i]; if (pix.w <= 0) { - pixbuf[i] = make_float4(bkgd.x, bkgd.y, bkgd.z, 0); + pixbuf[i] = make_float4(bkgd.x, bkgd.y, bkgd.z, 0.0f); return; } @@ -121,7 +121,9 @@ __device__ void de_add(int ibase, int ii, int jj, float4 scaled) { __global__ void logscale(float4 *pixbuf, float4 *outbuf, float k1, float k2) { int i = blockDim.x * blockIdx.x + threadIdx.x; + float den; float4 pix = pixbuf[i]; + read_pix(pix, den); float ls = fmaxf(0, k1 * logf(1.0f + pix.w * k2) / pix.w); pix.x *= ls; @@ -138,7 +140,7 @@ void logscale(float4 *pixbuf, float4 *outbuf, float k1, float k2) { #define MAX_SD 4.33333333f __global__ -void density_est(float4 *pixbuf, float4 *outbuf, float *denbuf, +void density_est(float4 *pixbuf, float4 *outbuf, float est_sd, float neg_est_curve, float est_min, float k1, float k2) { for (int i = threadIdx.x + 32*threadIdx.y; i < FW2; i += 32) @@ -151,7 +153,8 @@ void density_est(float4 *pixbuf, float4 *outbuf, float *denbuf, + blockIdx.x * 32 + threadIdx.x + W2; float4 in = pixbuf[idx]; - float den = denbuf[idx]; + float den; + read_pix(in, den); if (in.w > 0 && den > 0) { float ls = k1 * logf(1.0f + in.w * k2) / in.w; @@ -279,7 +282,7 @@ void density_est(float4 *pixbuf, float4 *outbuf, float *denbuf, ''') - def invoke(self, mod, cp, abufd, obufd, dbufd, stream=None): + def invoke(self, mod, cp, abufd, obufd, stream=None): # TODO: add no-est version # TODO: come up with a general way to average these parameters @@ -301,7 +304,7 @@ void density_est(float4 *pixbuf, float4 *outbuf, float *denbuf, neg_est_curve = np.float32(-cp.estimator_curve) est_min = np.float32(cp.estimator_minimum / 3.) fun = mod.get_function("density_est") - fun(abufd, obufd, dbufd, est_sd, neg_est_curve, est_min, k1, k2, + fun(abufd, obufd, est_sd, neg_est_curve, est_min, k1, k2, block=(32, 32, 1), grid=(self.features.acc_width/32, 1), stream=stream) diff --git a/cuburn/code/iter.py b/cuburn/code/iter.py index a5f87f1..d0f978e 100644 --- a/cuburn/code/iter.py +++ b/cuburn/code/iter.py @@ -16,13 +16,33 @@ class IterCode(HunkOCode): bodies = [self._xfbody(i,x) for i,x in enumerate(self.features.xforms)] bodies.append(iterbody) self.defs = '\n'.join(bodies) + self.decls += self.pix_helpers.substitute(features=features) decls = """ // Note: for normalized lookups, uchar4 actually returns floats texture palTex; __shared__ iter_info info; + """ + pix_helpers = Template(""" +__device__ +void read_pix(float4 &pix, float &den) { + den = pix.w; + {{if features.pal_has_alpha}} + read_half(pix.z, pix.w, pix.z, den); + {{endif}} +} + +__device__ +void write_pix(float4 &pix, float den) { + {{if features.pal_has_alpha}} + write_half(pix.z, pix.z, pix.w, den); + {{endif}} + pix.w = den; +} +""") + def _xfbody(self, xfid, xform): px = self.packer.view('info', 'xf%d_' % xfid) px.sub('xf', 'cp.xforms[%d]' % xfid) @@ -204,12 +224,17 @@ void iter(mwc_st *msts, iter_info *infos, float4 *accbuf, float *denbuf) { float4 outcol = tex2D(palTex, color, {{packer.get("cp_step_frac")}}); float4 pix = accbuf[i]; + float den; + // TODO: unify read/write_pix cycle when alpha is needed + read_pix(pix, den); pix.x += outcol.x; pix.y += outcol.y; pix.z += outcol.z; pix.w += outcol.w; - accbuf[i] = pix; // TODO: atomic operations (or better) - denbuf[i] += 1.0f; + den += 1.0f; + + write_pix(pix, den); + accbuf[i] = pix; } } ''') diff --git a/cuburn/code/util.py b/cuburn/code/util.py index 83bee2f..5c119a9 100644 --- a/cuburn/code/util.py +++ b/cuburn/code/util.py @@ -50,7 +50,7 @@ class BaseCode(HunkOCode): #include """ - defs = """ + defs = r""" #undef M_E #undef M_LOG2E #undef M_LOG10E @@ -103,6 +103,39 @@ void zero_dptr(float* dptr, int size) { dptr[i] = 0.0f; } } + +__device__ +void read_half(float &x, float &y, float xy, float den) { + asm("\n\t{" + "\n\t .reg .u16 x, y;" + "\n\t .reg .f32 rc;" + "\n\t mov.b32 {x, y}, %2;" + "\n\t mul.f32 rc, %3, 0f37800000;" // 1/65536. + "\n\t cvt.rn.f32.u16 %0, x;" + "\n\t cvt.rn.f32.u16 %1, y;" + "\n\t mul.f32 %0, %0, rc;" + "\n\t mul.f32 %1, %1, rc;" + "\n\t}" + : "=f"(x), "=f"(y) : "f"(xy), "f"(den)); +} + +__device__ +void write_half(float &xy, float x, float y, float den) { + asm("\n\t{" + "\n\t .reg .u16 x, y;" + "\n\t .reg .f32 rc, xf, yf;" + "\n\t rcp.approx.f32 rc, %3;" + "\n\t mul.f32 rc, rc, 65536.0;" + "\n\t mul.f32 xf, %1, rc;" + "\n\t mul.f32 yf, %2, rc;" + "\n\t cvt.rni.u16.f32 x, xf;" + "\n\t cvt.rni.u16.f32 y, yf;" + "\n\t mov.b32 %0, {x, y};" + "\n\t}" + : "=f"(xy) : "f"(x), "f"(y), "f"(den)); +} + + """ @staticmethod diff --git a/cuburn/render.py b/cuburn/render.py index 2698e0a..90a8a7a 100644 --- a/cuburn/render.py +++ b/cuburn/render.py @@ -229,7 +229,6 @@ class _AnimRenderer(object): memset(byref(self._cen_cp), 0, sizeof(self._cen_cp)) self.nbins = anim.features.acc_height * anim.features.acc_stride - self.d_den = cuda.mem_alloc(4 * self.nbins) self.d_accum = cuda.mem_alloc(16 * self.nbins) self.d_out = cuda.mem_alloc(16 * self.nbins) self.d_infos = cuda.mem_alloc(anim._iter.packer.align * self.ncps) @@ -245,8 +244,6 @@ class _AnimRenderer(object): a._interp(cen_time, cen_cp) palette = self._interp_colors(cen_time, cen_cp) - util.BaseCode.zero_dptr(a.mod, self.d_den, self.nbins, - self.stream) util.BaseCode.zero_dptr(a.mod, self.d_accum, 4 * self.nbins, self.stream) @@ -311,7 +308,7 @@ class _AnimRenderer(object): # TODO: get block config from IterCode # TODO: print timing information iter_fun(self.d_seeds[b], np.uint64(d_info_off), - self.d_accum, self.d_den, texrefs=[tref], + self.d_accum, texrefs=[tref], block=(32, 16, 1), grid=(len(block_times), 1), stream=self.stream) @@ -327,8 +324,7 @@ class _AnimRenderer(object): util.BaseCode.zero_dptr(a.mod, self.d_out, 4 * self.nbins, self.stream) - a._de.invoke(a.mod, Genome(cen_cp), - self.d_accum, self.d_out, self.d_den, + a._de.invoke(a.mod, Genome(cen_cp), self.d_accum, self.d_out, self.stream) f = np.float32 @@ -381,11 +377,6 @@ class _AnimRenderer(object): g = a.features.gutter obuf_dim = (a.features.acc_height, a.features.acc_stride, 4) out = cuda.from_device(self.d_out, obuf_dim, np.float32) - #dacc = cuda.from_device(self.d_accum, obuf_dim, np.float32) - #daccw = dacc[:,:,3] - #print daccw.sum() - # TODO: performance? - g = a.features.gutter out = np.delete(out, np.s_[:g], axis=0) out = np.delete(out, np.s_[:g], axis=1) out = np.delete(out, np.s_[-g:], axis=0) @@ -447,6 +438,10 @@ class Features(object): else: self.final_xform_index = None + alphas = np.array([c.color[3] for g in genomes + for c in g.palette.entries]) + self.pal_has_alpha = np.any(alphas != 1.0) + self.max_cps = max([cp.ntemporal_samples for cp in genomes]) self.width = genomes[0].width