mirror of
https://github.com/stevenrobertson/cuburn.git
synced 2025-02-05 11:40:04 -05:00
Speed enhancement: alpha packing.
When the alpha channel is used in a color palette, the code now replaces the blue channel in the accumulation buffer with a pair of two U16s, which encode the values of the blue and alpha channels as a fraction of the value of the density. When the alpha channel is always 1.0, the blue channel works as normal. Density is now always the last element in the accumulation buffer. Eliminating the separate IO operations improved total runtime by more than 30% on my card, while the extra calculations reduced that to 20% when alpha was present (though that can be optimized further).
This commit is contained in:
parent
a052f7f4c5
commit
618b51b1b1
@ -15,7 +15,7 @@ void colorclip(float4 *pixbuf, float gamma, float vibrancy, float highpow,
|
||||
float4 pix = pixbuf[i];
|
||||
|
||||
if (pix.w <= 0) {
|
||||
pixbuf[i] = make_float4(bkgd.x, bkgd.y, bkgd.z, 0);
|
||||
pixbuf[i] = make_float4(bkgd.x, bkgd.y, bkgd.z, 0.0f);
|
||||
return;
|
||||
}
|
||||
|
||||
@ -121,7 +121,9 @@ __device__ void de_add(int ibase, int ii, int jj, float4 scaled) {
|
||||
__global__
|
||||
void logscale(float4 *pixbuf, float4 *outbuf, float k1, float k2) {
|
||||
int i = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
float den;
|
||||
float4 pix = pixbuf[i];
|
||||
read_pix(pix, den);
|
||||
|
||||
float ls = fmaxf(0, k1 * logf(1.0f + pix.w * k2) / pix.w);
|
||||
pix.x *= ls;
|
||||
@ -138,7 +140,7 @@ void logscale(float4 *pixbuf, float4 *outbuf, float k1, float k2) {
|
||||
#define MAX_SD 4.33333333f
|
||||
|
||||
__global__
|
||||
void density_est(float4 *pixbuf, float4 *outbuf, float *denbuf,
|
||||
void density_est(float4 *pixbuf, float4 *outbuf,
|
||||
float est_sd, float neg_est_curve, float est_min,
|
||||
float k1, float k2) {
|
||||
for (int i = threadIdx.x + 32*threadIdx.y; i < FW2; i += 32)
|
||||
@ -151,7 +153,8 @@ void density_est(float4 *pixbuf, float4 *outbuf, float *denbuf,
|
||||
+ blockIdx.x * 32 + threadIdx.x + W2;
|
||||
|
||||
float4 in = pixbuf[idx];
|
||||
float den = denbuf[idx];
|
||||
float den;
|
||||
read_pix(in, den);
|
||||
|
||||
if (in.w > 0 && den > 0) {
|
||||
float ls = k1 * logf(1.0f + in.w * k2) / in.w;
|
||||
@ -279,7 +282,7 @@ void density_est(float4 *pixbuf, float4 *outbuf, float *denbuf,
|
||||
|
||||
''')
|
||||
|
||||
def invoke(self, mod, cp, abufd, obufd, dbufd, stream=None):
|
||||
def invoke(self, mod, cp, abufd, obufd, stream=None):
|
||||
# TODO: add no-est version
|
||||
# TODO: come up with a general way to average these parameters
|
||||
|
||||
@ -301,7 +304,7 @@ void density_est(float4 *pixbuf, float4 *outbuf, float *denbuf,
|
||||
neg_est_curve = np.float32(-cp.estimator_curve)
|
||||
est_min = np.float32(cp.estimator_minimum / 3.)
|
||||
fun = mod.get_function("density_est")
|
||||
fun(abufd, obufd, dbufd, est_sd, neg_est_curve, est_min, k1, k2,
|
||||
fun(abufd, obufd, est_sd, neg_est_curve, est_min, k1, k2,
|
||||
block=(32, 32, 1), grid=(self.features.acc_width/32, 1),
|
||||
stream=stream)
|
||||
|
||||
|
@ -16,13 +16,33 @@ class IterCode(HunkOCode):
|
||||
bodies = [self._xfbody(i,x) for i,x in enumerate(self.features.xforms)]
|
||||
bodies.append(iterbody)
|
||||
self.defs = '\n'.join(bodies)
|
||||
self.decls += self.pix_helpers.substitute(features=features)
|
||||
|
||||
decls = """
|
||||
// Note: for normalized lookups, uchar4 actually returns floats
|
||||
texture<uchar4, cudaTextureType2D, cudaReadModeNormalizedFloat> palTex;
|
||||
__shared__ iter_info info;
|
||||
|
||||
"""
|
||||
|
||||
pix_helpers = Template("""
|
||||
__device__
|
||||
void read_pix(float4 &pix, float &den) {
|
||||
den = pix.w;
|
||||
{{if features.pal_has_alpha}}
|
||||
read_half(pix.z, pix.w, pix.z, den);
|
||||
{{endif}}
|
||||
}
|
||||
|
||||
__device__
|
||||
void write_pix(float4 &pix, float den) {
|
||||
{{if features.pal_has_alpha}}
|
||||
write_half(pix.z, pix.z, pix.w, den);
|
||||
{{endif}}
|
||||
pix.w = den;
|
||||
}
|
||||
""")
|
||||
|
||||
def _xfbody(self, xfid, xform):
|
||||
px = self.packer.view('info', 'xf%d_' % xfid)
|
||||
px.sub('xf', 'cp.xforms[%d]' % xfid)
|
||||
@ -204,12 +224,17 @@ void iter(mwc_st *msts, iter_info *infos, float4 *accbuf, float *denbuf) {
|
||||
|
||||
float4 outcol = tex2D(palTex, color, {{packer.get("cp_step_frac")}});
|
||||
float4 pix = accbuf[i];
|
||||
float den;
|
||||
// TODO: unify read/write_pix cycle when alpha is needed
|
||||
read_pix(pix, den);
|
||||
pix.x += outcol.x;
|
||||
pix.y += outcol.y;
|
||||
pix.z += outcol.z;
|
||||
pix.w += outcol.w;
|
||||
accbuf[i] = pix; // TODO: atomic operations (or better)
|
||||
denbuf[i] += 1.0f;
|
||||
den += 1.0f;
|
||||
|
||||
write_pix(pix, den);
|
||||
accbuf[i] = pix;
|
||||
}
|
||||
}
|
||||
''')
|
||||
|
@ -50,7 +50,7 @@ class BaseCode(HunkOCode):
|
||||
#include<stdio.h>
|
||||
"""
|
||||
|
||||
defs = """
|
||||
defs = r"""
|
||||
#undef M_E
|
||||
#undef M_LOG2E
|
||||
#undef M_LOG10E
|
||||
@ -103,6 +103,39 @@ void zero_dptr(float* dptr, int size) {
|
||||
dptr[i] = 0.0f;
|
||||
}
|
||||
}
|
||||
|
||||
__device__
|
||||
void read_half(float &x, float &y, float xy, float den) {
|
||||
asm("\n\t{"
|
||||
"\n\t .reg .u16 x, y;"
|
||||
"\n\t .reg .f32 rc;"
|
||||
"\n\t mov.b32 {x, y}, %2;"
|
||||
"\n\t mul.f32 rc, %3, 0f37800000;" // 1/65536.
|
||||
"\n\t cvt.rn.f32.u16 %0, x;"
|
||||
"\n\t cvt.rn.f32.u16 %1, y;"
|
||||
"\n\t mul.f32 %0, %0, rc;"
|
||||
"\n\t mul.f32 %1, %1, rc;"
|
||||
"\n\t}"
|
||||
: "=f"(x), "=f"(y) : "f"(xy), "f"(den));
|
||||
}
|
||||
|
||||
__device__
|
||||
void write_half(float &xy, float x, float y, float den) {
|
||||
asm("\n\t{"
|
||||
"\n\t .reg .u16 x, y;"
|
||||
"\n\t .reg .f32 rc, xf, yf;"
|
||||
"\n\t rcp.approx.f32 rc, %3;"
|
||||
"\n\t mul.f32 rc, rc, 65536.0;"
|
||||
"\n\t mul.f32 xf, %1, rc;"
|
||||
"\n\t mul.f32 yf, %2, rc;"
|
||||
"\n\t cvt.rni.u16.f32 x, xf;"
|
||||
"\n\t cvt.rni.u16.f32 y, yf;"
|
||||
"\n\t mov.b32 %0, {x, y};"
|
||||
"\n\t}"
|
||||
: "=f"(xy) : "f"(x), "f"(y), "f"(den));
|
||||
}
|
||||
|
||||
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
|
@ -229,7 +229,6 @@ class _AnimRenderer(object):
|
||||
memset(byref(self._cen_cp), 0, sizeof(self._cen_cp))
|
||||
|
||||
self.nbins = anim.features.acc_height * anim.features.acc_stride
|
||||
self.d_den = cuda.mem_alloc(4 * self.nbins)
|
||||
self.d_accum = cuda.mem_alloc(16 * self.nbins)
|
||||
self.d_out = cuda.mem_alloc(16 * self.nbins)
|
||||
self.d_infos = cuda.mem_alloc(anim._iter.packer.align * self.ncps)
|
||||
@ -245,8 +244,6 @@ class _AnimRenderer(object):
|
||||
a._interp(cen_time, cen_cp)
|
||||
palette = self._interp_colors(cen_time, cen_cp)
|
||||
|
||||
util.BaseCode.zero_dptr(a.mod, self.d_den, self.nbins,
|
||||
self.stream)
|
||||
util.BaseCode.zero_dptr(a.mod, self.d_accum, 4 * self.nbins,
|
||||
self.stream)
|
||||
|
||||
@ -311,7 +308,7 @@ class _AnimRenderer(object):
|
||||
# TODO: get block config from IterCode
|
||||
# TODO: print timing information
|
||||
iter_fun(self.d_seeds[b], np.uint64(d_info_off),
|
||||
self.d_accum, self.d_den, texrefs=[tref],
|
||||
self.d_accum, texrefs=[tref],
|
||||
block=(32, 16, 1), grid=(len(block_times), 1),
|
||||
stream=self.stream)
|
||||
|
||||
@ -327,8 +324,7 @@ class _AnimRenderer(object):
|
||||
|
||||
util.BaseCode.zero_dptr(a.mod, self.d_out, 4 * self.nbins,
|
||||
self.stream)
|
||||
a._de.invoke(a.mod, Genome(cen_cp),
|
||||
self.d_accum, self.d_out, self.d_den,
|
||||
a._de.invoke(a.mod, Genome(cen_cp), self.d_accum, self.d_out,
|
||||
self.stream)
|
||||
|
||||
f = np.float32
|
||||
@ -381,11 +377,6 @@ class _AnimRenderer(object):
|
||||
g = a.features.gutter
|
||||
obuf_dim = (a.features.acc_height, a.features.acc_stride, 4)
|
||||
out = cuda.from_device(self.d_out, obuf_dim, np.float32)
|
||||
#dacc = cuda.from_device(self.d_accum, obuf_dim, np.float32)
|
||||
#daccw = dacc[:,:,3]
|
||||
#print daccw.sum()
|
||||
# TODO: performance?
|
||||
g = a.features.gutter
|
||||
out = np.delete(out, np.s_[:g], axis=0)
|
||||
out = np.delete(out, np.s_[:g], axis=1)
|
||||
out = np.delete(out, np.s_[-g:], axis=0)
|
||||
@ -447,6 +438,10 @@ class Features(object):
|
||||
else:
|
||||
self.final_xform_index = None
|
||||
|
||||
alphas = np.array([c.color[3] for g in genomes
|
||||
for c in g.palette.entries])
|
||||
self.pal_has_alpha = np.any(alphas != 1.0)
|
||||
|
||||
self.max_cps = max([cp.ntemporal_samples for cp in genomes])
|
||||
|
||||
self.width = genomes[0].width
|
||||
|
Loading…
Reference in New Issue
Block a user