Add faster no-L1 accum

This commit is contained in:
Steven Robertson 2011-10-15 00:32:30 -04:00
parent dd645bcbf6
commit c7728d3507
3 changed files with 65 additions and 17 deletions

View File

@ -41,6 +41,60 @@ void write_pix(float4 &pix, float den) {
{{endif}}
pix.w = den;
}
__device__
void update_pix(uint64_t ptr, uint32_t i, float4 c) {
{{if features.pal_has_alpha}}
asm volatile ({{crep('''
{
.reg .u16 sz, sw;
.reg .u64 base, off;
.reg .f32 x, y, z, w, den, rc, tz, tw;
// TODO: this limits the accumulation buffer to <4GB
shl.b32 %0, %0, 4;
cvt.u64.u32 off, %0;
add.u64 base, %1, off;
ld.cg.v4.f32 {x, y, z, den}, [base];
add.f32 x, x, %2;
add.f32 y, y, %3;
mov.b32 {sz, sw}, z;
cvt.rn.f32.u16 tz, sz;
cvt.rn.f32.u16 tw, sw;
mul.f32 tz, tz, den;
mul.f32 tw, tz, den;
fma.f32 tz, %4, 65535.0, tz;
fma.f32 tw, %5, 65535.0, tw;
add.f32 den, 1.0;
rcp.approx.f32 rc, den;
mul.f32 tz, tz, rc;
mul.f32 tw, tw, rc;
cvt.rni.u16.f32 sz, tz;
cvt.rni.u16.f32 sw, tw;
mov.b32 z, {sz, sw};
st.cs.v4.f32 [base], {x, y, z, den};
}
''')}} : "+r"(i) : "l"(ptr), "f"(c.x), "f"(c.y), "f"(c.z), "f"(c.w));
{{else}}
asm volatile ({{crep('''
{
.reg .u64 base, off;
.reg .f32 x, y, z, den;
// TODO: this limits the accumulation buffer to <4GB
shl.b32 %0, %0, 4;
cvt.u64.u32 off, %0;
add.u64 base, %1, off;
ld.cg.v4.f32 {x, y, z, den}, [base];
add.f32 x, x, %2;
add.f32 y, y, %3;
add.f32 z, z, %4;
add.f32 den, den, 1.0;
st.cs.v4.f32 [base], {x, y, z, den};
}
''')}} : "+r"(i) : "l"(ptr), "f"(c.x), "f"(c.y), "f"(c.z));
{{endif}}
}
""")
def _xfbody(self, xfid, xform):
@ -86,7 +140,7 @@ void apply_xf{{xfid}}(float &ox, float &oy, float &color, mwc_st &rctx) {
def _iterbody(self):
tmpl = Template(r'''
__global__
void iter(mwc_st *msts, iter_info *infos, float4 *accbuf, float *denbuf) {
void iter(mwc_st *msts, iter_info *infos, uint64_t accbuf_ptr) {
__shared__ int nsamps;
mwc_st rctx = msts[gtid()];
iter_info *info_glob = &(infos[blockIdx.x]);
@ -223,18 +277,8 @@ void iter(mwc_st *msts, iter_info *infos, float4 *accbuf, float *denbuf) {
uint32_t i = iy * {{features.acc_stride}} + ix;
float4 outcol = tex2D(palTex, color, {{packer.get("cp_step_frac")}});
float4 pix = accbuf[i];
float den;
// TODO: unify read/write_pix cycle when alpha is needed
read_pix(pix, den);
pix.x += outcol.x;
pix.y += outcol.y;
pix.z += outcol.z;
pix.w += outcol.w;
den += 1.0f;
update_pix(accbuf_ptr, i, outcol);
write_pix(pix, den);
accbuf[i] = pix;
}
msts[gtid()] = rctx;
}

View File

@ -5,9 +5,13 @@ Provides tools and miscellaneous functions for building device code.
import numpy as np
import tempita
def crep(s):
"""Escape for PTX assembly"""
return '"%s"' % s.encode("string_escape")
class Template(tempita.Template):
default_namespace = tempita.Template.default_namespace.copy()
Template.default_namespace.update({'np': np})
Template.default_namespace.update({'np': np, 'crep': crep})
class HunkOCode(object):
"""An apparently passive container for device code."""

View File

@ -320,7 +320,7 @@ class _AnimRenderer(object):
d_seeds = self.d_seeds if on_main else self.d_alt_seeds
if not d_seeds:
seeds = mwc.MWC.make_seeds(iter.IterCode.NTHREADS *
seeds = mwc.MWC.make_seeds(a._iter.NTHREADS *
self.cps_per_block)
if self.sync:
d_seeds = cuda.to_device(seeds)
@ -363,9 +363,9 @@ class _AnimRenderer(object):
h_infos[:] = infos
cuda.memcpy_htod_async(d_info_off, h_infos, stream)
# TODO: get block config from IterCode
iter_fun(d_seeds, np.uintp(d_info_off), self.d_accum,
block=(32, 16, 1), grid=(len(block_times), 1),
iter_fun(d_seeds, np.uintp(d_info_off), np.uint64(self.d_accum),
block=(32, a._iter.NTHREADS/32, 1),
grid=(len(block_times), 1),
texrefs=[tref], stream=stream)
if self.sync and self.sleep: