diff --git a/cuburn/code/iter.py b/cuburn/code/iter.py index 4089288..99c3645 100644 --- a/cuburn/code/iter.py +++ b/cuburn/code/iter.py @@ -41,6 +41,60 @@ void write_pix(float4 &pix, float den) { {{endif}} pix.w = den; } + +__device__ +void update_pix(uint64_t ptr, uint32_t i, float4 c) { + {{if features.pal_has_alpha}} + asm volatile ({{crep(''' + { + .reg .u16 sz, sw; + .reg .u64 base, off; + .reg .f32 x, y, z, w, den, rc, tz, tw; + + // TODO: this limits the accumulation buffer to <4GB + shl.b32 %0, %0, 4; + cvt.u64.u32 off, %0; + add.u64 base, %1, off; + ld.cg.v4.f32 {x, y, z, den}, [base]; + add.f32 x, x, %2; + add.f32 y, y, %3; + mov.b32 {sz, sw}, z; + cvt.rn.f32.u16 tz, sz; + cvt.rn.f32.u16 tw, sw; + mul.f32 tz, tz, den; + mul.f32 tw, tz, den; + fma.f32 tz, %4, 65535.0, tz; + fma.f32 tw, %5, 65535.0, tw; + add.f32 den, 1.0; + rcp.approx.f32 rc, den; + mul.f32 tz, tz, rc; + mul.f32 tw, tw, rc; + cvt.rni.u16.f32 sz, tz; + cvt.rni.u16.f32 sw, tw; + mov.b32 z, {sz, sw}; + st.cs.v4.f32 [base], {x, y, z, den}; + } + ''')}} : "+r"(i) : "l"(ptr), "f"(c.x), "f"(c.y), "f"(c.z), "f"(c.w)); + {{else}} + asm volatile ({{crep(''' + { + .reg .u64 base, off; + .reg .f32 x, y, z, den; + + // TODO: this limits the accumulation buffer to <4GB + shl.b32 %0, %0, 4; + cvt.u64.u32 off, %0; + add.u64 base, %1, off; + ld.cg.v4.f32 {x, y, z, den}, [base]; + add.f32 x, x, %2; + add.f32 y, y, %3; + add.f32 z, z, %4; + add.f32 den, den, 1.0; + st.cs.v4.f32 [base], {x, y, z, den}; + } + ''')}} : "+r"(i) : "l"(ptr), "f"(c.x), "f"(c.y), "f"(c.z)); + {{endif}} +} """) def _xfbody(self, xfid, xform): @@ -86,7 +140,7 @@ void apply_xf{{xfid}}(float &ox, float &oy, float &color, mwc_st &rctx) { def _iterbody(self): tmpl = Template(r''' __global__ -void iter(mwc_st *msts, iter_info *infos, float4 *accbuf, float *denbuf) { +void iter(mwc_st *msts, iter_info *infos, uint64_t accbuf_ptr) { __shared__ int nsamps; mwc_st rctx = msts[gtid()]; iter_info *info_glob = &(infos[blockIdx.x]); @@ -223,18 +277,8 @@ void iter(mwc_st *msts, iter_info *infos, float4 *accbuf, float *denbuf) { uint32_t i = iy * {{features.acc_stride}} + ix; float4 outcol = tex2D(palTex, color, {{packer.get("cp_step_frac")}}); - float4 pix = accbuf[i]; - float den; - // TODO: unify read/write_pix cycle when alpha is needed - read_pix(pix, den); - pix.x += outcol.x; - pix.y += outcol.y; - pix.z += outcol.z; - pix.w += outcol.w; - den += 1.0f; + update_pix(accbuf_ptr, i, outcol); - write_pix(pix, den); - accbuf[i] = pix; } msts[gtid()] = rctx; } diff --git a/cuburn/code/util.py b/cuburn/code/util.py index f6e8dcb..fc6eeeb 100644 --- a/cuburn/code/util.py +++ b/cuburn/code/util.py @@ -5,9 +5,13 @@ Provides tools and miscellaneous functions for building device code. import numpy as np import tempita +def crep(s): + """Escape for PTX assembly""" + return '"%s"' % s.encode("string_escape") + class Template(tempita.Template): default_namespace = tempita.Template.default_namespace.copy() -Template.default_namespace.update({'np': np}) +Template.default_namespace.update({'np': np, 'crep': crep}) class HunkOCode(object): """An apparently passive container for device code.""" diff --git a/cuburn/render.py b/cuburn/render.py index 887b7f8..87a1acd 100644 --- a/cuburn/render.py +++ b/cuburn/render.py @@ -320,7 +320,7 @@ class _AnimRenderer(object): d_seeds = self.d_seeds if on_main else self.d_alt_seeds if not d_seeds: - seeds = mwc.MWC.make_seeds(iter.IterCode.NTHREADS * + seeds = mwc.MWC.make_seeds(a._iter.NTHREADS * self.cps_per_block) if self.sync: d_seeds = cuda.to_device(seeds) @@ -363,9 +363,9 @@ class _AnimRenderer(object): h_infos[:] = infos cuda.memcpy_htod_async(d_info_off, h_infos, stream) - # TODO: get block config from IterCode - iter_fun(d_seeds, np.uintp(d_info_off), self.d_accum, - block=(32, 16, 1), grid=(len(block_times), 1), + iter_fun(d_seeds, np.uintp(d_info_off), np.uint64(self.d_accum), + block=(32, a._iter.NTHREADS/32, 1), + grid=(len(block_times), 1), texrefs=[tref], stream=stream) if self.sync and self.sleep: