Add faster no-L1 accum

2025-12-20 12:06:17 -05:00 · 2011-10-15 00:32:30 -04:00
parent dd645bcbf6
commit c7728d3507
3 changed files with 65 additions and 17 deletions
--- a/cuburn/code/iter.py
+++ b/cuburn/code/iter.py
@ -41,6 +41,60 @@ void write_pix(float4 &pix, float den) {
    {{endif}}
    pix.w = den;
 }
+
+__device__
+void update_pix(uint64_t ptr, uint32_t i, float4 c) {
+    {{if features.pal_has_alpha}}
+    asm volatile ({{crep('''
+    {
+        .reg .u16       sz, sw;
+        .reg .u64       base, off;
+        .reg .f32       x, y, z, w, den, rc, tz, tw;
+
+        // TODO: this limits the accumulation buffer to <4GB
+        shl.b32         %0,     %0,     4;
+        cvt.u64.u32     off,    %0;
+        add.u64         base,   %1,     off;
+        ld.cg.v4.f32    {x, y, z, den},         [base];
+        add.f32         x,      x,      %2;
+        add.f32         y,      y,      %3;
+        mov.b32         {sz, sw},       z;
+        cvt.rn.f32.u16  tz,     sz;
+        cvt.rn.f32.u16  tw,     sw;
+        mul.f32         tz,     tz,     den;
+        mul.f32         tw,     tz,     den;
+        fma.f32         tz,     %4,     65535.0,    tz;
+        fma.f32         tw,     %5,     65535.0,    tw;
+        add.f32         den,    1.0;
+        rcp.approx.f32  rc,     den;
+        mul.f32         tz,     tz,     rc;
+        mul.f32         tw,     tw,     rc;
+        cvt.rni.u16.f32 sz,     tz;
+        cvt.rni.u16.f32 sw,     tw;
+        mov.b32         z,      {sz, sw};
+        st.cs.v4.f32    [base], {x, y, z, den};
+    }
+    ''')}} : "+r"(i) : "l"(ptr), "f"(c.x), "f"(c.y), "f"(c.z), "f"(c.w));
+    {{else}}
+    asm volatile ({{crep('''
+    {
+        .reg .u64       base, off;
+        .reg .f32       x, y, z, den;
+
+        // TODO: this limits the accumulation buffer to <4GB
+        shl.b32         %0,     %0,     4;
+        cvt.u64.u32     off,    %0;
+        add.u64         base,   %1,     off;
+        ld.cg.v4.f32    {x, y, z, den},         [base];
+        add.f32         x,      x,      %2;
+        add.f32         y,      y,      %3;
+        add.f32         z,      z,      %4;
+        add.f32         den,    den,    1.0;
+        st.cs.v4.f32    [base], {x, y, z, den};
+    }
+    ''')}} : "+r"(i) : "l"(ptr), "f"(c.x), "f"(c.y), "f"(c.z));
+    {{endif}}
+}
 """)

    def _xfbody(self, xfid, xform):
@ -86,7 +140,7 @@ void apply_xf{{xfid}}(float &ox, float &oy, float &color, mwc_st &rctx) {
    def _iterbody(self):
        tmpl = Template(r'''
 __global__
-void iter(mwc_st *msts, iter_info *infos, float4 *accbuf, float *denbuf) {
+void iter(mwc_st *msts, iter_info *infos, uint64_t accbuf_ptr) {
    __shared__ int nsamps;
    mwc_st rctx = msts[gtid()];
    iter_info *info_glob = &(infos[blockIdx.x]);
@ -223,18 +277,8 @@ void iter(mwc_st *msts, iter_info *infos, float4 *accbuf, float *denbuf) {
        uint32_t i = iy * {{features.acc_stride}} + ix;

        float4 outcol = tex2D(palTex, color, {{packer.get("cp_step_frac")}});
-        float4 pix = accbuf[i];
-        float den;
-        // TODO: unify read/write_pix cycle when alpha is needed
-        read_pix(pix, den);
-        pix.x += outcol.x;
-        pix.y += outcol.y;
-        pix.z += outcol.z;
-        pix.w += outcol.w;
-        den += 1.0f;
+        update_pix(accbuf_ptr, i, outcol);

-        write_pix(pix, den);
-        accbuf[i] = pix;
    }
    msts[gtid()] = rctx;
 }
--- a/cuburn/code/util.py
+++ b/cuburn/code/util.py
@ -5,9 +5,13 @@ Provides tools and miscellaneous functions for building device code.
 import numpy as np
 import tempita

+def crep(s):
+    """Escape for PTX assembly"""
+    return '"%s"' % s.encode("string_escape")
+
 class Template(tempita.Template):
    default_namespace = tempita.Template.default_namespace.copy()
-Template.default_namespace.update({'np': np})
+Template.default_namespace.update({'np': np, 'crep': crep})

 class HunkOCode(object):
    """An apparently passive container for device code."""
--- a/cuburn/render.py
+++ b/cuburn/render.py
@ -320,7 +320,7 @@ class _AnimRenderer(object):
            d_seeds = self.d_seeds if on_main else self.d_alt_seeds

            if not d_seeds:
-                seeds = mwc.MWC.make_seeds(iter.IterCode.NTHREADS *
+                seeds = mwc.MWC.make_seeds(a._iter.NTHREADS *
                                           self.cps_per_block)
                if self.sync:
                    d_seeds = cuda.to_device(seeds)
@ -363,9 +363,9 @@ class _AnimRenderer(object):
                h_infos[:] = infos
                cuda.memcpy_htod_async(d_info_off, h_infos, stream)

-            # TODO: get block config from IterCode
-            iter_fun(d_seeds, np.uintp(d_info_off), self.d_accum,
-                     block=(32, 16, 1), grid=(len(block_times), 1),
+            iter_fun(d_seeds, np.uintp(d_info_off), np.uint64(self.d_accum),
+                     block=(32, a._iter.NTHREADS/32, 1),
+                     grid=(len(block_times), 1),
                     texrefs=[tref], stream=stream)

            if self.sync and self.sleep: