New atomic write mode

2025-10-14 00:51:35 -04:00 · 2011-12-10 12:18:00 -05:00
parent a0fd3f965f
commit 6c50e6dadc
3 changed files with 130 additions and 26 deletions
--- a/cuburn/code/interp.py
+++ b/cuburn/code/interp.py
@ -270,8 +270,8 @@ void interp_palette_hsv_flat(mwc_st *rctxs,
    uint32_t r = min(255, (uint32_t) (rgba.x * 255.0f + 0.49f * mwc_next_11(rctx)));
    uint32_t g = min(255, (uint32_t) (rgba.y * 255.0f + 0.49f * mwc_next_11(rctx)));
    uint32_t b = min(255, (uint32_t) (rgba.z * 255.0f + 0.49f * mwc_next_11(rctx)));
-    out.x = (1 << 22) | (r << 4);
-    out.y = (g << 18) | b;
+    out.y = (1 << 22) | (r << 4);
+    out.x = (g << 18) | b;
    surf2Dwrite(out, flatpal, 8 * threadIdx.x, blockIdx.x);
    rctxs[gid] = rctx;
 }
--- a/cuburn/code/iter.py
+++ b/cuburn/code/iter.py
@ -187,8 +187,10 @@ void iter(
        uint64_t out_ptr,
        mwc_st *msts,
        float4 *points,
-        const iter_params *all_params,
-        int nsamps_to_generate
+        const iter_params *all_params
+{{if info.acc_mode == 'atomic'}}
+      , uint64_t atom_ptr
+{{endif}}
 ) {
    const iter_params *global_params = &(all_params[blockIdx.x]);

@ -207,10 +209,14 @@ void iter(
    int this_rb_idx = rb_idx + threadIdx.x + 32 * threadIdx.y;
    mwc_st rctx = msts[this_rb_idx];

-{{if info.acc_mode != 'deferred'}}
+{{if info.acc_mode == 'global'}}
    __shared__ float time_frac;
    time_frac = blockIdx.x / (float) gridDim.x;
 {{else}}
+    {{if info.acc_mode == 'atomic'}}
+    // TODO: spare the register, reuse at call site?
+    int time = blockIdx.x >> 4;
+    {{endif}}
    float color_dither = 0.49f * mwc_next_11(rctx);
 {{endif}}

@ -224,10 +230,10 @@ void iter(
    // Shared memory size can be reduced by a factor of four using a slower
    // 4-stage reduce, but on Fermi hardware shmem use isn't a bottleneck
    __shared__ float swap[{{4*NTHREADS}}];
-    __shared__ float cosel[{{NWARPS}}];
+    __shared__ float cosel[{{2*NWARPS}}];

    // This is normally done after the swap-sync in the main loop
-    if (threadIdx.y == 0 && threadIdx.x < {{NWARPS}})
+    if (threadIdx.y == 0 && threadIdx.x < {{NWARPS*2}})
        cosel[threadIdx.x] = mwc_next_01(rctx);
    __syncthreads();
 {{endif}}
@ -295,7 +301,7 @@ void iter(
        __syncthreads();

        // We select the next xforms here, since we've just synced.
-        if (threadIdx.y == 0 && threadIdx.x < {{NWARPS}})
+        if (threadIdx.y == 0 && threadIdx.x < {{NWARPS*2}})
            cosel[threadIdx.x] = mwc_next_01(rctx);

        fuse = swap[sr];
@ -345,15 +351,60 @@ void iter(
        }

        uint32_t i = iy * acc_size.stride + ix;
-
 {{if info.acc_mode == 'atomic'}}
-        float4 outcol = tex2D(palTex, cc, time_frac);
-        float *accbuf_f = reinterpret_cast<float*>(out_ptr + (16*i));
-        atomicAdd(accbuf_f,   outcol.x);
-        atomicAdd(accbuf_f+1, outcol.y);
-        atomicAdd(accbuf_f+2, outcol.z);
-        atomicAdd(accbuf_f+3, 1.0f);
-{{elif info.acc_mode == 'global'}}
+        asm volatile ({{crep("""
+{
+    .reg .pred  p;
+    .reg .u32   off, color, hi, lo, d, r, g, b;
+    .reg .f32   colorf, rf, gf, bf, df;
+    .reg .u64   ptr, val;
+
+    // TODO: coord dithering better, or pre-supersampled palette?
+    fma.rn.ftz.f32      colorf, %0,     255.0,  %1;
+    cvt.rni.u32.f32     color,  colorf;
+    shl.b32             color,  color,  3;
+
+    suld.b.2d.v2.b32.clamp      {lo, hi},   [flatpal, {color, %2}];
+    mov.b64             val,    {lo, hi};
+    shl.b32             off,    %3,     3;
+    cvt.u64.u32         ptr,    off;
+    add.u64             ptr,    ptr,    %4;
+    setp.le.f32         p,      %5,     0.98;
+@p  red.global.add.u64  [ptr],  val;
+@p  bra                 oflow_end;
+    atom.global.add.u64 val,    [ptr],  val;
+    mov.b64             {lo, hi},       val;
+    setp.lo.u32         p,      hi,     (256 << 22);
+@p  bra                 oflow_end;
+    atom.global.exch.b64 val,   [ptr],  0;
+    mov.b64             {lo, hi},       val;
+    shr.u32             d,      hi,     22;
+    bfe.u32             r,      hi,     4,      18;
+    bfe.u32             g,      lo,     18,     14;
+    bfi.b32             g,      hi,     g,      14,     4;
+    and.b32             b,      lo,     ((1<<18)-1);
+    cvt.rn.f32.u32      rf,     r;
+    cvt.rn.f32.u32      gf,     g;
+    cvt.rn.f32.u32      bf,     b;
+    cvt.rn.f32.u32      df,     d;
+    mul.rn.ftz.f32      rf,     rf,     (1.0/255.0);
+    mul.rn.ftz.f32      gf,     gf,     (1.0/255.0);
+    mul.rn.ftz.f32      bf,     bf,     (1.0/255.0);
+    shl.b32             off,    %3,     4;
+    cvt.u64.u32         ptr,    off;
+    add.u64             ptr,    ptr,    %6;
+    red.global.add.f32  [ptr],          rf;
+    red.global.add.f32  [ptr+4],        gf;
+    red.global.add.f32  [ptr+8],        bf;
+    red.global.add.f32  [ptr+12],       df;
+oflow_end:
+}
+        """)}}  ::  "f"(cc), "f"(color_dither), "r"(time), "r"(i),
+                    "l"(atom_ptr), "f"(cosel[threadIdx.y + {{NWARPS}}]),
+                    "l"(out_ptr));
+{{endif}}
+
+{{if info.acc_mode == 'global'}}
        float4 outcol = tex2D(palTex, cc, time_frac);
        float4 *accbuf = reinterpret_cast<float4*>(out_ptr + (16*i));
        float4 pix = *accbuf;
@ -380,6 +431,47 @@ void iter(
    return;
 }

+{{if info.acc_mode == 'atomic'}}
+__global__
+void flush_atom(uint64_t out_ptr, uint64_t atom_ptr, int nbins) {
+    int i = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;
+    asm volatile ({{crep("""
+{
+    .reg .u32   off, hi, lo, d, r, g, b;
+    .reg .u64   val, ptr;
+    .reg .f32   rf, gf, bf, df, rg, gg, bg, dg;
+
+    // TODO: use explicit movs to handle this
+    shl.b32             off,    %0,     3;
+    cvt.u64.u32         ptr,    off;
+    add.u64             ptr,    ptr,    %1;
+    ld.global.v2.u32    {lo, hi},   [ptr];
+    shl.b32             off,    %0,     4;
+    cvt.u64.u32         ptr,    off;
+    add.u64             ptr,    ptr,    %2;
+    ld.global.v4.f32    {rg,gg,bg,dg},  [ptr];
+    shr.u32             d,      hi,     22;
+    bfe.u32             r,      hi,     4,      18;
+    bfe.u32             g,      lo,     18,     14;
+    bfi.b32             g,      hi,     g,      14,     4;
+    and.b32             b,      lo,     ((1<<18)-1);
+    cvt.rn.f32.u32      rf,     r;
+    cvt.rn.f32.u32      gf,     g;
+    cvt.rn.f32.u32      bf,     b;
+    cvt.rn.f32.u32      df,     d;
+    fma.rn.ftz.f32      rg,     rf,     (1.0/255.0),    rg;
+    fma.rn.ftz.f32      gg,     gf,     (1.0/255.0),    gg;
+    fma.rn.ftz.f32      bg,     bf,     (1.0/255.0),    bg;
+    add.rn.ftz.f32      dg,     df,     dg;
+    st.global.v4.f32    [ptr],  {rg,gg,bg,dg};
+}
+    """)}}  ::  "r"(i), "l"(atom_ptr), "l"(out_ptr));
+}
+
+{{endif}}
+
+{{if info.acc_mode == 'deferred'}}
+
 // Block size, shared accumulation bits, shared accumulation width.
 #define BS 1024
 #define SHAB 12
@ -462,8 +554,8 @@ write_shmem(
    shl.b32         color,  color,  3;
    cvt.rni.u32.f32 time,   %1;

-    suld.b.2d.v2.b32.clamp  {hi, lo},   [flatpal, {color, time}];
-    ld.shared.v2.u32    {hiw, low},     [shoff];
+    suld.b.2d.v2.b32.clamp  {lo, hi},   [flatpal, {color, time}];
+    ld.shared.v2.u32    {low, hiw},     [shoff];
    add.cc.u32          lo,     lo,     low;
    addc.u32            hi,     hi,     hiw;
    setp.hs.u32         q,      hi,     (1023 << 22);
@ -513,7 +605,7 @@ oflow_write_end:
        asm({{crep("""
 {
    .reg .u32 hi, lo;
-    ld.shared.v2.u32   {hi, lo},    [%4];
+    ld.shared.v2.u32   {lo, hi},    [%4];
    shr.u32         %0,     hi,     22;
    bfe.u32         %1,     hi,     4,      18;
    bfe.u32         %2,     lo,     18,     14;
@ -530,7 +622,7 @@ oflow_write_end:
        glo_idx += (BS << 8);
    }
 }
-
+{{endif}}
 ''', 'iter_kern')
        return tmpl.substitute(
                info = self.info,
--- a/cuburn/render.py
+++ b/cuburn/render.py
@ -8,7 +8,7 @@ from itertools import cycle, repeat, chain, izip
 from ctypes import *
 from cStringIO import StringIO
 import numpy as np
-from numpy import int32 as i32
+from numpy import int32 as i32, uint64 as u64
 from scipy import ndimage

 from fr0stlib import pyflam3
@ -110,7 +110,6 @@ class Renderer(object):
        reset_rb_fun = self.mod.get_function("reset_rb")
        packer_fun = self.mod.get_function("interp_iter_params")
        iter_fun = self.mod.get_function("iter")
-        write_fun = self.mod.get_function("write_shmem")

        info = self.info

@ -119,6 +118,7 @@ class Renderer(object):
        filt_stream = cuda.Stream()
        if info.acc_mode == 'deferred':
            write_stream = cuda.Stream()
+            write_fun = self.mod.get_function("write_shmem")
        else:
            write_stream = iter_stream

@ -132,6 +132,9 @@ class Renderer(object):
        # Extra padding in accum helps with write_shmem overruns
        d_accum = cuda.mem_alloc(16 * nbins + (1<<16))
        d_out = cuda.mem_alloc(16 * nbins)
+        if info.acc_mode == 'atomic':
+            d_atom = cuda.mem_alloc(8 * nbins)
+            flush_fun = self.mod.get_function("flush_atom")

        acc_size = np.array([info.acc_width, info.acc_height, info.acc_stride])
        d_acc_size = self.mod.get_global('acc_size')[0]
@ -191,7 +194,7 @@ class Renderer(object):
        d_palint_vals = cuda.to_device(
                np.concatenate(map(info.db.palettes.get, pals[1::2])))

-        if info.acc_mode == 'deferred':
+        if info.acc_mode in ('deferred', 'atomic'):
            palette_fun = self.mod.get_function("interp_palette_hsv_flat")
            dsc = cuda.ArrayDescriptor3D()
            dsc.height = info.palette_height
@ -227,7 +230,7 @@ class Renderer(object):

        for idx, start, stop in times:
            twidth = np.float32((stop-start) / info.palette_height)
-            if info.acc_mode == 'deferred':
+            if info.acc_mode in ('deferred', 'atomic'):
                palette_fun(d_seeds, d_palint_times, d_palint_vals,
                            np.float32(start), twidth,
                            block=(256,1,1), grid=(info.palette_height,1),
@ -257,6 +260,8 @@ class Renderer(object):
                #print '%60s %g' % ('_'.join(n), i)

            util.BaseCode.fill_dptr(self.mod, d_accum, 4 * nbins, write_stream)
+            if info.acc_mode == 'atomic':
+                util.BaseCode.fill_dptr(self.mod, d_atom, 2 * nbins, write_stream)
            nrounds = ( (info.density * info.width * info.height)
                      / (ntemporal_samples * 256 * 256) ) + 1
            if info.acc_mode == 'deferred':
@ -273,9 +278,16 @@ class Renderer(object):
                              block=(1024, 1, 1), grid=(nwriteblocks, 1),
                              stream=write_stream)
            else:
-                iter_fun(np.uint64(d_accum), d_seeds, d_points, d_infos,
-                         block=(32, self._iter.NTHREADS/32, 1),
+                args = [u64(d_accum), d_seeds, d_points, d_infos]
+                if info.acc_mode == 'atomic':
+                    args.append(u64(d_atom))
+                iter_fun(*args, block=(32, self._iter.NTHREADS/32, 1),
                         grid=(ntemporal_samples, nrounds), stream=iter_stream)
+                if info.acc_mode == 'atomic':
+                    nblocks = int(np.ceil(np.sqrt(nbins/float(512))))
+                    flush_fun(u64(d_accum), u64(d_atom), i32(nbins),
+                              block=(512, 1, 1), grid=(nblocks, nblocks),
+                              stream=iter_stream)

            util.BaseCode.fill_dptr(self.mod, d_out, 4 * nbins, filt_stream)
            _sync_stream(filt_stream, write_stream)