cuburn/cuburn/code/iter.py

"""
The main iteration loop.
"""

from cuburn.code import mwc, variations, interp
from cuburn.code.util import *

def precalc_densities(pcp, std_xforms):
    # This pattern recurs a few times for precalc segments. Unfortunately,
    # namespace stuff means it's not easy to functionalize this boilerplate
    pre_cp = pcp._precalc()
    pre_cp._code(Template(r"""

        float sum = 0.0f;

        {{for n in std_xforms}}
        float den_{{n}} = {{pre_cp.xforms[n].density}};
        sum += den_{{n}};
        {{endfor}}

        float rsum = 1.0f / sum;
        sum = 0.0f;

        {{for n in std_xforms[:-1]}}
        sum += den_{{n}} * rsum;
        {{pre_cp._set('den_' + n)}} = sum;
        {{endfor}}
    """, name='precalc_densities').substitute(locals()))

def precalc_chaos(pcp, std_xforms):
    pre_cp = pcp._precalc()
    pre_cp._code(Template("""

        float sum, rsum;

        {{for p in std_xforms}}
        sum = 0.0f;

        {{for n in std_xforms}}
        float den_{{p}}_{{n}} = {{pre_cp.xforms[p].chaos[n]}};
        sum += den_{{p}}_{{n}};
        {{endfor}}

        rsum = 1.0f / sum;
        sum = 0.0f;

        {{for n in std_xforms[:-1]}}
        sum += den_{{p}}_{{n}} * rsum;
        {{pre_cp._set('chaos_%s_%s' % (p, n))}} = sum;
        {{endfor}}

        {{endfor}}

    """, name='precalc_chaos').substitute(locals()))

def precalc_camera(pcam):
    pre_cam = pcam._precalc()

    # Maxima code to check my logic:
    #   matrix([1,0,0.5*width + g],[0,1,0.5*height+g],[0,0,1])
    # . matrix([width * scale,0,0], [0,width * scale,0], [0,0,1])
    # . matrix([cosr,-sinr,0], [sinr,cosr,0], [0,0,1])
    # . matrix([1,0,-cenx],[0,1,-ceny],[0,0,1])
    # . matrix([X],[Y],[1]);

    pre_cam._code(Template(r"""

        float rot = {{pre_cam.rotation}} * M_PI / 180.0f;
        float rotsin = sin(rot), rotcos = cos(rot);
        float cenx = {{pre_cam.center.x}}, ceny = {{pre_cam.center.y}};
        float scale = {{pre_cam.scale}} * acc_size.width;

        {{pre_cam._set('xx')}} = scale * rotcos;
        {{pre_cam._set('xy')}} = scale * -rotsin;
        {{pre_cam._set('xo')}} = scale * (rotsin * ceny - rotcos * cenx)
                               + 0.5f * acc_size.awidth;

        {{pre_cam._set('yx')}} = scale * rotsin;
        {{pre_cam._set('yy')}} = scale * rotcos;
        {{pre_cam._set('yo')}} = scale * -(rotsin * cenx + rotcos * ceny)
                               + 0.5f * acc_size.aheight;

    """).substitute(locals()))

def precalc_xf_affine(px):
    pre = px._precalc()
    pre._code(Template(r"""

        float pri = {{pre.angle}} * M_PI / 180.0f;
        float spr = {{pre.spread}} * M_PI / 180.0f;

        float magx = {{pre.magnitude.x}};
        float magy = {{pre.magnitude.y}};

        {{pre._set('xx')}} = magx * cos(pri-spr);
        {{pre._set('yx')}} = -magx * sin(pri-spr);
        {{pre._set('xy')}} = -magy * cos(pri+spr);
        {{pre._set('yy')}} = magy * sin(pri+spr);
        {{pre._set('xo')}} = {{pre.offset.x}};
        {{pre._set('yo')}} = -{{pre.offset.y}};

    """).substitute(locals()))

class IterCode(HunkOCode):
    # The number of threads per block
    NTHREADS = 256

    def __init__(self, info, genome):
        self.packer = interp.GenomePacker('iter_params')
        self.pcp = self.packer.view('params', genome, 'cp')

        iterbody = self._iterbody(info, genome)
        bodies = [self._xfbody(i,x) for i,x in sorted(genome.xforms.items())]
        bodies.append(iterbody)
        self.defs = '\n'.join(bodies)

    decls = """
// Note: for normalized lookups, uchar4 actually returns floats
texture<uchar4, cudaTextureType2D, cudaReadModeNormalizedFloat> palTex;
__shared__ iter_params params;
__device__ int rb_head, rb_tail, rb_size;

typedef struct {
    uint32_t width;
    uint32_t height;
    uint32_t awidth;
    uint32_t aheight;
    uint32_t astride;
} acc_size_t;
__constant__ acc_size_t acc_size;

"""

    def _xfbody(self, xfid, xform):
        px = self.pcp.xforms[xfid]
        tmpl = Template(r"""
__device__
void apply_xf_{{xfid}}(float &ox, float &oy, float &color, mwc_st &rctx) {
    float tx, ty;

    {{precalc_xf_affine(px.affine)}}
    {{apply_affine('ox', 'oy', 'tx', 'ty', px.affine)}}

    ox = 0;
    oy = 0;

    {{for name in xform.variations}}
    if (1) {
    {{py:pv = px.variations[name]}}
    float w = {{pv.weight}};
    {{variations.var_code[name].substitute(locals())}}
    }
    {{endfor}}

    {{if 'post' in xform}}
    tx = ox;
    ty = oy;
    {{precalc_xf_affine(px.post)}}
    {{apply_affine('tx', 'ty', 'ox', 'oy', px.post)}}
    {{endif}}

    float csp = {{px.color_speed}};
    color = color * (1.0f - csp) + {{px.color}} * csp;
};
""", 'apply_xf_'+xfid)
        g = dict(globals())
        g.update(locals())
        return tmpl.substitute(g)

    def _iterbody(self, info, genome):
        tmpl = Template(r'''

__global__ void reset_rb(int size) {
    rb_head = rb_tail = 0;
    rb_size = size;
}

__global__
void iter(
        uint64_t out_ptr,
        mwc_st *msts,
        float4 *points,
        const iter_params *all_params
{{if info.acc_mode == 'atomic'}}
      , uint64_t atom_ptr
{{endif}}
) {
    const iter_params *global_params = &(all_params[blockIdx.x]);


    // load params to shared memory cooperatively
    for (int i = threadIdx.y * blockDim.x + threadIdx.x;
         i < (sizeof(iter_params) / 4); i += blockDim.x * blockDim.y)
        reinterpret_cast<float*>(&params)[i] =
            reinterpret_cast<const float*>(global_params)[i];

    __shared__ int rb_idx;
    if (threadIdx.y == 1 && threadIdx.x == 1)
        rb_idx = 32 * blockDim.y * (atomicAdd(&rb_head, 1) % rb_size);

    __syncthreads();
    int this_rb_idx = rb_idx + threadIdx.x + 32 * threadIdx.y;
    mwc_st rctx = msts[this_rb_idx];

    {{precalc_camera(pcp.camera)}}
    if (threadIdx.y == 5 && threadIdx.x == 4) {
        float ditherwidth = {{pcp.camera.dither_width}} * 0.33f;
        float u0 = mwc_next_01(rctx);
        float r = ditherwidth * sqrtf(-2.0f * log2f(u0) / M_LOG2E);

        float u1 = 2.0f * M_PI * mwc_next_01(rctx);
        {{pcp.camera.xo}} += r * cos(u1);
        {{pcp.camera.yo}} += r * sin(u1);
    }

{{if info.acc_mode == 'global'}}
    __shared__ float time_frac;
    time_frac = blockIdx.x / (float) gridDim.x;
{{else}}
    {{if info.acc_mode == 'atomic'}}
    // TODO: spare the register, reuse at call site?
    int time = blockIdx.x >> 4;
    {{endif}}
    float color_dither = 0.49f * mwc_next_11(rctx);
{{endif}}

    // TODO: 4th channel unused. Kill or use for something helpful
    float4 old_point = points[this_rb_idx];
    float x = old_point.x, y = old_point.y, color = old_point.z;

{{if info.chaos_used}}
    int last_xf_used = 0;
{{else}}
    // Shared memory size can be reduced by a factor of four using a slower
    // 4-stage reduce, but on Fermi hardware shmem use isn't a bottleneck
    __shared__ float swap[{{4*NTHREADS}}];
    __shared__ float cosel[{{2*NWARPS}}];

    // This is normally done after the swap-sync in the main loop
    if (threadIdx.y == 0 && threadIdx.x < {{NWARPS*2}})
        cosel[threadIdx.x] = mwc_next_01(rctx);
    __syncthreads();
{{endif}}

    bool fuse = false;

    // This condition checks for large numbers, Infs, and NaNs.
    if (!(-(fabsf(x) + fabsf(y)) > -1.0e6f)) {
        x = mwc_next_11(rctx);
        y = mwc_next_11(rctx);
        color = mwc_next_01(rctx);
        fuse = true;
    }

    // TODO: link up with FUSE, etc
    for (int round = 0; round < 256; round++) {

{{if info.chaos_used}}

        {{precalc_chaos(pcp, std_xforms)}}

        // For now, we don't attempt to use the swap buffer when chaos is used
        float xfsel = mwc_next_01(rctx);

        {{for prior_xform_idx, prior_xform_name in enumerate(std_xforms)}}
        if (last_xf_used == {{prior_xform_idx}}) {
            {{for xform_idx, xform_name in enumerate(std_xforms[:-1])}}
            if (xfsel <= {{pcp['chaos_'+prior_xform_name+'_'+xform_name]}}) {
                apply_xf_{{xform_name}}(x, y, color, rctx);
                last_xf_used = {{xform_idx}};
            } else
            {{endfor}}
            {
                apply_xf_{{std_xforms[-1]}}(x, y, color, rctx);
                last_xf_used = {{len(std_xforms)-1}};
            }
        } else
        {{endfor}}
        {
            printf("Something went *very* wrong.\n");
            asm("trap;");
        }

{{else}}
        {{precalc_densities(pcp, std_xforms)}}
        float xfsel = cosel[threadIdx.y];

        {{for xform_name in std_xforms[:-1]}}
        if (xfsel <= {{pcp['den_'+xform_name]}}) {
            apply_xf_{{xform_name}}(x, y, color, rctx);
        } else
        {{endfor}}
            apply_xf_{{std_xforms[-1]}}(x, y, color, rctx);

        // Rotate points between threads.
        int swr = threadIdx.y + threadIdx.x
                + (round & 1) * (threadIdx.x / {{NTHREADS / 32}});
        int sw = (swr * 32 + threadIdx.x) & {{NTHREADS-1}};
        int sr = threadIdx.y * 32 + threadIdx.x;

        swap[sw] = fuse ? 1.0f : 0.0f;
        swap[sw+{{NTHREADS}}] = x;
        swap[sw+{{2*NTHREADS}}] = y;
        swap[sw+{{3*NTHREADS}}] = color;
        __syncthreads();

        // We select the next xforms here, since we've just synced.
        if (threadIdx.y == 0 && threadIdx.x < {{NWARPS*2}})
            cosel[threadIdx.x] = mwc_next_01(rctx);

        fuse = swap[sr];
        x = swap[sr+{{NTHREADS}}];
        y = swap[sr+{{2*NTHREADS}}];
        color = swap[sr+{{3*NTHREADS}}];

{{endif}}

{{if info.acc_mode == 'deferred'}}
        int tid = threadIdx.y * 32 + threadIdx.x;
        int offset = 4 * (256 * (256 * blockIdx.x + round) + tid);
        int *log = reinterpret_cast<int*>(out_ptr + offset);
{{endif}}

        if (fuse) {
{{if info.acc_mode == 'deferred'}}
            *log = 0xffffffff;
{{endif}}
            continue;
        }

{{if 'final' in cp.xforms}}
        float fx = x, fy = y, fcolor = color;
        apply_xf_final(fx, fy, fcolor, rctx);
{{endif}}

        float cx, cy, cc;

{{if 'final' in cp.xforms}}
        {{apply_affine('fx', 'fy', 'cx', 'cy', pcp.camera)}}
        cc = fcolor;
{{else}}
        {{apply_affine('x', 'y', 'cx', 'cy', pcp.camera)}}
        cc = color;
{{endif}}

        uint32_t ix = trunca(cx), iy = trunca(cy);

        if (ix >= acc_size.astride || iy >= acc_size.aheight) {
{{if info.acc_mode == 'deferred'}}
            *log = 0xffffffff;
{{endif}}
            continue;
        }

        uint32_t i = iy * acc_size.astride + ix;
{{if info.acc_mode == 'atomic'}}
        asm volatile ({{crep("""
{
    .reg .pred  p;
    .reg .u32   off, color, hi, lo, d, r, g, b;
    .reg .f32   colorf, rf, gf, bf, df;
    .reg .u64   ptr, val;

    // TODO: coord dithering better, or pre-supersampled palette?
    fma.rn.ftz.f32      colorf, %0,     255.0,  %1;
    cvt.rni.u32.f32     color,  colorf;
    shl.b32             color,  color,  3;

    suld.b.2d.v2.b32.clamp      {lo, hi},   [flatpal, {color, %2}];
    mov.b64             val,    {lo, hi};
    shl.b32             off,    %3,     3;
    cvt.u64.u32         ptr,    off;
    add.u64             ptr,    ptr,    %4;
    setp.le.f32         p,      %5,     0.97;
@p  red.global.add.u64  [ptr],  val;
@p  bra                 oflow_end;
    atom.global.add.u64 val,    [ptr],  val;
    mov.b64             {lo, hi},       val;
    setp.lo.u32         p,      hi,     (256 << 22);
@p  bra                 oflow_end;
    atom.global.exch.b64 val,   [ptr],  0;
    mov.b64             {lo, hi},       val;
    shr.u32             d,      hi,     22;
    bfe.u32             r,      hi,     4,      18;
    bfe.u32             g,      lo,     18,     14;
    bfi.b32             g,      hi,     g,      14,     4;
    and.b32             b,      lo,     ((1<<18)-1);
    cvt.rn.f32.u32      rf,     r;
    cvt.rn.f32.u32      gf,     g;
    cvt.rn.f32.u32      bf,     b;
    cvt.rn.f32.u32      df,     d;
    mul.rn.ftz.f32      rf,     rf,     (1.0/255.0);
    mul.rn.ftz.f32      gf,     gf,     (1.0/255.0);
    mul.rn.ftz.f32      bf,     bf,     (1.0/255.0);
    shl.b32             off,    %3,     4;
    cvt.u64.u32         ptr,    off;
    add.u64             ptr,    ptr,    %6;
    red.global.add.f32  [ptr],          rf;
    red.global.add.f32  [ptr+4],        gf;
    red.global.add.f32  [ptr+8],        bf;
    red.global.add.f32  [ptr+12],       df;
oflow_end:
}
        """)}}  ::  "f"(cc), "f"(color_dither), "r"(time), "r"(i),
                    "l"(atom_ptr), "f"(cosel[threadIdx.y + {{NWARPS}}]),
                    "l"(out_ptr));
{{endif}}

{{if info.acc_mode == 'global'}}
        float4 outcol = tex2D(palTex, cc, time_frac);
        float4 *accbuf = reinterpret_cast<float4*>(out_ptr + (16*i));
        float4 pix = *accbuf;
        pix.x += outcol.x;
        pix.y += outcol.y;
        pix.z += outcol.z;
        pix.w += 1.0f;
        *accbuf = pix;
{{elif info.acc_mode == 'deferred'}}
        // 'color' gets the top 8 bits. TODO: add dithering via precalc.
        uint32_t icolor = fminf(1.0f, cc) * 255.0f + color_dither;
        asm("bfi.b32    %0, %1, %0, 24, 8;" : "+r"(i) : "r"(icolor));
        *log = i;
{{endif}}
    }

    if (threadIdx.x == 0 && threadIdx.y == 0)
        rb_idx = 32 * blockDim.y * (atomicAdd(&rb_tail, 1) % rb_size);
    __syncthreads();
    this_rb_idx = rb_idx + threadIdx.x + 32 * threadIdx.y;

    points[this_rb_idx] = make_float4(x, y, color, 0.0f);
    msts[this_rb_idx] = rctx;
    return;
}

{{if info.acc_mode == 'atomic'}}
__global__
void flush_atom(uint64_t out_ptr, uint64_t atom_ptr, int nbins) {
    int i = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;
    if (i >= nbins) return;
    asm volatile ({{crep("""
{
    .reg .u32   off, hi, lo, d, r, g, b;
    .reg .u64   val, ptr;
    .reg .f32   rf, gf, bf, df, rg, gg, bg, dg;

    // TODO: use explicit movs to handle this
    shl.b32             off,    %0,     3;
    cvt.u64.u32         ptr,    off;
    add.u64             ptr,    ptr,    %1;
    ld.global.v2.u32    {lo, hi},   [ptr];
    shl.b32             off,    %0,     4;
    cvt.u64.u32         ptr,    off;
    add.u64             ptr,    ptr,    %2;
    ld.global.v4.f32    {rg,gg,bg,dg},  [ptr];
    shr.u32             d,      hi,     22;
    bfe.u32             r,      hi,     4,      18;
    bfe.u32             g,      lo,     18,     14;
    bfi.b32             g,      hi,     g,      14,     4;
    and.b32             b,      lo,     ((1<<18)-1);
    cvt.rn.f32.u32      rf,     r;
    cvt.rn.f32.u32      gf,     g;
    cvt.rn.f32.u32      bf,     b;
    cvt.rn.f32.u32      df,     d;
    fma.rn.ftz.f32      rg,     rf,     (1.0/255.0),    rg;
    fma.rn.ftz.f32      gg,     gf,     (1.0/255.0),    gg;
    fma.rn.ftz.f32      bg,     bf,     (1.0/255.0),    bg;
    add.rn.ftz.f32      dg,     df,     dg;
    st.global.v4.f32    [ptr],  {rg,gg,bg,dg};
}
    """)}}  ::  "r"(i), "l"(atom_ptr), "l"(out_ptr));
}

{{endif}}

{{if info.acc_mode == 'deferred'}}

// Block size, shared accumulation bits, shared accumulation width.
#define BS 1024
#define SHAB 12
#define SHAW (1<<SHAB)

// Read the point log, accumulate in shared memory, and write the results.
// This kernel is to be launched with one block for every 4,096 addresses to
// be processed, and will handle those addresses.
//
// log_bounds is an array mapping radix values to the first index in the log
// with that radix position. For performance reasons in other parts of the
// code, the radix may actually include bits within the lower SHAB part of the
// address, or it might not cover the first few bits after the SHAB part;
// log_bounds_shift covers that. glob_addr_bits specifies the number of bits
// above SHAB which are address bits.

__global__ void
__launch_bounds__(BS, 1)
write_shmem(
        float4 *acc,
        const uint32_t *log,
        const uint32_t *log_bounds,
        uint32_t nbins
) {
    const int tid = threadIdx.x;
    const int bid = blockIdx.x;

    // These two accumulators, used in write_shmem, hold {density, red} and
    // {green, blue} values as packed u16 pairs. The fixed size represents
    // 4,096 pixels in the accumulator.
    __shared__ uint32_t s_acc[SHAW*2];

    int idx = tid;
    for (int i = 0; i < (SHAW * 2 / BS); i++) {
        s_acc[idx] = 0;
        idx += BS;
    }
    __syncthreads();

    // Shut the compiler up
    idx = s_acc[0];

    // log_bounds[] holds inclusive prefix sums, so that log_bounds[0] is the
    // largest index with radix 0, and so on.
    int lb_idx_hi = bid & 0xff;
    int lb_idx_lo = lb_idx_hi - 1;

    int idx_lo;
    if (lb_idx_lo > 0) idx_lo = log_bounds[lb_idx_lo] & ~(BS - 1);
    else               idx_lo = 0;
    int idx_hi = (log_bounds[lb_idx_hi] & ~(BS - 1)) + BS;

    float rnrounds = {{'%d.0f' % info.palette_height}} / (idx_hi - idx_lo);
    float time = tid * rnrounds;
    float time_step = BS * rnrounds;

    int magic = ((blockIdx.x & 0xff) << 3) + ((blockIdx.x & 0xf00) << 12);
    int magic_mask = 0xf007f8;

    for (int i = idx_lo + tid; i < idx_hi; i += BS) {
        int entry = log[i];

        asm volatile ({{crep("""
{
    .reg .pred q;
    .reg .u32 shoff, color, time, d, r, g, b, hi, lo, hiw, low, tmp;
    .reg .u64 ptr;
    .reg .f32 rf, gf, bf, df, rg, gg, dg, bg;

    // TODO: opacity
    and.b32         tmp,    %0,     %4;
    setp.eq.u32     q,      tmp,    %3;
@!q bra             before_sync;

    and.b32         shoff,  %0,     0xff800;
    shr.b32         shoff,  shoff,  5;
    bfi.b32         shoff,  %0,     shoff,  3,  3;

    bfe.u32         color,  %0,     24, 8;
    shl.b32         color,  color,  3;
    cvt.rni.u32.f32 time,   %1;

    suld.b.2d.v2.b32.clamp  {lo, hi},   [flatpal, {color, time}];
    ld.shared.v2.u32    {low, hiw},     [shoff];
    add.cc.u32          lo,     lo,     low;
    addc.u32            hi,     hi,     hiw;
    setp.hs.u32         q,      hi,     (1023 << 22);
@q  bra                 oflow_sync;
    st.shared.v2.u32    [shoff],        {hi, lo};
before_sync:
    bar.sync            0;
    bra                 oflow_write_end;
oflow_sync:
    st.shared.v2.u32    [shoff],    {0, 0};

    // TODO: opacity
    bfi.b32         shoff,  %0,     0,  4,  24;
    cvt.u64.u32     ptr,    shoff;
    add.u64         ptr,    ptr,    %2;
    ld.global.v4.f32    {dg,bg,gg,rg},  [ptr];
    bar.sync        0;

    bfe.u32         r,      hi,     4,      18;
    bfe.u32         g,      lo,     18,     14;
    bfi.b32         g,      hi,     g,      14,     4;
    and.b32         b,      lo,     ((1<<18)-1);
    cvt.rn.f32.u32  rf,     r;
    cvt.rn.f32.u32  gf,     g;
    cvt.rn.f32.u32  bf,     b;
    fma.rn.ftz.f32  rf,     rf,     (1.0/255.0),    rg;
    fma.rn.ftz.f32  gf,     gf,     (1.0/255.0),    gg;
    fma.rn.ftz.f32  bf,     bf,     (1.0/255.0),    bg;
    add.f32         df,     df,     dg;
    st.global.v4.f32    [ptr],  {df,bf,gf,rf};

oflow_write_end:
}
        """)}}  ::  "r"(entry), "f"(time), "l"(acc), "r"(magic), "r"(magic_mask));
        time += time_step;
    }

    __syncthreads();


    idx = tid;
    int glo_idx = magic | (((idx << 8) | idx) & 0xff807);

    for (int i = 0; i < (SHAW / BS) && glo_idx < nbins; i++) {
        int d, r, g, b;
        float4 pix = acc[glo_idx];
        asm({{crep("""
{
    .reg .u32 hi, lo;
    ld.shared.v2.u32   {lo, hi},    [%4];
    shr.u32         %0,     hi,     22;
    bfe.u32         %1,     hi,     4,      18;
    bfe.u32         %2,     lo,     18,     14;
    bfi.b32         %2,     hi,     %2,     14,     4;
    and.b32         %3,     lo,     ((1<<18)-1);
}
        """)}} : "=r"(d), "=r"(r), "=r"(g), "=r"(b) : "r"(idx*8));
        pix.x += r / 255.0f;
        pix.y += g / 255.0f;
        pix.z += b / 255.0f;
        pix.w += d;
        acc[glo_idx] = pix;
        idx += BS;
        glo_idx += (BS << 8);
    }
}
{{endif}}
''', 'iter_kern')
        return tmpl.substitute(
                info = info,
                cp = genome,
                pcp = self.pcp,
                NTHREADS = self.NTHREADS,
                NWARPS = self.NTHREADS / 32,
                std_xforms = [n for n in sorted(genome.xforms) if n != 'final'],
                **globals())