diff --git a/cuburn/code/iter.py b/cuburn/code/iter.py
index 05ed89f..a5f87f1 100644
--- a/cuburn/code/iter.py
+++ b/cuburn/code/iter.py
@@ -29,8 +29,8 @@ __shared__ iter_info info;
 
         tmpl = Template("""
 __device__
-void apply_xf{{xfid}}(float *ix, float *iy, float *icolor, mwc_st *rctx) {
-    float tx, ty, ox = *ix, oy = *iy;
+void apply_xf{{xfid}}(float &ox, float &oy, float &color, mwc_st &rctx) {
+    float tx, ty;
 
     {{apply_affine_flam3('ox', 'oy', 'tx', 'ty', px, 'xf.c', 'pre')}}
 
@@ -55,11 +55,8 @@ void apply_xf{{xfid}}(float *ix, float *iy, float *icolor, mwc_st *rctx) {
     {{endif}}
     {{endfor}}
 
-    *ix = ox;
-    *iy = oy;
-
     float csp = {{px.get('xf.color_speed')}};
-    *icolor = *icolor * (1.0f - csp) + {{px.get('xf.color')}} * csp;
+    color = color * (1.0f - csp) + {{px.get('xf.color')}} * csp;
 };
 """)
         g = dict(globals())
@@ -87,9 +84,9 @@ void iter(mwc_st *msts, iter_info *infos, float4 *accbuf, float *denbuf) {
     int consec_bad = -{{features.fuse}};
 
     float x, y, color;
-    x = mwc_next_11(&rctx);
-    y = mwc_next_11(&rctx);
-    color = mwc_next_01(&rctx);
+    x = mwc_next_11(rctx);
+    y = mwc_next_11(rctx);
+    color = mwc_next_01(rctx);
 
     {{if features.chaos_used}}
     int last_xf_used = 0;
@@ -102,12 +99,12 @@ void iter(mwc_st *msts, iter_info *infos, float4 *accbuf, float *denbuf) {
     while (1) {
         {{if features.chaos_used}}
         // For now, we can't use the swap buffer with chaos enabled
-        float xfsel = mwc_next_01(&rctx);
+        float xfsel = mwc_next_01(rctx);
         // Needed to match the behavior of the loop with swapping
         __syncthreads();
         {{else}}
         if (threadIdx.y == 0 && threadIdx.x < 16) {
-            cosel[threadIdx.x] = mwc_next_01(&rctx);
+            cosel[threadIdx.x] = mwc_next_01(rctx);
         }
         __syncthreads();
         float xfsel = cosel[threadIdx.y];
@@ -122,7 +119,7 @@ void iter(mwc_st *msts, iter_info *infos, float4 *accbuf, float *denbuf) {
         {{for density_col_idx,  this_xform_idx in enumerate(features.std_xforms)}}
         if (last_xf_used == {{prior_xform_idx}} &&
                 xfsel <= {{packer.get("cp.chaos_densities[%d][%d]" % (density_row_idx, density_col_idx))}}) {
-            apply_xf{{this_xform_idx}}(&x, &y, &color, &rctx);
+            apply_xf{{this_xform_idx}}(x, y, color, rctx);
             last_xf_used = {{this_xform_idx}};
         } else
         {{endfor}}
@@ -130,7 +127,7 @@ void iter(mwc_st *msts, iter_info *infos, float4 *accbuf, float *denbuf) {
         {{else}}
         {{for density_col_idx, this_xform_idx in enumerate(features.std_xforms)}}
         if (xfsel <= {{packer.get("cp.norm_density[%d]" % (density_col_idx))}}) {
-            apply_xf{{this_xform_idx}}(&x, &y, &color, &rctx);
+            apply_xf{{this_xform_idx}}(x, y, color, rctx);
         } else
         {{endfor}}
         {{endif}}
@@ -166,7 +163,7 @@ void iter(mwc_st *msts, iter_info *infos, float4 *accbuf, float *denbuf) {
 
         {{if features.final_xform_index}}
         float fx = x, fy = y, fcolor;
-        apply_xf{{features.final_xform_index}}(&fx, &fy, &fcolor, &rctx);
+        apply_xf{{features.final_xform_index}}(fx, fy, fcolor, rctx);
         {{endif}}
 
         // TODO: this may not optimize well, verify.
@@ -181,11 +178,11 @@ void iter(mwc_st *msts, iter_info *infos, float4 *accbuf, float *denbuf) {
 
         // TODO: verify that constants get premultiplied
         float ditherwidth = {{packer.get("0.33 * cp.spatial_filter_radius")}};
-        float u0 = mwc_next_01(&rctx);
+        float u0 = mwc_next_01(rctx);
         float r = ditherwidth * sqrt(-2.0f * log2f(u0) / M_LOG2E);
 
         // TODO: provide mwc_next_0_2pi()
-        float u1 = 2.0f * M_PI * mwc_next_01(&rctx);
+        float u1 = 2.0f * M_PI * mwc_next_01(rctx);
 
         float ditherx = r * cos(u1);
         float dithery = r * sin(u1);
@@ -195,9 +192,9 @@ void iter(mwc_st *msts, iter_info *infos, float4 *accbuf, float *denbuf) {
             iy < 0 || iy >= {{features.acc_height}} ) {
             consec_bad++;
             if (consec_bad > {{features.max_oob}}) {
-                x = mwc_next_11(&rctx);
-                y = mwc_next_11(&rctx);
-                color = mwc_next_01(&rctx);
+                x = mwc_next_11(rctx);
+                y = mwc_next_11(rctx);
+                color = mwc_next_01(rctx);
                 consec_bad = -{{features.fuse}};
             }
             continue;
@@ -214,7 +211,6 @@ void iter(mwc_st *msts, iter_info *infos, float4 *accbuf, float *denbuf) {
         accbuf[i] = pix;    // TODO: atomic operations (or better)
         denbuf[i] += 1.0f;
     }
-    asm volatile ("membar.cta;");
 }
 ''')
         return tmpl.substitute(
diff --git a/cuburn/code/mwc.py b/cuburn/code/mwc.py
index 5df718f..6234b56 100644
--- a/cuburn/code/mwc.py
+++ b/cuburn/code/mwc.py
@@ -16,20 +16,20 @@ typedef struct {
 """
 
     defs = r"""
-__device__ uint32_t mwc_next(mwc_st *st) {
+__device__ uint32_t mwc_next(mwc_st &st) {
     asm("{\n\t.reg .u64 val;\n\t"
         "cvt.u64.u32  val, %0;\n\t"
         "mad.wide.u32 val, %1, %2, val;\n\t"
         "mov.b64 {%1, %0}, val;\n\t}\n\t"
-        : "+r"(st->carry), "+r"(st->state) : "r"(st->mul));
-    return st->state;
+        : "+r"(st.carry), "+r"(st.state) : "r"(st.mul));
+    return st.state;
 }
 
-__device__ float mwc_next_01(mwc_st *st) {
+__device__ float mwc_next_01(mwc_st &st) {
     return mwc_next(st) * (1.0f / 4294967296.0f);
 }
 
-__device__ float mwc_next_11(mwc_st *st) {
+__device__ float mwc_next_11(mwc_st &st) {
     uint32_t val = mwc_next(st);
     float ret;
     asm("cvt.rn.f32.s32 %0,     %1;\n\t"