diff --git a/Source/EmberCL/DEOpenCLKernelCreator.cpp b/Source/EmberCL/DEOpenCLKernelCreator.cpp
index 6dc5d9d..e5ad8f7 100644
--- a/Source/EmberCL/DEOpenCLKernelCreator.cpp
+++ b/Source/EmberCL/DEOpenCLKernelCreator.cpp
@@ -56,8 +56,8 @@ DEOpenCLKernelCreator::DEOpenCLKernelCreator(bool doublePrecision, bool nVidia)
 /// Kernel source and entry point properties, getters only.
 /// </summary>
 
-string DEOpenCLKernelCreator::LogScaleAssignDEKernel() { return m_LogScaleAssignDEKernel; }
-string DEOpenCLKernelCreator::LogScaleAssignDEEntryPoint() { return m_LogScaleAssignDEEntryPoint; }
+const string& DEOpenCLKernelCreator::LogScaleAssignDEKernel() const { return m_LogScaleAssignDEKernel; }
+const string& DEOpenCLKernelCreator::LogScaleAssignDEEntryPoint() const { return m_LogScaleAssignDEEntryPoint; }
 
 /// <summary>
 /// Get the kernel source for the specified supersample and filterWidth.
@@ -65,7 +65,7 @@ string DEOpenCLKernelCreator::LogScaleAssignDEEntryPoint() { return m_LogScaleAs
 /// <param name="ss">The supersample being used</param>
 /// <param name="filterWidth">Filter width</param>
 /// <returns>The kernel source</returns>
-string DEOpenCLKernelCreator::GaussianDEKernel(size_t ss, uint filterWidth)
+const string& DEOpenCLKernelCreator::GaussianDEKernel(size_t ss, uint filterWidth) const
 {
 #ifndef ROW_ONLY_DE
 	if (filterWidth > MaxDEFilterSize())
@@ -101,7 +101,7 @@ string DEOpenCLKernelCreator::GaussianDEKernel(size_t ss, uint filterWidth)
 /// <param name="ss">The supersample being used</param>
 /// <param name="filterWidth">Filter width</param>
 /// <returns>The name of the density estimation filtering entry point kernel function</returns>
-string DEOpenCLKernelCreator::GaussianDEEntryPoint(size_t ss, uint filterWidth)
+const string& DEOpenCLKernelCreator::GaussianDEEntryPoint(size_t ss, uint filterWidth) const
 {
 #ifndef ROW_ONLY_DE
 	if (filterWidth > MaxDEFilterSize())
diff --git a/Source/EmberCL/DEOpenCLKernelCreator.h b/Source/EmberCL/DEOpenCLKernelCreator.h
index 68305e0..4793036 100644
--- a/Source/EmberCL/DEOpenCLKernelCreator.h
+++ b/Source/EmberCL/DEOpenCLKernelCreator.h
@@ -34,10 +34,10 @@ public:
 	DEOpenCLKernelCreator(bool doublePrecision, bool nVidia);
 
 	//Accessors.
-	string LogScaleAssignDEKernel();
-	string LogScaleAssignDEEntryPoint();
-	string GaussianDEKernel(size_t ss, uint filterWidth);
-	string GaussianDEEntryPoint(size_t ss, uint filterWidth);
+	const string& LogScaleAssignDEKernel() const;
+	const string& LogScaleAssignDEEntryPoint() const;
+	const string& GaussianDEKernel(size_t ss, uint filterWidth) const;
+	const string& GaussianDEEntryPoint(size_t ss, uint filterWidth) const;
 
 	//Miscellaneous static functions.
 	static uint MaxDEFilterSize();
diff --git a/Source/EmberCL/EmberCLFunctions.h b/Source/EmberCL/EmberCLFunctions.h
index 4230646..c9048c7 100644
--- a/Source/EmberCL/EmberCLFunctions.h
+++ b/Source/EmberCL/EmberCLFunctions.h
@@ -351,14 +351,11 @@ static const char* CarToRasFunctionString =
 	"}\n"
 	"\n";
 
-static string AtomicString(bool doublePrecision, bool dp64AtomicSupport)
+static string AtomicString()
 {
 	ostringstream os;
 
-	//If they want single precision, or if they want double precision and have dp atomic support.
-	if (!doublePrecision || dp64AtomicSupport)
-	{
-		os <<
+	os <<
 		"void AtomicAdd(volatile __global real_bucket_t* source, const real_bucket_t operand)\n"
 		"{\n"
 		"	union\n"
@@ -379,32 +376,6 @@ static string AtomicString(bool doublePrecision, bool dp64AtomicSupport)
 		"		newVal.realVal = prevVal.realVal + operand;\n"
 		"	} while (atomic_cmpxchg((volatile __global atomi*)source, prevVal.intVal, newVal.intVal) != prevVal.intVal);\n"
 		"}\n";
-	}
-	else//They want double precision and do not have dp atomic support.
-	{
-		os <<
-		"void AtomicAdd(volatile __global double* source, const double operand)\n"
-		"{\n"
-		"	union\n"
-		"	{\n"
-		"		uint intVal[2];\n"
-		"		double realVal;\n"
-		"	} newVal;\n"
-		"\n"
-		"	union\n"
-		"	{\n"
-		"		uint intVal[2];\n"
-		"		double realVal;\n"
-		"	} prevVal;\n"
-		"\n"
-		"	do\n"
-		"	{\n"
-		"		prevVal.realVal = *source;\n"
-		"		newVal.realVal = prevVal.realVal + operand;\n"
-		"	} while ((atomic_cmpxchg((volatile __global uint*)source, prevVal.intVal[0], newVal.intVal[0])     != prevVal.intVal[0]) ||\n"
-		"			 (atomic_cmpxchg((volatile __global uint*)source + 1, prevVal.intVal[1], newVal.intVal[1]) != prevVal.intVal[1]));\n"
-		"}\n";
-	}
 
 	return os.str();
 }
diff --git a/Source/EmberCL/EmberCLStructs.h b/Source/EmberCL/EmberCLStructs.h
index 59b7521..fa31933 100644
--- a/Source/EmberCL/EmberCLStructs.h
+++ b/Source/EmberCL/EmberCLStructs.h
@@ -39,7 +39,7 @@ static string ConstantDefinesString(bool doublePrecision)
 		   << "#endif\n"
 		   << "#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable\n"//Only supported on nVidia.
 		   << "typedef long intPrec;\n"
-		   << "typedef ulong atomi;\n"
+		   << "typedef uint atomi;\n"//Same size as real_bucket_t, always 4 bytes.
 		   << "typedef double real_t;\n"
 		   << "typedef float real_bucket_t;\n"//Assume buckets are always float, even though iter calcs are in double.
 		   << "typedef double4 real4;\n"
diff --git a/Source/EmberCL/FinalAccumOpenCLKernelCreator.cpp b/Source/EmberCL/FinalAccumOpenCLKernelCreator.cpp
index e06ce79..7ebf7ee 100644
--- a/Source/EmberCL/FinalAccumOpenCLKernelCreator.cpp
+++ b/Source/EmberCL/FinalAccumOpenCLKernelCreator.cpp
@@ -37,24 +37,24 @@ FinalAccumOpenCLKernelCreator::FinalAccumOpenCLKernelCreator(bool doublePrecisio
 /// Kernel source and entry point properties, getters only.
 /// </summary>
 
-string FinalAccumOpenCLKernelCreator::GammaCorrectionWithAlphaCalcKernel()        { return m_GammaCorrectionWithAlphaCalcKernel;	    }
-string FinalAccumOpenCLKernelCreator::GammaCorrectionWithAlphaCalcEntryPoint()    { return m_GammaCorrectionWithAlphaCalcEntryPoint;    }
-string FinalAccumOpenCLKernelCreator::GammaCorrectionWithoutAlphaCalcKernel()     { return m_GammaCorrectionWithoutAlphaCalcKernel;     }
-string FinalAccumOpenCLKernelCreator::GammaCorrectionWithoutAlphaCalcEntryPoint() { return m_GammaCorrectionWithoutAlphaCalcEntryPoint; }
+const string& FinalAccumOpenCLKernelCreator::GammaCorrectionWithAlphaCalcKernel()        const { return m_GammaCorrectionWithAlphaCalcKernel;	    }
+const string& FinalAccumOpenCLKernelCreator::GammaCorrectionWithAlphaCalcEntryPoint()    const { return m_GammaCorrectionWithAlphaCalcEntryPoint;    }
+const string& FinalAccumOpenCLKernelCreator::GammaCorrectionWithoutAlphaCalcKernel()     const { return m_GammaCorrectionWithoutAlphaCalcKernel;     }
+const string& FinalAccumOpenCLKernelCreator::GammaCorrectionWithoutAlphaCalcEntryPoint() const { return m_GammaCorrectionWithoutAlphaCalcEntryPoint; }
 
-string FinalAccumOpenCLKernelCreator::FinalAccumEarlyClipKernel()                                   { return m_FinalAccumEarlyClipKernel;                                   }
-string FinalAccumOpenCLKernelCreator::FinalAccumEarlyClipEntryPoint()                               { return m_FinalAccumEarlyClipEntryPoint;                               }
-string FinalAccumOpenCLKernelCreator::FinalAccumEarlyClipWithAlphaCalcWithAlphaAccumKernel()        { return m_FinalAccumEarlyClipWithAlphaCalcWithAlphaAccumKernel;        }
-string FinalAccumOpenCLKernelCreator::FinalAccumEarlyClipWithAlphaCalcWithAlphaAccumEntryPoint()    { return m_FinalAccumEarlyClipWithAlphaCalcWithAlphaAccumEntryPoint;    }
-string FinalAccumOpenCLKernelCreator::FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumKernel()     { return m_FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumKernel;     }
-string FinalAccumOpenCLKernelCreator::FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumEntryPoint() { return m_FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumEntryPoint; }
+const string& FinalAccumOpenCLKernelCreator::FinalAccumEarlyClipKernel()                                   const { return m_FinalAccumEarlyClipKernel;                                   }
+const string& FinalAccumOpenCLKernelCreator::FinalAccumEarlyClipEntryPoint()                               const { return m_FinalAccumEarlyClipEntryPoint;                               }
+const string& FinalAccumOpenCLKernelCreator::FinalAccumEarlyClipWithAlphaCalcWithAlphaAccumKernel()        const { return m_FinalAccumEarlyClipWithAlphaCalcWithAlphaAccumKernel;        }
+const string& FinalAccumOpenCLKernelCreator::FinalAccumEarlyClipWithAlphaCalcWithAlphaAccumEntryPoint()    const { return m_FinalAccumEarlyClipWithAlphaCalcWithAlphaAccumEntryPoint;    }
+const string& FinalAccumOpenCLKernelCreator::FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumKernel()     const { return m_FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumKernel;     }
+const string& FinalAccumOpenCLKernelCreator::FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumEntryPoint() const { return m_FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumEntryPoint; }
 
-string FinalAccumOpenCLKernelCreator::FinalAccumLateClipKernel()                                   { return m_FinalAccumLateClipKernel;                                   }
-string FinalAccumOpenCLKernelCreator::FinalAccumLateClipEntryPoint()                               { return m_FinalAccumLateClipEntryPoint;                               }
-string FinalAccumOpenCLKernelCreator::FinalAccumLateClipWithAlphaCalcWithAlphaAccumKernel()        { return m_FinalAccumLateClipWithAlphaCalcWithAlphaAccumKernel;        }
-string FinalAccumOpenCLKernelCreator::FinalAccumLateClipWithAlphaCalcWithAlphaAccumEntryPoint()    { return m_FinalAccumLateClipWithAlphaCalcWithAlphaAccumEntryPoint;    }
-string FinalAccumOpenCLKernelCreator::FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumKernel()     { return m_FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumKernel;     }
-string FinalAccumOpenCLKernelCreator::FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumEntryPoint() { return m_FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumEntryPoint; }
+const string& FinalAccumOpenCLKernelCreator::FinalAccumLateClipKernel()                                   const { return m_FinalAccumLateClipKernel;                                   }
+const string& FinalAccumOpenCLKernelCreator::FinalAccumLateClipEntryPoint()                               const { return m_FinalAccumLateClipEntryPoint;                               }
+const string& FinalAccumOpenCLKernelCreator::FinalAccumLateClipWithAlphaCalcWithAlphaAccumKernel()        const { return m_FinalAccumLateClipWithAlphaCalcWithAlphaAccumKernel;        }
+const string& FinalAccumOpenCLKernelCreator::FinalAccumLateClipWithAlphaCalcWithAlphaAccumEntryPoint()    const { return m_FinalAccumLateClipWithAlphaCalcWithAlphaAccumEntryPoint;    }
+const string& FinalAccumOpenCLKernelCreator::FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumKernel()     const { return m_FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumKernel;     }
+const string& FinalAccumOpenCLKernelCreator::FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumEntryPoint() const { return m_FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumEntryPoint; }
 
 /// <summary>
 /// Get the gamma correction entry point.
@@ -62,7 +62,7 @@ string FinalAccumOpenCLKernelCreator::FinalAccumLateClipWithoutAlphaCalcWithAlph
 /// <param name="channels">The number of channels used, 3 or 4.</param>
 /// <param name="transparency">True if channels equals 4 and using transparency, else false.</param>
 /// <returns>The name of the gamma correction entry point kernel function</returns>
-string FinalAccumOpenCLKernelCreator::GammaCorrectionEntryPoint(size_t channels, bool transparency)
+const string& FinalAccumOpenCLKernelCreator::GammaCorrectionEntryPoint(size_t channels, bool transparency) const
 {
 	bool alphaCalc = ((channels > 3) && transparency);
 	return alphaCalc ? m_GammaCorrectionWithAlphaCalcEntryPoint : m_GammaCorrectionWithoutAlphaCalcEntryPoint;
@@ -74,7 +74,7 @@ string FinalAccumOpenCLKernelCreator::GammaCorrectionEntryPoint(size_t channels,
 /// <param name="channels">The number of channels used, 3 or 4.</param>
 /// <param name="transparency">True if channels equals 4 and using transparency, else false.</param>
 /// <returns>The gamma correction kernel string</returns>
-string FinalAccumOpenCLKernelCreator::GammaCorrectionKernel(size_t channels, bool transparency)
+const string& FinalAccumOpenCLKernelCreator::GammaCorrectionKernel(size_t channels, bool transparency) const
 {
 	bool alphaCalc = ((channels > 3) && transparency);
 	return alphaCalc ? m_GammaCorrectionWithAlphaCalcKernel : m_GammaCorrectionWithoutAlphaCalcKernel;
@@ -89,7 +89,7 @@ string FinalAccumOpenCLKernelCreator::GammaCorrectionKernel(size_t channels, boo
 /// <param name="alphaBase">Storage for the alpha base value used in the kernel. 0 if transparency is true, else 255.</param>
 /// <param name="alphaScale">Storage for the alpha scale value used in the kernel. 255 if transparency is true, else 0.</param>
 /// <returns>The name of the final accumulation entry point kernel function</returns>
-string FinalAccumOpenCLKernelCreator::FinalAccumEntryPoint(bool earlyClip, size_t channels, bool transparency, double& alphaBase, double& alphaScale)
+const string& FinalAccumOpenCLKernelCreator::FinalAccumEntryPoint(bool earlyClip, size_t channels, bool transparency, double& alphaBase, double& alphaScale) const
 {
 	bool alphaCalc = ((channels > 3) && transparency);
 	bool alphaAccum = channels > 3;
@@ -109,7 +109,7 @@ string FinalAccumOpenCLKernelCreator::FinalAccumEntryPoint(bool earlyClip, size_
 		else if (!alphaCalc && alphaAccum)//Rgba output and !Transparency.
 			return FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumEntryPoint();
 		else
-			return "";//Cannot have alphaCalc and !alphaAccum, it makes no sense.
+			return m_Empty;//Cannot have alphaCalc and !alphaAccum, it makes no sense.
 	}
 	else
 	{
@@ -120,7 +120,7 @@ string FinalAccumOpenCLKernelCreator::FinalAccumEntryPoint(bool earlyClip, size_
 		else if (!alphaCalc && alphaAccum)//Rgba output and !Transparency.
 			return FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumEntryPoint();
 		else
-			return "";//Cannot have alphaCalc and !alphaAccum, it makes no sense.
+			return m_Empty;//Cannot have alphaCalc and !alphaAccum, it makes no sense.
 	}
 }
 
@@ -131,7 +131,7 @@ string FinalAccumOpenCLKernelCreator::FinalAccumEntryPoint(bool earlyClip, size_
 /// <param name="channels">The number of channels used, 3 or 4.</param>
 /// <param name="transparency">True if channels equals 4 and using transparency, else false.</param>
 /// <returns>The final accumulation kernel string</returns>
-string FinalAccumOpenCLKernelCreator::FinalAccumKernel(bool earlyClip, size_t channels, bool transparency)
+const string& FinalAccumOpenCLKernelCreator::FinalAccumKernel(bool earlyClip, size_t channels, bool transparency) const
 {
 	bool alphaCalc = (channels > 3 && transparency);
 	bool alphaAccum = channels > 3;
@@ -145,7 +145,7 @@ string FinalAccumOpenCLKernelCreator::FinalAccumKernel(bool earlyClip, size_t ch
 		else if (!alphaCalc && alphaAccum)//Rgba output and !Transparency.
 			return FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumKernel();
 		else
-			return "";//Cannot have alphaCalc and !alphaAccum, it makes no sense.
+			return m_Empty;//Cannot have alphaCalc and !alphaAccum, it makes no sense.
 	}
 	else
 	{
@@ -156,7 +156,7 @@ string FinalAccumOpenCLKernelCreator::FinalAccumKernel(bool earlyClip, size_t ch
 		else if (!alphaCalc && alphaAccum)//Rgba output and !Transparency.
 			return FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumKernel();
 		else
-			return "";//Cannot have alphaCalc and !alphaAccum, it makes no sense.
+			return m_Empty;//Cannot have alphaCalc and !alphaAccum, it makes no sense.
 	}
 }
 
diff --git a/Source/EmberCL/FinalAccumOpenCLKernelCreator.h b/Source/EmberCL/FinalAccumOpenCLKernelCreator.h
index 6d6ee26..1018768 100644
--- a/Source/EmberCL/FinalAccumOpenCLKernelCreator.h
+++ b/Source/EmberCL/FinalAccumOpenCLKernelCreator.h
@@ -25,29 +25,29 @@ class EMBERCL_API FinalAccumOpenCLKernelCreator
 public:
 	FinalAccumOpenCLKernelCreator(bool doublePrecision);
 
-	string GammaCorrectionWithAlphaCalcKernel();
-	string GammaCorrectionWithAlphaCalcEntryPoint();
+	const string& GammaCorrectionWithAlphaCalcKernel() const;
+	const string& GammaCorrectionWithAlphaCalcEntryPoint() const;
 
-	string GammaCorrectionWithoutAlphaCalcKernel();
-	string GammaCorrectionWithoutAlphaCalcEntryPoint();
+	const string& GammaCorrectionWithoutAlphaCalcKernel() const;
+	const string& GammaCorrectionWithoutAlphaCalcEntryPoint() const;
 
-	string FinalAccumEarlyClipKernel();
-	string FinalAccumEarlyClipEntryPoint();
-	string FinalAccumEarlyClipWithAlphaCalcWithAlphaAccumKernel();
-	string FinalAccumEarlyClipWithAlphaCalcWithAlphaAccumEntryPoint();
-	string FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumKernel();
-	string FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumEntryPoint();
+	const string& FinalAccumEarlyClipKernel() const;
+	const string& FinalAccumEarlyClipEntryPoint() const;
+	const string& FinalAccumEarlyClipWithAlphaCalcWithAlphaAccumKernel() const;
+	const string& FinalAccumEarlyClipWithAlphaCalcWithAlphaAccumEntryPoint() const;
+	const string& FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumKernel() const;
+	const string& FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumEntryPoint() const;
 
-	string FinalAccumLateClipKernel();
-	string FinalAccumLateClipEntryPoint();
-	string FinalAccumLateClipWithAlphaCalcWithAlphaAccumKernel();
-	string FinalAccumLateClipWithAlphaCalcWithAlphaAccumEntryPoint();
-	string FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumKernel();
-	string FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumEntryPoint();
-	string GammaCorrectionEntryPoint(size_t channels, bool transparency);
-	string GammaCorrectionKernel(size_t channels, bool transparency);
-	string FinalAccumEntryPoint(bool earlyClip, size_t channels, bool transparency, double& alphaBase, double& alphaScale);
-	string FinalAccumKernel(bool earlyClip, size_t channels, bool transparency);
+	const string& FinalAccumLateClipKernel() const;
+	const string& FinalAccumLateClipEntryPoint() const;
+	const string& FinalAccumLateClipWithAlphaCalcWithAlphaAccumKernel() const;
+	const string& FinalAccumLateClipWithAlphaCalcWithAlphaAccumEntryPoint() const;
+	const string& FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumKernel() const;
+	const string& FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumEntryPoint() const;
+	const string& GammaCorrectionEntryPoint(size_t channels, bool transparency) const;
+	const string& GammaCorrectionKernel(size_t channels, bool transparency) const;
+	const string& FinalAccumEntryPoint(bool earlyClip, size_t channels, bool transparency, double& alphaBase, double& alphaScale) const;
+	const string& FinalAccumKernel(bool earlyClip, size_t channels, bool transparency) const;
 
 private:
 	string CreateFinalAccumKernelString(bool earlyClip, size_t channels, bool transparency);
@@ -56,6 +56,7 @@ private:
 	string CreateFinalAccumKernelString(bool earlyClip, bool alphaCalc, bool alphaAccum);
 	string CreateGammaCorrectionFunctionString(bool globalBucket, bool alphaCalc, bool alphaAccum, bool finalOut);
 	string CreateCalcNewRgbFunctionString(bool globalBucket);
+
 	string m_GammaCorrectionWithAlphaCalcKernel;
 	string m_GammaCorrectionWithAlphaCalcEntryPoint;
 
@@ -76,6 +77,7 @@ private:
 	string m_FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumKernel;//False, true.
 	string m_FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumEntryPoint;
 
+	string m_Empty;
 	bool m_DoublePrecision;
 };
 }
diff --git a/Source/EmberCL/IterOpenCLKernelCreator.cpp b/Source/EmberCL/IterOpenCLKernelCreator.cpp
index d1a8e23..d634152 100644
--- a/Source/EmberCL/IterOpenCLKernelCreator.cpp
+++ b/Source/EmberCL/IterOpenCLKernelCreator.cpp
@@ -10,11 +10,9 @@ namespace EmberCLns
 /// Constructor that sets up some basic entry point strings and creates
 /// the zeroization kernel string since it requires no conditional inputs.
 /// </summary>
-/// <param name="nVidia">True if running on an nVidia card, else false.</param>
 template <typename T>
-IterOpenCLKernelCreator<T>::IterOpenCLKernelCreator(bool nVidia)
+IterOpenCLKernelCreator<T>::IterOpenCLKernelCreator()
 {
-	m_NVidia = nVidia;
 	m_IterEntryPoint = "IterateKernel";
 	m_ZeroizeEntryPoint = "ZeroizeKernel";
 	m_ZeroizeKernel = CreateZeroizeKernelString();
@@ -24,9 +22,9 @@ IterOpenCLKernelCreator<T>::IterOpenCLKernelCreator(bool nVidia)
 /// Accessors.
 /// </summary>
 
-template <typename T> string IterOpenCLKernelCreator<T>::ZeroizeKernel() { return m_ZeroizeKernel; }
-template <typename T> string IterOpenCLKernelCreator<T>::ZeroizeEntryPoint() { return m_ZeroizeEntryPoint; }
-template <typename T> string IterOpenCLKernelCreator<T>::IterEntryPoint() { return m_IterEntryPoint; }
+template <typename T> const string& IterOpenCLKernelCreator<T>::ZeroizeKernel() const { return m_ZeroizeKernel; }
+template <typename T> const string& IterOpenCLKernelCreator<T>::ZeroizeEntryPoint() const { return m_ZeroizeEntryPoint; }
+template <typename T> const string& IterOpenCLKernelCreator<T>::IterEntryPoint() const { return m_IterEntryPoint; }
 
 /// <summary>
 /// Create the iteration kernel string using the Cuburn method.
@@ -221,8 +219,12 @@ string IterOpenCLKernelCreator<T>::CreateIterKernelString(Ember<T>& ember, strin
 		EmberCLStructString <<
 		UnionCLStructString <<
 		CarToRasCLStructString <<
-		CarToRasFunctionString <<
-		AtomicString(doublePrecision, m_NVidia) <<
+		CarToRasFunctionString;
+
+	if (lockAccum)
+		os << AtomicString();
+
+	os <<
 		xformFuncs.str() <<
 		"__kernel void " << m_IterEntryPoint << "(\n" <<
 		"	uint iterCount,\n"
diff --git a/Source/EmberCL/IterOpenCLKernelCreator.h b/Source/EmberCL/IterOpenCLKernelCreator.h
index 4eb1e2a..9054ce1 100644
--- a/Source/EmberCL/IterOpenCLKernelCreator.h
+++ b/Source/EmberCL/IterOpenCLKernelCreator.h
@@ -23,10 +23,10 @@ template <typename T>
 class EMBERCL_API IterOpenCLKernelCreator
 {
 public:
-	IterOpenCLKernelCreator(bool nVidia);
-	string ZeroizeKernel();
-	string ZeroizeEntryPoint();
-	string IterEntryPoint();
+	IterOpenCLKernelCreator();
+	const string& ZeroizeKernel() const;
+	const string& ZeroizeEntryPoint() const;
+	const string& IterEntryPoint() const;
 	string CreateIterKernelString(Ember<T>& ember, string& parVarDefines, bool lockAccum = false, bool doAccum = true);
 	static void ParVarIndexDefines(Ember<T>& ember, pair<string, vector<T>>& params, bool doVals = true, bool doString = true);
 	static bool IsBuildRequired(Ember<T>& ember1, Ember<T>& ember2);
@@ -38,7 +38,6 @@ private:
 	string m_IterEntryPoint;
 	string m_ZeroizeKernel;
 	string m_ZeroizeEntryPoint;
-	bool m_NVidia;
 };
 
 #ifdef OPEN_CL_TEST_AREA
diff --git a/Source/EmberCL/RendererCL.cpp b/Source/EmberCL/RendererCL.cpp
index 118051b..c7faf68 100644
--- a/Source/EmberCL/RendererCL.cpp
+++ b/Source/EmberCL/RendererCL.cpp
@@ -15,7 +15,7 @@ namespace EmberCLns
 template <typename T, typename bucketT>
 RendererCL<T, bucketT>::RendererCL(uint platform, uint device, bool shared, GLuint outputTexID)
 	:
-	m_IterOpenCLKernelCreator(false),
+	m_IterOpenCLKernelCreator(),
 	m_DEOpenCLKernelCreator(typeid(T) == typeid(double), false),
 	m_FinalAccumOpenCLKernelCreator(typeid(T) == typeid(double))
 {
@@ -105,7 +105,7 @@ bool RendererCL<T, bucketT>::Init(uint platform, uint device, bool shared, GLuin
 	{
 		m_NVidia = ToLower(m_Wrapper.DeviceAndPlatformNames()).find_first_of("nvidia") != string::npos && m_Wrapper.LocalMemSize() > (32 * 1024);
 		m_WarpSize = m_NVidia ? 32 : 64;
-		m_IterOpenCLKernelCreator = IterOpenCLKernelCreator<T>(m_NVidia);
+		m_IterOpenCLKernelCreator = IterOpenCLKernelCreator<T>();
 		m_DEOpenCLKernelCreator = DEOpenCLKernelCreator(m_DoublePrecision, m_NVidia);
 
 		string zeroizeProgram = m_IterOpenCLKernelCreator.ZeroizeKernel();
@@ -286,7 +286,7 @@ bool RendererCL<T, bucketT>::WriteRandomPoints()
 /// </summary>
 /// <returns>The string representation of the kernel for the last built iter program.</returns>
 template <typename T, typename bucketT>
-string RendererCL<T, bucketT>::IterKernel() { return m_IterKernel; }
+const string& RendererCL<T, bucketT>::IterKernel() const { return m_IterKernel; }
 
 
 /// <summary>
@@ -294,14 +294,14 @@ string RendererCL<T, bucketT>::IterKernel() { return m_IterKernel; }
 /// </summary>
 /// <returns>The string representation of the kernel for the last built density filtering program.</returns>
 template <typename T, typename bucketT>
-string RendererCL<T, bucketT>::DEKernel() { return m_DEOpenCLKernelCreator.GaussianDEKernel(Supersample(), m_DensityFilterCL.m_FilterWidth); }
+const string& RendererCL<T, bucketT>::DEKernel() const { return m_DEOpenCLKernelCreator.GaussianDEKernel(Supersample(), m_DensityFilterCL.m_FilterWidth); }
 
 /// <summary>
 /// Get the kernel string for the last built final accumulation program.
 /// </summary>
 /// <returns>The string representation of the kernel for the last built final accumulation program.</returns>
 template <typename T, typename bucketT>
-string RendererCL<T, bucketT>::FinalAccumKernel() { return m_FinalAccumOpenCLKernelCreator.FinalAccumKernel(EarlyClip(), Renderer<T, bucketT>::NumChannels(), Transparency()); }
+const string& RendererCL<T, bucketT>::FinalAccumKernel() const { return m_FinalAccumOpenCLKernelCreator.FinalAccumKernel(EarlyClip(), Renderer<T, bucketT>::NumChannels(), Transparency()); }
 
 /// <summary>
 /// Virtual functions overridden from RendererCLBase.
@@ -567,8 +567,8 @@ bool RendererCL<T, bucketT>::Alloc()
 	m_XformsCL.resize(m_Ember.TotalXformCount());
 
 	bool b = true;
-	size_t histLength = SuperSize() * sizeof(v4T);
-	size_t accumLength = SuperSize() * sizeof(v4T);
+	size_t histLength = SuperSize() * sizeof(v4bT);
+	size_t accumLength = SuperSize() * sizeof(v4bT);
 	const char* loc = __FUNCTION__;
 
 	if (b && !(b = m_Wrapper.AddBuffer(m_EmberBufferName,               sizeof(m_EmberCL))))						  { this->m_ErrorReport.push_back(loc); }
@@ -694,18 +694,37 @@ EmberStats RendererCL<T, bucketT>::Iterate(size_t iterCount, size_t temporalSamp
 	EmberStats stats;//Do not record bad vals with with GPU. If the user needs to investigate bad vals, use the CPU.
 	const char* loc = __FUNCTION__;
 
-	IterOpenCLKernelCreator<T>::ParVarIndexDefines(m_Ember, m_Params, true, false);//Always do this to get the values (but no string), regardless of whether a rebuild is necessary.
-
-	//Don't know the size of the parametric varations parameters buffer until the ember is examined.
-	//So set it up right before the run.
-	if (!m_Params.second.empty())
+	//Only need to do this once on the beginning of a new render. Last iter will always be 0 at the beginning of a full render or temporal sample.
+	if (m_LastIter == 0)
 	{
-		if (!m_Wrapper.AddAndWriteBuffer(m_ParVarsBufferName, m_Params.second.data(), m_Params.second.size() * sizeof(m_Params.second[0])))
+		ConvertEmber(m_Ember, m_EmberCL, m_XformsCL);
+		ConvertCarToRas(*CoordMap());
+
+		if (b && !(b = m_Wrapper.WriteBuffer(m_EmberBufferName,		 reinterpret_cast<void*>(&m_EmberCL),							   sizeof(m_EmberCL))))						    { this->m_ErrorReport.push_back(loc); }
+		if (b && !(b = m_Wrapper.WriteBuffer(m_XformsBufferName,	 reinterpret_cast<void*>(m_XformsCL.data()),					   sizeof(m_XformsCL[0]) * m_XformsCL.size()))) { this->m_ErrorReport.push_back(loc); }
+		if (b && !(b = m_Wrapper.AddAndWriteBuffer(m_DistBufferName, reinterpret_cast<void*>(const_cast<byte*>(XformDistributions())), XformDistributionsSize())))				    { this->m_ErrorReport.push_back(loc); }//Will be resized for xaos.
+		if (b && !(b = m_Wrapper.WriteBuffer(m_CarToRasBufferName,   reinterpret_cast<void*>(&m_CarToRasCL),						   sizeof(m_CarToRasCL))))					    { this->m_ErrorReport.push_back(loc); }
+
+		if (b && !(b = m_Wrapper.AddAndWriteImage("Palette", CL_MEM_READ_ONLY, m_PaletteFormat, m_DmapCL.m_Entries.size(), 1, 0, m_DmapCL.m_Entries.data()))) { this->m_ErrorReport.push_back(loc); }
+
+		if (b)
 		{
-			m_Abort = true;
-			this->m_ErrorReport.push_back(loc);
-			return stats;
+			IterOpenCLKernelCreator<T>::ParVarIndexDefines(m_Ember, m_Params, true, false);//Always do this to get the values (but no string), regardless of whether a rebuild is necessary.
+
+			//Don't know the size of the parametric varations parameters buffer until the ember is examined.
+			//So set it up right before the run.
+			if (!m_Params.second.empty())
+			{
+				if (!m_Wrapper.AddAndWriteBuffer(m_ParVarsBufferName, m_Params.second.data(), m_Params.second.size() * sizeof(m_Params.second[0])))
+				{
+					m_Abort = true;
+					this->m_ErrorReport.push_back(loc);
+					return stats;
+				}
+			}
 		}
+		else
+			return stats;
 	}
 
 	//Rebuilding is expensive, so only do it if it's required.
@@ -716,7 +735,7 @@ EmberStats RendererCL<T, bucketT>::Iterate(size_t iterCount, size_t temporalSamp
 	{
 		m_IterTimer.Tic();//Tic() here to avoid including build time in iter time measurement.
 
-		if (m_Stats.m_Iters == 0)//Only reset the call count on the beginning of a new render. Do not reset on KEEP_ITERATING.
+		if (m_LastIter == 0)//Only reset the call count on the beginning of a new render. Do not reset on KEEP_ITERATING.
 			m_Calls = 0;
 
 		b = RunIter(iterCount, temporalSample, stats.m_Iters);
@@ -772,10 +791,8 @@ bool RendererCL<T, bucketT>::BuildIterProgramForEmber(bool doAccum)
 
 /// <summary>
 /// Run the iteration kernel.
-/// Fusing on the CPU is done once per sub batch, usually 10,000 iters, however
-/// determining when to do it in OpenCL is much more difficult.
-/// Currently it's done once every 4 kernel calls which seems to be a good balance
-/// between quality of the final image and performance.
+/// Fusing on the CPU is done once per sub batch, usually 10,000 iters. Here,
+/// the same fusing frequency is kept, but is done per kernel thread.
 /// </summary>
 /// <param name="iterCount">The number of iterations to run</param>
 /// <param name="temporalSample">The temporal sample this is running for</param>
@@ -803,16 +820,6 @@ bool RendererCL<T, bucketT>::RunIter(size_t iterCount, size_t temporalSample, si
 
 	if (kernelIndex != -1)
 	{
-		ConvertEmber(m_Ember, m_EmberCL, m_XformsCL);
-		m_CarToRasCL = ConvertCarToRas(*CoordMap());
-
-		if (b && !(b = m_Wrapper.WriteBuffer      (m_EmberBufferName,    reinterpret_cast<void*>(&m_EmberCL),							   sizeof(m_EmberCL))))						    { this->m_ErrorReport.push_back(loc); }
-		if (b && !(b = m_Wrapper.WriteBuffer	  (m_XformsBufferName,   reinterpret_cast<void*>(m_XformsCL.data()),					   sizeof(m_XformsCL[0]) * m_XformsCL.size()))) { this->m_ErrorReport.push_back(loc); }
-		if (b && !(b = m_Wrapper.AddAndWriteBuffer(m_DistBufferName,     reinterpret_cast<void*>(const_cast<byte*>(XformDistributions())), XformDistributionsSize())))				    { this->m_ErrorReport.push_back(loc); }//Will be resized for xaos.
-		if (b && !(b = m_Wrapper.WriteBuffer      (m_CarToRasBufferName, reinterpret_cast<void*>(&m_CarToRasCL),						   sizeof(m_CarToRasCL))))					    { this->m_ErrorReport.push_back(loc); }
-
-		if (b && !(b = m_Wrapper.AddAndWriteImage("Palette", CL_MEM_READ_ONLY, m_PaletteFormat, m_DmapCL.m_Entries.size(), 1, 0, m_DmapCL.m_Entries.data()))) { this->m_ErrorReport.push_back(loc); }
-
 		//If animating, treat each temporal sample as a newly started render for fusing purposes.
 		if (temporalSample > 0)
 			m_Calls = 0;
@@ -1255,13 +1262,13 @@ bool RendererCL<T, bucketT>::RunDensityFilterPrivate(uint kernelIndex, uint grid
 template <typename T, typename bucketT>
 int RendererCL<T, bucketT>::MakeAndGetDensityFilterProgram(size_t ss, uint filterWidth)
 {
-	string deEntryPoint = m_DEOpenCLKernelCreator.GaussianDEEntryPoint(ss, filterWidth);
+	auto& deEntryPoint = m_DEOpenCLKernelCreator.GaussianDEEntryPoint(ss, filterWidth);
 	int kernelIndex = m_Wrapper.FindKernelIndex(deEntryPoint);
 	const char* loc = __FUNCTION__;
 
 	if (kernelIndex == -1)//Has not been built yet.
 	{
-		string kernel = m_DEOpenCLKernelCreator.GaussianDEKernel(ss, filterWidth);
+		auto& kernel = m_DEOpenCLKernelCreator.GaussianDEKernel(ss, filterWidth);
 		bool b = m_Wrapper.AddProgram(deEntryPoint, kernel, deEntryPoint, m_DoublePrecision);
 
 		if (b)
@@ -1288,13 +1295,13 @@ int RendererCL<T, bucketT>::MakeAndGetDensityFilterProgram(size_t ss, uint filte
 template <typename T, typename bucketT>
 int RendererCL<T, bucketT>::MakeAndGetFinalAccumProgram(double& alphaBase, double& alphaScale)
 {
-	string finalAccumEntryPoint = m_FinalAccumOpenCLKernelCreator.FinalAccumEntryPoint(EarlyClip(), Renderer<T, bucketT>::NumChannels(), Transparency(), alphaBase, alphaScale);
+	auto& finalAccumEntryPoint = m_FinalAccumOpenCLKernelCreator.FinalAccumEntryPoint(EarlyClip(), Renderer<T, bucketT>::NumChannels(), Transparency(), alphaBase, alphaScale);
 	int kernelIndex = m_Wrapper.FindKernelIndex(finalAccumEntryPoint);
 	const char* loc = __FUNCTION__;
 
 	if (kernelIndex == -1)//Has not been built yet.
 	{
-		string kernel = m_FinalAccumOpenCLKernelCreator.FinalAccumKernel(EarlyClip(), Renderer<T, bucketT>::NumChannels(), Transparency());
+		auto& kernel = m_FinalAccumOpenCLKernelCreator.FinalAccumKernel(EarlyClip(), Renderer<T, bucketT>::NumChannels(), Transparency());
 		bool b = m_Wrapper.AddProgram(finalAccumEntryPoint, kernel, finalAccumEntryPoint, m_DoublePrecision);
 
 		if (b)
@@ -1313,13 +1320,13 @@ int RendererCL<T, bucketT>::MakeAndGetFinalAccumProgram(double& alphaBase, doubl
 template <typename T, typename bucketT>
 int RendererCL<T, bucketT>::MakeAndGetGammaCorrectionProgram()
 {
-	string gammaEntryPoint = m_FinalAccumOpenCLKernelCreator.GammaCorrectionEntryPoint(Renderer<T, bucketT>::NumChannels(), Transparency());
+	auto& gammaEntryPoint = m_FinalAccumOpenCLKernelCreator.GammaCorrectionEntryPoint(Renderer<T, bucketT>::NumChannels(), Transparency());
 	int kernelIndex = m_Wrapper.FindKernelIndex(gammaEntryPoint);
 	const char* loc = __FUNCTION__;
 
 	if (kernelIndex == -1)//Has not been built yet.
 	{
-		string kernel = m_FinalAccumOpenCLKernelCreator.GammaCorrectionKernel(Renderer<T, bucketT>::NumChannels(), Transparency());
+		auto& kernel = m_FinalAccumOpenCLKernelCreator.GammaCorrectionKernel(Renderer<T, bucketT>::NumChannels(), Transparency());
 		bool b = m_Wrapper.AddProgram(gammaEntryPoint, kernel, gammaEntryPoint, m_DoublePrecision);
 
 		if (b)
@@ -1454,21 +1461,17 @@ void RendererCL<T, bucketT>::ConvertEmber(Ember<T>& ember, EmberCL<T>& emberCL,
 /// <param name="carToRas">The CarToRas object to convert</param>
 /// <returns>The CarToRasCL object</returns>
 template <typename T, typename bucketT>
-CarToRasCL<T> RendererCL<T, bucketT>::ConvertCarToRas(const CarToRas<T>& carToRas)
+void RendererCL<T, bucketT>::ConvertCarToRas(const CarToRas<T>& carToRas)
 {
-	CarToRasCL<T> carToRasCL;
-
-	carToRasCL.m_RasWidth = uint(carToRas.RasWidth());
-	carToRasCL.m_PixPerImageUnitW = carToRas.PixPerImageUnitW();
-	carToRasCL.m_RasLlX = carToRas.RasLlX();
-	carToRasCL.m_PixPerImageUnitH = carToRas.PixPerImageUnitH();
-	carToRasCL.m_RasLlY = carToRas.RasLlY();
-	carToRasCL.m_CarLlX = carToRas.CarLlX();
-	carToRasCL.m_CarLlY = carToRas.CarLlY();
-	carToRasCL.m_CarUrX = carToRas.CarUrX();
-	carToRasCL.m_CarUrY = carToRas.CarUrY();
-
-	return carToRasCL;
+	m_CarToRasCL.m_RasWidth = uint(carToRas.RasWidth());
+	m_CarToRasCL.m_PixPerImageUnitW = carToRas.PixPerImageUnitW();
+	m_CarToRasCL.m_RasLlX = carToRas.RasLlX();
+	m_CarToRasCL.m_PixPerImageUnitH = carToRas.PixPerImageUnitH();
+	m_CarToRasCL.m_RasLlY = carToRas.RasLlY();
+	m_CarToRasCL.m_CarLlX = carToRas.CarLlX();
+	m_CarToRasCL.m_CarLlY = carToRas.CarLlY();
+	m_CarToRasCL.m_CarUrX = carToRas.CarUrX();
+	m_CarToRasCL.m_CarUrY = carToRas.CarUrY();
 }
 
 /// <summary>
diff --git a/Source/EmberCL/RendererCL.h b/Source/EmberCL/RendererCL.h
index b1b3e6d..bf714ba 100644
--- a/Source/EmberCL/RendererCL.h
+++ b/Source/EmberCL/RendererCL.h
@@ -124,9 +124,9 @@ public:
 #ifdef TEST_CL
 	bool WriteRandomPoints();
 #endif
-	string IterKernel();
-	string DEKernel();
-	string FinalAccumKernel();
+	const string& IterKernel() const;
+	const string& DEKernel() const;
+	const string& FinalAccumKernel() const;
 
 	//Virtual functions overridden from RendererCLBase.
 	virtual bool ReadFinal(byte* pixels);
@@ -179,7 +179,7 @@ private:
 	void ConvertDensityFilter();
 	void ConvertSpatialFilter();
 	void ConvertEmber(Ember<T>& ember, EmberCL<T>& emberCL, vector<XformCL<T>>& xformsCL);
-	static CarToRasCL<T> ConvertCarToRas(const CarToRas<T>& carToRas);
+	void ConvertCarToRas(const CarToRas<T>& carToRas);
 
 	bool m_Init;
 	bool m_NVidia;
diff --git a/Source/EmberTester/EmberTester.cpp b/Source/EmberTester/EmberTester.cpp
index 4aee375..21ba90b 100644
--- a/Source/EmberTester/EmberTester.cpp
+++ b/Source/EmberTester/EmberTester.cpp
@@ -71,7 +71,7 @@ Ember<T> CreateBasicEmber(uint width, uint height, uint ss, T quality, T centerX
 string GetEmberCLKernelString(Ember<float>& ember, bool iter, bool log, bool de, uint ss, bool accum)
 {
 	ostringstream os;
-	IterOpenCLKernelCreator<float> iterCreator(false);
+	IterOpenCLKernelCreator<float> iterCreator;
 	DEOpenCLKernelCreator deCreator(false, false);
 	FinalAccumOpenCLKernelCreator accumCreator(false);
 	pair<string, vector<float>> pair;
diff --git a/Source/Fractorium/AboutDialog.ui b/Source/Fractorium/AboutDialog.ui
index 4a13f20..5384bf2 100644
--- a/Source/Fractorium/AboutDialog.ui
+++ b/Source/Fractorium/AboutDialog.ui
@@ -7,7 +7,7 @@
     <x>0</x>
     <y>0</y>
     <width>488</width>
-    <height>567</height>
+    <height>595</height>
    </rect>
   </property>
   <property name="sizePolicy">
@@ -58,7 +58,7 @@
       </font>
      </property>
      <property name="text">
-      <string>&lt;html&gt;&lt;head/&gt;&lt;body&gt;&lt;p align=&quot;center&quot;&gt;&lt;br/&gt;&lt;span style=&quot; font-size:12pt;&quot;&gt;Fractorium 0.4.1.9 Beta&lt;/span&gt;&lt;/p&gt;&lt;p align=&quot;center&quot;&gt;&lt;span style=&quot; font-size:10pt;&quot;&gt;&lt;br/&gt;A Qt-based fractal flame editor which uses a C++ re-write of the flam3 algorithm named Ember and a GPU capable version named EmberCL which implements a portion of the cuburn algorithm in OpenCL.&lt;/span&gt;&lt;/p&gt;&lt;p align=&quot;center&quot;&gt;&lt;span style=&quot; font-size:10pt;&quot;&gt;Matt Feemster&lt;/span&gt;&lt;/p&gt;&lt;/body&gt;&lt;/html&gt;</string>
+      <string>&lt;html&gt;&lt;head/&gt;&lt;body&gt;&lt;p align=&quot;center&quot;&gt;&lt;br/&gt;Fractorium 0.4.1.9 Beta&lt;/p&gt;&lt;p align=&quot;center&quot;&gt;&lt;span style=&quot; font-size:10pt;&quot;&gt;&lt;br/&gt;A Qt-based fractal flame editor which uses a C++ re-write of the flam3 algorithm named Ember and a GPU capable version named EmberCL which implements a portion of the cuburn algorithm in OpenCL.&lt;/span&gt;&lt;/p&gt;&lt;p align=&quot;center&quot;&gt;&lt;span style=&quot; font-size:10pt;&quot;&gt;Lead: Matt Feemster&lt;/span&gt;&lt;/p&gt;&lt;p align=&quot;center&quot;&gt;&lt;span style=&quot; font-size:10pt;&quot;&gt;Contributors: Simon Detheridge&lt;/span&gt;&lt;/p&gt;&lt;/body&gt;&lt;/html&gt;</string>
      </property>
      <property name="textFormat">
       <enum>Qt::RichText</enum>