diff --git a/Builds/MSVC/Installer/FractoriumInstaller.wixproj b/Builds/MSVC/Installer/FractoriumInstaller.wixproj
index 4951f58..5704658 100644
--- a/Builds/MSVC/Installer/FractoriumInstaller.wixproj
+++ b/Builds/MSVC/Installer/FractoriumInstaller.wixproj
@@ -6,7 +6,7 @@
     <ProductVersion>3.7</ProductVersion>
     <ProjectGuid>{c8096c47-e358-438c-a520-146d46b0637d}</ProjectGuid>
     <SchemaVersion>2.0</SchemaVersion>
-    <OutputName>Fractorium_Beta_0.4.1.4</OutputName>
+    <OutputName>Fractorium_Beta_0.4.1.5</OutputName>
     <OutputType>Package</OutputType>
     <WixTargetsPath Condition=" '$(WixTargetsPath)' == '' AND '$(MSBuildExtensionsPath32)' != '' ">$(MSBuildExtensionsPath32)\Microsoft\WiX\v3.x\Wix.targets</WixTargetsPath>
     <WixTargetsPath Condition=" '$(WixTargetsPath)' == '' ">$(MSBuildExtensionsPath)\Microsoft\WiX\v3.x\Wix.targets</WixTargetsPath>
diff --git a/Builds/MSVC/Installer/Product.wxs b/Builds/MSVC/Installer/Product.wxs
index 0eb3367..9f1b883 100644
--- a/Builds/MSVC/Installer/Product.wxs
+++ b/Builds/MSVC/Installer/Product.wxs
@@ -1,6 +1,6 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <Wix xmlns="http://schemas.microsoft.com/wix/2006/wi">
-  <?define ProductVersion="0.4.1.4" ?>
+  <?define ProductVersion="0.4.1.5" ?>
   <?define ProductName="Fractorium Beta $(var.ProductVersion) ($(var.GpuType))" ?>
   <?define UpgradeCode="{4714cd15-bfba-44f6-8059-9e1466ebfa6e}"?>
   <?define Manufacturer="Fractorium"?>
@@ -13,7 +13,7 @@
   <!--
   Change this for every release.
   -->
-  <?define ProductCode="{362B35F6-078F-4773-B092-66473A67988E}"?>
+  <?define ProductCode="{9C24521C-6721-490F-A21C-49AF87EE7074}"?>
   
   <Product Id="$(var.ProductCode)" Name="$(var.ProductName)" Language="1033" Version="$(var.ProductVersion)" Manufacturer="$(var.Manufacturer)" UpgradeCode="$(var.UpgradeCode)">
     <Package
diff --git a/Data/Version History.txt b/Data/Version History.txt
index f364bb8..7ca6dbb 100644
--- a/Data/Version History.txt	
+++ b/Data/Version History.txt	
@@ -1,3 +1,51 @@
+0.4.1.5 Beta 11/28/2014
+--User Changes
+	Remove limit on the number of xforms allowable on the GPU. This was previously 21.
+	Show actual strips count to be used in parens outside of user specified strips count on final render dialog.
+	Allow for adjustment of iteration depth and fuse count per ember and save/read these values with the xml.
+	Iteration optimizations on both CPU and GPU.
+	Automatically adjust default quality spinner value when using CPU/GPU to 10/30, respectively.
+
+--Bug Fixes
+	Fix severe randomization bug with OpenCL.
+	Fix undo list off by one error when doing a new edit anywhere but the end of the undo list.
+	Make integer variation parameters use 4 decimal places in the variations list like all the others.
+	New build of the latest Qt to fix scroll bar drawing bug.
+	Prevent grid from showing as much when pressing control to increase a spinner's increment speed. Still shows sometimes, but better than before.
+
+--Code Changes
+	Pass count and fuse to iterator as a structure now to allow for passing more params in the future.
+	Slightly different grid/block logic when running DE filtering on the GPU.
+	Attempt a different way of doing DE, but #define out because it ended up not being faster.
+	Restructure some things to allow for a variable length xforms buffer to be passed to the GPU.
+	Add sub batch size and fuse count as ember members, and remove them from the renderer classes.
+	Remove m_LastPass from Renderer. It should have been removed with passes.
+	Pass seeds as a buffer to the OpenCL iteration kernel, rather than a single seed that gets modified.
+	Slight optimization on CPU accum.
+	Use case statement instead of if/else for xform chosing in OpenCL for a 2% speedup on params with large numbers of xforms.
+	Add SizeOf() wrapper around sizeof(vec[0]) * vec.size().
+	Remove LogScaleSum() functions from the CPU and GPU because they're no longer used since passes were removed.
+	Make some OpenCLWrapper getters const.
+	Better ogranize RendererCL methods that return grid dimensions.
+
+0.4.1.4 Beta 11/04/2014
+--User Changes
+	Remove custom focusing logic from spin boxes. This was originally done to overcome a bug in Qt that has since been fixed.
+	Add Random Xaos button.
+	Remove passes, they were never used.
+
+--Bug Fixes
+	Fix uniqe names and estimated iter count in final render dialog.
+	Fix using non-zero rotation with strips.
+	Fix locale decimal commas bug with affine adjustment combo boxes.
+	Fix accidental disabling of post affine group box. The designer was clobbering it.
+	Remove PaletteImage option from command line programs, it's unused.
+	Improve documentation for some other command line options.
+	
+--Code Changes
+	Replace all instances of auto_ptr with unique_ptr.
+	Remove * operator from Affine2D, it was unused.
+	
 0.4.1.3 Beta 10/14/2014
 --User Changes
 	Size is no longer fixed to the window size.
diff --git a/Source/Ember/Ember.h b/Source/Ember/Ember.h
index b479350..e3a4b79 100644
--- a/Source/Ember/Ember.h
+++ b/Source/Ember/Ember.h
@@ -92,6 +92,8 @@ public:
 		m_OrigFinalRasW		  = ember.m_OrigFinalRasW;
 		m_OrigFinalRasH		  = ember.m_OrigFinalRasH;
 		m_OrigPixPerUnit	  = ember.m_OrigPixPerUnit;
+		m_SubBatchSize		  = ember.m_SubBatchSize;
+		m_FuseCount			  = ember.m_FuseCount;
 		m_Supersample		  = ember.m_Supersample;
 		m_TemporalSamples	  = ember.m_TemporalSamples;
 		m_Symmetry			  = ember.m_Symmetry;
@@ -184,6 +186,8 @@ public:
 		m_OrigFinalRasW = 1920;
 		m_OrigFinalRasH = 1080;
 		m_OrigPixPerUnit = 240;
+		m_SubBatchSize = DEFAULT_SBS;
+		m_FuseCount = 15;
 		m_Supersample = 1;
 		m_TemporalSamples = 1000;
 		m_Symmetry = 0;
@@ -750,24 +754,15 @@ public:
 		m_PaletteMode = embers[0].m_PaletteMode;
 		m_AffineInterp = embers[0].m_AffineInterp;
 
-		//Interpolate ember parameters.
-		InterpT<&Ember<T>::m_Brightness>(embers, coefs, size);
-		InterpT<&Ember<T>::m_HighlightPower>(embers, coefs, size);
-		InterpT<&Ember<T>::m_Gamma>(embers, coefs, size);
-		InterpT<&Ember<T>::m_Vibrancy>(embers, coefs, size);
-		InterpT<&Ember<T>::m_Hue>(embers, coefs, size);
+		//Interpolate ember parameters, these should be in the same order the members are declared.
 		InterpI<&Ember<T>::m_FinalRasW>(embers, coefs, size);
 		InterpI<&Ember<T>::m_FinalRasH>(embers, coefs, size);
+		InterpI<&Ember<T>::m_SubBatchSize>(embers, coefs, size);
+		InterpI<&Ember<T>::m_FuseCount>(embers, coefs, size);
 		InterpI<&Ember<T>::m_Supersample>(embers, coefs, size);
-		InterpT<&Ember<T>::m_CenterX>(embers, coefs, size);
-		InterpT<&Ember<T>::m_CenterY>(embers, coefs, size);
-		InterpT<&Ember<T>::m_RotCenterY>(embers, coefs, size);
-		InterpX<Color<T>, &Ember<T>::m_Background>(embers, coefs, size); m_Background.a = bgAlphaSave;//Don't interp alpha.
-		InterpT<&Ember<T>::m_PixelsPerUnit>(embers, coefs, size);
-		InterpT<&Ember<T>::m_SpatialFilterRadius>(embers, coefs, size);
-		InterpT<&Ember<T>::m_TemporalFilterExp>(embers, coefs, size);
-		InterpT<&Ember<T>::m_TemporalFilterWidth>(embers, coefs, size);
+		InterpI<&Ember<T>::m_TemporalSamples>(embers, coefs, size);
 		InterpT<&Ember<T>::m_Quality>(embers, coefs, size);
+		InterpT<&Ember<T>::m_PixelsPerUnit>(embers, coefs, size);
 		InterpT<&Ember<T>::m_Zoom>(embers, coefs, size);
 		InterpT<&Ember<T>::m_CamZPos>(embers, coefs, size);
 		InterpT<&Ember<T>::m_CamPerspective>(embers, coefs, size);
@@ -775,12 +770,23 @@ public:
 		InterpT<&Ember<T>::m_CamPitch>(embers, coefs, size);
 		InterpT<&Ember<T>::m_CamDepthBlur>(embers, coefs, size);
 		InterpX<m3T, &Ember<T>::m_CamMat>(embers, coefs, size);
+		InterpT<&Ember<T>::m_CenterX>(embers, coefs, size);
+		InterpT<&Ember<T>::m_CenterY>(embers, coefs, size);
+		InterpT<&Ember<T>::m_RotCenterY>(embers, coefs, size);
 		InterpT<&Ember<T>::m_Rotate>(embers, coefs, size);
-		InterpI<&Ember<T>::m_TemporalSamples>(embers, coefs, size);
+		InterpT<&Ember<T>::m_Hue>(embers, coefs, size);
+		InterpT<&Ember<T>::m_Brightness>(embers, coefs, size);
+		InterpT<&Ember<T>::m_Gamma>(embers, coefs, size);
+		InterpT<&Ember<T>::m_Vibrancy>(embers, coefs, size);
+		InterpT<&Ember<T>::m_GammaThresh>(embers, coefs, size);
+		InterpT<&Ember<T>::m_HighlightPower>(embers, coefs, size);
+		InterpX<Color<T>, &Ember<T>::m_Background>(embers, coefs, size); m_Background.a = bgAlphaSave;//Don't interp alpha.
+		InterpT<&Ember<T>::m_TemporalFilterExp>(embers, coefs, size);
+		InterpT<&Ember<T>::m_TemporalFilterWidth>(embers, coefs, size);
 		InterpT<&Ember<T>::m_MaxRadDE>(embers, coefs, size);
 		InterpT<&Ember<T>::m_MinRadDE>(embers, coefs, size);
 		InterpT<&Ember<T>::m_CurveDE>(embers, coefs, size);
-		InterpT<&Ember<T>::m_GammaThresh>(embers, coefs, size);
+		InterpT<&Ember<T>::m_SpatialFilterRadius>(embers, coefs, size);
 
 		//An extra step needed here due to the OOD that was not needed in the original.
 		//A small price to pay for the conveniences it affords us elsewhere.
@@ -1382,6 +1388,8 @@ public:
 		   << "Quality: " << m_Quality << endl
 		   << "Pixels Per Unit: " << m_PixelsPerUnit << endl
 		   << "Original Pixels Per Unit: " << m_OrigPixPerUnit << endl
+		   << "Sub Batch Size: " << m_SubBatchSize << endl
+		   << "Fuse Count: " << m_FuseCount << endl
 		   << "Zoom: " << m_Zoom << endl
 		   << "ZPos: " << m_CamZPos << endl
 		   << "Perspective: " << m_CamPerspective << endl
@@ -1459,6 +1467,14 @@ public:
 	size_t m_OrigFinalRasH;//the dimension may change in an editor and the originals are needed for the aspect ratio.
 	T m_OrigPixPerUnit;
 
+	//The iteration depth. This was a rendering parameter in flam3 but has been made a member here
+	//so that it can be adjusted more easily.
+	size_t m_SubBatchSize;
+
+	//The number of iterations to disregard for each sub batch. This was a rendering parameter in flam3 but has been made a member here
+	//so that it can be adjusted more easily.
+	size_t m_FuseCount;
+
 	//The multiplier in size of the histogram and DE filtering buffers. Must be at least one, preferrably never larger than 4, only useful at 2.
 	//Xml field: "supersample" or "overample (deprecated)".
 	size_t m_Supersample;
diff --git a/Source/Ember/EmberDefines.h b/Source/Ember/EmberDefines.h
index 6adb8eb..85a831a 100644
--- a/Source/Ember/EmberDefines.h
+++ b/Source/Ember/EmberDefines.h
@@ -36,7 +36,7 @@ namespace EmberNs
 	extern void sincos(float x, float *s, float *c);
 #endif
 
-#define EMBER_VERSION "0.4.1.4"
+#define EMBER_VERSION "0.4.1.5"
 #define EPS6 T(1e-6)
 #define EPS std::numeric_limits<T>::epsilon()//Apoplugin.h uses -20, but it's more mathematically correct to do it this way.
 #define ISAAC_SIZE 4
@@ -54,6 +54,7 @@ namespace EmberNs
 #define COLORMAP_LENGTH 256//These will need to change if 2D palette support is ever added, or variable sized palettes.
 #define COLORMAP_LENGTH_MINUS_1 255
 #define WHITE 255
+#define DEFAULT_SBS (1024 * 10)
 #define XC (const xmlChar*)
 #define BadVal(x) (((x) != (x)) || ((x) > 1e10) || ((x) < -1e10))
 #define Rint(A) floor((A) + (((A) < 0) ? T(-0.5) : T(0.5)))
diff --git a/Source/Ember/EmberToXml.h b/Source/Ember/EmberToXml.h
index f345279..79f5eba 100644
--- a/Source/Ember/EmberToXml.h
+++ b/Source/Ember/EmberToXml.h
@@ -156,6 +156,8 @@ public:
 		os << " temporal_filter_width=\"" << ember.m_TemporalFilterWidth << "\"";
 		os << " quality=\"" << ember.m_Quality << "\"";
 		os << " temporal_samples=\"" << ember.m_TemporalSamples << "\"";
+		os << " sub_batch_size=\"" << ember.m_SubBatchSize << "\"";
+		os << " fuse=\"" << ember.m_FuseCount << "\"";
 		os << " background=\"" << ember.m_Background.r << " " << ember.m_Background.g << " " << ember.m_Background.b << "\"";
 		os << " brightness=\"" << ember.m_Brightness << "\"";
 		os << " gamma=\"" << ember.m_Gamma << "\"";
diff --git a/Source/Ember/Iterator.h b/Source/Ember/Iterator.h
index c1f4db5..29bd4e7 100644
--- a/Source/Ember/Iterator.h
+++ b/Source/Ember/Iterator.h
@@ -16,6 +16,17 @@ namespace EmberNs
 	using Iterator<T>::DoFinalXform; \
 	using Iterator<T>::DoBadVals;
 
+template <typename T, typename bucketT> class Renderer;
+
+template <typename T>
+struct IterParams
+{
+	size_t m_Count;
+	size_t m_Skip;
+	//T m_OneColDiv2;
+	//T m_OneRowDiv2;
+};
+
 /// <summary>
 /// Iterator base class.
 /// Iterating is one loop level outside of the inner xform application loop so it's still very important
@@ -69,7 +80,7 @@ public:
 	/// <param name="samples">The buffer to store the output points</param>
 	/// <param name="rand">The random context to use</param>
 	/// <returns>The number of bad values</returns>
-	virtual size_t Iterate(Ember<T>& ember, size_t count, size_t skip, Point<T>* samples, QTIsaac<ISAAC_SIZE, ISAAC_INT>& rand) { return 0; }
+	virtual size_t Iterate(Ember<T>& ember, IterParams<T>& params, Point<T>* samples, QTIsaac<ISAAC_SIZE, ISAAC_INT>& rand) { return 0; }
 
 	/// <summary>
 	/// Initialize the xform selection vector by normalizing the weights of all xforms and
@@ -278,7 +289,7 @@ public:
 	/// <param name="samples">The buffer to store the output points</param>
 	/// <param name="rand">The random context to use</param>
 	/// <returns>The number of bad values</returns>
-	virtual size_t Iterate(Ember<T>& ember, size_t count, size_t skip, Point<T>* samples, QTIsaac<ISAAC_SIZE, ISAAC_INT>& rand)
+	virtual size_t Iterate(Ember<T>& ember, IterParams<T>& params, Point<T>* samples, QTIsaac<ISAAC_SIZE, ISAAC_INT>& rand) override
 	{
 		size_t i, badVals = 0;
 		Point<T> tempPoint, p1;
@@ -290,7 +301,7 @@ public:
 			{
 				p1 = samples[0];
 
-				for (i = 0; i < skip; i++)//Fuse.
+				for (i = 0; i < params.m_Skip; i++)//Fuse.
 				{
 					if (xforms[NextXformFromIndex(rand.Rand())].Apply(&p1, &p1, rand))
 						DoBadVals(xforms, badVals, &p1, rand);
@@ -299,7 +310,7 @@ public:
 				DoFinalXform(ember, p1, samples, rand);//Apply to last fuse point and store as the first element in samples.
 				ember.Proj(samples[0], rand);
 
-				for (i = 1; i < count; i++)//Real loop.
+				for (i = 1; i < params.m_Count; i++)//Real loop.
 				{
 					if (xforms[NextXformFromIndex(rand.Rand())].Apply(&p1, &p1, rand))
 						DoBadVals(xforms, badVals, &p1, rand);
@@ -312,7 +323,7 @@ public:
 			{
 				p1 = samples[0];
 
-				for (i = 0; i < skip; i++)//Fuse.
+				for (i = 0; i < params.m_Skip; i++)//Fuse.
 				{
 					if (xforms[NextXformFromIndex(rand.Rand())].Apply(&p1, &p1, rand))
 						DoBadVals(xforms, badVals, &p1, rand);
@@ -321,7 +332,7 @@ public:
 				samples[0] = p1;
 				ember.Proj(samples[0], rand);
 
-				for (i = 1; i < count; i++)//Real loop.
+				for (i = 1; i < params.m_Count; i++)//Real loop.
 				{
 					if (xforms[NextXformFromIndex(rand.Rand())].Apply(&p1, &samples[i], rand))
 						DoBadVals(xforms, badVals, samples + i, rand);
@@ -337,7 +348,7 @@ public:
 			{
 				p1 = samples[0];
 
-				for (i = 0; i < skip; i++)//Fuse.
+				for (i = 0; i < params.m_Skip; i++)//Fuse.
 				{
 					if (xforms[NextXformFromIndex(rand.Rand())].Apply(&p1, &p1, rand))
 						DoBadVals(xforms, badVals, &p1, rand);
@@ -345,7 +356,7 @@ public:
 
 				DoFinalXform(ember, p1, samples, rand);//Apply to last fuse point and store as the first element in samples.
 
-				for (i = 1; i < count; i++)//Real loop.
+				for (i = 1; i < params.m_Count; i++)//Real loop.
 				{
 					if (xforms[NextXformFromIndex(rand.Rand())].Apply(&p1, &p1, rand))//Feed the resulting value of applying the randomly selected xform back into the next iter, and not the result of applying the final xform.
 						DoBadVals(xforms, badVals, &p1, rand);
@@ -357,7 +368,7 @@ public:
 			{
 				p1 = samples[0];
 
-				for (i = 0; i < skip; i++)//Fuse.
+				for (i = 0; i < params.m_Skip; i++)//Fuse.
 				{
 					if (xforms[NextXformFromIndex(rand.Rand())].Apply(&p1, &p1, rand))
 						DoBadVals(xforms, badVals, &p1, rand);
@@ -365,9 +376,11 @@ public:
 
 				samples[0] = p1;
 
-				for (i = 0; i < count - 1; i++)//Real loop.
+				for (i = 0; i < params.m_Count - 1; i++)//Real loop.
+				{
 					if (xforms[NextXformFromIndex(rand.Rand())].Apply(samples + i, samples + i + 1, rand))
 						DoBadVals(xforms, badVals, samples + i + 1, rand);
+				}
 			}
 		}
 
@@ -442,7 +455,7 @@ public:
 	/// <param name="samples">The buffer to store the output points</param>
 	/// <param name="rand">The random context to use</param>
 	/// <returns>The number of bad values</returns>
-	virtual size_t Iterate(Ember<T>& ember, size_t count, size_t skip, Point<T>* samples, QTIsaac<ISAAC_SIZE, ISAAC_INT>& rand)
+	virtual size_t Iterate(Ember<T>& ember, IterParams<T>& params, Point<T>* samples, QTIsaac<ISAAC_SIZE, ISAAC_INT>& rand) override
 	{
 		size_t i, xformIndex;
 		size_t lastXformUsed = 0;
@@ -456,7 +469,7 @@ public:
 			{
 				p1 = samples[0];
 
-				for (i = 0; i < skip; i++)//Fuse.
+				for (i = 0; i < params.m_Skip; i++)//Fuse.
 				{
 					xformIndex = NextXformFromIndex(rand.Rand(), lastXformUsed);
 
@@ -469,7 +482,7 @@ public:
 				DoFinalXform(ember, p1, samples, rand);//Apply to last fuse point and store as the first element in samples.
 				ember.Proj(samples[0], rand);
 
-				for (i = 1; i < count; i++)//Real loop.
+				for (i = 1; i < params.m_Count; i++)//Real loop.
 				{
 					xformIndex = NextXformFromIndex(rand.Rand(), lastXformUsed);
 
@@ -485,7 +498,7 @@ public:
 			{
 				p1 = samples[0];
 
-				for (i = 0; i < skip; i++)//Fuse.
+				for (i = 0; i < params.m_Skip; i++)//Fuse.
 				{
 					xformIndex = NextXformFromIndex(rand.Rand(), lastXformUsed);
 
@@ -498,7 +511,7 @@ public:
 				samples[0] = p1;
 				ember.Proj(samples[0], rand);
 
-				for (i = 1; i < count; i++)//Real loop.
+				for (i = 1; i < params.m_Count; i++)//Real loop.
 				{
 					xformIndex = NextXformFromIndex(rand.Rand(), lastXformUsed);
 
@@ -517,7 +530,7 @@ public:
 			{
 				p1 = samples[0];
 
-				for (i = 0; i < skip; i++)//Fuse.
+				for (i = 0; i < params.m_Skip; i++)//Fuse.
 				{
 					xformIndex = NextXformFromIndex(rand.Rand(), lastXformUsed);
 
@@ -529,7 +542,7 @@ public:
 
 				DoFinalXform(ember, p1, samples, rand);//Apply to last fuse point and store as the first element in samples.
 
-				for (i = 1; i < count; i++)//Real loop.
+				for (i = 1; i < params.m_Count; i++)//Real loop.
 				{
 					xformIndex = NextXformFromIndex(rand.Rand(), lastXformUsed);
 
@@ -544,7 +557,7 @@ public:
 			{
 				p1 = samples[0];
 
-				for (i = 0; i < skip; i++)//Fuse.
+				for (i = 0; i < params.m_Skip; i++)//Fuse.
 				{
 					xformIndex = NextXformFromIndex(rand.Rand(), lastXformUsed);
 
@@ -556,7 +569,7 @@ public:
 
 				samples[0] = p1;
 
-				for (i = 0; i < count - 1; i++)//Real loop.
+				for (i = 0; i < params.m_Count - 1; i++)//Real loop.
 				{
 					xformIndex = NextXformFromIndex(rand.Rand(), lastXformUsed);
 
diff --git a/Source/Ember/Renderer.cpp b/Source/Ember/Renderer.cpp
index afe8619..9462e88 100644
--- a/Source/Ember/Renderer.cpp
+++ b/Source/Ember/Renderer.cpp
@@ -691,7 +691,7 @@ bool Renderer<T, bucketT>::Alloc()
 		(m_SuperSize         != m_HistBuckets.size())        ||
 		(m_SuperSize         != m_AccumulatorBuckets.size()) ||
 		(m_ThreadsToUse      != m_Samples.size())            ||
-		(m_Samples[0].size() != m_SubBatchSize);
+		(m_Samples[0].size() != SubBatchSize());
 
 	if (lock)
 		EnterResize();
@@ -728,14 +728,14 @@ bool Renderer<T, bucketT>::Alloc()
 
 	for (size_t i = 0; i < m_Samples.size(); i++)
 	{
-		if (m_Samples[i].size() != m_SubBatchSize)
+		if (m_Samples[i].size() != SubBatchSize())
 		{
-			m_Samples[i].resize(m_SubBatchSize);
+			m_Samples[i].resize(SubBatchSize());
 
 			if (m_ReclaimOnResize)
 				m_Samples[i].shrink_to_fit();
 
-			b &= (m_Samples[i].size() == m_SubBatchSize);
+			b &= (m_Samples[i].size() == SubBatchSize());
 		}
 	}
 
@@ -1154,7 +1154,7 @@ eRenderStatus Renderer<T, bucketT>::AccumulatorToFinalImage(unsigned char* pixel
 /// This function will be called multiple times for an interactive rendering, and
 /// once for a straight through render.
 /// The iteration is reset and fused in each thread after each sub batch is done
-/// which by default is 10,000 iterations.
+/// which by default is 10,240 iterations.
 /// </summary>
 /// <param name="iterCount">The number of iterations to run</param>
 /// <param name="temporalSample">The temporal sample this is running for</param>
@@ -1164,7 +1164,6 @@ EmberStats Renderer<T, bucketT>::Iterate(size_t iterCount, size_t temporalSample
 {
 	//Timing t2(4);
 	m_IterTimer.Tic();
-	size_t fuse = EarlyClip() ? 100 : 15;//EarlyClip was one way of detecting a later version of flam3, so it used 100 which is a better value.
 	size_t totalItersPerThread = (size_t)ceil((double)iterCount / (double)m_ThreadsToUse);
 	double percent, etaMs;
 	EmberStats stats;
@@ -1180,17 +1179,21 @@ EmberStats Renderer<T, bucketT>::Iterate(size_t iterCount, size_t temporalSample
 	parallel_for(size_t(0), m_ThreadsToUse, [&] (size_t threadIndex)
 	{
 #endif
-		Timing t;
-		size_t subBatchSize = (size_t)min(totalItersPerThread, (size_t)m_SubBatchSize);
+		//Timing t;
+		IterParams<T> params;
 
 		m_BadVals[threadIndex] = 0;
+		params.m_Count = min(totalItersPerThread, SubBatchSize());
+		params.m_Skip = FuseCount();
+		//params.m_OneColDiv2 = m_CarToRas.OneCol() / 2;
+		//params.m_OneRowDiv2 = m_CarToRas.OneRow() / 2;
 
 		//Sub batch iterations, loop 2.
-		for (m_SubBatch[threadIndex] = 0; (m_SubBatch[threadIndex] < totalItersPerThread) && !m_Abort; m_SubBatch[threadIndex] += subBatchSize)
+		for (m_SubBatch[threadIndex] = 0; (m_SubBatch[threadIndex] < totalItersPerThread) && !m_Abort; m_SubBatch[threadIndex] += params.m_Count)
 		{
-			//Must recalculate the number of iters to run on each sub batch because the last batch will most likely have less than m_SubBatchSize iters.
+			//Must recalculate the number of iters to run on each sub batch because the last batch will most likely have less than SubBatchSize iters.
 			//For example, if 51,000 are requested, and the sbs is 10,000, it should run 5 sub batches of 10,000 iters, and one final sub batch of 1,000 iters.
-			subBatchSize = min(subBatchSize, totalItersPerThread - m_SubBatch[threadIndex]);
+			params.m_Count = min(params.m_Count, totalItersPerThread - m_SubBatch[threadIndex]);
 
 			//Use first as random point, the rest are iterated points.
 			//Note that this gets reset with a new random point for each subBatchSize iterations.
@@ -1203,14 +1206,14 @@ EmberStats Renderer<T, bucketT>::Iterate(size_t iterCount, size_t temporalSample
 			//Finally, iterate.
 			//t.Tic();
 			//Iterating, loop 3.
-			m_BadVals[threadIndex] += m_Iterator->Iterate(m_Ember, subBatchSize, fuse, m_Samples[threadIndex].data(), m_Rand[threadIndex]);
+			m_BadVals[threadIndex] += m_Iterator->Iterate(m_Ember, params, m_Samples[threadIndex].data(), m_Rand[threadIndex]);
 			//iterationTime += t.Toc();
 
 			if (m_LockAccum)
 				m_AccumCs.Enter();
 			//t.Tic();
 			//Map temp buffer samples into the histogram using the palette for color.
-			Accumulate(m_Samples[threadIndex].data(), subBatchSize, &m_Dmap);
+			Accumulate(m_Rand[threadIndex], m_Samples[threadIndex].data(), params.m_Count, &m_Dmap);
 			//accumulationTime += t.Toc();
 			if (m_LockAccum)
 				m_AccumCs.Leave();
@@ -1347,6 +1350,8 @@ template <typename T, typename bucketT> ePaletteMode      Renderer<T, bucketT>::
 template <typename T, typename bucketT> size_t Renderer<T, bucketT>::TemporalSamples() const { return m_Ember.m_TemporalSamples; }
 template <typename T, typename bucketT> size_t Renderer<T, bucketT>::FinalRasW()       const { return m_Ember.m_FinalRasW; }
 template <typename T, typename bucketT> size_t Renderer<T, bucketT>::FinalRasH()       const { return m_Ember.m_FinalRasH; }
+template <typename T, typename bucketT> size_t Renderer<T, bucketT>::SubBatchSize()    const { return m_Ember.m_SubBatchSize; }
+template <typename T, typename bucketT> size_t Renderer<T, bucketT>::FuseCount()	   const { return m_Ember.m_FuseCount; }
 
 /// <summary>
 /// Non-virtual iterator wrappers.
@@ -1396,11 +1401,13 @@ void Renderer<T, bucketT>::PrepFinalAccumVals(Color<T>& background, T& g, T& lin
 /// <param name="sampleCount">The number of samples</param>
 /// <param name="palette">The palette to use</param>
 template <typename T, typename bucketT>
-void Renderer<T, bucketT>::Accumulate(Point<T>* samples, size_t sampleCount, const Palette<bucketT>* palette)
+void Renderer<T, bucketT>::Accumulate(QTIsaac<ISAAC_SIZE, ISAAC_INT>& rand, Point<T>* samples, size_t sampleCount, const Palette<bucketT>* palette)
 {
 	size_t histIndex, intColorIndex, histSize = m_HistBuckets.size();
 	bucketT colorIndex, colorIndexFrac;
 	const glm::detail::tvec4<bucketT, glm::defaultp>* dmap = &(palette->m_Entries[0]);
+	//T oneColDiv2 = m_CarToRas.OneCol() / 2;
+	//T oneRowDiv2 = m_CarToRas.OneRow() / 2;
 
 	//It's critical to understand what's going on here as it's one of the most important parts of the algorithm.
 	//A color value gets retrieved from the palette and
@@ -1413,24 +1420,37 @@ void Renderer<T, bucketT>::Accumulate(Point<T>* samples, size_t sampleCount, con
 	//Splitting these conditionals into separate loops makes no speed difference.
 	for (size_t i = 0; i < sampleCount && !m_Abort; i++)
 	{
+		Point<T> p(samples[i]);//Slightly faster to cache this.
+
 		if (Rotate() != 0)
 		{
-			T p00 = samples[i].m_X - CenterX();
-			T p11 = samples[i].m_Y - m_Ember.m_RotCenterY;
+			T p00 = p.m_X - CenterX();
+			T p11 = p.m_Y - m_Ember.m_RotCenterY;
 
-			samples[i].m_X = (p00 * m_RotMat.A()) + (p11 * m_RotMat.B()) + CenterX();
-			samples[i].m_Y = (p00 * m_RotMat.D()) + (p11 * m_RotMat.E()) + m_Ember.m_RotCenterY;
+			p.m_X = (p00 * m_RotMat.A()) + (p11 * m_RotMat.B()) + CenterX();
+			p.m_Y = (p00 * m_RotMat.D()) + (p11 * m_RotMat.E()) + m_Ember.m_RotCenterY;
 		}
 
+		//T angle = rand.Frand01<T>() * M_2PI;
+		//T r = exp(T(0.5) * sqrt(-log(rand.Frand01<T>()))) - 1;
+
+		//T r = (rand.Frand01<T>() + rand.Frand01<T>() - 1);
+		//T r = (rand.Frand01<T>() + rand.Frand01<T>() + rand.Frand01<T>() + rand.Frand01<T>() - 2);
+
+		//p.m_X += (r * oneColDiv2) * cos(angle);
+		//p.m_Y += (r * oneRowDiv2) * sin(angle);
+		//p.m_X += r * cos(angle);
+		//p.m_Y += r * sin(angle);
+
 		//Checking this first before converting gives better performance than converting and checking a single value, which the original did.
 		//Second, an interesting optimization observation is that when keeping the bounds vars within m_CarToRas and calling its InBounds() member function,
 		//rather than here as members, about a 7% speedup is achieved. This is possibly due to the fact that data from m_CarToRas is accessed
 		//right after the call to Convert(), so some caching efficiencies get realized.
-		if (m_CarToRas.InBounds(samples[i]))
+		if (m_CarToRas.InBounds(p))
 		{
-			if (samples[i].m_VizAdjusted != 0)
+			if (p.m_VizAdjusted != 0)
 			{
-				m_CarToRas.Convert(samples[i], histIndex);
+				m_CarToRas.Convert(p, histIndex);
 
 				//There is a very slim chance that a point will be right on the border and will technically be in bounds, passing the InBounds() test,
 				//but ends up being mapped to a histogram bucket that is out of bounds due to roundoff error. Perform one final check before proceeding.
@@ -1445,7 +1465,7 @@ void Renderer<T, bucketT>::Accumulate(Point<T>* samples, size_t sampleCount, con
 					//Use overloaded addition and multiplication operators in vec4 to perform the accumulation.
 					if (PaletteMode() == PALETTE_LINEAR)
 					{
-						colorIndex = (bucketT)samples[i].m_ColorX * COLORMAP_LENGTH;
+						colorIndex = (bucketT)p.m_ColorX * COLORMAP_LENGTH;
 						intColorIndex = (size_t)colorIndex;
 
 						if (intColorIndex < 0)
@@ -1463,19 +1483,19 @@ void Renderer<T, bucketT>::Accumulate(Point<T>* samples, size_t sampleCount, con
 							colorIndexFrac = colorIndex - (bucketT)intColorIndex;//Interpolate between intColorIndex and intColorIndex + 1.
 						}
 
-						if (samples[i].m_VizAdjusted == 1)
+						if (p.m_VizAdjusted == 1)
 							m_HistBuckets[histIndex] += ((dmap[intColorIndex] * (1 - colorIndexFrac)) + (dmap[intColorIndex + 1] * colorIndexFrac));
 						else
-							m_HistBuckets[histIndex] += (((dmap[intColorIndex] * (1 - colorIndexFrac)) + (dmap[intColorIndex + 1] * colorIndexFrac)) * (bucketT)samples[i].m_VizAdjusted);
+							m_HistBuckets[histIndex] += (((dmap[intColorIndex] * (1 - colorIndexFrac)) + (dmap[intColorIndex + 1] * colorIndexFrac)) * (bucketT)p.m_VizAdjusted);
 					}
 					else if (PaletteMode() == PALETTE_STEP)
 					{
-						intColorIndex = Clamp<size_t>((size_t)(samples[i].m_ColorX * COLORMAP_LENGTH), 0, COLORMAP_LENGTH_MINUS_1);
+						intColorIndex = Clamp<size_t>((size_t)(p.m_ColorX * COLORMAP_LENGTH), 0, COLORMAP_LENGTH_MINUS_1);
 
-						if (samples[i].m_VizAdjusted == 1)
+						if (p.m_VizAdjusted == 1)
 							m_HistBuckets[histIndex] += dmap[intColorIndex];
 						else
-							m_HistBuckets[histIndex] += (dmap[intColorIndex] * (bucketT)samples[i].m_VizAdjusted);
+							m_HistBuckets[histIndex] += (dmap[intColorIndex] * (bucketT)p.m_VizAdjusted);
 					}
 				}
 			}
diff --git a/Source/Ember/Renderer.h b/Source/Ember/Renderer.h
index 8772eb1..7942c18 100644
--- a/Source/Ember/Renderer.h
+++ b/Source/Ember/Renderer.h
@@ -134,6 +134,8 @@ public:
 	virtual size_t TemporalSamples() const override;
 	virtual size_t FinalRasW()       const override;
 	virtual size_t FinalRasH()       const override;
+	virtual size_t SubBatchSize()    const override;
+	virtual size_t FuseCount()		 const override;
 
 	//Non-virtual iterator wrappers.
 	const unsigned char* XformDistributions()		 const;
@@ -144,9 +146,9 @@ protected:
 	//Non-virtual functions that might be needed by a derived class.
 	void PrepFinalAccumVals(Color<T>& background, T& g, T& linRange, T& vibrancy);
 
-private:
+	private:
 	//Miscellaneous non-virtual functions used only in this class.
-	void Accumulate(Point<T>* samples, size_t sampleCount, const Palette<bucketT>* palette);
+	void Accumulate(QTIsaac<ISAAC_SIZE, ISAAC_INT>& rand, Point<T>* samples, size_t sampleCount, const Palette<bucketT>* palette);
 	/*inline*/ void AddToAccum(const glm::detail::tvec4<bucketT, glm::defaultp>& bucket, intmax_t i, intmax_t ii, intmax_t j, intmax_t jj);
 	template <typename accumT> void GammaCorrection(glm::detail::tvec4<bucketT, glm::defaultp>& bucket, Color<T>& background, T g, T linRange, T vibrancy, bool doAlpha, bool scale, accumT* correctedChannels);
 
diff --git a/Source/Ember/RendererBase.cpp b/Source/Ember/RendererBase.cpp
index 2a4c9df..0110d3f 100644
--- a/Source/Ember/RendererBase.cpp
+++ b/Source/Ember/RendererBase.cpp
@@ -15,7 +15,6 @@ RendererBase::RendererBase()
 	m_YAxisUp = false;
 	m_InsertPalette = false;
 	m_ReclaimOnResize = false;
-	m_SubBatchSize = 1024 * 10;
 	m_NumChannels = 3;
 	m_BytesPerChannel = 1;
 	m_SuperSize = 0;
@@ -412,17 +411,6 @@ void RendererBase::Transparency(bool transparency)
 	ChangeVal([&] { m_Transparency = transparency; }, ACCUM_ONLY);
 }
 
-/// <summary>
-/// Set the sub batch size. This is the size of of the chunks that the iteration
-/// trajectory will be broken up into.
-/// Reset the rendering process.
-/// </summary>
-/// <param name="sbs">The sub batch size to set</param>
-void RendererBase::SubBatchSize(size_t sbs)
-{
-	ChangeVal([&] { m_SubBatchSize = sbs; }, FULL_RENDER);
-}
-
 /// <summary>
 /// Set the callback object.
 /// </summary>
@@ -583,14 +571,6 @@ void RendererBase::NumChannels(size_t numChannels)
 /// <returns>The number of threads used when rendering</returns>
 size_t RendererBase::ThreadCount() const { return m_ThreadsToUse; }
 
-/// <summary>
-/// Get the sub batch size. This is the size of of the chunks that the iteration
-/// trajectory will be broken up into.
-/// Default: 10k.
-/// </summary>
-/// <returns>The sub batch size</returns>
-size_t RendererBase::SubBatchSize() const { return m_SubBatchSize; }
-
 /// <summary>
 /// Get the renderer type enum.
 /// CPU_RENDERER for this class, other values for derived classes.
diff --git a/Source/Ember/RendererBase.h b/Source/Ember/RendererBase.h
index 01c390f..ae235ad 100644
--- a/Source/Ember/RendererBase.h
+++ b/Source/Ember/RendererBase.h
@@ -149,7 +149,6 @@ public:
 	void ReclaimOnResize(bool reclaimOnResize);
 	bool Transparency() const;
 	void Transparency(bool transparency);
-	void SubBatchSize(size_t subBatchSize);
 	void Callback(RenderCallback* callback);
 	void ThreadCount(size_t threads, const char* seedString = nullptr);
 	size_t BytesPerChannel() const;
@@ -161,7 +160,6 @@ public:
 	//Virtual render properties, getters and setters.
 	virtual void NumChannels(size_t numChannels);
 	virtual size_t ThreadCount()   const;
-	virtual size_t SubBatchSize()  const;
 	virtual eRendererType RendererType() const;
 
 	//Abstract render properties, getters only.
@@ -169,6 +167,8 @@ public:
 	virtual size_t HistBucketSize()				   const = 0;
 	virtual size_t FinalRasW()		               const = 0;
 	virtual size_t FinalRasH()					   const = 0;
+	virtual size_t SubBatchSize()				   const = 0;
+	virtual size_t FuseCount()					   const = 0;
 	virtual double ScaledQuality()                 const = 0;
 	virtual double LowerLeftX(bool  gutter = true) const = 0;
 	virtual double LowerLeftY(bool  gutter = true) const = 0;
@@ -207,10 +207,8 @@ protected:
 	size_t m_DensityFilterOffset;
 	size_t m_NumChannels;
 	size_t m_BytesPerChannel;
-	size_t m_SubBatchSize;
 	size_t m_ThreadsToUse;
 	size_t m_VibGamCount;
-	size_t m_LastPass;
 	size_t m_LastTemporalSample;
 	double m_LastIterPercent;
 	size_t m_LastIter;
diff --git a/Source/Ember/SheepTools.h b/Source/Ember/SheepTools.h
index a7b0ba4..cdc83ab 100644
--- a/Source/Ember/SheepTools.h
+++ b/Source/Ember/SheepTools.h
@@ -879,7 +879,6 @@ public:
 		m_Renderer->EarlyClip(true);
 		m_Renderer->PixelAspectRatio(1);
 		m_Renderer->ThreadCount(Timing::ProcessorCount());
-		m_Renderer->SubBatchSize(10000);
 		m_Renderer->Callback(nullptr);
 
 		if (m_Renderer->Run(m_FinalImage) != RENDER_OK)
@@ -1280,8 +1279,16 @@ public:
 	/// <returns>The number of iterations ran</returns>
 	size_t EstimateBoundingBox(Ember<T>& ember, T eps, size_t samples, T* bmin, T* bmax)
 	{
+		bool newAlloc = false;
 		size_t i, lowTarget, highTarget;
 		T min[2], max[2];
+		IterParams<T> params;
+
+		m_Renderer->SetEmber(ember);
+		m_Renderer->CreateSpatialFilter(newAlloc);
+		m_Renderer->CreateDEFilter(newAlloc);
+		m_Renderer->ComputeBounds();
+		m_Renderer->ComputeCamera();
 
 		if (ember.XaosPresent())
 			m_Iterator = m_XaosIterator.get();
@@ -1290,8 +1297,12 @@ public:
 
 		m_Iterator->InitDistributions(ember);
 		m_Samples.resize(samples);
+		params.m_Count = samples;
+		params.m_Skip = 20;
+		//params.m_OneColDiv2 = m_Renderer->CoordMap()->OneCol() / 2;
+		//params.m_OneRowDiv2 = m_Renderer->CoordMap()->OneRow() / 2;
 
-		size_t bv = m_Iterator->Iterate(ember, samples, 20, m_Samples.data(), m_Rand);//Use a special fuse of 20, all other calls to this will use 15, or 100.
+		size_t bv = m_Iterator->Iterate(ember, params, m_Samples.data(), m_Rand);//Use a special fuse of 20, all other calls to this will use 15, or 100.
 
 		if (bv / T(samples) > eps)
 			eps = 3 * bv / T(samples);
diff --git a/Source/Ember/Utils.h b/Source/Ember/Utils.h
index b3d5734..3d01023 100644
--- a/Source/Ember/Utils.h
+++ b/Source/Ember/Utils.h
@@ -33,6 +33,17 @@ static inline void ForEach(c& container, fn func)
 	std::for_each(container.begin(), container.end(), func);
 }
 
+/// <summary>
+/// Thin wrapper around computing the total size of a vector.
+/// </summary>
+/// <param name="vec">The vector to compute the size of</param>
+/// <returns>The size of one element times the length.</returns>
+template<typename T>
+static inline size_t SizeOf(vector<T>& vec)
+{
+	return sizeof(vec[0]) * vec.size();
+}
+
 /// <summary>
 /// After a run completes, information about what was run can be saved as strings to the comments
 /// section of a jpg or png file. This class is just a container for those values.
@@ -276,7 +287,7 @@ static void ClearVec(vector<T*>& vec, bool arrayDelete = false)
 template<typename T>
 static inline void Memset(vector<T>& vec, int val = 0)
 {
-	memset((void*)vec.data(), val, vec.size() * sizeof(vec[0]));
+	memset((void*)vec.data(), val, SizeOf(vec));
 }
 
 /// <summary>
diff --git a/Source/Ember/XmlToEmber.h b/Source/Ember/XmlToEmber.h
index 64de8b8..b80a14e 100644
--- a/Source/Ember/XmlToEmber.h
+++ b/Source/Ember/XmlToEmber.h
@@ -593,6 +593,8 @@ private:
 			else if (ParseAndAssignInt(curAtt->name, attStr, "oversample",       currentEmber.m_Supersample    , ret)) { }
 			else if (ParseAndAssignInt(curAtt->name, attStr, "supersample",      currentEmber.m_Supersample    , ret)) { }
 			else if (ParseAndAssignInt(curAtt->name, attStr, "temporal_samples", currentEmber.m_TemporalSamples, ret)) { }
+			else if (ParseAndAssignInt(curAtt->name, attStr, "sub_batch_size",	 currentEmber.m_SubBatchSize   , ret)) { }
+			else if (ParseAndAssignInt(curAtt->name, attStr, "fuse",			 currentEmber.m_FuseCount	   , ret)) { }
 			else if (ParseAndAssignInt(curAtt->name, attStr, "soloxform",		 soloXform                     , ret)) { }
 			else if (ParseAndAssignInt(curAtt->name, attStr, "new_linear",		 newLinear					   , ret)) { }
 
diff --git a/Source/EmberAnimate/EmberAnimate.cpp b/Source/EmberAnimate/EmberAnimate.cpp
index c216715..6466385 100644
--- a/Source/EmberAnimate/EmberAnimate.cpp
+++ b/Source/EmberAnimate/EmberAnimate.cpp
@@ -186,6 +186,9 @@ bool EmberAnimate(EmberOptions& opt)
 		if (opt.Supersample() > 0)
 			embers[i].m_Supersample = opt.Supersample();
 
+		if (opt.SubBatchSize() != DEFAULT_SBS)
+			embers[i].m_SubBatchSize = opt.SubBatchSize();
+
 		embers[i].m_Quality *= T(opt.QualityScale());
 		embers[i].m_FinalRasW = (unsigned int)((T)embers[i].m_FinalRasW * opt.SizeScale());
 		embers[i].m_FinalRasH = (unsigned int)((T)embers[i].m_FinalRasH * opt.SizeScale());
@@ -250,13 +253,12 @@ bool EmberAnimate(EmberOptions& opt)
 	renderer->YAxisUp(opt.YAxisUp());
 	renderer->LockAccum(opt.LockAccum());
 	renderer->InsertPalette(opt.InsertPalette());
-	renderer->SubBatchSize(opt.SubBatchSize());
 	renderer->PixelAspectRatio(T(opt.AspectRatio()));
 	renderer->Transparency(opt.Transparency());
 	renderer->NumChannels(channels);
 	renderer->BytesPerChannel(opt.BitsPerChannel() / 8);
 	renderer->Callback(opt.DoProgress() ? progress.get() : NULL);
- 
+
 	//Begin run.
 	for (ftime = opt.FirstFrame(); ftime <= opt.LastFrame(); ftime += opt.Dtime())
 	{
diff --git a/Source/EmberAnimate/EmberAnimate.rc b/Source/EmberAnimate/EmberAnimate.rc
index 09e4dc1..7ec4eec 100644
--- a/Source/EmberAnimate/EmberAnimate.rc
+++ b/Source/EmberAnimate/EmberAnimate.rc
@@ -49,8 +49,8 @@ END
 //
 
 VS_VERSION_INFO VERSIONINFO
- FILEVERSION 0,4,1,4
- PRODUCTVERSION 0,4,1,4
+ FILEVERSION 0,4,1,5
+ PRODUCTVERSION 0,4,1,5
  FILEFLAGSMASK 0x3fL
 #ifdef _DEBUG
  FILEFLAGS 0x1L
@@ -67,12 +67,12 @@ BEGIN
         BEGIN
             VALUE "CompanyName", "Open Source"
             VALUE "FileDescription", "Renders fractal flames as animations with motion blur"
-            VALUE "FileVersion", "0.4.1.4"
+            VALUE "FileVersion", "0.4.1.5"
             VALUE "InternalName", "EmberAnimate.rc"
             VALUE "LegalCopyright", "Copyright (C) Matt Feemster 2013, GPL v3"
             VALUE "OriginalFilename", "EmberAnimate.rc"
             VALUE "ProductName", "Ember Animate"
-            VALUE "ProductVersion", "0.4.1.4"
+            VALUE "ProductVersion", "0.4.1.5"
         END
     END
     BLOCK "VarFileInfo"
diff --git a/Source/EmberCL/DEOpenCLKernelCreator.cpp b/Source/EmberCL/DEOpenCLKernelCreator.cpp
index fb5b677..f1da81a 100644
--- a/Source/EmberCL/DEOpenCLKernelCreator.cpp
+++ b/Source/EmberCL/DEOpenCLKernelCreator.cpp
@@ -25,7 +25,6 @@ template <>
 DEOpenCLKernelCreator<float>::DEOpenCLKernelCreator(bool nVidia)
 {
 	m_NVidia = nVidia;
-	m_LogScaleSumDEEntryPoint                 = "LogScaleSumDensityFilterKernel";
 	m_LogScaleAssignDEEntryPoint              = "LogScaleAssignDensityFilterKernel";
 	m_GaussianDEWithoutSsEntryPoint           = "GaussianDEWithoutSsKernel";
 	m_GaussianDESsWithScfEntryPoint           = "GaussianDESsWithScfKernel";
@@ -33,7 +32,6 @@ DEOpenCLKernelCreator<float>::DEOpenCLKernelCreator(bool nVidia)
 	m_GaussianDEWithoutSsNoCacheEntryPoint    = "GaussianDEWithoutSsNoCacheKernel";
 	m_GaussianDESsWithScfNoCacheEntryPoint    = "GaussianDESsWithScfNoCacheKernel";
 	m_GaussianDESsWithoutScfNoCacheEntryPoint = "GaussianDESsWithoutScfNoCacheKernel";
-	m_LogScaleSumDEKernel                     = CreateLogScaleSumDEKernelString();
 	m_LogScaleAssignDEKernel                  = CreateLogScaleAssignDEKernelString();
 	m_GaussianDEWithoutSsKernel               = CreateGaussianDEKernel(1);
 	m_GaussianDESsWithScfKernel               = CreateGaussianDEKernel(2);
@@ -56,25 +54,39 @@ DEOpenCLKernelCreator<float>::DEOpenCLKernelCreator(bool nVidia)
 template <>
 DEOpenCLKernelCreator<double>::DEOpenCLKernelCreator(bool nVidia)
 {
+#ifdef ROW_ONLY_DE
+	m_NVidia = nVidia;
+	m_LogScaleAssignDEEntryPoint = "LogScaleAssignDensityFilterKernel";
+	m_GaussianDEWithoutSsEntryPoint = "GaussianDEWithoutSsKernel";
+	m_GaussianDESsWithScfEntryPoint = "GaussianDESsWithScfKernel";
+	m_GaussianDESsWithoutScfEntryPoint = "GaussianDESsWithoutScfKernel";
+	m_GaussianDEWithoutSsNoCacheEntryPoint = "GaussianDEWithoutSsNoCacheKernel";
+	m_GaussianDESsWithScfNoCacheEntryPoint = "GaussianDESsWithScfNoCacheKernel";
+	m_GaussianDESsWithoutScfNoCacheEntryPoint = "GaussianDESsWithoutScfNoCacheKernel";
+	m_LogScaleAssignDEKernel = CreateLogScaleAssignDEKernelString();
+	m_GaussianDEWithoutSsKernel = CreateGaussianDEKernel(1);
+	m_GaussianDESsWithScfKernel = CreateGaussianDEKernel(2);
+	m_GaussianDESsWithoutScfKernel = CreateGaussianDEKernel(3);
+	m_GaussianDEWithoutSsNoCacheKernel = CreateGaussianDEKernelNoLocalCache(1);
+	m_GaussianDESsWithScfNoCacheKernel = CreateGaussianDEKernelNoLocalCache(2);
+	m_GaussianDESsWithoutScfNoCacheKernel = CreateGaussianDEKernelNoLocalCache(3);
+#else
 	m_NVidia = nVidia;
-	m_LogScaleSumDEEntryPoint                 = "LogScaleSumDensityFilterKernel";
 	m_LogScaleAssignDEEntryPoint              = "LogScaleAssignDensityFilterKernel";
 	m_GaussianDEWithoutSsNoCacheEntryPoint    = "GaussianDEWithoutSsNoCacheKernel";
 	m_GaussianDESsWithScfNoCacheEntryPoint    = "GaussianDESsWithScfNoCacheKernel";
 	m_GaussianDESsWithoutScfNoCacheEntryPoint = "GaussianDESsWithoutScfNoCacheKernel";
-	m_LogScaleSumDEKernel                     = CreateLogScaleSumDEKernelString();
 	m_LogScaleAssignDEKernel                  = CreateLogScaleAssignDEKernelString();
 	m_GaussianDEWithoutSsNoCacheKernel        = CreateGaussianDEKernelNoLocalCache(1);
 	m_GaussianDESsWithScfNoCacheKernel        = CreateGaussianDEKernelNoLocalCache(2);
 	m_GaussianDESsWithoutScfNoCacheKernel     = CreateGaussianDEKernelNoLocalCache(3);
+#endif
 }
 
 /// <summary>
 /// Kernel source and entry point properties, getters only.
 /// </summary>
 
-template <typename T> string DEOpenCLKernelCreator<T>::LogScaleSumDEKernel() { return m_LogScaleSumDEKernel; }
-template <typename T> string DEOpenCLKernelCreator<T>::LogScaleSumDEEntryPoint() { return m_LogScaleSumDEEntryPoint; }
 template <typename T> string DEOpenCLKernelCreator<T>::LogScaleAssignDEKernel() { return m_LogScaleAssignDEKernel; }
 template <typename T> string DEOpenCLKernelCreator<T>::LogScaleAssignDEEntryPoint() { return m_LogScaleAssignDEEntryPoint; }
 
@@ -87,6 +99,7 @@ template <typename T> string DEOpenCLKernelCreator<T>::LogScaleAssignDEEntryPoin
 template <typename T>
 string DEOpenCLKernelCreator<T>::GaussianDEKernel(size_t ss, unsigned int filterWidth)
 {
+#ifndef ROW_ONLY_DE
 	if ((typeid(T) == typeid(double)) || (filterWidth > MaxDEFilterSize()))//Type double does not use cache.
 	{
 		if (ss > 1)
@@ -100,6 +113,7 @@ string DEOpenCLKernelCreator<T>::GaussianDEKernel(size_t ss, unsigned int filter
 			return m_GaussianDEWithoutSsNoCacheKernel;
 	}
 	else
+#endif
 	{
 		if (ss > 1)
 		{
@@ -122,6 +136,7 @@ string DEOpenCLKernelCreator<T>::GaussianDEKernel(size_t ss, unsigned int filter
 template <typename T>
 string DEOpenCLKernelCreator<T>::GaussianDEEntryPoint(size_t ss, unsigned int filterWidth)
 {
+#ifndef ROW_ONLY_DE
 	if ((typeid(T) == typeid(double)) || (filterWidth > MaxDEFilterSize()))//Type double does not use cache.
 	{
 		if (ss > 1)
@@ -135,6 +150,7 @@ string DEOpenCLKernelCreator<T>::GaussianDEEntryPoint(size_t ss, unsigned int fi
 			return m_GaussianDEWithoutSsNoCacheEntryPoint;
 	}
 	else
+#endif
 	{
 		if (ss > 1)
 		{
@@ -194,45 +210,6 @@ unsigned int DEOpenCLKernelCreator<T>::SolveMaxBoxSize(unsigned int localMem)
 	return (unsigned int)floor(sqrt(floor((T)localMem / 16.0)));//Divide by 16 because each element is float4.
 }
 
-/// <summary>
-/// Create the log scale kernel string, using summation.
-/// This means each cell will be added to, rather than just assigned.
-/// Since adding is slower than assigning, this should only be used when Passes > 1,
-/// otherwise use the kernel created from CreateLogScaleAssignDEKernelString().
-/// </summary>
-/// <returns>The kernel string</returns>
-template <typename T>
-string DEOpenCLKernelCreator<T>::CreateLogScaleSumDEKernelString()
-{
-	ostringstream os;
- 
-	os	<<
-		ConstantDefinesString(typeid(T) == typeid(double)) <<
-		DensityFilterCLStructString <<
-		"__kernel void " << m_LogScaleSumDEEntryPoint << "(\n"
-		"	const __global real4* histogram,\n"
-		"	__global real4* accumulator,\n"
-		"	__constant DensityFilterCL* logFilter\n"
-		"\t)\n"
-		"{\n"
-		"	if ((GLOBAL_ID_X < logFilter->m_SuperRasW) && (GLOBAL_ID_Y < logFilter->m_SuperRasH))\n"
-		"	{\n"
-		"		uint index = (GLOBAL_ID_Y * logFilter->m_SuperRasW) + GLOBAL_ID_X;\n"
-		"\n"
-		"		if (histogram[index].w != 0)\n"
-		"		{\n"
-		"			real_t logScale = (logFilter->m_K1 * log(1.0 + histogram[index].w * logFilter->m_K2)) / histogram[index].w;\n"
-		"\n"
-		"			accumulator[index] += histogram[index] * logScale;\n"//Using a single real4 vector operation doubles the speed from doing each component individually.
-		"		}\n"
-		"\n"
-		"		barrier(CLK_GLOBAL_MEM_FENCE);\n"//Just to be safe. Makes no speed difference to do all of the time or only when there's a hit.
-		"	}\n"
-		"}\n";
- 
-	return os.str();
-}
-
 /// <summary>
 /// Create the log scale kernel string, using assignment.
 /// Use this when Passes == 1.
@@ -270,6 +247,215 @@ string DEOpenCLKernelCreator<T>::CreateLogScaleAssignDEKernelString()
 	return os.str();
 }
 
+#ifdef ROW_ONLY_DE
+template <typename T>
+string DEOpenCLKernelCreator<T>::CreateGaussianDEKernel(size_t ss)
+{
+	bool doSS = ss > 1;
+	bool doScf = !(ss & 1);
+	ostringstream os;
+
+	os <<
+		ConstantDefinesString(typeid(T) == typeid(double)) <<
+		DensityFilterCLStructString <<
+		UnionCLStructString <<
+		"__kernel void " << GaussianDEEntryPoint(ss, MaxDEFilterSize()) << "(\n" <<
+		"	const __global real4* histogram,\n"
+		"	__global real4reals* accumulator,\n"
+		"	__constant DensityFilterCL* densityFilter,\n"
+		"	const __global real_t* filterCoefs,\n"
+		"	const __global real_t* filterWidths,\n"
+		"	const __global uint* coefIndices,\n"
+		"	const uint chunkSizeW,\n"
+		"	const uint chunkSizeH,\n"
+		"	const uint chunkW,\n"
+		"	const uint chunkH\n"
+		"\t)\n"
+		"{\n"
+		"	uint rowsToProcess = 32;\n"//Rows to process.
+		"\n"
+		"	if (((((BLOCK_ID_X * chunkSizeW) + chunkW) * BLOCK_SIZE_X) + THREAD_ID_X >= densityFilter->m_SuperRasW) ||\n"
+		"	    ((((BLOCK_ID_Y * chunkSizeH) + chunkH) * rowsToProcess) + THREAD_ID_Y >= densityFilter->m_SuperRasH))\n"
+		"			return;\n"
+		"\n";
+
+	if (doSS)
+	{
+		os <<
+			"	uint ss = (uint)floor((real_t)densityFilter->m_Supersample / 2.0);\n"
+			"	int densityBoxLeftX;\n"
+			"	int densityBoxRightX;\n"
+			"	int densityBoxTopY;\n"
+			"	int densityBoxBottomY;\n"
+			"\n";
+
+		if (doScf)
+			os <<
+			"	real_t scfact = pow(densityFilter->m_Supersample / (densityFilter->m_Supersample + 1.0), 2.0);\n";
+	}
+
+	os <<
+		"	uint fullTempBoxWidth;\n"
+		"	uint leftBound, rightBound, topBound, botBound;\n"
+		"	uint blockHistStartRow, blockHistEndRow, histCol;\n"
+		"	uint blockHistStartCol, boxReadStartCol, boxReadEndCol;\n"
+		"	uint accumWriteStartCol, colsToWrite, colOffset, colsToWriteOffset;\n"
+		"	int histRow, filterRow, accumWriteOffset;\n"
+		"\n"
+		"	fullTempBoxWidth = BLOCK_SIZE_X + (densityFilter->m_FilterWidth * 2);\n"
+		//Compute the bounds of the area to be sampled, which is just the ends minus the super sample minus 1.
+		"	leftBound = densityFilter->m_Supersample - 1;\n"
+		"	rightBound = densityFilter->m_SuperRasW - (densityFilter->m_Supersample - 1);\n"
+		"	topBound = densityFilter->m_Supersample - 1;\n"
+		"	botBound = densityFilter->m_SuperRasH - (densityFilter->m_Supersample - 1);\n"
+		"\n"
+		//Start and end values are the indices in the histogram read from
+		//and written to in the accumulator. They are not the indices for the local block of data.
+		//Before computing local offsets, compute the global offsets first to determine if any rows or cols fall outside of the bounds.
+		"	blockHistStartRow = min(botBound, topBound + (((BLOCK_ID_Y * chunkSizeH) + chunkH) * rowsToProcess));\n"//The first histogram row this block will process.
+		"	blockHistEndRow = min(botBound, blockHistStartRow + rowsToProcess);\n"//The last histogram row this block will process, clamped to the last row.
+		"	blockHistStartCol = min(rightBound, leftBound + (((BLOCK_ID_X * chunkSizeW) + chunkW) * BLOCK_SIZE_X));\n"//The first histogram column this block will process.
+		"	boxReadStartCol = densityFilter->m_FilterWidth - min(densityFilter->m_FilterWidth, blockHistStartCol);\n"//The first box col this block will read from when copying to the accumulator.
+		"	boxReadEndCol = densityFilter->m_FilterWidth + min(densityFilter->m_FilterWidth + BLOCK_SIZE_X, densityFilter->m_SuperRasW - blockHistStartCol);\n"//The last box col this block will read from when copying to the accumulator.
+		"\n"
+		//Last, the indices in the global accumulator that the local bounds will be writing to.
+		"	accumWriteStartCol = blockHistStartCol - min(densityFilter->m_FilterWidth, blockHistStartCol);\n"//The first column in the accumulator this block will write to.
+		"	colsToWrite = ceil((real_t)(boxReadEndCol - boxReadStartCol) / (real_t)BLOCK_SIZE_X);\n"//Elements per thread to be written to the accumulator.
+		"	histCol = blockHistStartCol + THREAD_ID_X;\n"//The histogram column this individual thread will be reading from.
+		"\n"
+		"	if (histCol >= rightBound)\n"
+		"		return;\n"
+		"\n"
+		//Compute the col position in this local box to serve as the center position
+		//from which filter application offsets are computed.
+		//These are the local indices for the local data that are temporarily accumulated to before
+		//writing out to the global accumulator.
+		"	uint boxCol = densityFilter->m_FilterWidth + THREAD_ID_X;\n"
+		"	uint colsToZeroOffset, colsToZero = ceil((real_t)fullTempBoxWidth / (real_t)(BLOCK_SIZE_X));\n"//Usually is 2.
+		"	int i, j, k, jmin, jmax;\n"
+		"	uint filterSelectInt, filterCoefIndex;\n"
+		"	real_t cacheLog;\n"
+		"	real_t filterSelect;\n"
+		"	real4 bucket;\n"
+		;
+
+	os << " __local real4reals filterBox[192];\n";//Must be >= fullTempBoxWidth.
+
+	os <<
+		"\n"
+		"	colsToZeroOffset = colsToZero * THREAD_ID_X;\n"
+		"	colsToWriteOffset = colsToWrite * THREAD_ID_X;\n"
+		"	k = (int)densityFilter->m_FilterWidth;\n"//Need a signed int to use below, really is filter width, but reusing a variable to save space.
+		"\n"
+		"	for (histRow = blockHistStartRow; histRow < blockHistEndRow; histRow++)\n"//Process pixels by row, for 32 rows.
+		"	{\n"
+		"		bucket = histogram[(histRow * densityFilter->m_SuperRasW) + histCol];\n"
+		"\n"
+		"		if (bucket.w != 0)\n"
+		"			cacheLog = (densityFilter->m_K1 * log(1.0 + bucket.w * densityFilter->m_K2)) / bucket.w;\n"
+		"\n";
+		
+	if (doSS)
+	{
+		os <<
+			"	filterSelect = 0;\n"
+			"	densityBoxLeftX = histCol - min(histCol, ss);\n"
+			"	densityBoxRightX = histCol + min(ss, (densityFilter->m_SuperRasW - histCol) - 1);\n"
+			"	densityBoxTopY = histRow - min((uint)histRow, ss);\n"
+			"	densityBoxBottomY = histRow + min(ss, (densityFilter->m_SuperRasH - histRow) - 1);\n"
+			"\n"
+			"	for (j = densityBoxTopY; j <= densityBoxBottomY; j++)\n"
+			"	{\n"
+			"		for (i = densityBoxLeftX; i <= densityBoxRightX; i++)\n"
+			"		{\n"
+			"			filterSelect += histogram[(j * densityFilter->m_SuperRasW) + i].w;\n"
+			"		}\n"
+			"	}\n"
+			"\n";
+
+	if (doScf)
+		os << "	filterSelect *= scfact;\n";
+	}
+	else
+	{
+	os
+		<< "	filterSelect = bucket.w;\n";
+	}
+
+	os <<
+		"\n"
+		"		if (filterSelect > densityFilter->m_MaxFilteredCounts)\n"
+		"			filterSelectInt = densityFilter->m_MaxFilterIndex;\n"
+		"		else if (filterSelect <= DE_THRESH)\n"
+		"			filterSelectInt = (int)ceil(filterSelect) - 1;\n"
+		"		else if (filterSelect != 0)\n"
+		"			filterSelectInt = (int)DE_THRESH + (int)floor(pow((real_t)(filterSelect - DE_THRESH), densityFilter->m_Curve));\n"
+		"		else\n"
+		"			filterSelectInt = 0;\n"
+		"\n"
+		"		if (filterSelectInt > densityFilter->m_MaxFilterIndex)\n"
+		"			filterSelectInt = densityFilter->m_MaxFilterIndex;\n"
+		"\n"
+		"		filterCoefIndex = filterSelectInt * densityFilter->m_KernelSize;\n"
+		"\n"
+		//With this new method, only accumulate to the temp local buffer first. Write to the final accumulator last.
+		//For each loop through, note that there is a local memory barrier call inside of each call to AddToAccumNoCheck().
+		//If this isn't done, pixel errors occurr and even an out of resources error occurrs because too many writes are done to the same place in memory at once.
+		"		jmin = min(k, histRow);\n"
+		"		jmax = (int)min((densityFilter->m_SuperRasH - 1) - histRow, densityFilter->m_FilterWidth);\n"
+		"\n"
+		"		for (j = -jmin; j <= jmax; j++)\n"
+		"		{\n"
+		"			for (i = 0; i < colsToZero && (colsToZeroOffset + i) < fullTempBoxWidth; i++)\n"//Each thread zeroizes a few columns.
+		"			{\n"
+		"				filterBox[colsToZeroOffset + i].m_Real4 = 0;\n"
+		"			}\n"
+		"\n"
+		"			barrier(CLK_LOCAL_MEM_FENCE);\n"
+		"\n"
+		"			if (bucket.w != 0)\n"
+		"			{\n"
+		"				filterRow = abs(j) * (densityFilter->m_FilterWidth + 1);\n"
+		"\n"
+		"				for (i = -k; i <= k; i++)\n"
+		"				{\n"
+		"					filterSelectInt = filterCoefIndex + coefIndices[filterRow + abs(i)];\n"//Really is filterCoeffIndexPlusOffset, but reusing a variable to save space.
+		"					filterBox[i + boxCol].m_Real4 += (bucket * (filterCoefs[filterSelectInt] * cacheLog));\n"
+		"				}\n"
+		"			}\n"
+		"\n"
+		"			barrier(CLK_LOCAL_MEM_FENCE);\n"
+		"\n"
+		//At this point, all threads in this block have applied the filter to their surrounding pixels and stored the results in the temp local box.
+		//Add the cells of it that are in bounds to the global accumulator.
+		//Compute offsets in local box to read from, and offsets into global accumulator to write to.
+		//Use a method here that is similar to the zeroization above: Each thread (column) in the first row iterates through all of the
+		//rows and adds a few columns to the accumulator.
+		//"			if (THREAD_ID_X == 0)\n"
+		//"			{\n"
+		//"				for (int kk = boxReadStartCol, i = 0; kk < boxReadEndCol; kk++, i++)\n"//Each thread writes a few columns.//Could do away with kk//TODO//OPT
+		//"				{\n"
+		//"					accumulator[((histRow + j) * densityFilter->m_SuperRasW) + (accumWriteStartCol + i)].m_Real4 += filterBox[kk].m_Real4;\n"
+		//"				}\n"
+		//"			}\n"
+		"			accumWriteOffset = ((histRow + j) * densityFilter->m_SuperRasW) + accumWriteStartCol;\n"
+		"\n"
+		"			for (i = 0; i < colsToWrite; i++)\n"//Each thread writes a few columns.
+		"			{\n"
+		"				colOffset = colsToWriteOffset + i;\n"
+		"\n"
+		"				if (boxReadStartCol + colOffset < boxReadEndCol)\n"
+		"					accumulator[accumWriteOffset + colOffset].m_Real4 += filterBox[boxReadStartCol + colOffset].m_Real4;\n"
+		"			}\n"
+		"		}\n"//for() filter rows.
+		"		barrier(CLK_GLOBAL_MEM_FENCE);\n"
+		"	}\n"//for() histogram rows.
+		"}\n";
+
+	return os.str();
+}
+
+#else
 /// <summary>
 /// Create the gaussian density filtering kernel string.
 /// 6 different methods of processing were tried before settling on this final and fastest 7th one.
@@ -281,7 +467,7 @@ string DEOpenCLKernelCreator<T>::CreateLogScaleAssignDEKernelString()
 /// This allows writing to the global buffer without ever overlapping or using atomics.
 /// The supersample parameter will produce three different kernels.
 /// SS = 1, SS > 1 && SS even, SS > 1 && SS odd.
-/// The width of the kernl this runs in must be evenly divisible by 16 or else artifacts will occur.
+/// The width of the kernel this runs in must be evenly divisible by 16 or else artifacts will occur.
 /// Note that because this function uses so many variables and is so complex, OpenCL can easily run
 /// out of resources in some cases. Certain variables had to be reused to condense the kernel footprint
 /// down enough to be able to run a block size of 32x32.
@@ -311,18 +497,15 @@ string DEOpenCLKernelCreator<T>::CreateGaussianDEKernel(size_t ss)
 		"	const __global uint* coefIndices,\n"
 		"	const uint chunkSizeW,\n"
 		"	const uint chunkSizeH,\n"
-		"	const uint rowParity,\n"
-		"	const uint colParity\n"
+		"	const uint chunkW,\n"
+		"	const uint chunkH\n"
 		"\t)\n"
 		"{\n"
-		//Parity determines if this function should execute.
-		"	if ((GLOBAL_ID_X >= densityFilter->m_SuperRasW) ||\n"
-		"	    (GLOBAL_ID_Y >= densityFilter->m_SuperRasH) ||\n"
-		"	    ((BLOCK_ID_X % chunkSizeW) != colParity)    ||\n"
-		"	    ((BLOCK_ID_Y % chunkSizeH) != rowParity))     \n"
+		"	if (((((BLOCK_ID_X * chunkSizeW) + chunkW) * BLOCK_SIZE_X) + THREAD_ID_X >= densityFilter->m_SuperRasW) ||\n"
+		"	    ((((BLOCK_ID_Y * chunkSizeH) + chunkH) * BLOCK_SIZE_Y) + THREAD_ID_Y >= densityFilter->m_SuperRasH))\n"
 		"			return;\n"
 		"\n";
- 
+
 	if (doSS)
 	{
 		os <<
@@ -367,13 +550,13 @@ string DEOpenCLKernelCreator<T>::CreateGaussianDEKernel(size_t ss)
 		//Start and end values are the indices in the histogram read from
 		//and written to in the accumulator. They are not the indices for the local block of data.
 		//Before computing local offsets, compute the global offsets first to determine if any rows or cols fall outside of the bounds.
-		"	blockHistStartRow = min(botBound, topBound + (BLOCK_ID_Y * BLOCK_SIZE_Y));\n"//The first histogram row this block will process.
+		"	blockHistStartRow = min(botBound, topBound + (((BLOCK_ID_Y * chunkSizeH) + chunkH) * BLOCK_SIZE_Y));\n"//The first histogram row this block will process.
 		"	blockHistEndRow = min(botBound, blockHistStartRow + BLOCK_SIZE_Y);\n"//The last histogram row this block will process, clamped to the last row.
 		"	boxReadStartRow = densityFilter->m_FilterWidth - min(densityFilter->m_FilterWidth, blockHistStartRow);\n"//The first row in the local box to read from when writing back to the final accumulator for this block.
 		"	boxReadEndRow = densityFilter->m_FilterWidth + min(densityFilter->m_FilterWidth + BLOCK_SIZE_Y, densityFilter->m_SuperRasH - blockHistStartRow);\n"//The last row in the local box to read from  when writing back to the final accumulator for this block.
-		"	blockHistStartCol = min(rightBound, leftBound + (BLOCK_ID_X * BLOCK_SIZE_X));\n"//The first histogram column this block will process.
-		"	boxReadStartCol = densityFilter->m_FilterWidth - min(densityFilter->m_FilterWidth, blockHistStartCol);\n"//The first box row this block will read from when copying to the accumulator.
-		"	boxReadEndCol = densityFilter->m_FilterWidth + min(densityFilter->m_FilterWidth + BLOCK_SIZE_X, densityFilter->m_SuperRasW - blockHistStartCol);\n"//The last box row this block will read from when copying to the accumulator.
+		"	blockHistStartCol = min(rightBound, leftBound + (((BLOCK_ID_X * chunkSizeW) + chunkW) * BLOCK_SIZE_X));\n"//The first histogram column this block will process.
+		"	boxReadStartCol = densityFilter->m_FilterWidth - min(densityFilter->m_FilterWidth, blockHistStartCol);\n"//The first box col this block will read from when copying to the accumulator.
+		"	boxReadEndCol = densityFilter->m_FilterWidth + min(densityFilter->m_FilterWidth + BLOCK_SIZE_X, densityFilter->m_SuperRasW - blockHistStartCol);\n"//The last box col this block will read from when copying to the accumulator.
 		"\n"
 		//Last, the indices in the global accumulator that the local bounds will be writing to.
 		"	accumWriteStartRow = blockHistStartRow - min(densityFilter->m_FilterWidth,  blockHistStartRow);\n"//Will be fw - 0 except for boundary columns, it will be less.
@@ -496,7 +679,7 @@ string DEOpenCLKernelCreator<T>::CreateGaussianDEKernel(size_t ss)
 		"				{\n"
 		"					filterSelectInt = filterCoefIndex + coefIndices[(abs(j) * (densityFilter->m_FilterWidth + 1)) + abs(i)];\n"//Really is filterCoeffIndexPlusOffset, but reusing a variable to save space.
 		"\n"
-		"					if (filterCoefs[filterSelectInt] != 0)\n"
+		"					if (filterCoefs[filterSelectInt] != 0)\n"//This conditional actually improves speed, despite SIMT being bad at conditionals.
 		"					{\n"
 		"						filterBox[(i + boxCol) + ((j + boxRow) * fullTempBoxWidth)].m_Real4 += (bucket * (filterCoefs[filterSelectInt] * cacheLog));\n"
 		"					}\n"
@@ -511,14 +694,14 @@ string DEOpenCLKernelCreator<T>::CreateGaussianDEKernel(size_t ss)
 		"\n"
 		"	if (THREAD_ID_Y == 0)\n"
 		"	{\n"
-				//At this point, all threads in this block have applied the filter to their surrounding pixel and stored the results in the temp local box.
+				//At this point, all threads in this block have applied the filter to their surrounding pixels and stored the results in the temp local box.
 				//Add the cells of it that are in bounds to the global accumulator.
 				//Compute offsets in local box to read from, and offsets into global accumulator to write to.
 				//Use a method here that is similar to the zeroization above: Each thread (column) in the first row iterates through all of the
 				//rows and adds a few columns to the accumulator.
 		"		for (i = boxReadStartRow, j = accumWriteStartRow; i < boxReadEndRow; i++, j++)\n"
 		"		{\n"
-		"			for (k = 0; k < colsToWrite; k++)\n"//Write a few columns.
+		"			for (k = 0; k < colsToWrite; k++)\n"//Each thread writes a few columns.
 		"			{\n"
 		"				boxCol = (colsToWrite * THREAD_ID_X) + k;\n"//Really is colOffset, but reusing a variable to save space.
 		"\n"
@@ -532,6 +715,7 @@ string DEOpenCLKernelCreator<T>::CreateGaussianDEKernel(size_t ss)
 
 	return os.str();
 }
+#endif
 
 /// <summary>
 /// Create the gaussian density filtering kernel string, but use no local cache and perform
@@ -543,7 +727,7 @@ string DEOpenCLKernelCreator<T>::CreateGaussianDEKernel(size_t ss)
 /// on the CPU because the frequent global memory access brings performance to a crawl.
 /// The supersample parameter will produce three different kernels.
 /// SS = 1, SS > 1 && SS even, SS > 1 && SS odd.
-/// The width of the kernl this runs in must be evenly divisible by 16 or else artifacts will occur.
+/// The width of the kernel this runs in must be evenly divisible by 16 or else artifacts will occur.
 /// Note that because this function uses so many variables and is so complex, OpenCL can easily run
 /// out of resources in some cases. Certain variables had to be reused to condense the kernel footprint
 /// down enough to be able to run a block size of 32x32.
@@ -572,15 +756,12 @@ string DEOpenCLKernelCreator<T>::CreateGaussianDEKernelNoLocalCache(size_t ss)
 		"	const __global uint* coefIndices,\n"
 		"	const uint chunkSizeW,\n"
 		"	const uint chunkSizeH,\n"
-		"	const uint rowParity,\n"
-		"	const uint colParity\n"
+		"	const uint chunkW,\n"
+		"	const uint chunkH\n"
 		"\t)\n"
 		"{\n"
-		//Parity determines if this function should execute.
-		"	if ((GLOBAL_ID_X >= densityFilter->m_SuperRasW) ||\n"
-		"	    (GLOBAL_ID_Y >= densityFilter->m_SuperRasH) ||\n"
-		"	    ((BLOCK_ID_X % chunkSizeW) != colParity)    ||\n"
-		"	    ((BLOCK_ID_Y % chunkSizeH) != rowParity))     \n"
+		"	if (((((BLOCK_ID_X * chunkSizeW) + chunkW) * BLOCK_SIZE_X) + THREAD_ID_X >= densityFilter->m_SuperRasW) ||\n"
+		"	    ((((BLOCK_ID_Y * chunkSizeH) + chunkH) * BLOCK_SIZE_Y) + THREAD_ID_Y >= densityFilter->m_SuperRasH))\n"
 		"			return;\n"
 		"\n";
  
@@ -606,10 +787,10 @@ string DEOpenCLKernelCreator<T>::CreateGaussianDEKernelNoLocalCache(size_t ss)
 		"\n"
 		//Start and end values are the indices in the histogram read from and written to in the accumulator.
 		//Before computing local offsets, compute the global offsets first to determine if any rows or cols fall outside of the bounds.
-		"	uint blockHistStartRow = min(botBound, topBound + (BLOCK_ID_Y * BLOCK_SIZE_Y));\n"//The first histogram row this block will process.
+		"	uint blockHistStartRow = min(botBound, topBound + (((BLOCK_ID_Y * chunkSizeH) + chunkH) * BLOCK_SIZE_Y));\n"//The first histogram row this block will process.
 		"	uint threadHistRow = blockHistStartRow + THREAD_ID_Y;\n"//The histogram row this individual thread will be reading from.
 		"\n"
-		"	uint blockHistStartCol = min(rightBound, leftBound + (BLOCK_ID_X * BLOCK_SIZE_X));\n"//The first histogram column this block will process.
+		"	uint blockHistStartCol = min(rightBound, leftBound + (((BLOCK_ID_X * chunkSizeW) + chunkW) * BLOCK_SIZE_X));\n"//The first histogram column this block will process.
 		"	uint threadHistCol = blockHistStartCol + THREAD_ID_X;\n"//The histogram column this individual thread will be reading from.
 		"\n"
 		"	int i, j;\n"
diff --git a/Source/EmberCL/DEOpenCLKernelCreator.h b/Source/EmberCL/DEOpenCLKernelCreator.h
index 802ddd6..a56cef3 100644
--- a/Source/EmberCL/DEOpenCLKernelCreator.h
+++ b/Source/EmberCL/DEOpenCLKernelCreator.h
@@ -8,6 +8,8 @@
 /// DEOpenCLKernelCreator class.
 /// </summary>
 
+//#define ROW_ONLY_DE 1
+
 namespace EmberCLns
 {
 /// <summary>
@@ -35,8 +37,6 @@ public:
 	DEOpenCLKernelCreator(bool nVidia);
 
 	//Accessors.
-	string LogScaleSumDEKernel();
-	string LogScaleSumDEEntryPoint();
 	string LogScaleAssignDEKernel();
 	string LogScaleAssignDEEntryPoint();
 	string GaussianDEKernel(size_t ss, unsigned int filterWidth);
@@ -49,14 +49,10 @@ public:
 
 private:
 	//Kernel creators.
-	string CreateLogScaleSumDEKernelString();
 	string CreateLogScaleAssignDEKernelString();
 	string CreateGaussianDEKernel(size_t ss);
 	string CreateGaussianDEKernelNoLocalCache(size_t ss);
-
-	string m_LogScaleSumDEKernel;
-	string m_LogScaleSumDEEntryPoint;
-
+	
 	string m_LogScaleAssignDEKernel;
 	string m_LogScaleAssignDEEntryPoint;
 
diff --git a/Source/EmberCL/EmberCLStructs.h b/Source/EmberCL/EmberCLStructs.h
index 26328d4..6aa4c34 100644
--- a/Source/EmberCL/EmberCLStructs.h
+++ b/Source/EmberCL/EmberCLStructs.h
@@ -181,9 +181,6 @@ static const char* XformCLStructString =
 "} XformCL;\n"
 "\n";
 
-#define MAX_CL_XFORM 21//These must always match.
-#define MAX_CL_XFORM_STRING "21"
-
 /// <summary>
 /// A structure on the host used to hold all of the needed information for an ember used on the device to iterate in OpenCL.
 /// Template argument expected to be float or double.
@@ -191,7 +188,6 @@ static const char* XformCLStructString =
 template <typename T>
 struct ALIGN EmberCL
 {
-	XformCL<T> m_Xforms[MAX_CL_XFORM];
 	T m_CamZPos;
 	T m_CamPerspective;
 	T m_CamYaw;
@@ -209,7 +205,6 @@ struct ALIGN EmberCL
 static const char* EmberCLStructString =
 "typedef struct __attribute__ " ALIGN_CL " _EmberCL\n"
 "{\n"
-"	XformCL m_Xforms[" MAX_CL_XFORM_STRING "];\n"
 "	real_t m_CamZPos;\n"
 "	real_t m_CamPerspective;\n"
 "	real_t m_CamYaw;\n"
diff --git a/Source/EmberCL/IterOpenCLKernelCreator.cpp b/Source/EmberCL/IterOpenCLKernelCreator.cpp
index 6c944e6..220938b 100644
--- a/Source/EmberCL/IterOpenCLKernelCreator.cpp
+++ b/Source/EmberCL/IterOpenCLKernelCreator.cpp
@@ -1,6 +1,9 @@
 #include "EmberCLPch.h"
 #include "IterOpenCLKernelCreator.h"
 
+//#define STRAIGHT_RAND 1
+#define USE_CASE 1
+
 namespace EmberCLns
 {
 /// <summary>
@@ -233,8 +236,9 @@ string IterOpenCLKernelCreator<T>::CreateIterKernelString(Ember<T>& ember, strin
 		"__kernel void " << m_IterEntryPoint << "(\n" <<
 		"	uint iterCount,\n"
 		"	uint fuseCount,\n"
-		"	uint seed,\n"
+		"	__global uint2* seeds,\n"
 		"	__constant EmberCL* ember,\n"
+		"	__constant XformCL* xforms,\n"
 		"	__constant real_t* parVars,\n"
 		"	__global uchar* xformDistributions,\n"//Using uchar is quicker than uint. Can't be constant because the size can be too large to fit when using xaos.//FINALOPT
 		"	__constant CarToRasCL* carToRas,\n"
@@ -246,13 +250,14 @@ string IterOpenCLKernelCreator<T>::CreateIterKernelString(Ember<T>& ember, strin
 		"{\n"
 		"	bool fuse, ok;\n"
 		"	uint threadIndex = INDEX_IN_BLOCK_2D;\n"
+		"	uint pointsIndex = INDEX_IN_GRID_2D;\n"
 		"	uint i, itersToDo;\n"
 		"	uint consec = 0;\n"
 		//"	int badvals = 0;\n"
 		"	uint histIndex;\n"
 		"	real_t p00, p01;\n"
 		"	Point firstPoint, secondPoint, tempPoint;\n"
-		"	uint2 mwc;\n"
+		"	uint2 mwc = seeds[pointsIndex];\n"
 		"	float4 palColor1;\n"
 		"	int2 iPaletteCoord;\n"
 		"	const sampler_t paletteSampler = CLK_NORMALIZED_COORDS_FALSE |\n"//Coords from 0 to 255.
@@ -265,12 +270,11 @@ string IterOpenCLKernelCreator<T>::CreateIterKernelString(Ember<T>& ember, strin
 	
 	os <<
 		"\n"
+#ifndef STRAIGHT_RAND
 		"	__local Point swap[NTHREADS];\n"
 		"	__local uint xfsel[NWARPS];\n"
+#endif
 		"\n"
-		"	uint pointsIndex = INDEX_IN_GRID_2D;\n"
-		"	mwc.x = (pointsIndex + 1 * seed);\n"
-		"	mwc.y = ((BLOCK_ID_X + 1) * (pointsIndex + 1) * seed);\n"
 		"	iPaletteCoord.y = 0;\n"
 		"\n"
 		"	if (fuseCount > 0)\n"
@@ -295,9 +299,11 @@ string IterOpenCLKernelCreator<T>::CreateIterKernelString(Ember<T>& ember, strin
 		//This along with the randomness that the point shuffle provides gives sufficient randomness
 		//to produce results identical to those produced on the CPU.
 	os <<
+#ifndef STRAIGHT_RAND
 		"	if (THREAD_ID_Y == 0 && THREAD_ID_X < NWARPS)\n"
 		"		xfsel[THREAD_ID_X] = MwcNext(&mwc) % " << CHOOSE_XFORM_GRAIN << ";\n"//It's faster to do the % here ahead of time than every time an xform is looked up to use inside the loop.
 		"\n"
+#endif
 		"	barrier(CLK_LOCAL_MEM_FENCE);\n"
 		"\n"
 		"	for (i = 0; i < itersToDo; i++)\n"
@@ -309,22 +315,51 @@ string IterOpenCLKernelCreator<T>::CreateIterKernelString(Ember<T>& ember, strin
 		"		do\n"
 		"		{\n";
 
-		//If xaos is present, the cuburn method is effectively ceased. Every thread will be picking a random xform.
+		//If xaos is present, the a hybrid of the cuburn method is used.
+		//This makes each thread in a row pick the same offset into a distribution, using xfsel.
+		//However, the distribution the offset is in, is determined by firstPoint.m_LastXfUsed.
 		if (ember.XaosPresent())
 		{
 			os <<
+#ifdef STRAIGHT_RAND
 		"			secondPoint.m_LastXfUsed = xformDistributions[MwcNext(&mwc) % " << CHOOSE_XFORM_GRAIN << " + (" << CHOOSE_XFORM_GRAIN << " * (firstPoint.m_LastXfUsed + 1u))];\n\n";
-		//"			secondPoint.m_LastXfUsed = xformDistributions[xfsel[THREAD_ID_Y] + (" << CHOOSE_XFORM_GRAIN << " * (firstPoint.m_LastXfUsed + 1u))];\n\n";//Partial cuburn hybrid.
+#else
+		"			secondPoint.m_LastXfUsed = xformDistributions[xfsel[THREAD_ID_Y] + (" << CHOOSE_XFORM_GRAIN << " * (firstPoint.m_LastXfUsed + 1u))];\n\n";//Partial cuburn hybrid.
+#endif
 		}
 		else
 		{
 			os <<
-		//"			secondPoint.m_LastXfUsed = xformDistributions[MwcNext(&mwc) % " << CHOOSE_XFORM_GRAIN << "];\n\n";//For testing, using straight rand flam4/fractron style instead of cuburn.
+#ifdef STRAIGHT_RAND
+		"			secondPoint.m_LastXfUsed = xformDistributions[MwcNext(&mwc) % " << CHOOSE_XFORM_GRAIN << "];\n\n";//For testing, using straight rand flam4/fractron style instead of cuburn.
+#else
 		"			secondPoint.m_LastXfUsed = xformDistributions[xfsel[THREAD_ID_Y]];\n\n";
+#endif
 		}
 
 		for (i = 0; i < ember.XformCount(); i++)
 		{
+#ifdef USE_CASE
+			if (i == 0)
+			{
+			os <<
+		"			switch (secondPoint.m_LastXfUsed)\n"
+		"			{\n";
+			}
+			
+			os <<
+		"				case " << i << ":\n"
+		"				{\n" <<
+		"					Xform" << i << "(&(xforms[" << i << "]), parVars, &firstPoint, &secondPoint, &mwc);\n" <<
+		"					break;\n"
+		"				}\n";
+
+			if (i == ember.XformCount() - 1)
+			{
+			os <<
+		"			}\n";
+			}
+#else
 			if (i == 0)
 				os <<
 		"			if (secondPoint.m_LastXfUsed == " << i << ")\n";
@@ -334,9 +369,11 @@ string IterOpenCLKernelCreator<T>::CreateIterKernelString(Ember<T>& ember, strin
 
 		os <<
 		"			{\n" <<
-		"				Xform" << i << "(&(ember->m_Xforms[" << i << "]), parVars, &firstPoint, &secondPoint, &mwc);\n" <<
+		"				Xform" << i << "(&(xforms[" << i << "]), parVars, &firstPoint, &secondPoint, &mwc);\n" <<
 		"			}\n";
+#endif
 		}
+
 		os <<
 		"\n"
 		"			ok = !BadVal(secondPoint.m_X) && !BadVal(secondPoint.m_Y);\n"
@@ -360,6 +397,7 @@ string IterOpenCLKernelCreator<T>::CreateIterKernelString(Ember<T>& ember, strin
 		"			secondPoint.m_Y = MwcNextNeg1Pos1(&mwc);\n"
 		"			secondPoint.m_Z = 0.0;\n"
 		"		}\n"
+#ifndef STRAIGHT_RAND
 		"\n"//Rotate points between threads. This is how randomization is achieved.
 		"		uint swr = threadXY + ((i & 1u) * threadXDivRows);\n"
 		"		uint sw = (swr * THREADS_PER_WARP + THREAD_ID_X) & threadsMinus1;\n"
@@ -368,16 +406,16 @@ string IterOpenCLKernelCreator<T>::CreateIterKernelString(Ember<T>& ember, strin
 		//Write to another thread's location.
 		"		swap[sw] = secondPoint;\n"
 		"\n"
-
 		//Populate randomized xform index buffer with new random values.
 		"		if (THREAD_ID_Y == 0 && THREAD_ID_X < NWARPS)\n"
 		"			xfsel[THREAD_ID_X] = MwcNext(&mwc) % " << CHOOSE_XFORM_GRAIN << ";\n"
 		"\n"
 		"		barrier(CLK_LOCAL_MEM_FENCE);\n"
-		"\n"
-
 		//Another thread will have written to this thread's location, so read the new value and use it for accumulation below.
 		"		firstPoint = swap[threadIndex];\n"
+#else
+		"		firstPoint = secondPoint;\n"//For testing, using straight rand flam4/fractron style instead of cuburn.
+#endif
 		"\n"
 		"		if (fuse)\n"
 		"		{\n"
@@ -399,14 +437,14 @@ string IterOpenCLKernelCreator<T>::CreateIterKernelString(Ember<T>& ember, strin
 
 			//CPU takes an extra step here to preserve the opacity of the randomly selected xform, rather than the final xform's opacity.
 			//The same thing takes place here automatically because secondPoint.m_LastXfUsed is used below to retrieve the opacity when accumulating.
-			os <<
-		"		if ((ember->m_Xforms[" << finalIndex << "].m_Opacity == 1) || (MwcNext01(&mwc) < ember->m_Xforms[" << finalIndex << "].m_Opacity))\n"
-		"		{\n"
-		"			tempPoint.m_LastXfUsed = secondPoint.m_LastXfUsed;\n"
-		"			Xform" << finalIndex << "(&(ember->m_Xforms[" << finalIndex << "]), parVars, &secondPoint, &tempPoint, &mwc);\n"
-		"			secondPoint = tempPoint;\n"
-		"		}\n"
-		"\n";
+		os <<
+			"		if ((xforms[" << finalIndex << "].m_Opacity == 1) || (MwcNext01(&mwc) < xforms[" << finalIndex << "].m_Opacity))\n"
+			"		{\n"
+			"			tempPoint.m_LastXfUsed = secondPoint.m_LastXfUsed;\n"
+			"			Xform" << finalIndex << "(&(xforms[" << finalIndex << "]), parVars, &secondPoint, &tempPoint, &mwc);\n"
+			"			secondPoint = tempPoint;\n"
+			"		}\n"
+			"\n";
 		}
 		
 		os << CreateProjectionString(ember);
@@ -471,18 +509,18 @@ string IterOpenCLKernelCreator<T>::CreateIterKernelString(Ember<T>& ember, strin
 				if (typeid(T) == typeid(double))
 				{
 					os <<
-		"				AtomicAdd(&(histogram[histIndex].m_Reals[0]), (real_t)palColor1.x * ember->m_Xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n"//Always apply opacity, even though it's usually 1.
-		"				AtomicAdd(&(histogram[histIndex].m_Reals[1]), (real_t)palColor1.y * ember->m_Xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n"
-		"				AtomicAdd(&(histogram[histIndex].m_Reals[2]), (real_t)palColor1.z * ember->m_Xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n"
-		"				AtomicAdd(&(histogram[histIndex].m_Reals[3]), (real_t)palColor1.w * ember->m_Xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n";
+		"				AtomicAdd(&(histogram[histIndex].m_Reals[0]), (real_t)palColor1.x * xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n"//Always apply opacity, even though it's usually 1.
+		"				AtomicAdd(&(histogram[histIndex].m_Reals[1]), (real_t)palColor1.y * xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n"
+		"				AtomicAdd(&(histogram[histIndex].m_Reals[2]), (real_t)palColor1.z * xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n"
+		"				AtomicAdd(&(histogram[histIndex].m_Reals[3]), (real_t)palColor1.w * xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n";
 				}
 				else
 				{
-				os <<
-		"				AtomicAdd(&(histogram[histIndex].m_Reals[0]), palColor1.x * ember->m_Xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n"//Always apply opacity, even though it's usually 1.
-		"				AtomicAdd(&(histogram[histIndex].m_Reals[1]), palColor1.y * ember->m_Xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n"
-		"				AtomicAdd(&(histogram[histIndex].m_Reals[2]), palColor1.z * ember->m_Xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n"
-		"				AtomicAdd(&(histogram[histIndex].m_Reals[3]), palColor1.w * ember->m_Xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n";
+					os <<
+		"				AtomicAdd(&(histogram[histIndex].m_Reals[0]), palColor1.x * xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n"//Always apply opacity, even though it's usually 1.
+		"				AtomicAdd(&(histogram[histIndex].m_Reals[1]), palColor1.y * xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n"
+		"				AtomicAdd(&(histogram[histIndex].m_Reals[2]), palColor1.z * xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n"
+		"				AtomicAdd(&(histogram[histIndex].m_Reals[3]), palColor1.w * xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n";
 				}
 			}
 			else
@@ -496,12 +534,12 @@ string IterOpenCLKernelCreator<T>::CreateIterKernelString(Ember<T>& ember, strin
 		"				realColor.y = (real_t)palColor1.y;\n"
 		"				realColor.z = (real_t)palColor1.z;\n"
 		"				realColor.w = (real_t)palColor1.w;\n"
-		"				histogram[histIndex].m_Real4 += (realColor * ember->m_Xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n";
+		"				histogram[histIndex].m_Real4 += (realColor * xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n";
 				}
 				else
 				{
-				os <<
-		"				histogram[histIndex].m_Real4 += (palColor1 * ember->m_Xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n";
+					os <<
+		"				histogram[histIndex].m_Real4 += (palColor1 * xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n";
 				}
 			}
 
@@ -525,6 +563,7 @@ string IterOpenCLKernelCreator<T>::CreateIterKernelString(Ember<T>& ember, strin
 		"	points[pointsIndex].m_ColorX = MwcNextNeg1Pos1(&mwc);\n"
 #else
 		"	points[pointsIndex] = firstPoint;\n"
+		"	seeds[pointsIndex] = mwc;\n"
 #endif
 		"	barrier(CLK_GLOBAL_MEM_FENCE);\n"
 		"}\n";
diff --git a/Source/EmberCL/OpenCLWrapper.cpp b/Source/EmberCL/OpenCLWrapper.cpp
index 0a043c5..57621a8 100644
--- a/Source/EmberCL/OpenCLWrapper.cpp
+++ b/Source/EmberCL/OpenCLWrapper.cpp
@@ -1121,12 +1121,12 @@ string OpenCLWrapper::DumpInfo()
 /// <summary>
 /// OpenCL properties, getters only.
 /// </summary>
-bool OpenCLWrapper::Ok() { return m_Init; }
-bool OpenCLWrapper::Shared() { return m_Shared; }
-cl::Context OpenCLWrapper::Context() { return m_Context; }
-unsigned int OpenCLWrapper::PlatformIndex() { return m_PlatformIndex; }
-unsigned int OpenCLWrapper::DeviceIndex() { return m_DeviceIndex; }
-unsigned int OpenCLWrapper::LocalMemSize() { return m_LocalMemSize; }
+bool OpenCLWrapper::Ok() const { return m_Init; }
+bool OpenCLWrapper::Shared() const { return m_Shared; }
+cl::Context OpenCLWrapper::Context() const { return m_Context; }
+unsigned int OpenCLWrapper::PlatformIndex() const { return m_PlatformIndex; }
+unsigned int OpenCLWrapper::DeviceIndex() const { return m_DeviceIndex; }
+unsigned int OpenCLWrapper::LocalMemSize() const { return m_LocalMemSize; }
 
 /// <summary>
 /// Makes the even grid dims.
diff --git a/Source/EmberCL/OpenCLWrapper.h b/Source/EmberCL/OpenCLWrapper.h
index e060d01..0369c18 100644
--- a/Source/EmberCL/OpenCLWrapper.h
+++ b/Source/EmberCL/OpenCLWrapper.h
@@ -184,12 +184,12 @@ public:
 	string DumpInfo();
 
 	//Accessors.
-	bool Ok();
-	bool Shared();
-	cl::Context Context();
-	unsigned int PlatformIndex();
-	unsigned int DeviceIndex();
-	unsigned int LocalMemSize();
+	bool Ok() const;
+	bool Shared() const;
+	cl::Context Context() const;
+	unsigned int PlatformIndex() const;
+	unsigned int DeviceIndex() const;
+	unsigned int LocalMemSize() const;
 
 	static void MakeEvenGridDims(unsigned int blockW, unsigned int blockH, unsigned int& gridW, unsigned int& gridH);
 
diff --git a/Source/EmberCL/RendererCL.cpp b/Source/EmberCL/RendererCL.cpp
index eebb711..bd43bb6 100644
--- a/Source/EmberCL/RendererCL.cpp
+++ b/Source/EmberCL/RendererCL.cpp
@@ -22,7 +22,9 @@ RendererCL<T>::RendererCL(unsigned int platform, unsigned int device, bool share
 
 	//Buffer names.
 	m_EmberBufferName               = "Ember";
+	m_XformsBufferName				= "Xforms";
 	m_ParVarsBufferName             = "ParVars";
+	m_SeedsBufferName				= "Seeds";
 	m_DistBufferName                = "Dist";
 	m_CarToRasBufferName            = "CarToRas";
 	m_DEFilterParamsBufferName      = "DEFilterParams";
@@ -50,6 +52,13 @@ RendererCL<T>::RendererCL(unsigned int platform, unsigned int device, bool share
 	m_PaletteFormat.image_channel_data_type = CL_FLOAT;
 	m_FinalFormat.image_channel_order = CL_RGBA;
 	m_FinalFormat.image_channel_data_type = CL_UNORM_INT8;//Change if this ever supports 2BPC outputs for PNG.
+	m_Seeds.resize(IterGridKernelCount());
+
+	for (size_t i = 0; i < m_Seeds.size(); i++)
+	{
+		m_Seeds[i].x = m_Rand[0].Rand();
+		m_Seeds[i].y = m_Rand[0].Rand();
+	}
 
 	Init(platform, device, shared, outputTexID);//Init OpenCL upon construction and create programs that will not change.
 }
@@ -100,14 +109,12 @@ bool RendererCL<T>::Init(unsigned int platform, unsigned int device, bool shared
 		m_DEOpenCLKernelCreator = DEOpenCLKernelCreator<T>(m_NVidia);
 
 		string zeroizeProgram = m_IterOpenCLKernelCreator.ZeroizeKernel();
-		string logAssignProgram = m_DEOpenCLKernelCreator.LogScaleAssignDEKernel();
-		string logSumProgram = m_DEOpenCLKernelCreator.LogScaleSumDEKernel();//Build a couple of simple programs to ensure OpenCL is working right.
+		string logAssignProgram = m_DEOpenCLKernelCreator.LogScaleAssignDEKernel();//Build a couple of simple programs to ensure OpenCL is working right.
 
 		if (b && !(b = m_Wrapper.AddProgram(m_IterOpenCLKernelCreator.ZeroizeEntryPoint(),		  zeroizeProgram,	m_IterOpenCLKernelCreator.ZeroizeEntryPoint(),        m_DoublePrecision))) { m_ErrorReport.push_back(loc); }
 		if (b && !(b = m_Wrapper.AddProgram(m_DEOpenCLKernelCreator.LogScaleAssignDEEntryPoint(), logAssignProgram, m_DEOpenCLKernelCreator.LogScaleAssignDEEntryPoint(), m_DoublePrecision))) { m_ErrorReport.push_back(loc); }
-		if (b && !(b = m_Wrapper.AddProgram(m_DEOpenCLKernelCreator.LogScaleSumDEEntryPoint(),	  logSumProgram,	m_DEOpenCLKernelCreator.LogScaleSumDEEntryPoint(),    m_DoublePrecision))) { m_ErrorReport.push_back(loc); }
-		
 		if (b && !(b = m_Wrapper.AddAndWriteImage("Palette", CL_MEM_READ_ONLY, m_PaletteFormat, 256, 1, 0, NULL))) { m_ErrorReport.push_back(loc); }
+		if (b && !(b = m_Wrapper.AddAndWriteBuffer(m_SeedsBufferName, (void*)m_Seeds.data(), SizeOf(m_Seeds)))) { m_ErrorReport.push_back(loc); }
 
 		//This is the maximum box dimension for density filtering which consists of (blockSize  * blockSize) + (2 * filterWidth).
 		//These blocks must be square, and ideally, 32x32.
@@ -123,6 +130,11 @@ bool RendererCL<T>::Init(unsigned int platform, unsigned int device, bool shared
 	return b;
 }
 
+/// <summary>
+/// Set the shared output texture where final accumulation will be written to.
+/// </summary>
+/// <param name="outputTexID">The texture ID of the shared OpenGL texture if shared</param>
+/// <returns>True if success, else false.</returns>
 template <typename T>
 bool RendererCL<T>::SetOutputTexture(GLuint outputTexID)
 {
@@ -149,16 +161,28 @@ bool RendererCL<T>::SetOutputTexture(GLuint outputTexID)
 /// OpenCL property accessors, getters only.
 /// </summary>
 
-template <typename T> unsigned int RendererCL<T>::IterCountPerKernel()   { return m_IterCountPerKernel;                 }
-template <typename T> unsigned int RendererCL<T>::IterBlocksWide()       { return m_IterBlocksWide;                     }
-template <typename T> unsigned int RendererCL<T>::IterBlocksHigh()       { return m_IterBlocksHigh;                     }
-template <typename T> unsigned int RendererCL<T>::IterBlockWidth()       { return m_IterBlockWidth;                     }
-template <typename T> unsigned int RendererCL<T>::IterBlockHeight()      { return m_IterBlockHeight;                    }
-template <typename T> unsigned int RendererCL<T>::IterGridWidth()        { return IterBlocksWide() * IterBlockWidth();  }
-template <typename T> unsigned int RendererCL<T>::IterGridHeight()       { return IterBlocksHigh() * IterBlockHeight(); }
-template <typename T> unsigned int RendererCL<T>::TotalIterKernelCount() { return IterGridWidth() * IterGridHeight();   }
-template <typename T> unsigned int RendererCL<T>::PlatformIndex()        { return m_Wrapper.PlatformIndex();            }
-template <typename T> unsigned int RendererCL<T>::DeviceIndex()          { return m_Wrapper.DeviceIndex();              }
+//Iters per kernel/block/grid.
+template <typename T> unsigned int RendererCL<T>::IterCountPerKernel() const { return m_IterCountPerKernel; }
+template <typename T> unsigned int RendererCL<T>::IterCountPerBlock()  const { return IterCountPerKernel() * IterBlockKernelCount(); }
+template <typename T> unsigned int RendererCL<T>::IterCountPerGrid()   const { return IterCountPerKernel() * IterGridKernelCount();  }
+
+//Kernels per block.
+template <typename T> unsigned int RendererCL<T>::IterBlockKernelWidth()  const { return m_IterBlockWidth;								 }
+template <typename T> unsigned int RendererCL<T>::IterBlockKernelHeight() const { return m_IterBlockHeight;								 }
+template <typename T> unsigned int RendererCL<T>::IterBlockKernelCount()  const { return IterBlockKernelWidth() * IterBlockKernelHeight(); }
+
+//Kernels per grid.
+template <typename T> unsigned int RendererCL<T>::IterGridKernelWidth()  const { return IterGridBlockWidth() * IterBlockKernelWidth();   }
+template <typename T> unsigned int RendererCL<T>::IterGridKernelHeight() const { return IterGridBlockHeight() * IterBlockKernelHeight(); }
+template <typename T> unsigned int RendererCL<T>::IterGridKernelCount()	 const { return IterGridKernelWidth() * IterGridKernelHeight();  }
+
+//Blocks per grid.
+template <typename T> unsigned int RendererCL<T>::IterGridBlockWidth()  const { return m_IterBlocksWide;							   }
+template <typename T> unsigned int RendererCL<T>::IterGridBlockHeight() const { return m_IterBlocksHigh;							   }
+template <typename T> unsigned int RendererCL<T>::IterGridBlockCount()  const { return IterGridBlockWidth() * IterGridBlockHeight(); }
+
+template <typename T> unsigned int RendererCL<T>::PlatformIndex() { return m_Wrapper.PlatformIndex(); }
+template <typename T> unsigned int RendererCL<T>::DeviceIndex()   { return m_Wrapper.DeviceIndex();   }
 
 /// <summary>
 /// Read the histogram into the host side CPU buffer.
@@ -197,10 +221,10 @@ bool RendererCL<T>::ReadAccum()
 template <typename T>
 bool RendererCL<T>::ReadPoints(vector<PointCL<T>>& vec)
 {
-	vec.resize(TotalIterKernelCount());//Allocate the memory to read into.
+	vec.resize(IterGridKernelCount());//Allocate the memory to read into.
 
-	if (vec.size() >= TotalIterKernelCount())
-		return m_Wrapper.ReadBuffer(m_PointsBufferName, (void*)vec.data(), TotalIterKernelCount() * sizeof(PointCL<T>));
+	if (vec.size() >= IterGridKernelCount())
+		return m_Wrapper.ReadBuffer(m_PointsBufferName, (void*)vec.data(), IterGridKernelCount() * sizeof(PointCL<T>));
 
 	return false;
 }
@@ -237,6 +261,26 @@ bool RendererCL<T>::WritePoints(vector<PointCL<T>>& vec)
 	return m_Wrapper.WriteBuffer(m_PointsBufferName, (void*)vec.data(), vec.size() * sizeof(vec[0]));
 }
 
+#ifdef TEST_CL
+template <typename T>
+bool RendererCL<T>::WriteRandomPoints()
+{
+	size_t size = IterGridKernelCount();
+	vector<PointCL<T>> vec(size);
+
+	for (int i = 0; i < size; i++)
+	{
+		vec[i].m_X = m_Rand[0].Frand11<T>();
+		vec[i].m_Y = m_Rand[0].Frand11<T>();
+		vec[i].m_Z = 0;
+		vec[i].m_ColorX = m_Rand[0].Frand01<T>();
+		vec[i].m_LastXfUsed = 0;
+	}
+
+	return WritePoints(vec);
+}
+#endif
+
 /// <summary>
 /// Get the kernel string for the last built iter program.
 /// </summary>
@@ -351,7 +395,7 @@ void RendererCL<T>::ClearErrorReport()
 template <typename T>
 size_t RendererCL<T>::SubBatchSize() const
 {
-	return m_IterBlocksWide * m_IterBlocksHigh * SQR(m_IterCountPerKernel);
+	return IterCountPerGrid();
 }
 
 /// <summary>
@@ -483,6 +527,7 @@ bool RendererCL<T>::Alloc()
 		return false;
 
 	EnterResize();
+	m_XformsCL.resize(m_Ember.TotalXformCount());
 
 	bool b = true;
 	size_t histLength = SuperSize() * sizeof(v4T);
@@ -490,6 +535,7 @@ bool RendererCL<T>::Alloc()
 	const char* loc = __FUNCTION__;
 
 	if (b && !(b = m_Wrapper.AddBuffer(m_EmberBufferName,               sizeof(m_EmberCL))))         { m_ErrorReport.push_back(loc); }
+	if (b && !(b = m_Wrapper.AddBuffer(m_XformsBufferName,				SizeOf(m_XformsCL))))		 { m_ErrorReport.push_back(loc); }
 	if (b && !(b = m_Wrapper.AddBuffer(m_ParVarsBufferName,             128 * sizeof(T))))           { m_ErrorReport.push_back(loc); }
 	if (b && !(b = m_Wrapper.AddBuffer(m_DistBufferName,                CHOOSE_XFORM_GRAIN)))        { m_ErrorReport.push_back(loc); }//Will be resized for xaos.
 	if (b && !(b = m_Wrapper.AddBuffer(m_CarToRasBufferName,            sizeof(m_CarToRasCL))))      { m_ErrorReport.push_back(loc); }
@@ -498,7 +544,7 @@ bool RendererCL<T>::Alloc()
 
 	if (b && !(b = m_Wrapper.AddBuffer(m_HistBufferName,   histLength)))								  { m_ErrorReport.push_back(loc); }//Histogram. Will memset to zero later.
 	if (b && !(b = m_Wrapper.AddBuffer(m_AccumBufferName,  accumLength)))								  { m_ErrorReport.push_back(loc); }//Accum buffer.
-	if (b && !(b = m_Wrapper.AddBuffer(m_PointsBufferName, TotalIterKernelCount() * sizeof(PointCL<T>)))) { m_ErrorReport.push_back(loc); }//Points between iter calls.
+	if (b && !(b = m_Wrapper.AddBuffer(m_PointsBufferName, IterGridKernelCount() * sizeof(PointCL<T>))))  { m_ErrorReport.push_back(loc); }//Points between iter calls.
 
 	if (b && !(b = SetOutputTexture(m_OutputTexID))) { m_ErrorReport.push_back(loc); }
 	
@@ -702,12 +748,12 @@ bool RendererCL<T>::RunIter(size_t iterCount, size_t temporalSample, size_t& ite
 {
 	Timing t;//, t2(4);
 	bool b = true;
-	unsigned int seed, fuse, argIndex;
-	unsigned int iterCountPerKernel = m_IterCountPerKernel;
-	unsigned int iterCountPerBlock = iterCountPerKernel * m_IterBlockWidth * m_IterBlockHeight;
+	unsigned int fuse, argIndex;
+	unsigned int iterCountPerKernel = IterCountPerKernel();
+	unsigned int iterCountPerBlock = IterCountPerBlock();
 	unsigned int supersize = (unsigned int)SuperSize();
 	int kernelIndex = m_Wrapper.FindKernelIndex(m_IterOpenCLKernelCreator.IterEntryPoint());
-	size_t fuseFreq = m_SubBatchSize / m_IterCountPerKernel;
+	size_t fuseFreq = Renderer<T, T>::SubBatchSize() / m_IterCountPerKernel;//Use the base sbs to determine when to fuse.
 	size_t itersRemaining, localIterCount = 0;
 	double percent, etaMs;
 	const char* loc = __FUNCTION__;
@@ -719,12 +765,13 @@ bool RendererCL<T>::RunIter(size_t iterCount, size_t temporalSample, size_t& ite
 
 	if (kernelIndex != -1)
 	{
-		m_EmberCL = ConvertEmber(m_Ember);
+		ConvertEmber(m_Ember, m_EmberCL, m_XformsCL);
 		m_CarToRasCL = ConvertCarToRas(*CoordMap());
 
-		if (b && !(b = m_Wrapper.WriteBuffer      (m_EmberBufferName,    (void*)&m_EmberCL,           sizeof(m_EmberCL))))        { m_ErrorReport.push_back(loc); }
-		if (b && !(b = m_Wrapper.AddAndWriteBuffer(m_DistBufferName,     (void*)XformDistributions(), XformDistributionsSize()))) { m_ErrorReport.push_back(loc); }//Will be resized for xaos.
-		if (b && !(b = m_Wrapper.WriteBuffer      (m_CarToRasBufferName, (void*)&m_CarToRasCL,        sizeof(m_CarToRasCL))))     { m_ErrorReport.push_back(loc); }
+		if (b && !(b = m_Wrapper.WriteBuffer      (m_EmberBufferName,    (void*)&m_EmberCL,           sizeof(m_EmberCL))))						   { m_ErrorReport.push_back(loc); }
+		if (b && !(b = m_Wrapper.WriteBuffer	  (m_XformsBufferName,   (void*)m_XformsCL.data(),    sizeof(m_XformsCL[0]) * m_XformsCL.size()))) { m_ErrorReport.push_back(loc); }
+		if (b && !(b = m_Wrapper.AddAndWriteBuffer(m_DistBufferName,     (void*)XformDistributions(), XformDistributionsSize())))				   { m_ErrorReport.push_back(loc); }//Will be resized for xaos.
+		if (b && !(b = m_Wrapper.WriteBuffer      (m_CarToRasBufferName, (void*)&m_CarToRasCL,        sizeof(m_CarToRasCL))))					   { m_ErrorReport.push_back(loc); }
 		
 		if (b && !(b = m_Wrapper.AddAndWriteImage("Palette", CL_MEM_READ_ONLY, m_PaletteFormat, m_Dmap.m_Entries.size(), 1, 0, m_Dmap.m_Entries.data()))) { m_ErrorReport.push_back(loc); }
 		
@@ -735,31 +782,32 @@ bool RendererCL<T>::RunIter(size_t iterCount, size_t temporalSample, size_t& ite
 		while (b && itersRan < iterCount && !m_Abort)
 		{
 			argIndex = 0;
-			seed = m_Rand[0].Rand();
 #ifdef TEST_CL
 			fuse = 0;
 #else
 			//fuse = 100;
-			fuse = ((m_Calls % fuseFreq) == 0 ? (EarlyClip() ? 100u : 15u) : 0u);
+			//fuse = ((m_Calls % fuseFreq) == 0 ? (EarlyClip() ? 100u : 15u) : 0u);
+			fuse = (unsigned int)((m_Calls % fuseFreq) == 0u ? FuseCount() : 0u);
 			//fuse = ((m_Calls % 4) == 0 ? 100u : 0u);
 #endif
 			itersRemaining = iterCount - itersRan;
-			unsigned int gridW = (unsigned int)min(ceil((double)itersRemaining / (double)iterCountPerBlock), (double)IterBlocksWide());
-			unsigned int gridH = (unsigned int)min(ceil((double)itersRemaining / ((double)gridW * iterCountPerBlock)), (double)IterBlocksHigh());
+			unsigned int gridW = (unsigned int)min(ceil((double)itersRemaining / (double)iterCountPerBlock), (double)IterGridBlockWidth());
+			unsigned int gridH = (unsigned int)min(ceil((double)itersRemaining / ((double)gridW * iterCountPerBlock)), (double)IterGridBlockHeight());
 			unsigned int iterCountThisLaunch = iterCountPerBlock * gridW * gridH;
 
 			//Similar to what's done in the base class.
 			//The number of iters per thread must be adjusted if they've requested less iters than is normally ran in a block (256 * 256).
 			if (iterCountThisLaunch > iterCount)
 			{
-				iterCountPerKernel = (unsigned int)ceil((double)iterCount / (double)(gridW * gridH * m_IterBlockWidth * m_IterBlockHeight));
-				iterCountThisLaunch = iterCountPerKernel * (gridW * gridH * m_IterBlockWidth * m_IterBlockHeight);
+				iterCountPerKernel = (unsigned int)ceil((double)iterCount / (double)(gridW * gridH * IterBlockKernelCount()));
+				iterCountThisLaunch = iterCountPerKernel * (gridW * gridH * IterBlockKernelCount());
 			}
 
 			if (b && !(b = m_Wrapper.SetArg      (kernelIndex, argIndex++, iterCountPerKernel)))   { m_ErrorReport.push_back(loc); }//Number of iters for each thread to run.
 			if (b && !(b = m_Wrapper.SetArg      (kernelIndex, argIndex++, fuse)))                 { m_ErrorReport.push_back(loc); }//Number of iters to fuse.
-			if (b && !(b = m_Wrapper.SetArg      (kernelIndex, argIndex++, seed)))                 { m_ErrorReport.push_back(loc); }//Seed.
-			if (b && !(b = m_Wrapper.SetBufferArg(kernelIndex, argIndex++, m_EmberBufferName)))    { m_ErrorReport.push_back(loc); }//Flame.
+			if (b && !(b = m_Wrapper.SetBufferArg(kernelIndex, argIndex++, m_SeedsBufferName)))    { m_ErrorReport.push_back(loc); }//Seeds.
+			if (b && !(b = m_Wrapper.SetBufferArg(kernelIndex, argIndex++, m_EmberBufferName)))    { m_ErrorReport.push_back(loc); }//Ember.
+			if (b && !(b = m_Wrapper.SetBufferArg(kernelIndex, argIndex++, m_XformsBufferName)))   { m_ErrorReport.push_back(loc); }//Xforms.
 			if (b && !(b = m_Wrapper.SetBufferArg(kernelIndex, argIndex++, m_ParVarsBufferName)))  { m_ErrorReport.push_back(loc); }//Parametric variation parameters.
 			if (b && !(b = m_Wrapper.SetBufferArg(kernelIndex, argIndex++, m_DistBufferName)))     { m_ErrorReport.push_back(loc); }//Xform distributions.
 			if (b && !(b = m_Wrapper.SetBufferArg(kernelIndex, argIndex++, m_CarToRasBufferName))) { m_ErrorReport.push_back(loc); }//Coordinate converter.
@@ -769,11 +817,11 @@ bool RendererCL<T>::RunIter(size_t iterCount, size_t temporalSample, size_t& ite
 			if (b && !(b = m_Wrapper.SetBufferArg(kernelIndex, argIndex++, m_PointsBufferName)))   { m_ErrorReport.push_back(loc); }//Random start points.
 			
 			if (b && !(b = m_Wrapper.RunKernel(kernelIndex,
-									 gridW * IterBlockWidth(),//Total grid dims.
-									 gridH * IterBlockHeight(),
+									 gridW * IterBlockKernelWidth(),//Total grid dims.
+									 gridH * IterBlockKernelHeight(),
 									 1,
-									 IterBlockWidth(),//Individual block dims.
-									 IterBlockHeight(),
+									 IterBlockKernelWidth(),//Individual block dims.
+									 IterBlockKernelHeight(),
 									 1)))
 			{
 				m_Abort = true;
@@ -876,7 +924,7 @@ template <typename T>
 eRenderStatus RendererCL<T>::RunDensityFilter()
 {
 	bool b = true;
-	Timing t(4);//, t2(4);
+	Timing t(4);// , t2(4);
 	m_DensityFilterCL = ConvertDensityFilter();
 	int kernelIndex = MakeAndGetDensityFilterProgram(Supersample(), m_DensityFilterCL.m_FilterWidth);
 	const char* loc = __FUNCTION__;
@@ -907,26 +955,62 @@ eRenderStatus RendererCL<T>::RunDensityFilter()
 		//The other is to proces the entire image in multiple passes, and each pass processes blocks of pixels
 		//that are far enough apart such that their filters do not overlap.
 		//Do the latter.
+		//Gap is in terms of blocks. How many blocks must separate two blocks running at the same time.
 		unsigned int gapW = (unsigned int)ceil((m_DensityFilterCL.m_FilterWidth * 2.0) / (double)blockSizeW);
 		unsigned int chunkSizeW = gapW + 1;
 		unsigned int gapH = (unsigned int)ceil((m_DensityFilterCL.m_FilterWidth * 2.0) / (double)blockSizeH);
 		unsigned int chunkSizeH = gapH + 1;
-
 		double totalChunks = chunkSizeW * chunkSizeH;
 
 		if (b && !(b = m_Wrapper.AddAndWriteBuffer(m_DEFilterParamsBufferName, (void*)&m_DensityFilterCL, sizeof(m_DensityFilterCL)))) { m_ErrorReport.push_back(loc); }
 
-		for (unsigned int row = 0; b && !m_Abort && row < chunkSizeH; row++)
+#ifdef ROW_ONLY_DE
+		blockSizeW = 64;//These *must* both be divisible by 16 or else pixels will go missing.
+		blockSizeH = 1;
+		gapW = (unsigned int)ceil((m_DensityFilterCL.m_FilterWidth * 2.0) / (double)blockSizeW);
+		chunkSizeW = gapW + 1;
+		gapH = (unsigned int)ceil((m_DensityFilterCL.m_FilterWidth * 2.0) / (double)32);//Block height is 1, but iterates over 32 rows.
+		chunkSizeH = gapH + 1;
+		totalChunks = chunkSizeW * chunkSizeH;
+
+		OpenCLWrapper::MakeEvenGridDims(blockSizeW, blockSizeH, gridW, gridH);
+		gridW /= chunkSizeW;
+		gridH /= chunkSizeH;
+
+		for (unsigned int rowChunk = 0; b && !m_Abort && rowChunk < chunkSizeH; rowChunk++)
 		{
-			for (unsigned int col = 0; b && !m_Abort && col < chunkSizeW; col++)
+			for (unsigned int colChunk = 0; b && !m_Abort && colChunk < chunkSizeW; colChunk++)
 			{
 				//t2.Tic();
-				if (b && !(b = RunDensityFilterPrivate(kernelIndex, gridW, gridH, blockSizeW, blockSizeH, chunkSizeW, chunkSizeH, row, col))) { m_Abort = true; m_ErrorReport.push_back(loc); }
+				if (b && !(b = RunDensityFilterPrivate(kernelIndex, gridW, gridH, blockSizeW, blockSizeH, chunkSizeW, chunkSizeH, colChunk, rowChunk))) { m_Abort = true; m_ErrorReport.push_back(loc); }
 				//t2.Toc(loc);
 
 				if (b && m_Callback)
 				{
-					double percent = (double((row * chunkSizeW) + (col + 1)) / totalChunks) * 100.0;
+					double percent = (double((rowChunk * chunkSizeW) + (colChunk + 1)) / totalChunks) * 100.0;
+					double etaMs = ((100.0 - percent) / percent) * t.Toc();
+
+					if (!m_Callback->ProgressFunc(m_Ember, m_ProgressParameter, percent, 1, etaMs))
+						Abort();
+				}
+			}
+		}
+#else
+		OpenCLWrapper::MakeEvenGridDims(blockSizeW, blockSizeH, gridW, gridH);
+		gridW /= chunkSizeW;
+		gridH /= chunkSizeH;
+
+		for (unsigned int rowChunk = 0; b && !m_Abort && rowChunk < chunkSizeH; rowChunk++)
+		{
+			for (unsigned int colChunk = 0; b && !m_Abort && colChunk < chunkSizeW; colChunk++)
+			{
+				//t2.Tic();
+				if (b && !(b = RunDensityFilterPrivate(kernelIndex, gridW, gridH, blockSizeW, blockSizeH, chunkSizeW, chunkSizeH, colChunk, rowChunk))) { m_Abort = true; m_ErrorReport.push_back(loc); }
+				//t2.Toc(loc);
+
+				if (b && m_Callback)
+				{
+					double percent = (double((rowChunk * chunkSizeW) + (colChunk + 1)) / totalChunks) * 100.0;
 					double etaMs = ((100.0 - percent) / percent) * t.Toc();
 					
 					if (!m_Callback->ProgressFunc(m_Ember, m_ProgressParameter, percent, 1, etaMs))
@@ -934,6 +1018,7 @@ eRenderStatus RendererCL<T>::RunDensityFilter()
 				}
 			}
 		}
+#endif
 
 		if (b && m_Callback)
 			m_Callback->ProgressFunc(m_Ember, m_ProgressParameter, 100.0, 1, 0.0);
@@ -1084,14 +1169,15 @@ bool RendererCL<T>::ClearBuffer(const string& bufferName, unsigned int width, un
 /// <param name="gridH">Grid height</param>
 /// <param name="blockW">Block width</param>
 /// <param name="blockH">Block height</param>
-/// <param name="chunkSize">Chunk size (gap + 1)</param>
+/// <param name="chunkSizeW">Chunk size width (gapW + 1)</param>
+/// <param name="chunkSizeH">Chunk size height (gapH + 1)</param>
 /// <param name="rowParity">Row parity</param>
 /// <param name="colParity">Column parity</param>
 /// <returns>True if success, else false.</returns>
 template <typename T>
-bool RendererCL<T>::RunDensityFilterPrivate(unsigned int kernelIndex, unsigned int gridW, unsigned int gridH, unsigned int blockW, unsigned int blockH, unsigned int chunkSizeW, unsigned int chunkSizeH, unsigned int rowParity, unsigned int colParity)
+bool RendererCL<T>::RunDensityFilterPrivate(unsigned int kernelIndex, unsigned int gridW, unsigned int gridH, unsigned int blockW, unsigned int blockH, unsigned int chunkSizeW, unsigned int chunkSizeH, unsigned int chunkW, unsigned int chunkH)
 {
-	//Timing t;
+	//Timing t(4);
 	bool b = true;
 	unsigned int argIndex = 0;
 	const char* loc = __FUNCTION__;
@@ -1104,8 +1190,8 @@ bool RendererCL<T>::RunDensityFilterPrivate(unsigned int kernelIndex, unsigned i
 	if (b && !(b = m_Wrapper.SetBufferArg(kernelIndex, argIndex, m_DECoefIndicesBufferName)))  { m_ErrorReport.push_back(loc); } argIndex++;//Coef indices.
 	if (b && !(b = m_Wrapper.SetArg(      kernelIndex, argIndex, chunkSizeW)))                 { m_ErrorReport.push_back(loc); } argIndex++;//Chunk size width (gapW + 1).
 	if (b && !(b = m_Wrapper.SetArg(      kernelIndex, argIndex, chunkSizeH)))                 { m_ErrorReport.push_back(loc); } argIndex++;//Chunk size height (gapH + 1).
-	if (b && !(b = m_Wrapper.SetArg(      kernelIndex, argIndex, rowParity)))                  { m_ErrorReport.push_back(loc); } argIndex++;//Row parity.
-	if (b && !(b = m_Wrapper.SetArg(      kernelIndex, argIndex, colParity)))                  { m_ErrorReport.push_back(loc); } argIndex++;//Col parity.
+	if (b && !(b = m_Wrapper.SetArg(      kernelIndex, argIndex, chunkW)))					   { m_ErrorReport.push_back(loc); } argIndex++;//Column chunk.
+	if (b && !(b = m_Wrapper.SetArg(      kernelIndex, argIndex, chunkH)))					   { m_ErrorReport.push_back(loc); } argIndex++;//Row chunk.
 	//t.Toc(__FUNCTION__ " set args");
 
 	//t.Tic();
@@ -1270,60 +1356,57 @@ SpatialFilterCL<T> RendererCL<T>::ConvertSpatialFilter()
 
 /// <summary>
 /// Convert the host side Ember object into an EmberCL object
-/// for passing to OpenCL.
+/// and a vector of XformCL for passing to OpenCL.
 /// </summary>
 /// <param name="ember">The Ember object to convert</param>
-/// <returns>The EmberCL object</returns>
+/// <param name="emberCL">The converted EmberCL</param>
+/// <param name="xformsCL">The converted vector of XformCL</param>
 template <typename T>
-EmberCL<T> RendererCL<T>::ConvertEmber(Ember<T>& ember)
+void RendererCL<T>::ConvertEmber(Ember<T>& ember, EmberCL<T>& emberCL, vector<XformCL<T>>& xformsCL)
 {
-	EmberCL<T> emberCL;
-
 	memset(&emberCL, 0, sizeof(EmberCL<T>));//Might not really be needed.
 
-	emberCL.m_RotA            = m_RotMat.A();
-	emberCL.m_RotB            = m_RotMat.B();
-	emberCL.m_RotD            = m_RotMat.D();
-	emberCL.m_RotE            = m_RotMat.E();
-	emberCL.m_CamMat		  = ember.m_CamMat;
-	emberCL.m_CenterX         = CenterX();
-	emberCL.m_CenterY		  = ember.m_RotCenterY;
-	emberCL.m_CamZPos		  = ember.m_CamZPos;
-	emberCL.m_CamPerspective  = ember.m_CamPerspective;
-	emberCL.m_CamYaw		  = ember.m_CamYaw;
-	emberCL.m_CamPitch		  = ember.m_CamPitch;
-	emberCL.m_CamDepthBlur	  = ember.m_CamDepthBlur;
-	emberCL.m_BlurCoef		  = ember.BlurCoef();
+	emberCL.m_RotA           = m_RotMat.A();
+	emberCL.m_RotB           = m_RotMat.B();
+	emberCL.m_RotD           = m_RotMat.D();
+	emberCL.m_RotE           = m_RotMat.E();
+	emberCL.m_CamMat		 = ember.m_CamMat;
+	emberCL.m_CenterX        = CenterX();
+	emberCL.m_CenterY		 = ember.m_RotCenterY;
+	emberCL.m_CamZPos		 = ember.m_CamZPos;
+	emberCL.m_CamPerspective = ember.m_CamPerspective;
+	emberCL.m_CamYaw		 = ember.m_CamYaw;
+	emberCL.m_CamPitch		 = ember.m_CamPitch;
+	emberCL.m_CamDepthBlur	 = ember.m_CamDepthBlur;
+	emberCL.m_BlurCoef		 = ember.BlurCoef();
 
-	for (unsigned int i = 0; i < ember.TotalXformCount() && i < MAX_CL_XFORM; i++)//Copy the relevant values for each xform, capped at the max.
+	for (unsigned int i = 0; i < ember.TotalXformCount() && i < xformsCL.size(); i++)
 	{
 		Xform<T>* xform = ember.GetTotalXform(i);
 
-		emberCL.m_Xforms[i].m_A = xform->m_Affine.A();
-		emberCL.m_Xforms[i].m_B = xform->m_Affine.B();
-		emberCL.m_Xforms[i].m_C = xform->m_Affine.C();
-		emberCL.m_Xforms[i].m_D = xform->m_Affine.D();
-		emberCL.m_Xforms[i].m_E = xform->m_Affine.E();
-		emberCL.m_Xforms[i].m_F = xform->m_Affine.F();
+		xformsCL[i].m_A = xform->m_Affine.A();
+		xformsCL[i].m_B = xform->m_Affine.B();
+		xformsCL[i].m_C = xform->m_Affine.C();
+		xformsCL[i].m_D = xform->m_Affine.D();
+		xformsCL[i].m_E = xform->m_Affine.E();
+		xformsCL[i].m_F = xform->m_Affine.F();
 
-		emberCL.m_Xforms[i].m_PostA = xform->m_Post.A();
-		emberCL.m_Xforms[i].m_PostB = xform->m_Post.B();
-		emberCL.m_Xforms[i].m_PostC = xform->m_Post.C();
-		emberCL.m_Xforms[i].m_PostD = xform->m_Post.D();
-		emberCL.m_Xforms[i].m_PostE = xform->m_Post.E();
-		emberCL.m_Xforms[i].m_PostF = xform->m_Post.F();
+		xformsCL[i].m_PostA = xform->m_Post.A();
+		xformsCL[i].m_PostB = xform->m_Post.B();
+		xformsCL[i].m_PostC = xform->m_Post.C();
+		xformsCL[i].m_PostD = xform->m_Post.D();
+		xformsCL[i].m_PostE = xform->m_Post.E();
+		xformsCL[i].m_PostF = xform->m_Post.F();
 
-		emberCL.m_Xforms[i].m_DirectColor = xform->m_DirectColor;
-		emberCL.m_Xforms[i].m_ColorSpeedCache = xform->ColorSpeedCache();
-		emberCL.m_Xforms[i].m_OneMinusColorCache = xform->OneMinusColorCache();
-		emberCL.m_Xforms[i].m_Opacity = xform->m_Opacity;
-		emberCL.m_Xforms[i].m_VizAdjusted = xform->VizAdjusted();
+		xformsCL[i].m_DirectColor = xform->m_DirectColor;
+		xformsCL[i].m_ColorSpeedCache = xform->ColorSpeedCache();
+		xformsCL[i].m_OneMinusColorCache = xform->OneMinusColorCache();
+		xformsCL[i].m_Opacity = xform->m_Opacity;
+		xformsCL[i].m_VizAdjusted = xform->VizAdjusted();
 
 		for (unsigned int varIndex = 0; varIndex < xform->TotalVariationCount() && varIndex < MAX_CL_VARS; varIndex++)//Assign all variation weights for this xform, with a max of MAX_CL_VARS.
-			emberCL.m_Xforms[i].m_VariationWeights[varIndex] = xform->GetVariation(varIndex)->m_Weight;
+			xformsCL[i].m_VariationWeights[varIndex] = xform->GetVariation(varIndex)->m_Weight;
 	}
-
-	return emberCL;
 }
 
 /// <summary>
diff --git a/Source/EmberCL/RendererCL.h b/Source/EmberCL/RendererCL.h
index 1ef9760..677afbd 100644
--- a/Source/EmberCL/RendererCL.h
+++ b/Source/EmberCL/RendererCL.h
@@ -42,14 +42,27 @@ public:
 	//Non-virtual member functions for OpenCL specific tasks.
 	bool Init(unsigned int platform, unsigned int device, bool shared, GLuint outputTexID);
 	bool SetOutputTexture(GLuint outputTexID);
-	inline unsigned int IterCountPerKernel();
-	inline unsigned int IterBlocksWide();
-	inline unsigned int IterBlocksHigh();
-	inline unsigned int IterBlockWidth();
-	inline unsigned int IterBlockHeight();
-	inline unsigned int IterGridWidth();
-	inline unsigned int IterGridHeight();
-	inline unsigned int TotalIterKernelCount();
+
+	//Iters per kernel/block/grid.
+	inline unsigned int IterCountPerKernel() const;
+	inline unsigned int IterCountPerBlock() const;
+	inline unsigned int IterCountPerGrid() const;
+
+	//Kernels per block.
+	inline unsigned int IterBlockKernelWidth() const;
+	inline unsigned int IterBlockKernelHeight() const;
+	inline unsigned int IterBlockKernelCount() const;
+
+	//Kernels per grid.
+	inline unsigned int IterGridKernelWidth() const;
+	inline unsigned int IterGridKernelHeight() const;
+	inline unsigned int IterGridKernelCount() const;
+
+	//Blocks per grid.
+	inline unsigned int IterGridBlockWidth() const;
+	inline unsigned int IterGridBlockHeight() const;
+	inline unsigned int IterGridBlockCount() const;
+
 	unsigned int PlatformIndex();
 	unsigned int DeviceIndex();
 	bool ReadHist();
@@ -58,6 +71,9 @@ public:
 	bool ClearHist();
 	bool ClearAccum();
 	bool WritePoints(vector<PointCL<T>>& vec);
+#ifdef TEST_CL
+	bool WriteRandomPoints();
+#endif
 	string IterKernel();
 
 	//Virtual functions overridden from RendererCLBase.
@@ -98,7 +114,7 @@ private:
 	eRenderStatus RunDensityFilter();
 	eRenderStatus RunFinalAccum();
 	bool ClearBuffer(const string& bufferName, unsigned int width, unsigned int height, unsigned int elementSize);
-	bool RunDensityFilterPrivate(unsigned int kernelIndex, unsigned int gridW, unsigned int gridH, unsigned int blockW, unsigned int blockH, unsigned int chunkSizeW, unsigned int chunkSizeH, unsigned int rowParity, unsigned int colParity);
+	bool RunDensityFilterPrivate(unsigned int kernelIndex, unsigned int gridW, unsigned int gridH, unsigned int blockW, unsigned int blockH, unsigned int chunkSizeW, unsigned int chunkSizeH, unsigned int chunkW, unsigned int chunkH);
 	int MakeAndGetDensityFilterProgram(size_t ss, unsigned int filterWidth);
 	int MakeAndGetFinalAccumProgram(T& alphaBase, T& alphaScale);
 	int MakeAndGetGammaCorrectionProgram();
@@ -106,7 +122,7 @@ private:
 	//Private functions passing data to OpenCL programs.
 	DensityFilterCL<T> ConvertDensityFilter();
 	SpatialFilterCL<T> ConvertSpatialFilter();
-	EmberCL<T> ConvertEmber(Ember<T>& ember);
+	void ConvertEmber(Ember<T>& ember, EmberCL<T>& emberCL, vector<XformCL<T>>& xformsCL);
 	static CarToRasCL<T> ConvertCarToRas(const CarToRas<T>& carToRas);
 
 	bool m_Init;
@@ -122,7 +138,9 @@ private:
 
 	//Buffer names.
 	string m_EmberBufferName;
+	string m_XformsBufferName;
 	string m_ParVarsBufferName;
+	string m_SeedsBufferName;
 	string m_DistBufferName;
 	string m_CarToRasBufferName;
 	string m_DEFilterParamsBufferName;
@@ -146,6 +164,8 @@ private:
 	IMAGEGL2D m_AccumImage;
 	GLuint m_OutputTexID;
 	EmberCL<T> m_EmberCL;
+	vector<XformCL<T>> m_XformsCL;
+	vector<glm::highp_uvec2> m_Seeds;
 	Palette<float> m_Dmap;//Used instead of the base class' m_Dmap because OpenCL only supports float textures.
 	CarToRasCL<T> m_CarToRasCL;
 	DensityFilterCL<T> m_DensityFilterCL;
diff --git a/Source/EmberCommon/EmberOptions.h b/Source/EmberCommon/EmberOptions.h
index b06bb98..81aa9d9 100644
--- a/Source/EmberCommon/EmberOptions.h
+++ b/Source/EmberCommon/EmberOptions.h
@@ -306,7 +306,7 @@ public:
 		INITUINTOPTION(Strips,		   Eou(OPT_USE_RENDER,  OPT_STRIPS,           _T("--nstrips"),              1,                    SO_REQ_SEP, "\t--nstrips=<val>          The number of fractions to split a single render frame into. Useful for print size renders or low memory systems [default: 1].\n"));
 		INITUINTOPTION(Supersample,    Eou(OPT_RENDER_ANIM, OPT_SUPERSAMPLE,      _T("--supersample"),          0,                    SO_REQ_SEP, "\t--supersample=<val>      The supersample value used to override the one specified in the file [default: 0 (use value from file)].\n"));
 		INITUINTOPTION(BitsPerChannel, Eou(OPT_RENDER_ANIM, OPT_BPC,              _T("--bpc"),                  8,                    SO_REQ_SEP, "\t--bpc=<val>              Bits per channel. 8 or 16 for PNG, 8 for all others [default: 8].\n"));
-		INITUINTOPTION(SubBatchSize,   Eou(OPT_USE_ALL,     OPT_SBS,              _T("--sub_batch_size"),       10240,                SO_REQ_SEP, "\t--sub_batch_size=<val>   The chunk size that iterating will be broken into [default: 10k].\n"));
+		INITUINTOPTION(SubBatchSize,   Eou(OPT_USE_ALL,		OPT_SBS,			  _T("--sub_batch_size"),		DEFAULT_SBS,		  SO_REQ_SEP, "\t--sub_batch_size=<val>   The chunk size that iterating will be broken into [default: 10k].\n"));
 		INITUINTOPTION(Bits,           Eou(OPT_USE_ALL,     OPT_BITS,             _T("--bits"),                 33,                   SO_REQ_SEP, "\t--bits=<val>             Determines the types used for the histogram and accumulator [default: 33].\n"
 																																							  "\t\t\t\t\t32:  Histogram: float, Accumulator: float.\n"
 																																							  "\t\t\t\t\t33:  Histogram: float, Accumulator: float.\n"//This differs from the original which used an int hist for bits 33.
diff --git a/Source/EmberGenome/EmberGenome.cpp b/Source/EmberGenome/EmberGenome.cpp
index 2d76ef3..6c153da 100644
--- a/Source/EmberGenome/EmberGenome.cpp
+++ b/Source/EmberGenome/EmberGenome.cpp
@@ -264,10 +264,22 @@ bool EmberGenome(EmberOptions& opt)
 	else if (opt.Clone()    != "") filename = opt.Clone();
 	else if (opt.Mutate()   != "") filename = opt.Mutate();
 
-	if (!ParseEmberFile(parser, filename, embers))
+	if (ParseEmberFile(parser, filename, embers))
+	{
+		if (opt.SubBatchSize() != DEFAULT_SBS)
+			for (i = 0; i < embers.size(); i++)
+				embers[i].m_SubBatchSize = opt.SubBatchSize();
+	}
+	else
 		return false;
 
-	if (doCross1 && !ParseEmberFile(parser, opt.Cross1(), embers2))
+	if (doCross1 && ParseEmberFile(parser, opt.Cross1(), embers2))
+	{
+		if (opt.SubBatchSize() != DEFAULT_SBS)
+			for (i = 0; i < embers2.size(); i++)
+				embers2[i].m_SubBatchSize = opt.SubBatchSize();
+	}
+	else
 		return false;
 
 	if (opt.CloneAll() != "")
@@ -493,7 +505,6 @@ bool EmberGenome(EmberOptions& opt)
 	renderer->EarlyClip(opt.EarlyClip());
 	renderer->YAxisUp(opt.YAxisUp());
 	renderer->LockAccum(opt.LockAccum());
-	renderer->SubBatchSize(opt.SubBatchSize());
 	renderer->PixelAspectRatio(T(opt.AspectRatio()));
 	renderer->Transparency(opt.Transparency());
 
diff --git a/Source/EmberGenome/EmberGenome.rc b/Source/EmberGenome/EmberGenome.rc
index 6e8d76d..b246fdc 100644
--- a/Source/EmberGenome/EmberGenome.rc
+++ b/Source/EmberGenome/EmberGenome.rc
@@ -49,8 +49,8 @@ END
 //
 
 VS_VERSION_INFO VERSIONINFO
- FILEVERSION 0,4,1,4
- PRODUCTVERSION 0,4,1,4
+ FILEVERSION 0,4,1,5
+ PRODUCTVERSION 0,4,1,5
  FILEFLAGSMASK 0x3fL
 #ifdef _DEBUG
  FILEFLAGS 0x1L
@@ -67,12 +67,12 @@ BEGIN
         BEGIN
             VALUE "CompanyName", "Open Source"
             VALUE "FileDescription", "Manipulates fractal flames parameter files"
-            VALUE "FileVersion", "0.4.1.4"
+            VALUE "FileVersion", "0.4.1.5"
             VALUE "InternalName", "EmberGenome.rc"
             VALUE "LegalCopyright", "Copyright (C) Matt Feemster 2013, GPL v3"
             VALUE "OriginalFilename", "EmberGenome.rc"
             VALUE "ProductName", "Ember Genome"
-            VALUE "ProductVersion", "0.4.1.4"
+            VALUE "ProductVersion", "0.4.1.5"
         END
     END
     BLOCK "VarFileInfo"
diff --git a/Source/EmberRender/EmberRender.cpp b/Source/EmberRender/EmberRender.cpp
index 1d9a758..f45f712 100644
--- a/Source/EmberRender/EmberRender.cpp
+++ b/Source/EmberRender/EmberRender.cpp
@@ -147,7 +147,6 @@ bool EmberRender(EmberOptions& opt)
 	renderer->YAxisUp(opt.YAxisUp());
 	renderer->LockAccum(opt.LockAccum());
 	renderer->InsertPalette(opt.InsertPalette());
-	renderer->SubBatchSize(opt.SubBatchSize());
 	renderer->PixelAspectRatio(T(opt.AspectRatio()));
 	renderer->Transparency(opt.Transparency());
 	renderer->NumChannels(channels);
@@ -164,6 +163,9 @@ bool EmberRender(EmberOptions& opt)
 		if (opt.Supersample() > 0)
 			embers[i].m_Supersample = opt.Supersample();
 
+		if (opt.SubBatchSize() != DEFAULT_SBS)
+			embers[i].m_SubBatchSize = opt.SubBatchSize();
+
 		embers[i].m_TemporalSamples = 1;//Force temporal samples to 1 for render.
 		embers[i].m_Quality *= T(opt.QualityScale());
 		embers[i].m_FinalRasW = (unsigned int)((T)embers[i].m_FinalRasW * opt.SizeScale());
diff --git a/Source/EmberRender/EmberRender.rc b/Source/EmberRender/EmberRender.rc
index 786c4b5..1fa4a0e 100644
--- a/Source/EmberRender/EmberRender.rc
+++ b/Source/EmberRender/EmberRender.rc
@@ -49,8 +49,8 @@ END
 //
 
 VS_VERSION_INFO VERSIONINFO
- FILEVERSION 0,4,1,4
- PRODUCTVERSION 0,4,1,4
+ FILEVERSION 0,4,1,5
+ PRODUCTVERSION 0,4,1,5
  FILEFLAGSMASK 0x3fL
 #ifdef _DEBUG
  FILEFLAGS 0x1L
@@ -67,12 +67,12 @@ BEGIN
         BEGIN
             VALUE "CompanyName", "Open Source"
             VALUE "FileDescription", "Renders fractal flames as single images"
-            VALUE "FileVersion", "0.4.1.4"
+            VALUE "FileVersion", "0.4.1.5"
             VALUE "InternalName", "EmberRender.rc"
             VALUE "LegalCopyright", "Copyright (C) Matt Feemster 2013, GPL v3"
             VALUE "OriginalFilename", "EmberRender.rc"
             VALUE "ProductName", "Ember Render"
-            VALUE "ProductVersion", "0.4.1.4"
+            VALUE "ProductVersion", "0.4.1.5"
         END
     END
     BLOCK "VarFileInfo"
diff --git a/Source/Fractorium/AboutDialog.ui b/Source/Fractorium/AboutDialog.ui
index 9202d7c..0020c90 100644
--- a/Source/Fractorium/AboutDialog.ui
+++ b/Source/Fractorium/AboutDialog.ui
@@ -52,7 +52,7 @@
       </font>
      </property>
      <property name="text">
-      <string>&lt;html&gt;&lt;head/&gt;&lt;body&gt;&lt;p align=&quot;center&quot;&gt;&lt;br/&gt;&lt;span style=&quot; font-size:12pt;&quot;&gt;Fractorium 0.4.1.4 Beta&lt;/span&gt;&lt;/p&gt;&lt;p align=&quot;center&quot;&gt;&lt;span style=&quot; font-size:10pt;&quot;&gt;&lt;br/&gt;A Qt-based fractal flame editor which uses a C++ re-write of the flam3 algorithm named Ember and a GPU capable version named EmberCL which implements a portion of the cuburn algorithm in OpenCL.&lt;/span&gt;&lt;/p&gt;&lt;p align=&quot;center&quot;&gt;&lt;span style=&quot; font-size:10pt;&quot;&gt;Matt Feemster&lt;/span&gt;&lt;/p&gt;&lt;/body&gt;&lt;/html&gt;</string>
+      <string>&lt;html&gt;&lt;head/&gt;&lt;body&gt;&lt;p align=&quot;center&quot;&gt;&lt;br/&gt;&lt;span style=&quot; font-size:12pt;&quot;&gt;Fractorium 0.4.1.5 Beta&lt;/span&gt;&lt;/p&gt;&lt;p align=&quot;center&quot;&gt;&lt;span style=&quot; font-size:10pt;&quot;&gt;&lt;br/&gt;A Qt-based fractal flame editor which uses a C++ re-write of the flam3 algorithm named Ember and a GPU capable version named EmberCL which implements a portion of the cuburn algorithm in OpenCL.&lt;/span&gt;&lt;/p&gt;&lt;p align=&quot;center&quot;&gt;&lt;span style=&quot; font-size:10pt;&quot;&gt;Matt Feemster&lt;/span&gt;&lt;/p&gt;&lt;/body&gt;&lt;/html&gt;</string>
      </property>
      <property name="textFormat">
       <enum>Qt::RichText</enum>
diff --git a/Source/Fractorium/FinalRenderEmberController.cpp b/Source/Fractorium/FinalRenderEmberController.cpp
index 47bf421..806e484 100644
--- a/Source/Fractorium/FinalRenderEmberController.cpp
+++ b/Source/Fractorium/FinalRenderEmberController.cpp
@@ -578,6 +578,7 @@ pair<size_t, size_t> FinalRenderEmberController<T>::SyncAndComputeMemory()
 			[&](const string& s) { }, [&](const string& s) { }, [&](const string& s) { });
 
 		SyncGuiToEmbers();
+		m_FinalRenderDialog->m_StripsSpin->setSuffix(" (" + ToString(strips) + ")");
 		m_Renderer->SetEmber(*m_Ember);
 		m_Renderer->CreateSpatialFilter(b);
 		m_Renderer->CreateTemporalFilter(b);
diff --git a/Source/Fractorium/Fractorium.h b/Source/Fractorium/Fractorium.h
index b3f3e19..bbdfe18 100644
--- a/Source/Fractorium/Fractorium.h
+++ b/Source/Fractorium/Fractorium.h
@@ -152,9 +152,11 @@ public slots:
 	void OnDEFilterMinRadiusWidthChanged(double d);
 	void OnDEFilterMaxRadiusWidthChanged(double d);
 	void OnDEFilterCurveWidthChanged(double d);
-	void OnTemporalSamplesChanged(int d);//Iteration.
+	void OnSbsChanged(int d);//Iteration.
+	void OnFuseChanged(int d);
 	void OnQualityChanged(double d);
 	void OnSupersampleChanged(int d);
+	void OnTemporalSamplesChanged(int d);
 	void OnAffineInterpTypeComboCurrentIndexChanged(int index);
 	void OnInterpTypeComboCurrentIndexChanged(int index);
 
@@ -331,9 +333,11 @@ private:
 	DoubleSpinBox* m_DEFilterMinRadiusSpin;
 	DoubleSpinBox* m_DEFilterMaxRadiusSpin;
 	DoubleSpinBox* m_DECurveSpin;
-	SpinBox* m_TemporalSamplesSpin;//Iteration.
+	SpinBox* m_SbsSpin;//Iteration.
+	SpinBox* m_FuseSpin;
 	DoubleSpinBox* m_QualitySpin;
 	SpinBox* m_SupersampleSpin;
+	SpinBox* m_TemporalSamplesSpin;
 	StealthComboBox* m_AffineInterpTypeCombo;
 	StealthComboBox* m_InterpTypeCombo;
 	
diff --git a/Source/Fractorium/Fractorium.rc b/Source/Fractorium/Fractorium.rc
index d6dabf4..0ab71fc 100644
Binary files a/Source/Fractorium/Fractorium.rc and b/Source/Fractorium/Fractorium.rc differ
diff --git a/Source/Fractorium/Fractorium.ui b/Source/Fractorium/Fractorium.ui
index 7d955b2..88677b6 100644
--- a/Source/Fractorium/Fractorium.ui
+++ b/Source/Fractorium/Fractorium.ui
@@ -720,13 +720,13 @@
                <property name="minimumSize">
                 <size>
                  <width>0</width>
-                 <height>163</height>
+                 <height>156</height>
                 </size>
                </property>
                <property name="maximumSize">
                 <size>
                  <width>16777215</width>
-                 <height>163</height>
+                 <height>156</height>
                 </size>
                </property>
                <property name="focusPolicy">
@@ -805,13 +805,13 @@
                 <bool>false</bool>
                </attribute>
                <attribute name="verticalHeaderDefaultSectionSize">
-                <number>23</number>
+                <number>22</number>
                </attribute>
                <attribute name="verticalHeaderHighlightSections">
                 <bool>false</bool>
                </attribute>
                <attribute name="verticalHeaderMinimumSectionSize">
-                <number>23</number>
+                <number>22</number>
                </attribute>
                <attribute name="verticalHeaderStretchLastSection">
                 <bool>false</bool>
@@ -944,13 +944,13 @@
                <property name="minimumSize">
                 <size>
                  <width>0</width>
-                 <height>117</height>
+                 <height>156</height>
                 </size>
                </property>
                <property name="maximumSize">
                 <size>
                  <width>16777215</width>
-                 <height>117</height>
+                 <height>156</height>
                 </size>
                </property>
                <property name="focusPolicy">
@@ -1020,20 +1020,25 @@
                 <bool>false</bool>
                </attribute>
                <attribute name="verticalHeaderDefaultSectionSize">
-                <number>23</number>
+                <number>22</number>
                </attribute>
                <attribute name="verticalHeaderHighlightSections">
                 <bool>false</bool>
                </attribute>
                <attribute name="verticalHeaderMinimumSectionSize">
-                <number>23</number>
+                <number>22</number>
                </attribute>
                <attribute name="verticalHeaderStretchLastSection">
                 <bool>false</bool>
                </attribute>
                <row>
                 <property name="text">
-                 <string>Temporal Samples</string>
+                 <string>Sub Batch Size</string>
+                </property>
+               </row>
+               <row>
+                <property name="text">
+                 <string>Fuse Count</string>
                 </property>
                </row>
                <row>
@@ -1046,6 +1051,11 @@
                  <string>Supersample</string>
                 </property>
                </row>
+               <row>
+                <property name="text">
+                 <string>Temporal Samples</string>
+                </property>
+               </row>
                <row>
                 <property name="text">
                  <string>Affine Interpolation</string>
@@ -1068,7 +1078,7 @@
                </column>
                <item row="0" column="0">
                 <property name="text">
-                 <string>Temporal Samples</string>
+                 <string>Sub Batch Size</string>
                 </property>
                </item>
                <item row="0" column="1">
@@ -1078,7 +1088,7 @@
                </item>
                <item row="1" column="0">
                 <property name="text">
-                 <string>Quality</string>
+                 <string>Fuse Count</string>
                 </property>
                </item>
                <item row="1" column="1">
@@ -1088,7 +1098,7 @@
                </item>
                <item row="2" column="0">
                 <property name="text">
-                 <string>Supersample</string>
+                 <string>Quality</string>
                 </property>
                </item>
                <item row="2" column="1">
@@ -1098,7 +1108,7 @@
                </item>
                <item row="3" column="0">
                 <property name="text">
-                 <string>Affine Interpolation</string>
+                 <string>Supersample</string>
                 </property>
                </item>
                <item row="3" column="1">
@@ -1108,7 +1118,7 @@
                </item>
                <item row="4" column="0">
                 <property name="text">
-                 <string>Interpolation</string>
+                 <string>Temporal Samples</string>
                 </property>
                </item>
                <item row="4" column="1">
@@ -1116,6 +1126,26 @@
                  <string>0</string>
                 </property>
                </item>
+               <item row="5" column="0">
+                <property name="text">
+                 <string>Affine Interpolation</string>
+                </property>
+               </item>
+               <item row="5" column="1">
+                <property name="text">
+                 <string>0</string>
+                </property>
+               </item>
+               <item row="6" column="0">
+                <property name="text">
+                 <string>Interpolation</string>
+                </property>
+               </item>
+               <item row="6" column="1">
+                <property name="text">
+                 <string>0</string>
+                </property>
+               </item>
               </widget>
              </item>
              <item row="3" column="0">
@@ -1291,13 +1321,13 @@
                <property name="minimumSize">
                 <size>
                  <width>0</width>
-                 <height>163</height>
+                 <height>156</height>
                 </size>
                </property>
                <property name="maximumSize">
                 <size>
                  <width>16777215</width>
-                 <height>163</height>
+                 <height>156</height>
                 </size>
                </property>
                <property name="focusPolicy">
@@ -1367,13 +1397,13 @@
                 <bool>false</bool>
                </attribute>
                <attribute name="verticalHeaderDefaultSectionSize">
-                <number>23</number>
+                <number>22</number>
                </attribute>
                <attribute name="verticalHeaderHighlightSections">
                 <bool>false</bool>
                </attribute>
                <attribute name="verticalHeaderMinimumSectionSize">
-                <number>23</number>
+                <number>22</number>
                </attribute>
                <attribute name="verticalHeaderStretchLastSection">
                 <bool>false</bool>
@@ -1538,13 +1568,13 @@
                <property name="minimumSize">
                 <size>
                  <width>0</width>
-                 <height>278</height>
+                 <height>266</height>
                 </size>
                </property>
                <property name="maximumSize">
                 <size>
                  <width>16777215</width>
-                 <height>278</height>
+                 <height>266</height>
                 </size>
                </property>
                <property name="focusPolicy">
@@ -1614,13 +1644,13 @@
                 <bool>false</bool>
                </attribute>
                <attribute name="verticalHeaderDefaultSectionSize">
-                <number>23</number>
+                <number>22</number>
                </attribute>
                <attribute name="verticalHeaderHighlightSections">
                 <bool>false</bool>
                </attribute>
                <attribute name="verticalHeaderMinimumSectionSize">
-                <number>23</number>
+                <number>22</number>
                </attribute>
                <attribute name="verticalHeaderStretchLastSection">
                 <bool>false</bool>
@@ -2820,8 +2850,8 @@ SpinBox
                  <rect>
                   <x>0</x>
                   <y>0</y>
-                  <width>238</width>
-                  <height>747</height>
+                  <width>118</width>
+                  <height>597</height>
                  </rect>
                 </property>
                 <property name="palette">
diff --git a/Source/Fractorium/FractoriumEmberController.h b/Source/Fractorium/FractoriumEmberController.h
index 0fa2e8a..3614769 100644
--- a/Source/Fractorium/FractoriumEmberController.h
+++ b/Source/Fractorium/FractoriumEmberController.h
@@ -126,9 +126,11 @@ public:
 	virtual void DEFilterMinRadiusWidthChanged(double d) { }
 	virtual void DEFilterMaxRadiusWidthChanged(double d) { }
 	virtual void DEFilterCurveWidthChanged(double d) { }
-	virtual void TemporalSamplesChanged(int d) { }
+	virtual void SbsChanged(int d) { }
+	virtual void FuseChanged(int d) { }
 	virtual void QualityChanged(double d) { }
 	virtual void SupersampleChanged(int d) { }
+	virtual void TemporalSamplesChanged(int d) { }
 	virtual void AffineInterpTypeChanged(int i) { }
 	virtual void InterpTypeChanged(int i) { }
 	virtual void BackgroundChanged(const QColor& color) { }
@@ -340,9 +342,11 @@ public:
 	virtual void DEFilterMinRadiusWidthChanged(double d) override;
 	virtual void DEFilterMaxRadiusWidthChanged(double d) override;
 	virtual void DEFilterCurveWidthChanged(double d) override;
-	virtual void TemporalSamplesChanged(int d) override;
+	virtual void SbsChanged(int d) override;
+	virtual void FuseChanged(int d) override;
 	virtual void QualityChanged(double d) override;
 	virtual void SupersampleChanged(int d) override;
+	virtual void TemporalSamplesChanged(int d) override;
 	virtual void AffineInterpTypeChanged(int index) override;
 	virtual void InterpTypeChanged(int index) override;
 	virtual void BackgroundChanged(const QColor& col) override;
diff --git a/Source/Fractorium/FractoriumParams.cpp b/Source/Fractorium/FractoriumParams.cpp
index b1266dc..dec7e37 100644
--- a/Source/Fractorium/FractoriumParams.cpp
+++ b/Source/Fractorium/FractoriumParams.cpp
@@ -85,9 +85,11 @@ void Fractorium::InitParamsUI()
 	//Iteration.
 	row = 0;
 	table = ui.IterationTable;
-	SetupSpinner<SpinBox, int>         (table, this, row, 1, m_TemporalSamplesSpin, spinHeight, 1, 5000, 50, SIGNAL(valueChanged(int)),	   SLOT(OnTemporalSamplesChanged(int)), true, 1000);
-	SetupSpinner<DoubleSpinBox, double>(table, this, row, 1, m_QualitySpin,			spinHeight, 1, dmax, 50, SIGNAL(valueChanged(double)), SLOT(OnQualityChanged(double)),	    true,   10, 10, 10);
-	SetupSpinner<SpinBox, int>         (table, this, row, 1, m_SupersampleSpin,		spinHeight, 1,    4,  1, SIGNAL(valueChanged(int)),	   SLOT(OnSupersampleChanged(int)),	    true,    1,  1,  1);
+	SetupSpinner<SpinBox, int>(			table, this, row, 1, m_SbsSpin,				spinHeight, 1000, 100000, 100, SIGNAL(valueChanged(int)),	 SLOT(OnSbsChanged(int)),			  true, DEFAULT_SBS, DEFAULT_SBS, DEFAULT_SBS);
+	SetupSpinner<SpinBox, int>(			table, this, row, 1, m_FuseSpin,			spinHeight, 1,      1000,   5, SIGNAL(valueChanged(int)),	 SLOT(OnFuseChanged(int)),			  true,	   15,	  15, 15);
+	SetupSpinner<DoubleSpinBox, double>(table, this, row, 1, m_QualitySpin,			spinHeight, 1,      dmax,  50, SIGNAL(valueChanged(double)), SLOT(OnQualityChanged(double)),	  true,    10,	  10, 10);
+	SetupSpinner<SpinBox, int>(         table, this, row, 1, m_SupersampleSpin,		spinHeight, 1,         4,   1, SIGNAL(valueChanged(int)),	 SLOT(OnSupersampleChanged(int)),	  true,     1,	   1,  1);
+	SetupSpinner<SpinBox, int>(         table, this, row, 1, m_TemporalSamplesSpin, spinHeight, 1,      5000,  50, SIGNAL(valueChanged(int)),	 SLOT(OnTemporalSamplesChanged(int)), true,  1000);
 
 	comboVals.clear();
 	comboVals.push_back("Step");
@@ -389,14 +391,22 @@ void Fractorium::OnDEFilterCurveWidthChanged(double d) { m_Controller->DEFilterC
 /// </summary>
 
 /// <summary>
-/// Set the temporal samples to be used with animation.
-/// Called when the temporal samples spinner is changed.
-/// Does not reset anything because this is only used for animation.
-/// In the future, when animation is implemented, this will have an effect.
+/// Set the iteration depth.
+/// Called when the sub batch size spinner is changed.
+/// Resets the rendering process.
 /// </summary>
-/// <param name="d">The temporal samples value</param>
-template <typename T> void FractoriumEmberController<T>::TemporalSamplesChanged(int i) { Update([&] { m_Ember.m_TemporalSamples = i; }, true, NOTHING); }//Don't do anything until animation is implemented.
-void Fractorium::OnTemporalSamplesChanged(int d) { m_Controller->TemporalSamplesChanged(d); }
+/// <param name="d">The sub batch size value to set</param>
+template <typename T> void FractoriumEmberController<T>::SbsChanged(int d) { Update([&] { m_Ember.m_SubBatchSize = d; }); }
+void Fractorium::OnSbsChanged(int d) { m_Controller->SbsChanged(d); }
+
+/// <summary>
+/// Set the number of samples to disregard for each sub batch.
+/// Called when the fuse count spinner is changed.
+/// Resets the rendering process.
+/// </summary>
+/// <param name="d">The fuse count value to set</param>
+template <typename T> void FractoriumEmberController<T>::FuseChanged(int d) { Update([&] { m_Ember.m_FuseCount = d; }); }
+void Fractorium::OnFuseChanged(int d) { m_Controller->FuseChanged(d); }
 
 /// <summary>
 /// Set the quality.
@@ -425,6 +435,16 @@ void Fractorium::OnQualityChanged(double d) { m_Controller->QualityChanged(d); }
 template <typename T> void FractoriumEmberController<T>::SupersampleChanged(int d) { Update([&] { m_Ember.m_Supersample = d; }); }
 void Fractorium::OnSupersampleChanged(int d) { m_Controller->SupersampleChanged(d); }
 
+/// <summary>
+/// Set the temporal samples to be used with animation.
+/// Called when the temporal samples spinner is changed.
+/// Does not reset anything because this is only used for animation.
+/// In the future, when animation is implemented, this will have an effect.
+/// </summary>
+/// <param name="d">The temporal samples value</param>
+template <typename T> void FractoriumEmberController<T>::TemporalSamplesChanged(int i) { Update([&] { m_Ember.m_TemporalSamples = i; }, true, NOTHING); }//Don't do anything until animation is implemented.
+void Fractorium::OnTemporalSamplesChanged(int d) { m_Controller->TemporalSamplesChanged(d); }
+
 /// <summary>
 /// Set the affine interpolation type.
 /// Does not reset anything because this is only used for animation.
@@ -523,9 +543,11 @@ void FractoriumEmberController<T>::FillParamTablesAndPalette()
 	m_Fractorium->m_DEFilterMinRadiusSpin->SetValueStealth(m_Ember.m_MinRadDE);
 	m_Fractorium->m_DEFilterMaxRadiusSpin->SetValueStealth(m_Ember.m_MaxRadDE);
 	m_Fractorium->m_DECurveSpin->SetValueStealth(m_Ember.m_CurveDE);
-	m_Fractorium->m_TemporalSamplesSpin->SetValueStealth(m_Ember.m_TemporalSamples);//Iteration.
+	m_Fractorium->m_SbsSpin->SetValueStealth(m_Ember.m_SubBatchSize);//Iteration.
+	m_Fractorium->m_FuseSpin->SetValueStealth(m_Ember.m_FuseCount);
 	m_Fractorium->m_QualitySpin->SetValueStealth(m_Ember.m_Quality);
 	m_Fractorium->m_SupersampleSpin->SetValueStealth(m_Ember.m_Supersample);
+	m_Fractorium->m_TemporalSamplesSpin->SetValueStealth(m_Ember.m_TemporalSamples);
 	m_Fractorium->m_AffineInterpTypeCombo->SetCurrentIndexStealth(m_Ember.m_AffineInterp);
 	m_Fractorium->m_InterpTypeCombo->SetCurrentIndexStealth(m_Ember.m_Interp);
 
@@ -585,9 +607,11 @@ void FractoriumEmberController<T>::ParamsToEmber(Ember<T>& ember)
 	ember.m_MinRadDE = m_Fractorium->m_DEFilterMinRadiusSpin->value();
 	ember.m_MaxRadDE = m_Fractorium->m_DEFilterMaxRadiusSpin->value();
 	ember.m_CurveDE = m_Fractorium->m_DECurveSpin->value();
-	ember.m_TemporalSamples = m_Fractorium->m_TemporalSamplesSpin->value();
+	ember.m_SubBatchSize = m_Fractorium->m_SbsSpin->value();
+	ember.m_FuseCount = m_Fractorium->m_FuseSpin->value();
 	ember.m_Quality = m_Fractorium->m_QualitySpin->value();
 	ember.m_Supersample = m_Fractorium->m_SupersampleSpin->value();
+	ember.m_TemporalSamples = m_Fractorium->m_TemporalSamplesSpin->value();
 	ember.m_AffineInterp = (eAffineInterp)m_Fractorium->m_AffineInterpTypeCombo->currentIndex();
 	ember.m_Interp = (eInterp)m_Fractorium->m_InterpTypeCombo->currentIndex();
 }
diff --git a/Source/Fractorium/FractoriumRender.cpp b/Source/Fractorium/FractoriumRender.cpp
index a22908e..2bc0cb5 100644
--- a/Source/Fractorium/FractoriumRender.cpp
+++ b/Source/Fractorium/FractoriumRender.cpp
@@ -140,7 +140,7 @@ void FractoriumEmberControllerBase::SaveCurrentRender(const QString& filename, b
 			return;
 		}
 
-		data = m_FinalImage.data();//Png and channels = 4.
+		data = m_FinalImage.data();//Png and channels == 4.
 		
 		if ((suffix == "jpg" || suffix == "bmp") && m_Renderer->NumChannels() == 4)
 		{
@@ -296,7 +296,7 @@ bool FractoriumEmberController<T>::Render()
 
 	bool success = true;
 	GLWidget* gl = m_Fractorium->ui.GLDisplay;
-	RendererCL<T>* rendererCL;
+	RendererCL<T>* rendererCL = nullptr;
 	eProcessAction action = CondenseAndClearProcessActions();
 
 	if (m_Renderer->RendererType() == OPENCL_RENDERER)
@@ -412,10 +412,16 @@ bool FractoriumEmberController<T>::Render()
 					if (m_UndoList.size() >= UNDO_SIZE)
 						m_UndoList.pop_front();
 				}
-				else if (!m_LastEditWasUndoRedo && m_UndoIndex != m_UndoList.size() - 1)//They were in the middle of the undo list, then did a manual edit, so clear the undo list.
+				else if (!m_LastEditWasUndoRedo && m_UndoIndex < m_UndoList.size() - 1)//They were anywhere but the end of the undo list, then did a manual edit, so clear the undo list.
 				{
+					Ember<T> ember(m_UndoList[m_UndoIndex]);
+
 					ClearUndo();
+					m_UndoList.push_back(ember);
 					m_UndoList.push_back(m_Ember);
+					m_UndoIndex = m_UndoList.size() - 1;
+					m_Fractorium->ui.ActionUndo->setEnabled(true);
+					m_Fractorium->ui.ActionRedo->setEnabled(false);
 				}
 
 				m_LastEditWasUndoRedo = false;
@@ -515,8 +521,22 @@ bool FractoriumEmberController<T>::CreateRenderer(eRendererType renderType, unsi
 	{
 		m_RenderType = m_Renderer->RendererType();
 
-		if (m_RenderType == OPENCL_RENDERER && m_Fractorium->m_QualitySpin->value() < 30)
-			m_Fractorium->m_QualitySpin->setValue(30);
+		if (m_RenderType == OPENCL_RENDERER)
+		{
+			m_Fractorium->m_QualitySpin->DoubleClickZero(30);
+			m_Fractorium->m_QualitySpin->DoubleClickNonZero(30);
+
+			if (m_Fractorium->m_QualitySpin->value() < 30)
+				m_Fractorium->m_QualitySpin->setValue(30);
+		}
+		else
+		{
+			m_Fractorium->m_QualitySpin->DoubleClickZero(10);
+			m_Fractorium->m_QualitySpin->DoubleClickNonZero(10);
+
+			if (m_Fractorium->m_QualitySpin->value() > 10)
+				m_Fractorium->m_QualitySpin->setValue(10);
+		}
 
 		m_Renderer->Callback(this);
 		m_Renderer->NumChannels(4);//Always using 4 since the GL texture is RGBA.
diff --git a/Source/Fractorium/FractoriumXformsVariations.cpp b/Source/Fractorium/FractoriumXformsVariations.cpp
index 7a8890e..c36c10d 100644
--- a/Source/Fractorium/FractoriumXformsVariations.cpp
+++ b/Source/Fractorium/FractoriumXformsVariations.cpp
@@ -86,11 +86,8 @@ void FractoriumEmberController<T>::SetupVariationTree()
 						varSpinBox->Step(1);
 						varSpinBox->SmallStep(1);
 					}
-					else
-					{
-						varSpinBox->setDecimals(4);
-					}
-
+					
+					varSpinBox->setDecimals(4);
 					tree->setItemWidget(paramWidget, 1, varSpinBox);
 					m_Fractorium->connect(varSpinBox, SIGNAL(valueChanged(double)), SLOT(OnVariationSpinBoxValueChanged(double)), Qt::QueuedConnection);
 				}
diff --git a/Source/Fractorium/GLWidget.cpp b/Source/Fractorium/GLWidget.cpp
index 8cbdf06..79a08fb 100644
--- a/Source/Fractorium/GLWidget.cpp
+++ b/Source/Fractorium/GLWidget.cpp
@@ -294,7 +294,7 @@ void GLEmberController<T>::DrawAffines(bool pre, bool post)
 	bool dragging = m_DragState == DragDragging;
 
 	//Draw grid if control key is pressed.
-	if ((m_DragModifier & DragModControl) == DragModControl)
+	if (m_GL->hasFocus() && ((m_DragModifier & DragModControl) == DragModControl))
 	{
 		m_GL->glLineWidth(1.0f);
 		m_GL->DrawGrid();