0.4.1.5 Beta 11/28/2014

--User Changes Remove limit on the number of xforms allowable on the GPU. This was previously 21. Show actual strips count to be used in parens outside of user specified strips count on final render dialog. Allow for adjustment of iteration depth and fuse count per ember and save/read these values with the xml. Iteration optimizations on both CPU and GPU. Automatically adjust default quality spinner value when using CPU/GPU to 10/30, respectively. --Bug Fixes Fix severe randomization bug with OpenCL. Fix undo list off by one error when doing a new edit anywhere but the end of the undo list. Make integer variation parameters use 4 decimal places in the variations list like all the others. New build of the latest Qt to fix scroll bar drawing bug. Prevent grid from showing as much when pressing control to increase a spinner's increment speed. Still shows sometimes, but better than before. --Code Changes Pass count and fuse to iterator as a structure now to allow for passing more params in the future. Slightly different grid/block logic when running DE filtering on the GPU. Attempt a different way of doing DE, but #define out because it ended up not being faster. Restructure some things to allow for a variable length xforms buffer to be passed to the GPU. Add sub batch size and fuse count as ember members, and remove them from the renderer classes. Remove m_LastPass from Renderer. It should have been removed with passes. Pass seeds as a buffer to the OpenCL iteration kernel, rather than a single seed that gets modified. Slight optimization on CPU accum. Use case statement instead of if/else for xform chosing in OpenCL for a 2% speedup on params with large numbers of xforms. Add SizeOf() wrapper around sizeof(vec[0]) * vec.size(). Remove LogScaleSum() functions from the CPU and GPU because they're no longer used since passes were removed. Make some OpenCLWrapper getters const. Better ogranize RendererCL methods that return grid dimensions.
2025-10-07 13:40:53 -04:00 · 2014-11-28 01:37:51 -08:00
parent 3f29025f99
commit b29bedec38
39 changed files with 905 additions and 392 deletions
--- a/Source/EmberCL/DEOpenCLKernelCreator.cpp
+++ b/Source/EmberCL/DEOpenCLKernelCreator.cpp
@ -25,7 +25,6 @@ template <>
 DEOpenCLKernelCreator<float>::DEOpenCLKernelCreator(bool nVidia)
 {
 	m_NVidia = nVidia;
-	m_LogScaleSumDEEntryPoint                 = "LogScaleSumDensityFilterKernel";
 	m_LogScaleAssignDEEntryPoint              = "LogScaleAssignDensityFilterKernel";
 	m_GaussianDEWithoutSsEntryPoint           = "GaussianDEWithoutSsKernel";
 	m_GaussianDESsWithScfEntryPoint           = "GaussianDESsWithScfKernel";
@ -33,7 +32,6 @@ DEOpenCLKernelCreator<float>::DEOpenCLKernelCreator(bool nVidia)
 	m_GaussianDEWithoutSsNoCacheEntryPoint    = "GaussianDEWithoutSsNoCacheKernel";
 	m_GaussianDESsWithScfNoCacheEntryPoint    = "GaussianDESsWithScfNoCacheKernel";
 	m_GaussianDESsWithoutScfNoCacheEntryPoint = "GaussianDESsWithoutScfNoCacheKernel";
-	m_LogScaleSumDEKernel                     = CreateLogScaleSumDEKernelString();
 	m_LogScaleAssignDEKernel                  = CreateLogScaleAssignDEKernelString();
 	m_GaussianDEWithoutSsKernel               = CreateGaussianDEKernel(1);
 	m_GaussianDESsWithScfKernel               = CreateGaussianDEKernel(2);
@ -56,25 +54,39 @@ DEOpenCLKernelCreator<float>::DEOpenCLKernelCreator(bool nVidia)
 template <>
 DEOpenCLKernelCreator<double>::DEOpenCLKernelCreator(bool nVidia)
 {
+#ifdef ROW_ONLY_DE
+	m_NVidia = nVidia;
+	m_LogScaleAssignDEEntryPoint = "LogScaleAssignDensityFilterKernel";
+	m_GaussianDEWithoutSsEntryPoint = "GaussianDEWithoutSsKernel";
+	m_GaussianDESsWithScfEntryPoint = "GaussianDESsWithScfKernel";
+	m_GaussianDESsWithoutScfEntryPoint = "GaussianDESsWithoutScfKernel";
+	m_GaussianDEWithoutSsNoCacheEntryPoint = "GaussianDEWithoutSsNoCacheKernel";
+	m_GaussianDESsWithScfNoCacheEntryPoint = "GaussianDESsWithScfNoCacheKernel";
+	m_GaussianDESsWithoutScfNoCacheEntryPoint = "GaussianDESsWithoutScfNoCacheKernel";
+	m_LogScaleAssignDEKernel = CreateLogScaleAssignDEKernelString();
+	m_GaussianDEWithoutSsKernel = CreateGaussianDEKernel(1);
+	m_GaussianDESsWithScfKernel = CreateGaussianDEKernel(2);
+	m_GaussianDESsWithoutScfKernel = CreateGaussianDEKernel(3);
+	m_GaussianDEWithoutSsNoCacheKernel = CreateGaussianDEKernelNoLocalCache(1);
+	m_GaussianDESsWithScfNoCacheKernel = CreateGaussianDEKernelNoLocalCache(2);
+	m_GaussianDESsWithoutScfNoCacheKernel = CreateGaussianDEKernelNoLocalCache(3);
+#else
 	m_NVidia = nVidia;
-	m_LogScaleSumDEEntryPoint                 = "LogScaleSumDensityFilterKernel";
 	m_LogScaleAssignDEEntryPoint              = "LogScaleAssignDensityFilterKernel";
 	m_GaussianDEWithoutSsNoCacheEntryPoint    = "GaussianDEWithoutSsNoCacheKernel";
 	m_GaussianDESsWithScfNoCacheEntryPoint    = "GaussianDESsWithScfNoCacheKernel";
 	m_GaussianDESsWithoutScfNoCacheEntryPoint = "GaussianDESsWithoutScfNoCacheKernel";
-	m_LogScaleSumDEKernel                     = CreateLogScaleSumDEKernelString();
 	m_LogScaleAssignDEKernel                  = CreateLogScaleAssignDEKernelString();
 	m_GaussianDEWithoutSsNoCacheKernel        = CreateGaussianDEKernelNoLocalCache(1);
 	m_GaussianDESsWithScfNoCacheKernel        = CreateGaussianDEKernelNoLocalCache(2);
 	m_GaussianDESsWithoutScfNoCacheKernel     = CreateGaussianDEKernelNoLocalCache(3);
+#endif
 }

 /// <summary>
 /// Kernel source and entry point properties, getters only.
 /// </summary>

-template <typename T> string DEOpenCLKernelCreator<T>::LogScaleSumDEKernel() { return m_LogScaleSumDEKernel; }
-template <typename T> string DEOpenCLKernelCreator<T>::LogScaleSumDEEntryPoint() { return m_LogScaleSumDEEntryPoint; }
 template <typename T> string DEOpenCLKernelCreator<T>::LogScaleAssignDEKernel() { return m_LogScaleAssignDEKernel; }
 template <typename T> string DEOpenCLKernelCreator<T>::LogScaleAssignDEEntryPoint() { return m_LogScaleAssignDEEntryPoint; }

@ -87,6 +99,7 @@ template <typename T> string DEOpenCLKernelCreator<T>::LogScaleAssignDEEntryPoin
 template <typename T>
 string DEOpenCLKernelCreator<T>::GaussianDEKernel(size_t ss, unsigned int filterWidth)
 {
+#ifndef ROW_ONLY_DE
 	if ((typeid(T) == typeid(double)) || (filterWidth > MaxDEFilterSize()))//Type double does not use cache.
 	{
 		if (ss > 1)
@ -100,6 +113,7 @@ string DEOpenCLKernelCreator<T>::GaussianDEKernel(size_t ss, unsigned int filter
 			return m_GaussianDEWithoutSsNoCacheKernel;
 	}
 	else
+#endif
 	{
 		if (ss > 1)
 		{
@ -122,6 +136,7 @@ string DEOpenCLKernelCreator<T>::GaussianDEKernel(size_t ss, unsigned int filter
 template <typename T>
 string DEOpenCLKernelCreator<T>::GaussianDEEntryPoint(size_t ss, unsigned int filterWidth)
 {
+#ifndef ROW_ONLY_DE
 	if ((typeid(T) == typeid(double)) || (filterWidth > MaxDEFilterSize()))//Type double does not use cache.
 	{
 		if (ss > 1)
@ -135,6 +150,7 @@ string DEOpenCLKernelCreator<T>::GaussianDEEntryPoint(size_t ss, unsigned int fi
 			return m_GaussianDEWithoutSsNoCacheEntryPoint;
 	}
 	else
+#endif
 	{
 		if (ss > 1)
 		{
@ -194,45 +210,6 @@ unsigned int DEOpenCLKernelCreator<T>::SolveMaxBoxSize(unsigned int localMem)
 	return (unsigned int)floor(sqrt(floor((T)localMem / 16.0)));//Divide by 16 because each element is float4.
 }

-/// <summary>
-/// Create the log scale kernel string, using summation.
-/// This means each cell will be added to, rather than just assigned.
-/// Since adding is slower than assigning, this should only be used when Passes > 1,
-/// otherwise use the kernel created from CreateLogScaleAssignDEKernelString().
-/// </summary>
-/// <returns>The kernel string</returns>
-template <typename T>
-string DEOpenCLKernelCreator<T>::CreateLogScaleSumDEKernelString()
-{
-	ostringstream os;
- 
-	os	<<
-		ConstantDefinesString(typeid(T) == typeid(double)) <<
-		DensityFilterCLStructString <<
-		"__kernel void " << m_LogScaleSumDEEntryPoint << "(\n"
-		"	const __global real4* histogram,\n"
-		"	__global real4* accumulator,\n"
-		"	__constant DensityFilterCL* logFilter\n"
-		"\t)\n"
-		"{\n"
-		"	if ((GLOBAL_ID_X < logFilter->m_SuperRasW) && (GLOBAL_ID_Y < logFilter->m_SuperRasH))\n"
-		"	{\n"
-		"		uint index = (GLOBAL_ID_Y * logFilter->m_SuperRasW) + GLOBAL_ID_X;\n"
-		"\n"
-		"		if (histogram[index].w != 0)\n"
-		"		{\n"
-		"			real_t logScale = (logFilter->m_K1 * log(1.0 + histogram[index].w * logFilter->m_K2)) / histogram[index].w;\n"
-		"\n"
-		"			accumulator[index] += histogram[index] * logScale;\n"//Using a single real4 vector operation doubles the speed from doing each component individually.
-		"		}\n"
-		"\n"
-		"		barrier(CLK_GLOBAL_MEM_FENCE);\n"//Just to be safe. Makes no speed difference to do all of the time or only when there's a hit.
-		"	}\n"
-		"}\n";
- 
-	return os.str();
-}
-
 /// <summary>
 /// Create the log scale kernel string, using assignment.
 /// Use this when Passes == 1.
@ -270,6 +247,215 @@ string DEOpenCLKernelCreator<T>::CreateLogScaleAssignDEKernelString()
 	return os.str();
 }

+#ifdef ROW_ONLY_DE
+template <typename T>
+string DEOpenCLKernelCreator<T>::CreateGaussianDEKernel(size_t ss)
+{
+	bool doSS = ss > 1;
+	bool doScf = !(ss & 1);
+	ostringstream os;
+
+	os <<
+		ConstantDefinesString(typeid(T) == typeid(double)) <<
+		DensityFilterCLStructString <<
+		UnionCLStructString <<
+		"__kernel void " << GaussianDEEntryPoint(ss, MaxDEFilterSize()) << "(\n" <<
+		"	const __global real4* histogram,\n"
+		"	__global real4reals* accumulator,\n"
+		"	__constant DensityFilterCL* densityFilter,\n"
+		"	const __global real_t* filterCoefs,\n"
+		"	const __global real_t* filterWidths,\n"
+		"	const __global uint* coefIndices,\n"
+		"	const uint chunkSizeW,\n"
+		"	const uint chunkSizeH,\n"
+		"	const uint chunkW,\n"
+		"	const uint chunkH\n"
+		"\t)\n"
+		"{\n"
+		"	uint rowsToProcess = 32;\n"//Rows to process.
+		"\n"
+		"	if (((((BLOCK_ID_X * chunkSizeW) + chunkW) * BLOCK_SIZE_X) + THREAD_ID_X >= densityFilter->m_SuperRasW) ||\n"
+		"	    ((((BLOCK_ID_Y * chunkSizeH) + chunkH) * rowsToProcess) + THREAD_ID_Y >= densityFilter->m_SuperRasH))\n"
+		"			return;\n"
+		"\n";
+
+	if (doSS)
+	{
+		os <<
+			"	uint ss = (uint)floor((real_t)densityFilter->m_Supersample / 2.0);\n"
+			"	int densityBoxLeftX;\n"
+			"	int densityBoxRightX;\n"
+			"	int densityBoxTopY;\n"
+			"	int densityBoxBottomY;\n"
+			"\n";
+
+		if (doScf)
+			os <<
+			"	real_t scfact = pow(densityFilter->m_Supersample / (densityFilter->m_Supersample + 1.0), 2.0);\n";
+	}
+
+	os <<
+		"	uint fullTempBoxWidth;\n"
+		"	uint leftBound, rightBound, topBound, botBound;\n"
+		"	uint blockHistStartRow, blockHistEndRow, histCol;\n"
+		"	uint blockHistStartCol, boxReadStartCol, boxReadEndCol;\n"
+		"	uint accumWriteStartCol, colsToWrite, colOffset, colsToWriteOffset;\n"
+		"	int histRow, filterRow, accumWriteOffset;\n"
+		"\n"
+		"	fullTempBoxWidth = BLOCK_SIZE_X + (densityFilter->m_FilterWidth * 2);\n"
+		//Compute the bounds of the area to be sampled, which is just the ends minus the super sample minus 1.
+		"	leftBound = densityFilter->m_Supersample - 1;\n"
+		"	rightBound = densityFilter->m_SuperRasW - (densityFilter->m_Supersample - 1);\n"
+		"	topBound = densityFilter->m_Supersample - 1;\n"
+		"	botBound = densityFilter->m_SuperRasH - (densityFilter->m_Supersample - 1);\n"
+		"\n"
+		//Start and end values are the indices in the histogram read from
+		//and written to in the accumulator. They are not the indices for the local block of data.
+		//Before computing local offsets, compute the global offsets first to determine if any rows or cols fall outside of the bounds.
+		"	blockHistStartRow = min(botBound, topBound + (((BLOCK_ID_Y * chunkSizeH) + chunkH) * rowsToProcess));\n"//The first histogram row this block will process.
+		"	blockHistEndRow = min(botBound, blockHistStartRow + rowsToProcess);\n"//The last histogram row this block will process, clamped to the last row.
+		"	blockHistStartCol = min(rightBound, leftBound + (((BLOCK_ID_X * chunkSizeW) + chunkW) * BLOCK_SIZE_X));\n"//The first histogram column this block will process.
+		"	boxReadStartCol = densityFilter->m_FilterWidth - min(densityFilter->m_FilterWidth, blockHistStartCol);\n"//The first box col this block will read from when copying to the accumulator.
+		"	boxReadEndCol = densityFilter->m_FilterWidth + min(densityFilter->m_FilterWidth + BLOCK_SIZE_X, densityFilter->m_SuperRasW - blockHistStartCol);\n"//The last box col this block will read from when copying to the accumulator.
+		"\n"
+		//Last, the indices in the global accumulator that the local bounds will be writing to.
+		"	accumWriteStartCol = blockHistStartCol - min(densityFilter->m_FilterWidth, blockHistStartCol);\n"//The first column in the accumulator this block will write to.
+		"	colsToWrite = ceil((real_t)(boxReadEndCol - boxReadStartCol) / (real_t)BLOCK_SIZE_X);\n"//Elements per thread to be written to the accumulator.
+		"	histCol = blockHistStartCol + THREAD_ID_X;\n"//The histogram column this individual thread will be reading from.
+		"\n"
+		"	if (histCol >= rightBound)\n"
+		"		return;\n"
+		"\n"
+		//Compute the col position in this local box to serve as the center position
+		//from which filter application offsets are computed.
+		//These are the local indices for the local data that are temporarily accumulated to before
+		//writing out to the global accumulator.
+		"	uint boxCol = densityFilter->m_FilterWidth + THREAD_ID_X;\n"
+		"	uint colsToZeroOffset, colsToZero = ceil((real_t)fullTempBoxWidth / (real_t)(BLOCK_SIZE_X));\n"//Usually is 2.
+		"	int i, j, k, jmin, jmax;\n"
+		"	uint filterSelectInt, filterCoefIndex;\n"
+		"	real_t cacheLog;\n"
+		"	real_t filterSelect;\n"
+		"	real4 bucket;\n"
+		;
+
+	os << " __local real4reals filterBox[192];\n";//Must be >= fullTempBoxWidth.
+
+	os <<
+		"\n"
+		"	colsToZeroOffset = colsToZero * THREAD_ID_X;\n"
+		"	colsToWriteOffset = colsToWrite * THREAD_ID_X;\n"
+		"	k = (int)densityFilter->m_FilterWidth;\n"//Need a signed int to use below, really is filter width, but reusing a variable to save space.
+		"\n"
+		"	for (histRow = blockHistStartRow; histRow < blockHistEndRow; histRow++)\n"//Process pixels by row, for 32 rows.
+		"	{\n"
+		"		bucket = histogram[(histRow * densityFilter->m_SuperRasW) + histCol];\n"
+		"\n"
+		"		if (bucket.w != 0)\n"
+		"			cacheLog = (densityFilter->m_K1 * log(1.0 + bucket.w * densityFilter->m_K2)) / bucket.w;\n"
+		"\n";
+		
+	if (doSS)
+	{
+		os <<
+			"	filterSelect = 0;\n"
+			"	densityBoxLeftX = histCol - min(histCol, ss);\n"
+			"	densityBoxRightX = histCol + min(ss, (densityFilter->m_SuperRasW - histCol) - 1);\n"
+			"	densityBoxTopY = histRow - min((uint)histRow, ss);\n"
+			"	densityBoxBottomY = histRow + min(ss, (densityFilter->m_SuperRasH - histRow) - 1);\n"
+			"\n"
+			"	for (j = densityBoxTopY; j <= densityBoxBottomY; j++)\n"
+			"	{\n"
+			"		for (i = densityBoxLeftX; i <= densityBoxRightX; i++)\n"
+			"		{\n"
+			"			filterSelect += histogram[(j * densityFilter->m_SuperRasW) + i].w;\n"
+			"		}\n"
+			"	}\n"
+			"\n";
+
+	if (doScf)
+		os << "	filterSelect *= scfact;\n";
+	}
+	else
+	{
+	os
+		<< "	filterSelect = bucket.w;\n";
+	}
+
+	os <<
+		"\n"
+		"		if (filterSelect > densityFilter->m_MaxFilteredCounts)\n"
+		"			filterSelectInt = densityFilter->m_MaxFilterIndex;\n"
+		"		else if (filterSelect <= DE_THRESH)\n"
+		"			filterSelectInt = (int)ceil(filterSelect) - 1;\n"
+		"		else if (filterSelect != 0)\n"
+		"			filterSelectInt = (int)DE_THRESH + (int)floor(pow((real_t)(filterSelect - DE_THRESH), densityFilter->m_Curve));\n"
+		"		else\n"
+		"			filterSelectInt = 0;\n"
+		"\n"
+		"		if (filterSelectInt > densityFilter->m_MaxFilterIndex)\n"
+		"			filterSelectInt = densityFilter->m_MaxFilterIndex;\n"
+		"\n"
+		"		filterCoefIndex = filterSelectInt * densityFilter->m_KernelSize;\n"
+		"\n"
+		//With this new method, only accumulate to the temp local buffer first. Write to the final accumulator last.
+		//For each loop through, note that there is a local memory barrier call inside of each call to AddToAccumNoCheck().
+		//If this isn't done, pixel errors occurr and even an out of resources error occurrs because too many writes are done to the same place in memory at once.
+		"		jmin = min(k, histRow);\n"
+		"		jmax = (int)min((densityFilter->m_SuperRasH - 1) - histRow, densityFilter->m_FilterWidth);\n"
+		"\n"
+		"		for (j = -jmin; j <= jmax; j++)\n"
+		"		{\n"
+		"			for (i = 0; i < colsToZero && (colsToZeroOffset + i) < fullTempBoxWidth; i++)\n"//Each thread zeroizes a few columns.
+		"			{\n"
+		"				filterBox[colsToZeroOffset + i].m_Real4 = 0;\n"
+		"			}\n"
+		"\n"
+		"			barrier(CLK_LOCAL_MEM_FENCE);\n"
+		"\n"
+		"			if (bucket.w != 0)\n"
+		"			{\n"
+		"				filterRow = abs(j) * (densityFilter->m_FilterWidth + 1);\n"
+		"\n"
+		"				for (i = -k; i <= k; i++)\n"
+		"				{\n"
+		"					filterSelectInt = filterCoefIndex + coefIndices[filterRow + abs(i)];\n"//Really is filterCoeffIndexPlusOffset, but reusing a variable to save space.
+		"					filterBox[i + boxCol].m_Real4 += (bucket * (filterCoefs[filterSelectInt] * cacheLog));\n"
+		"				}\n"
+		"			}\n"
+		"\n"
+		"			barrier(CLK_LOCAL_MEM_FENCE);\n"
+		"\n"
+		//At this point, all threads in this block have applied the filter to their surrounding pixels and stored the results in the temp local box.
+		//Add the cells of it that are in bounds to the global accumulator.
+		//Compute offsets in local box to read from, and offsets into global accumulator to write to.
+		//Use a method here that is similar to the zeroization above: Each thread (column) in the first row iterates through all of the
+		//rows and adds a few columns to the accumulator.
+		//"			if (THREAD_ID_X == 0)\n"
+		//"			{\n"
+		//"				for (int kk = boxReadStartCol, i = 0; kk < boxReadEndCol; kk++, i++)\n"//Each thread writes a few columns.//Could do away with kk//TODO//OPT
+		//"				{\n"
+		//"					accumulator[((histRow + j) * densityFilter->m_SuperRasW) + (accumWriteStartCol + i)].m_Real4 += filterBox[kk].m_Real4;\n"
+		//"				}\n"
+		//"			}\n"
+		"			accumWriteOffset = ((histRow + j) * densityFilter->m_SuperRasW) + accumWriteStartCol;\n"
+		"\n"
+		"			for (i = 0; i < colsToWrite; i++)\n"//Each thread writes a few columns.
+		"			{\n"
+		"				colOffset = colsToWriteOffset + i;\n"
+		"\n"
+		"				if (boxReadStartCol + colOffset < boxReadEndCol)\n"
+		"					accumulator[accumWriteOffset + colOffset].m_Real4 += filterBox[boxReadStartCol + colOffset].m_Real4;\n"
+		"			}\n"
+		"		}\n"//for() filter rows.
+		"		barrier(CLK_GLOBAL_MEM_FENCE);\n"
+		"	}\n"//for() histogram rows.
+		"}\n";
+
+	return os.str();
+}
+
+#else
 /// <summary>
 /// Create the gaussian density filtering kernel string.
 /// 6 different methods of processing were tried before settling on this final and fastest 7th one.
@ -281,7 +467,7 @@ string DEOpenCLKernelCreator<T>::CreateLogScaleAssignDEKernelString()
 /// This allows writing to the global buffer without ever overlapping or using atomics.
 /// The supersample parameter will produce three different kernels.
 /// SS = 1, SS > 1 && SS even, SS > 1 && SS odd.
-/// The width of the kernl this runs in must be evenly divisible by 16 or else artifacts will occur.
+/// The width of the kernel this runs in must be evenly divisible by 16 or else artifacts will occur.
 /// Note that because this function uses so many variables and is so complex, OpenCL can easily run
 /// out of resources in some cases. Certain variables had to be reused to condense the kernel footprint
 /// down enough to be able to run a block size of 32x32.
@ -311,18 +497,15 @@ string DEOpenCLKernelCreator<T>::CreateGaussianDEKernel(size_t ss)
 		"	const __global uint* coefIndices,\n"
 		"	const uint chunkSizeW,\n"
 		"	const uint chunkSizeH,\n"
-		"	const uint rowParity,\n"
-		"	const uint colParity\n"
+		"	const uint chunkW,\n"
+		"	const uint chunkH\n"
 		"\t)\n"
 		"{\n"
-		//Parity determines if this function should execute.
-		"	if ((GLOBAL_ID_X >= densityFilter->m_SuperRasW) ||\n"
-		"	    (GLOBAL_ID_Y >= densityFilter->m_SuperRasH) ||\n"
-		"	    ((BLOCK_ID_X % chunkSizeW) != colParity)    ||\n"
-		"	    ((BLOCK_ID_Y % chunkSizeH) != rowParity))     \n"
+		"	if (((((BLOCK_ID_X * chunkSizeW) + chunkW) * BLOCK_SIZE_X) + THREAD_ID_X >= densityFilter->m_SuperRasW) ||\n"
+		"	    ((((BLOCK_ID_Y * chunkSizeH) + chunkH) * BLOCK_SIZE_Y) + THREAD_ID_Y >= densityFilter->m_SuperRasH))\n"
 		"			return;\n"
 		"\n";
- 
+
 	if (doSS)
 	{
 		os <<
@ -367,13 +550,13 @@ string DEOpenCLKernelCreator<T>::CreateGaussianDEKernel(size_t ss)
 		//Start and end values are the indices in the histogram read from
 		//and written to in the accumulator. They are not the indices for the local block of data.
 		//Before computing local offsets, compute the global offsets first to determine if any rows or cols fall outside of the bounds.
-		"	blockHistStartRow = min(botBound, topBound + (BLOCK_ID_Y * BLOCK_SIZE_Y));\n"//The first histogram row this block will process.
+		"	blockHistStartRow = min(botBound, topBound + (((BLOCK_ID_Y * chunkSizeH) + chunkH) * BLOCK_SIZE_Y));\n"//The first histogram row this block will process.
 		"	blockHistEndRow = min(botBound, blockHistStartRow + BLOCK_SIZE_Y);\n"//The last histogram row this block will process, clamped to the last row.
 		"	boxReadStartRow = densityFilter->m_FilterWidth - min(densityFilter->m_FilterWidth, blockHistStartRow);\n"//The first row in the local box to read from when writing back to the final accumulator for this block.
 		"	boxReadEndRow = densityFilter->m_FilterWidth + min(densityFilter->m_FilterWidth + BLOCK_SIZE_Y, densityFilter->m_SuperRasH - blockHistStartRow);\n"//The last row in the local box to read from  when writing back to the final accumulator for this block.
-		"	blockHistStartCol = min(rightBound, leftBound + (BLOCK_ID_X * BLOCK_SIZE_X));\n"//The first histogram column this block will process.
-		"	boxReadStartCol = densityFilter->m_FilterWidth - min(densityFilter->m_FilterWidth, blockHistStartCol);\n"//The first box row this block will read from when copying to the accumulator.
-		"	boxReadEndCol = densityFilter->m_FilterWidth + min(densityFilter->m_FilterWidth + BLOCK_SIZE_X, densityFilter->m_SuperRasW - blockHistStartCol);\n"//The last box row this block will read from when copying to the accumulator.
+		"	blockHistStartCol = min(rightBound, leftBound + (((BLOCK_ID_X * chunkSizeW) + chunkW) * BLOCK_SIZE_X));\n"//The first histogram column this block will process.
+		"	boxReadStartCol = densityFilter->m_FilterWidth - min(densityFilter->m_FilterWidth, blockHistStartCol);\n"//The first box col this block will read from when copying to the accumulator.
+		"	boxReadEndCol = densityFilter->m_FilterWidth + min(densityFilter->m_FilterWidth + BLOCK_SIZE_X, densityFilter->m_SuperRasW - blockHistStartCol);\n"//The last box col this block will read from when copying to the accumulator.
 		"\n"
 		//Last, the indices in the global accumulator that the local bounds will be writing to.
 		"	accumWriteStartRow = blockHistStartRow - min(densityFilter->m_FilterWidth,  blockHistStartRow);\n"//Will be fw - 0 except for boundary columns, it will be less.
@ -496,7 +679,7 @@ string DEOpenCLKernelCreator<T>::CreateGaussianDEKernel(size_t ss)
 		"				{\n"
 		"					filterSelectInt = filterCoefIndex + coefIndices[(abs(j) * (densityFilter->m_FilterWidth + 1)) + abs(i)];\n"//Really is filterCoeffIndexPlusOffset, but reusing a variable to save space.
 		"\n"
-		"					if (filterCoefs[filterSelectInt] != 0)\n"
+		"					if (filterCoefs[filterSelectInt] != 0)\n"//This conditional actually improves speed, despite SIMT being bad at conditionals.
 		"					{\n"
 		"						filterBox[(i + boxCol) + ((j + boxRow) * fullTempBoxWidth)].m_Real4 += (bucket * (filterCoefs[filterSelectInt] * cacheLog));\n"
 		"					}\n"
@ -511,14 +694,14 @@ string DEOpenCLKernelCreator<T>::CreateGaussianDEKernel(size_t ss)
 		"\n"
 		"	if (THREAD_ID_Y == 0)\n"
 		"	{\n"
-				//At this point, all threads in this block have applied the filter to their surrounding pixel and stored the results in the temp local box.
+				//At this point, all threads in this block have applied the filter to their surrounding pixels and stored the results in the temp local box.
 				//Add the cells of it that are in bounds to the global accumulator.
 				//Compute offsets in local box to read from, and offsets into global accumulator to write to.
 				//Use a method here that is similar to the zeroization above: Each thread (column) in the first row iterates through all of the
 				//rows and adds a few columns to the accumulator.
 		"		for (i = boxReadStartRow, j = accumWriteStartRow; i < boxReadEndRow; i++, j++)\n"
 		"		{\n"
-		"			for (k = 0; k < colsToWrite; k++)\n"//Write a few columns.
+		"			for (k = 0; k < colsToWrite; k++)\n"//Each thread writes a few columns.
 		"			{\n"
 		"				boxCol = (colsToWrite * THREAD_ID_X) + k;\n"//Really is colOffset, but reusing a variable to save space.
 		"\n"
@ -532,6 +715,7 @@ string DEOpenCLKernelCreator<T>::CreateGaussianDEKernel(size_t ss)

 	return os.str();
 }
+#endif

 /// <summary>
 /// Create the gaussian density filtering kernel string, but use no local cache and perform
@ -543,7 +727,7 @@ string DEOpenCLKernelCreator<T>::CreateGaussianDEKernel(size_t ss)
 /// on the CPU because the frequent global memory access brings performance to a crawl.
 /// The supersample parameter will produce three different kernels.
 /// SS = 1, SS > 1 && SS even, SS > 1 && SS odd.
-/// The width of the kernl this runs in must be evenly divisible by 16 or else artifacts will occur.
+/// The width of the kernel this runs in must be evenly divisible by 16 or else artifacts will occur.
 /// Note that because this function uses so many variables and is so complex, OpenCL can easily run
 /// out of resources in some cases. Certain variables had to be reused to condense the kernel footprint
 /// down enough to be able to run a block size of 32x32.
@ -572,15 +756,12 @@ string DEOpenCLKernelCreator<T>::CreateGaussianDEKernelNoLocalCache(size_t ss)
 		"	const __global uint* coefIndices,\n"
 		"	const uint chunkSizeW,\n"
 		"	const uint chunkSizeH,\n"
-		"	const uint rowParity,\n"
-		"	const uint colParity\n"
+		"	const uint chunkW,\n"
+		"	const uint chunkH\n"
 		"\t)\n"
 		"{\n"
-		//Parity determines if this function should execute.
-		"	if ((GLOBAL_ID_X >= densityFilter->m_SuperRasW) ||\n"
-		"	    (GLOBAL_ID_Y >= densityFilter->m_SuperRasH) ||\n"
-		"	    ((BLOCK_ID_X % chunkSizeW) != colParity)    ||\n"
-		"	    ((BLOCK_ID_Y % chunkSizeH) != rowParity))     \n"
+		"	if (((((BLOCK_ID_X * chunkSizeW) + chunkW) * BLOCK_SIZE_X) + THREAD_ID_X >= densityFilter->m_SuperRasW) ||\n"
+		"	    ((((BLOCK_ID_Y * chunkSizeH) + chunkH) * BLOCK_SIZE_Y) + THREAD_ID_Y >= densityFilter->m_SuperRasH))\n"
 		"			return;\n"
 		"\n";
 
@ -606,10 +787,10 @@ string DEOpenCLKernelCreator<T>::CreateGaussianDEKernelNoLocalCache(size_t ss)
 		"\n"
 		//Start and end values are the indices in the histogram read from and written to in the accumulator.
 		//Before computing local offsets, compute the global offsets first to determine if any rows or cols fall outside of the bounds.
-		"	uint blockHistStartRow = min(botBound, topBound + (BLOCK_ID_Y * BLOCK_SIZE_Y));\n"//The first histogram row this block will process.
+		"	uint blockHistStartRow = min(botBound, topBound + (((BLOCK_ID_Y * chunkSizeH) + chunkH) * BLOCK_SIZE_Y));\n"//The first histogram row this block will process.
 		"	uint threadHistRow = blockHistStartRow + THREAD_ID_Y;\n"//The histogram row this individual thread will be reading from.
 		"\n"
-		"	uint blockHistStartCol = min(rightBound, leftBound + (BLOCK_ID_X * BLOCK_SIZE_X));\n"//The first histogram column this block will process.
+		"	uint blockHistStartCol = min(rightBound, leftBound + (((BLOCK_ID_X * chunkSizeW) + chunkW) * BLOCK_SIZE_X));\n"//The first histogram column this block will process.
 		"	uint threadHistCol = blockHistStartCol + THREAD_ID_X;\n"//The histogram column this individual thread will be reading from.
 		"\n"
 		"	int i, j;\n"
--- a/Source/EmberCL/DEOpenCLKernelCreator.h
+++ b/Source/EmberCL/DEOpenCLKernelCreator.h
@ -8,6 +8,8 @@
 /// DEOpenCLKernelCreator class.
 /// </summary>

+//#define ROW_ONLY_DE 1
+
 namespace EmberCLns
 {
 /// <summary>
@ -35,8 +37,6 @@ public:
 	DEOpenCLKernelCreator(bool nVidia);

 	//Accessors.
-	string LogScaleSumDEKernel();
-	string LogScaleSumDEEntryPoint();
 	string LogScaleAssignDEKernel();
 	string LogScaleAssignDEEntryPoint();
 	string GaussianDEKernel(size_t ss, unsigned int filterWidth);
@ -49,14 +49,10 @@ public:

 private:
 	//Kernel creators.
-	string CreateLogScaleSumDEKernelString();
 	string CreateLogScaleAssignDEKernelString();
 	string CreateGaussianDEKernel(size_t ss);
 	string CreateGaussianDEKernelNoLocalCache(size_t ss);
-
-	string m_LogScaleSumDEKernel;
-	string m_LogScaleSumDEEntryPoint;
-
+	
 	string m_LogScaleAssignDEKernel;
 	string m_LogScaleAssignDEEntryPoint;

--- a/Source/EmberCL/EmberCLStructs.h
+++ b/Source/EmberCL/EmberCLStructs.h
@ -181,9 +181,6 @@ static const char* XformCLStructString =
 "} XformCL;\n"
 "\n";

-#define MAX_CL_XFORM 21//These must always match.
-#define MAX_CL_XFORM_STRING "21"
-
 /// <summary>
 /// A structure on the host used to hold all of the needed information for an ember used on the device to iterate in OpenCL.
 /// Template argument expected to be float or double.
@ -191,7 +188,6 @@ static const char* XformCLStructString =
 template <typename T>
 struct ALIGN EmberCL
 {
-	XformCL<T> m_Xforms[MAX_CL_XFORM];
 	T m_CamZPos;
 	T m_CamPerspective;
 	T m_CamYaw;
@ -209,7 +205,6 @@ struct ALIGN EmberCL
 static const char* EmberCLStructString =
 "typedef struct __attribute__ " ALIGN_CL " _EmberCL\n"
 "{\n"
-"	XformCL m_Xforms[" MAX_CL_XFORM_STRING "];\n"
 "	real_t m_CamZPos;\n"
 "	real_t m_CamPerspective;\n"
 "	real_t m_CamYaw;\n"
--- a/Source/EmberCL/IterOpenCLKernelCreator.cpp
+++ b/Source/EmberCL/IterOpenCLKernelCreator.cpp
@ -1,6 +1,9 @@
 #include "EmberCLPch.h"
 #include "IterOpenCLKernelCreator.h"

+//#define STRAIGHT_RAND 1
+#define USE_CASE 1
+
 namespace EmberCLns
 {
 /// <summary>
@ -233,8 +236,9 @@ string IterOpenCLKernelCreator<T>::CreateIterKernelString(Ember<T>& ember, strin
 		"__kernel void " << m_IterEntryPoint << "(\n" <<
 		"	uint iterCount,\n"
 		"	uint fuseCount,\n"
-		"	uint seed,\n"
+		"	__global uint2* seeds,\n"
 		"	__constant EmberCL* ember,\n"
+		"	__constant XformCL* xforms,\n"
 		"	__constant real_t* parVars,\n"
 		"	__global uchar* xformDistributions,\n"//Using uchar is quicker than uint. Can't be constant because the size can be too large to fit when using xaos.//FINALOPT
 		"	__constant CarToRasCL* carToRas,\n"
@ -246,13 +250,14 @@ string IterOpenCLKernelCreator<T>::CreateIterKernelString(Ember<T>& ember, strin
 		"{\n"
 		"	bool fuse, ok;\n"
 		"	uint threadIndex = INDEX_IN_BLOCK_2D;\n"
+		"	uint pointsIndex = INDEX_IN_GRID_2D;\n"
 		"	uint i, itersToDo;\n"
 		"	uint consec = 0;\n"
 		//"	int badvals = 0;\n"
 		"	uint histIndex;\n"
 		"	real_t p00, p01;\n"
 		"	Point firstPoint, secondPoint, tempPoint;\n"
-		"	uint2 mwc;\n"
+		"	uint2 mwc = seeds[pointsIndex];\n"
 		"	float4 palColor1;\n"
 		"	int2 iPaletteCoord;\n"
 		"	const sampler_t paletteSampler = CLK_NORMALIZED_COORDS_FALSE |\n"//Coords from 0 to 255.
@ -265,12 +270,11 @@ string IterOpenCLKernelCreator<T>::CreateIterKernelString(Ember<T>& ember, strin
 	
 	os <<
 		"\n"
+#ifndef STRAIGHT_RAND
 		"	__local Point swap[NTHREADS];\n"
 		"	__local uint xfsel[NWARPS];\n"
+#endif
 		"\n"
-		"	uint pointsIndex = INDEX_IN_GRID_2D;\n"
-		"	mwc.x = (pointsIndex + 1 * seed);\n"
-		"	mwc.y = ((BLOCK_ID_X + 1) * (pointsIndex + 1) * seed);\n"
 		"	iPaletteCoord.y = 0;\n"
 		"\n"
 		"	if (fuseCount > 0)\n"
@ -295,9 +299,11 @@ string IterOpenCLKernelCreator<T>::CreateIterKernelString(Ember<T>& ember, strin
 		//This along with the randomness that the point shuffle provides gives sufficient randomness
 		//to produce results identical to those produced on the CPU.
 	os <<
+#ifndef STRAIGHT_RAND
 		"	if (THREAD_ID_Y == 0 && THREAD_ID_X < NWARPS)\n"
 		"		xfsel[THREAD_ID_X] = MwcNext(&mwc) % " << CHOOSE_XFORM_GRAIN << ";\n"//It's faster to do the % here ahead of time than every time an xform is looked up to use inside the loop.
 		"\n"
+#endif
 		"	barrier(CLK_LOCAL_MEM_FENCE);\n"
 		"\n"
 		"	for (i = 0; i < itersToDo; i++)\n"
@ -309,22 +315,51 @@ string IterOpenCLKernelCreator<T>::CreateIterKernelString(Ember<T>& ember, strin
 		"		do\n"
 		"		{\n";

-		//If xaos is present, the cuburn method is effectively ceased. Every thread will be picking a random xform.
+		//If xaos is present, the a hybrid of the cuburn method is used.
+		//This makes each thread in a row pick the same offset into a distribution, using xfsel.
+		//However, the distribution the offset is in, is determined by firstPoint.m_LastXfUsed.
 		if (ember.XaosPresent())
 		{
 			os <<
+#ifdef STRAIGHT_RAND
 		"			secondPoint.m_LastXfUsed = xformDistributions[MwcNext(&mwc) % " << CHOOSE_XFORM_GRAIN << " + (" << CHOOSE_XFORM_GRAIN << " * (firstPoint.m_LastXfUsed + 1u))];\n\n";
-		//"			secondPoint.m_LastXfUsed = xformDistributions[xfsel[THREAD_ID_Y] + (" << CHOOSE_XFORM_GRAIN << " * (firstPoint.m_LastXfUsed + 1u))];\n\n";//Partial cuburn hybrid.
+#else
+		"			secondPoint.m_LastXfUsed = xformDistributions[xfsel[THREAD_ID_Y] + (" << CHOOSE_XFORM_GRAIN << " * (firstPoint.m_LastXfUsed + 1u))];\n\n";//Partial cuburn hybrid.
+#endif
 		}
 		else
 		{
 			os <<
-		//"			secondPoint.m_LastXfUsed = xformDistributions[MwcNext(&mwc) % " << CHOOSE_XFORM_GRAIN << "];\n\n";//For testing, using straight rand flam4/fractron style instead of cuburn.
+#ifdef STRAIGHT_RAND
+		"			secondPoint.m_LastXfUsed = xformDistributions[MwcNext(&mwc) % " << CHOOSE_XFORM_GRAIN << "];\n\n";//For testing, using straight rand flam4/fractron style instead of cuburn.
+#else
 		"			secondPoint.m_LastXfUsed = xformDistributions[xfsel[THREAD_ID_Y]];\n\n";
+#endif
 		}

 		for (i = 0; i < ember.XformCount(); i++)
 		{
+#ifdef USE_CASE
+			if (i == 0)
+			{
+			os <<
+		"			switch (secondPoint.m_LastXfUsed)\n"
+		"			{\n";
+			}
+			
+			os <<
+		"				case " << i << ":\n"
+		"				{\n" <<
+		"					Xform" << i << "(&(xforms[" << i << "]), parVars, &firstPoint, &secondPoint, &mwc);\n" <<
+		"					break;\n"
+		"				}\n";
+
+			if (i == ember.XformCount() - 1)
+			{
+			os <<
+		"			}\n";
+			}
+#else
 			if (i == 0)
 				os <<
 		"			if (secondPoint.m_LastXfUsed == " << i << ")\n";
@ -334,9 +369,11 @@ string IterOpenCLKernelCreator<T>::CreateIterKernelString(Ember<T>& ember, strin

 		os <<
 		"			{\n" <<
-		"				Xform" << i << "(&(ember->m_Xforms[" << i << "]), parVars, &firstPoint, &secondPoint, &mwc);\n" <<
+		"				Xform" << i << "(&(xforms[" << i << "]), parVars, &firstPoint, &secondPoint, &mwc);\n" <<
 		"			}\n";
+#endif
 		}
+
 		os <<
 		"\n"
 		"			ok = !BadVal(secondPoint.m_X) && !BadVal(secondPoint.m_Y);\n"
@ -360,6 +397,7 @@ string IterOpenCLKernelCreator<T>::CreateIterKernelString(Ember<T>& ember, strin
 		"			secondPoint.m_Y = MwcNextNeg1Pos1(&mwc);\n"
 		"			secondPoint.m_Z = 0.0;\n"
 		"		}\n"
+#ifndef STRAIGHT_RAND
 		"\n"//Rotate points between threads. This is how randomization is achieved.
 		"		uint swr = threadXY + ((i & 1u) * threadXDivRows);\n"
 		"		uint sw = (swr * THREADS_PER_WARP + THREAD_ID_X) & threadsMinus1;\n"
@ -368,16 +406,16 @@ string IterOpenCLKernelCreator<T>::CreateIterKernelString(Ember<T>& ember, strin
 		//Write to another thread's location.
 		"		swap[sw] = secondPoint;\n"
 		"\n"
-
 		//Populate randomized xform index buffer with new random values.
 		"		if (THREAD_ID_Y == 0 && THREAD_ID_X < NWARPS)\n"
 		"			xfsel[THREAD_ID_X] = MwcNext(&mwc) % " << CHOOSE_XFORM_GRAIN << ";\n"
 		"\n"
 		"		barrier(CLK_LOCAL_MEM_FENCE);\n"
-		"\n"
-
 		//Another thread will have written to this thread's location, so read the new value and use it for accumulation below.
 		"		firstPoint = swap[threadIndex];\n"
+#else
+		"		firstPoint = secondPoint;\n"//For testing, using straight rand flam4/fractron style instead of cuburn.
+#endif
 		"\n"
 		"		if (fuse)\n"
 		"		{\n"
@ -399,14 +437,14 @@ string IterOpenCLKernelCreator<T>::CreateIterKernelString(Ember<T>& ember, strin

 			//CPU takes an extra step here to preserve the opacity of the randomly selected xform, rather than the final xform's opacity.
 			//The same thing takes place here automatically because secondPoint.m_LastXfUsed is used below to retrieve the opacity when accumulating.
-			os <<
-		"		if ((ember->m_Xforms[" << finalIndex << "].m_Opacity == 1) || (MwcNext01(&mwc) < ember->m_Xforms[" << finalIndex << "].m_Opacity))\n"
-		"		{\n"
-		"			tempPoint.m_LastXfUsed = secondPoint.m_LastXfUsed;\n"
-		"			Xform" << finalIndex << "(&(ember->m_Xforms[" << finalIndex << "]), parVars, &secondPoint, &tempPoint, &mwc);\n"
-		"			secondPoint = tempPoint;\n"
-		"		}\n"
-		"\n";
+		os <<
+			"		if ((xforms[" << finalIndex << "].m_Opacity == 1) || (MwcNext01(&mwc) < xforms[" << finalIndex << "].m_Opacity))\n"
+			"		{\n"
+			"			tempPoint.m_LastXfUsed = secondPoint.m_LastXfUsed;\n"
+			"			Xform" << finalIndex << "(&(xforms[" << finalIndex << "]), parVars, &secondPoint, &tempPoint, &mwc);\n"
+			"			secondPoint = tempPoint;\n"
+			"		}\n"
+			"\n";
 		}
 		
 		os << CreateProjectionString(ember);
@ -471,18 +509,18 @@ string IterOpenCLKernelCreator<T>::CreateIterKernelString(Ember<T>& ember, strin
 				if (typeid(T) == typeid(double))
 				{
 					os <<
-		"				AtomicAdd(&(histogram[histIndex].m_Reals[0]), (real_t)palColor1.x * ember->m_Xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n"//Always apply opacity, even though it's usually 1.
-		"				AtomicAdd(&(histogram[histIndex].m_Reals[1]), (real_t)palColor1.y * ember->m_Xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n"
-		"				AtomicAdd(&(histogram[histIndex].m_Reals[2]), (real_t)palColor1.z * ember->m_Xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n"
-		"				AtomicAdd(&(histogram[histIndex].m_Reals[3]), (real_t)palColor1.w * ember->m_Xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n";
+		"				AtomicAdd(&(histogram[histIndex].m_Reals[0]), (real_t)palColor1.x * xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n"//Always apply opacity, even though it's usually 1.
+		"				AtomicAdd(&(histogram[histIndex].m_Reals[1]), (real_t)palColor1.y * xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n"
+		"				AtomicAdd(&(histogram[histIndex].m_Reals[2]), (real_t)palColor1.z * xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n"
+		"				AtomicAdd(&(histogram[histIndex].m_Reals[3]), (real_t)palColor1.w * xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n";
 				}
 				else
 				{
-				os <<
-		"				AtomicAdd(&(histogram[histIndex].m_Reals[0]), palColor1.x * ember->m_Xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n"//Always apply opacity, even though it's usually 1.
-		"				AtomicAdd(&(histogram[histIndex].m_Reals[1]), palColor1.y * ember->m_Xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n"
-		"				AtomicAdd(&(histogram[histIndex].m_Reals[2]), palColor1.z * ember->m_Xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n"
-		"				AtomicAdd(&(histogram[histIndex].m_Reals[3]), palColor1.w * ember->m_Xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n";
+					os <<
+		"				AtomicAdd(&(histogram[histIndex].m_Reals[0]), palColor1.x * xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n"//Always apply opacity, even though it's usually 1.
+		"				AtomicAdd(&(histogram[histIndex].m_Reals[1]), palColor1.y * xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n"
+		"				AtomicAdd(&(histogram[histIndex].m_Reals[2]), palColor1.z * xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n"
+		"				AtomicAdd(&(histogram[histIndex].m_Reals[3]), palColor1.w * xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n";
 				}
 			}
 			else
@ -496,12 +534,12 @@ string IterOpenCLKernelCreator<T>::CreateIterKernelString(Ember<T>& ember, strin
 		"				realColor.y = (real_t)palColor1.y;\n"
 		"				realColor.z = (real_t)palColor1.z;\n"
 		"				realColor.w = (real_t)palColor1.w;\n"
-		"				histogram[histIndex].m_Real4 += (realColor * ember->m_Xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n";
+		"				histogram[histIndex].m_Real4 += (realColor * xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n";
 				}
 				else
 				{
-				os <<
-		"				histogram[histIndex].m_Real4 += (palColor1 * ember->m_Xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n";
+					os <<
+		"				histogram[histIndex].m_Real4 += (palColor1 * xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n";
 				}
 			}

@ -525,6 +563,7 @@ string IterOpenCLKernelCreator<T>::CreateIterKernelString(Ember<T>& ember, strin
 		"	points[pointsIndex].m_ColorX = MwcNextNeg1Pos1(&mwc);\n"
 #else
 		"	points[pointsIndex] = firstPoint;\n"
+		"	seeds[pointsIndex] = mwc;\n"
 #endif
 		"	barrier(CLK_GLOBAL_MEM_FENCE);\n"
 		"}\n";
--- a/Source/EmberCL/OpenCLWrapper.cpp
+++ b/Source/EmberCL/OpenCLWrapper.cpp
@ -1121,12 +1121,12 @@ string OpenCLWrapper::DumpInfo()
 /// <summary>
 /// OpenCL properties, getters only.
 /// </summary>
-bool OpenCLWrapper::Ok() { return m_Init; }
-bool OpenCLWrapper::Shared() { return m_Shared; }
-cl::Context OpenCLWrapper::Context() { return m_Context; }
-unsigned int OpenCLWrapper::PlatformIndex() { return m_PlatformIndex; }
-unsigned int OpenCLWrapper::DeviceIndex() { return m_DeviceIndex; }
-unsigned int OpenCLWrapper::LocalMemSize() { return m_LocalMemSize; }
+bool OpenCLWrapper::Ok() const { return m_Init; }
+bool OpenCLWrapper::Shared() const { return m_Shared; }
+cl::Context OpenCLWrapper::Context() const { return m_Context; }
+unsigned int OpenCLWrapper::PlatformIndex() const { return m_PlatformIndex; }
+unsigned int OpenCLWrapper::DeviceIndex() const { return m_DeviceIndex; }
+unsigned int OpenCLWrapper::LocalMemSize() const { return m_LocalMemSize; }

 /// <summary>
 /// Makes the even grid dims.
--- a/Source/EmberCL/OpenCLWrapper.h
+++ b/Source/EmberCL/OpenCLWrapper.h
@ -184,12 +184,12 @@ public:
 	string DumpInfo();

 	//Accessors.
-	bool Ok();
-	bool Shared();
-	cl::Context Context();
-	unsigned int PlatformIndex();
-	unsigned int DeviceIndex();
-	unsigned int LocalMemSize();
+	bool Ok() const;
+	bool Shared() const;
+	cl::Context Context() const;
+	unsigned int PlatformIndex() const;
+	unsigned int DeviceIndex() const;
+	unsigned int LocalMemSize() const;

 	static void MakeEvenGridDims(unsigned int blockW, unsigned int blockH, unsigned int& gridW, unsigned int& gridH);

--- a/Source/EmberCL/RendererCL.cpp
+++ b/Source/EmberCL/RendererCL.cpp
@ -22,7 +22,9 @@ RendererCL<T>::RendererCL(unsigned int platform, unsigned int device, bool share

 	//Buffer names.
 	m_EmberBufferName               = "Ember";
+	m_XformsBufferName				= "Xforms";
 	m_ParVarsBufferName             = "ParVars";
+	m_SeedsBufferName				= "Seeds";
 	m_DistBufferName                = "Dist";
 	m_CarToRasBufferName            = "CarToRas";
 	m_DEFilterParamsBufferName      = "DEFilterParams";
@ -50,6 +52,13 @@ RendererCL<T>::RendererCL(unsigned int platform, unsigned int device, bool share
 	m_PaletteFormat.image_channel_data_type = CL_FLOAT;
 	m_FinalFormat.image_channel_order = CL_RGBA;
 	m_FinalFormat.image_channel_data_type = CL_UNORM_INT8;//Change if this ever supports 2BPC outputs for PNG.
+	m_Seeds.resize(IterGridKernelCount());
+
+	for (size_t i = 0; i < m_Seeds.size(); i++)
+	{
+		m_Seeds[i].x = m_Rand[0].Rand();
+		m_Seeds[i].y = m_Rand[0].Rand();
+	}

 	Init(platform, device, shared, outputTexID);//Init OpenCL upon construction and create programs that will not change.
 }
@ -100,14 +109,12 @@ bool RendererCL<T>::Init(unsigned int platform, unsigned int device, bool shared
 		m_DEOpenCLKernelCreator = DEOpenCLKernelCreator<T>(m_NVidia);

 		string zeroizeProgram = m_IterOpenCLKernelCreator.ZeroizeKernel();
-		string logAssignProgram = m_DEOpenCLKernelCreator.LogScaleAssignDEKernel();
-		string logSumProgram = m_DEOpenCLKernelCreator.LogScaleSumDEKernel();//Build a couple of simple programs to ensure OpenCL is working right.
+		string logAssignProgram = m_DEOpenCLKernelCreator.LogScaleAssignDEKernel();//Build a couple of simple programs to ensure OpenCL is working right.

 		if (b && !(b = m_Wrapper.AddProgram(m_IterOpenCLKernelCreator.ZeroizeEntryPoint(),		  zeroizeProgram,	m_IterOpenCLKernelCreator.ZeroizeEntryPoint(),        m_DoublePrecision))) { m_ErrorReport.push_back(loc); }
 		if (b && !(b = m_Wrapper.AddProgram(m_DEOpenCLKernelCreator.LogScaleAssignDEEntryPoint(), logAssignProgram, m_DEOpenCLKernelCreator.LogScaleAssignDEEntryPoint(), m_DoublePrecision))) { m_ErrorReport.push_back(loc); }
-		if (b && !(b = m_Wrapper.AddProgram(m_DEOpenCLKernelCreator.LogScaleSumDEEntryPoint(),	  logSumProgram,	m_DEOpenCLKernelCreator.LogScaleSumDEEntryPoint(),    m_DoublePrecision))) { m_ErrorReport.push_back(loc); }
-		
 		if (b && !(b = m_Wrapper.AddAndWriteImage("Palette", CL_MEM_READ_ONLY, m_PaletteFormat, 256, 1, 0, NULL))) { m_ErrorReport.push_back(loc); }
+		if (b && !(b = m_Wrapper.AddAndWriteBuffer(m_SeedsBufferName, (void*)m_Seeds.data(), SizeOf(m_Seeds)))) { m_ErrorReport.push_back(loc); }

 		//This is the maximum box dimension for density filtering which consists of (blockSize  * blockSize) + (2 * filterWidth).
 		//These blocks must be square, and ideally, 32x32.
@ -123,6 +130,11 @@ bool RendererCL<T>::Init(unsigned int platform, unsigned int device, bool shared
 	return b;
 }

+/// <summary>
+/// Set the shared output texture where final accumulation will be written to.
+/// </summary>
+/// <param name="outputTexID">The texture ID of the shared OpenGL texture if shared</param>
+/// <returns>True if success, else false.</returns>
 template <typename T>
 bool RendererCL<T>::SetOutputTexture(GLuint outputTexID)
 {
@ -149,16 +161,28 @@ bool RendererCL<T>::SetOutputTexture(GLuint outputTexID)
 /// OpenCL property accessors, getters only.
 /// </summary>

-template <typename T> unsigned int RendererCL<T>::IterCountPerKernel()   { return m_IterCountPerKernel;                 }
-template <typename T> unsigned int RendererCL<T>::IterBlocksWide()       { return m_IterBlocksWide;                     }
-template <typename T> unsigned int RendererCL<T>::IterBlocksHigh()       { return m_IterBlocksHigh;                     }
-template <typename T> unsigned int RendererCL<T>::IterBlockWidth()       { return m_IterBlockWidth;                     }
-template <typename T> unsigned int RendererCL<T>::IterBlockHeight()      { return m_IterBlockHeight;                    }
-template <typename T> unsigned int RendererCL<T>::IterGridWidth()        { return IterBlocksWide() * IterBlockWidth();  }
-template <typename T> unsigned int RendererCL<T>::IterGridHeight()       { return IterBlocksHigh() * IterBlockHeight(); }
-template <typename T> unsigned int RendererCL<T>::TotalIterKernelCount() { return IterGridWidth() * IterGridHeight();   }
-template <typename T> unsigned int RendererCL<T>::PlatformIndex()        { return m_Wrapper.PlatformIndex();            }
-template <typename T> unsigned int RendererCL<T>::DeviceIndex()          { return m_Wrapper.DeviceIndex();              }
+//Iters per kernel/block/grid.
+template <typename T> unsigned int RendererCL<T>::IterCountPerKernel() const { return m_IterCountPerKernel; }
+template <typename T> unsigned int RendererCL<T>::IterCountPerBlock()  const { return IterCountPerKernel() * IterBlockKernelCount(); }
+template <typename T> unsigned int RendererCL<T>::IterCountPerGrid()   const { return IterCountPerKernel() * IterGridKernelCount();  }
+
+//Kernels per block.
+template <typename T> unsigned int RendererCL<T>::IterBlockKernelWidth()  const { return m_IterBlockWidth;								 }
+template <typename T> unsigned int RendererCL<T>::IterBlockKernelHeight() const { return m_IterBlockHeight;								 }
+template <typename T> unsigned int RendererCL<T>::IterBlockKernelCount()  const { return IterBlockKernelWidth() * IterBlockKernelHeight(); }
+
+//Kernels per grid.
+template <typename T> unsigned int RendererCL<T>::IterGridKernelWidth()  const { return IterGridBlockWidth() * IterBlockKernelWidth();   }
+template <typename T> unsigned int RendererCL<T>::IterGridKernelHeight() const { return IterGridBlockHeight() * IterBlockKernelHeight(); }
+template <typename T> unsigned int RendererCL<T>::IterGridKernelCount()	 const { return IterGridKernelWidth() * IterGridKernelHeight();  }
+
+//Blocks per grid.
+template <typename T> unsigned int RendererCL<T>::IterGridBlockWidth()  const { return m_IterBlocksWide;							   }
+template <typename T> unsigned int RendererCL<T>::IterGridBlockHeight() const { return m_IterBlocksHigh;							   }
+template <typename T> unsigned int RendererCL<T>::IterGridBlockCount()  const { return IterGridBlockWidth() * IterGridBlockHeight(); }
+
+template <typename T> unsigned int RendererCL<T>::PlatformIndex() { return m_Wrapper.PlatformIndex(); }
+template <typename T> unsigned int RendererCL<T>::DeviceIndex()   { return m_Wrapper.DeviceIndex();   }

 /// <summary>
 /// Read the histogram into the host side CPU buffer.
@ -197,10 +221,10 @@ bool RendererCL<T>::ReadAccum()
 template <typename T>
 bool RendererCL<T>::ReadPoints(vector<PointCL<T>>& vec)
 {
-	vec.resize(TotalIterKernelCount());//Allocate the memory to read into.
+	vec.resize(IterGridKernelCount());//Allocate the memory to read into.

-	if (vec.size() >= TotalIterKernelCount())
-		return m_Wrapper.ReadBuffer(m_PointsBufferName, (void*)vec.data(), TotalIterKernelCount() * sizeof(PointCL<T>));
+	if (vec.size() >= IterGridKernelCount())
+		return m_Wrapper.ReadBuffer(m_PointsBufferName, (void*)vec.data(), IterGridKernelCount() * sizeof(PointCL<T>));

 	return false;
 }
@ -237,6 +261,26 @@ bool RendererCL<T>::WritePoints(vector<PointCL<T>>& vec)
 	return m_Wrapper.WriteBuffer(m_PointsBufferName, (void*)vec.data(), vec.size() * sizeof(vec[0]));
 }

+#ifdef TEST_CL
+template <typename T>
+bool RendererCL<T>::WriteRandomPoints()
+{
+	size_t size = IterGridKernelCount();
+	vector<PointCL<T>> vec(size);
+
+	for (int i = 0; i < size; i++)
+	{
+		vec[i].m_X = m_Rand[0].Frand11<T>();
+		vec[i].m_Y = m_Rand[0].Frand11<T>();
+		vec[i].m_Z = 0;
+		vec[i].m_ColorX = m_Rand[0].Frand01<T>();
+		vec[i].m_LastXfUsed = 0;
+	}
+
+	return WritePoints(vec);
+}
+#endif
+
 /// <summary>
 /// Get the kernel string for the last built iter program.
 /// </summary>
@ -351,7 +395,7 @@ void RendererCL<T>::ClearErrorReport()
 template <typename T>
 size_t RendererCL<T>::SubBatchSize() const
 {
-	return m_IterBlocksWide * m_IterBlocksHigh * SQR(m_IterCountPerKernel);
+	return IterCountPerGrid();
 }

 /// <summary>
@ -483,6 +527,7 @@ bool RendererCL<T>::Alloc()
 		return false;

 	EnterResize();
+	m_XformsCL.resize(m_Ember.TotalXformCount());

 	bool b = true;
 	size_t histLength = SuperSize() * sizeof(v4T);
@ -490,6 +535,7 @@ bool RendererCL<T>::Alloc()
 	const char* loc = __FUNCTION__;

 	if (b && !(b = m_Wrapper.AddBuffer(m_EmberBufferName,               sizeof(m_EmberCL))))         { m_ErrorReport.push_back(loc); }
+	if (b && !(b = m_Wrapper.AddBuffer(m_XformsBufferName,				SizeOf(m_XformsCL))))		 { m_ErrorReport.push_back(loc); }
 	if (b && !(b = m_Wrapper.AddBuffer(m_ParVarsBufferName,             128 * sizeof(T))))           { m_ErrorReport.push_back(loc); }
 	if (b && !(b = m_Wrapper.AddBuffer(m_DistBufferName,                CHOOSE_XFORM_GRAIN)))        { m_ErrorReport.push_back(loc); }//Will be resized for xaos.
 	if (b && !(b = m_Wrapper.AddBuffer(m_CarToRasBufferName,            sizeof(m_CarToRasCL))))      { m_ErrorReport.push_back(loc); }
@ -498,7 +544,7 @@ bool RendererCL<T>::Alloc()

 	if (b && !(b = m_Wrapper.AddBuffer(m_HistBufferName,   histLength)))								  { m_ErrorReport.push_back(loc); }//Histogram. Will memset to zero later.
 	if (b && !(b = m_Wrapper.AddBuffer(m_AccumBufferName,  accumLength)))								  { m_ErrorReport.push_back(loc); }//Accum buffer.
-	if (b && !(b = m_Wrapper.AddBuffer(m_PointsBufferName, TotalIterKernelCount() * sizeof(PointCL<T>)))) { m_ErrorReport.push_back(loc); }//Points between iter calls.
+	if (b && !(b = m_Wrapper.AddBuffer(m_PointsBufferName, IterGridKernelCount() * sizeof(PointCL<T>))))  { m_ErrorReport.push_back(loc); }//Points between iter calls.

 	if (b && !(b = SetOutputTexture(m_OutputTexID))) { m_ErrorReport.push_back(loc); }
 	
@ -702,12 +748,12 @@ bool RendererCL<T>::RunIter(size_t iterCount, size_t temporalSample, size_t& ite
 {
 	Timing t;//, t2(4);
 	bool b = true;
-	unsigned int seed, fuse, argIndex;
-	unsigned int iterCountPerKernel = m_IterCountPerKernel;
-	unsigned int iterCountPerBlock = iterCountPerKernel * m_IterBlockWidth * m_IterBlockHeight;
+	unsigned int fuse, argIndex;
+	unsigned int iterCountPerKernel = IterCountPerKernel();
+	unsigned int iterCountPerBlock = IterCountPerBlock();
 	unsigned int supersize = (unsigned int)SuperSize();
 	int kernelIndex = m_Wrapper.FindKernelIndex(m_IterOpenCLKernelCreator.IterEntryPoint());
-	size_t fuseFreq = m_SubBatchSize / m_IterCountPerKernel;
+	size_t fuseFreq = Renderer<T, T>::SubBatchSize() / m_IterCountPerKernel;//Use the base sbs to determine when to fuse.
 	size_t itersRemaining, localIterCount = 0;
 	double percent, etaMs;
 	const char* loc = __FUNCTION__;
@ -719,12 +765,13 @@ bool RendererCL<T>::RunIter(size_t iterCount, size_t temporalSample, size_t& ite

 	if (kernelIndex != -1)
 	{
-		m_EmberCL = ConvertEmber(m_Ember);
+		ConvertEmber(m_Ember, m_EmberCL, m_XformsCL);
 		m_CarToRasCL = ConvertCarToRas(*CoordMap());

-		if (b && !(b = m_Wrapper.WriteBuffer      (m_EmberBufferName,    (void*)&m_EmberCL,           sizeof(m_EmberCL))))        { m_ErrorReport.push_back(loc); }
-		if (b && !(b = m_Wrapper.AddAndWriteBuffer(m_DistBufferName,     (void*)XformDistributions(), XformDistributionsSize()))) { m_ErrorReport.push_back(loc); }//Will be resized for xaos.
-		if (b && !(b = m_Wrapper.WriteBuffer      (m_CarToRasBufferName, (void*)&m_CarToRasCL,        sizeof(m_CarToRasCL))))     { m_ErrorReport.push_back(loc); }
+		if (b && !(b = m_Wrapper.WriteBuffer      (m_EmberBufferName,    (void*)&m_EmberCL,           sizeof(m_EmberCL))))						   { m_ErrorReport.push_back(loc); }
+		if (b && !(b = m_Wrapper.WriteBuffer	  (m_XformsBufferName,   (void*)m_XformsCL.data(),    sizeof(m_XformsCL[0]) * m_XformsCL.size()))) { m_ErrorReport.push_back(loc); }
+		if (b && !(b = m_Wrapper.AddAndWriteBuffer(m_DistBufferName,     (void*)XformDistributions(), XformDistributionsSize())))				   { m_ErrorReport.push_back(loc); }//Will be resized for xaos.
+		if (b && !(b = m_Wrapper.WriteBuffer      (m_CarToRasBufferName, (void*)&m_CarToRasCL,        sizeof(m_CarToRasCL))))					   { m_ErrorReport.push_back(loc); }
 		
 		if (b && !(b = m_Wrapper.AddAndWriteImage("Palette", CL_MEM_READ_ONLY, m_PaletteFormat, m_Dmap.m_Entries.size(), 1, 0, m_Dmap.m_Entries.data()))) { m_ErrorReport.push_back(loc); }
 		
@ -735,31 +782,32 @@ bool RendererCL<T>::RunIter(size_t iterCount, size_t temporalSample, size_t& ite
 		while (b && itersRan < iterCount && !m_Abort)
 		{
 			argIndex = 0;
-			seed = m_Rand[0].Rand();
 #ifdef TEST_CL
 			fuse = 0;
 #else
 			//fuse = 100;
-			fuse = ((m_Calls % fuseFreq) == 0 ? (EarlyClip() ? 100u : 15u) : 0u);
+			//fuse = ((m_Calls % fuseFreq) == 0 ? (EarlyClip() ? 100u : 15u) : 0u);
+			fuse = (unsigned int)((m_Calls % fuseFreq) == 0u ? FuseCount() : 0u);
 			//fuse = ((m_Calls % 4) == 0 ? 100u : 0u);
 #endif
 			itersRemaining = iterCount - itersRan;
-			unsigned int gridW = (unsigned int)min(ceil((double)itersRemaining / (double)iterCountPerBlock), (double)IterBlocksWide());
-			unsigned int gridH = (unsigned int)min(ceil((double)itersRemaining / ((double)gridW * iterCountPerBlock)), (double)IterBlocksHigh());
+			unsigned int gridW = (unsigned int)min(ceil((double)itersRemaining / (double)iterCountPerBlock), (double)IterGridBlockWidth());
+			unsigned int gridH = (unsigned int)min(ceil((double)itersRemaining / ((double)gridW * iterCountPerBlock)), (double)IterGridBlockHeight());
 			unsigned int iterCountThisLaunch = iterCountPerBlock * gridW * gridH;

 			//Similar to what's done in the base class.
 			//The number of iters per thread must be adjusted if they've requested less iters than is normally ran in a block (256 * 256).
 			if (iterCountThisLaunch > iterCount)
 			{
-				iterCountPerKernel = (unsigned int)ceil((double)iterCount / (double)(gridW * gridH * m_IterBlockWidth * m_IterBlockHeight));
-				iterCountThisLaunch = iterCountPerKernel * (gridW * gridH * m_IterBlockWidth * m_IterBlockHeight);
+				iterCountPerKernel = (unsigned int)ceil((double)iterCount / (double)(gridW * gridH * IterBlockKernelCount()));
+				iterCountThisLaunch = iterCountPerKernel * (gridW * gridH * IterBlockKernelCount());
 			}

 			if (b && !(b = m_Wrapper.SetArg      (kernelIndex, argIndex++, iterCountPerKernel)))   { m_ErrorReport.push_back(loc); }//Number of iters for each thread to run.
 			if (b && !(b = m_Wrapper.SetArg      (kernelIndex, argIndex++, fuse)))                 { m_ErrorReport.push_back(loc); }//Number of iters to fuse.
-			if (b && !(b = m_Wrapper.SetArg      (kernelIndex, argIndex++, seed)))                 { m_ErrorReport.push_back(loc); }//Seed.
-			if (b && !(b = m_Wrapper.SetBufferArg(kernelIndex, argIndex++, m_EmberBufferName)))    { m_ErrorReport.push_back(loc); }//Flame.
+			if (b && !(b = m_Wrapper.SetBufferArg(kernelIndex, argIndex++, m_SeedsBufferName)))    { m_ErrorReport.push_back(loc); }//Seeds.
+			if (b && !(b = m_Wrapper.SetBufferArg(kernelIndex, argIndex++, m_EmberBufferName)))    { m_ErrorReport.push_back(loc); }//Ember.
+			if (b && !(b = m_Wrapper.SetBufferArg(kernelIndex, argIndex++, m_XformsBufferName)))   { m_ErrorReport.push_back(loc); }//Xforms.
 			if (b && !(b = m_Wrapper.SetBufferArg(kernelIndex, argIndex++, m_ParVarsBufferName)))  { m_ErrorReport.push_back(loc); }//Parametric variation parameters.
 			if (b && !(b = m_Wrapper.SetBufferArg(kernelIndex, argIndex++, m_DistBufferName)))     { m_ErrorReport.push_back(loc); }//Xform distributions.
 			if (b && !(b = m_Wrapper.SetBufferArg(kernelIndex, argIndex++, m_CarToRasBufferName))) { m_ErrorReport.push_back(loc); }//Coordinate converter.
@ -769,11 +817,11 @@ bool RendererCL<T>::RunIter(size_t iterCount, size_t temporalSample, size_t& ite
 			if (b && !(b = m_Wrapper.SetBufferArg(kernelIndex, argIndex++, m_PointsBufferName)))   { m_ErrorReport.push_back(loc); }//Random start points.
 			
 			if (b && !(b = m_Wrapper.RunKernel(kernelIndex,
-									 gridW * IterBlockWidth(),//Total grid dims.
-									 gridH * IterBlockHeight(),
+									 gridW * IterBlockKernelWidth(),//Total grid dims.
+									 gridH * IterBlockKernelHeight(),
 									 1,
-									 IterBlockWidth(),//Individual block dims.
-									 IterBlockHeight(),
+									 IterBlockKernelWidth(),//Individual block dims.
+									 IterBlockKernelHeight(),
 									 1)))
 			{
 				m_Abort = true;
@ -876,7 +924,7 @@ template <typename T>
 eRenderStatus RendererCL<T>::RunDensityFilter()
 {
 	bool b = true;
-	Timing t(4);//, t2(4);
+	Timing t(4);// , t2(4);
 	m_DensityFilterCL = ConvertDensityFilter();
 	int kernelIndex = MakeAndGetDensityFilterProgram(Supersample(), m_DensityFilterCL.m_FilterWidth);
 	const char* loc = __FUNCTION__;
@ -907,26 +955,62 @@ eRenderStatus RendererCL<T>::RunDensityFilter()
 		//The other is to proces the entire image in multiple passes, and each pass processes blocks of pixels
 		//that are far enough apart such that their filters do not overlap.
 		//Do the latter.
+		//Gap is in terms of blocks. How many blocks must separate two blocks running at the same time.
 		unsigned int gapW = (unsigned int)ceil((m_DensityFilterCL.m_FilterWidth * 2.0) / (double)blockSizeW);
 		unsigned int chunkSizeW = gapW + 1;
 		unsigned int gapH = (unsigned int)ceil((m_DensityFilterCL.m_FilterWidth * 2.0) / (double)blockSizeH);
 		unsigned int chunkSizeH = gapH + 1;
-
 		double totalChunks = chunkSizeW * chunkSizeH;

 		if (b && !(b = m_Wrapper.AddAndWriteBuffer(m_DEFilterParamsBufferName, (void*)&m_DensityFilterCL, sizeof(m_DensityFilterCL)))) { m_ErrorReport.push_back(loc); }

-		for (unsigned int row = 0; b && !m_Abort && row < chunkSizeH; row++)
+#ifdef ROW_ONLY_DE
+		blockSizeW = 64;//These *must* both be divisible by 16 or else pixels will go missing.
+		blockSizeH = 1;
+		gapW = (unsigned int)ceil((m_DensityFilterCL.m_FilterWidth * 2.0) / (double)blockSizeW);
+		chunkSizeW = gapW + 1;
+		gapH = (unsigned int)ceil((m_DensityFilterCL.m_FilterWidth * 2.0) / (double)32);//Block height is 1, but iterates over 32 rows.
+		chunkSizeH = gapH + 1;
+		totalChunks = chunkSizeW * chunkSizeH;
+
+		OpenCLWrapper::MakeEvenGridDims(blockSizeW, blockSizeH, gridW, gridH);
+		gridW /= chunkSizeW;
+		gridH /= chunkSizeH;
+
+		for (unsigned int rowChunk = 0; b && !m_Abort && rowChunk < chunkSizeH; rowChunk++)
 		{
-			for (unsigned int col = 0; b && !m_Abort && col < chunkSizeW; col++)
+			for (unsigned int colChunk = 0; b && !m_Abort && colChunk < chunkSizeW; colChunk++)
 			{
 				//t2.Tic();
-				if (b && !(b = RunDensityFilterPrivate(kernelIndex, gridW, gridH, blockSizeW, blockSizeH, chunkSizeW, chunkSizeH, row, col))) { m_Abort = true; m_ErrorReport.push_back(loc); }
+				if (b && !(b = RunDensityFilterPrivate(kernelIndex, gridW, gridH, blockSizeW, blockSizeH, chunkSizeW, chunkSizeH, colChunk, rowChunk))) { m_Abort = true; m_ErrorReport.push_back(loc); }
 				//t2.Toc(loc);

 				if (b && m_Callback)
 				{
-					double percent = (double((row * chunkSizeW) + (col + 1)) / totalChunks) * 100.0;
+					double percent = (double((rowChunk * chunkSizeW) + (colChunk + 1)) / totalChunks) * 100.0;
+					double etaMs = ((100.0 - percent) / percent) * t.Toc();
+
+					if (!m_Callback->ProgressFunc(m_Ember, m_ProgressParameter, percent, 1, etaMs))
+						Abort();
+				}
+			}
+		}
+#else
+		OpenCLWrapper::MakeEvenGridDims(blockSizeW, blockSizeH, gridW, gridH);
+		gridW /= chunkSizeW;
+		gridH /= chunkSizeH;
+
+		for (unsigned int rowChunk = 0; b && !m_Abort && rowChunk < chunkSizeH; rowChunk++)
+		{
+			for (unsigned int colChunk = 0; b && !m_Abort && colChunk < chunkSizeW; colChunk++)
+			{
+				//t2.Tic();
+				if (b && !(b = RunDensityFilterPrivate(kernelIndex, gridW, gridH, blockSizeW, blockSizeH, chunkSizeW, chunkSizeH, colChunk, rowChunk))) { m_Abort = true; m_ErrorReport.push_back(loc); }
+				//t2.Toc(loc);
+
+				if (b && m_Callback)
+				{
+					double percent = (double((rowChunk * chunkSizeW) + (colChunk + 1)) / totalChunks) * 100.0;
 					double etaMs = ((100.0 - percent) / percent) * t.Toc();
 					
 					if (!m_Callback->ProgressFunc(m_Ember, m_ProgressParameter, percent, 1, etaMs))
@ -934,6 +1018,7 @@ eRenderStatus RendererCL<T>::RunDensityFilter()
 				}
 			}
 		}
+#endif

 		if (b && m_Callback)
 			m_Callback->ProgressFunc(m_Ember, m_ProgressParameter, 100.0, 1, 0.0);
@ -1084,14 +1169,15 @@ bool RendererCL<T>::ClearBuffer(const string& bufferName, unsigned int width, un
 /// <param name="gridH">Grid height</param>
 /// <param name="blockW">Block width</param>
 /// <param name="blockH">Block height</param>
-/// <param name="chunkSize">Chunk size (gap + 1)</param>
+/// <param name="chunkSizeW">Chunk size width (gapW + 1)</param>
+/// <param name="chunkSizeH">Chunk size height (gapH + 1)</param>
 /// <param name="rowParity">Row parity</param>
 /// <param name="colParity">Column parity</param>
 /// <returns>True if success, else false.</returns>
 template <typename T>
-bool RendererCL<T>::RunDensityFilterPrivate(unsigned int kernelIndex, unsigned int gridW, unsigned int gridH, unsigned int blockW, unsigned int blockH, unsigned int chunkSizeW, unsigned int chunkSizeH, unsigned int rowParity, unsigned int colParity)
+bool RendererCL<T>::RunDensityFilterPrivate(unsigned int kernelIndex, unsigned int gridW, unsigned int gridH, unsigned int blockW, unsigned int blockH, unsigned int chunkSizeW, unsigned int chunkSizeH, unsigned int chunkW, unsigned int chunkH)
 {
-	//Timing t;
+	//Timing t(4);
 	bool b = true;
 	unsigned int argIndex = 0;
 	const char* loc = __FUNCTION__;
@ -1104,8 +1190,8 @@ bool RendererCL<T>::RunDensityFilterPrivate(unsigned int kernelIndex, unsigned i
 	if (b && !(b = m_Wrapper.SetBufferArg(kernelIndex, argIndex, m_DECoefIndicesBufferName)))  { m_ErrorReport.push_back(loc); } argIndex++;//Coef indices.
 	if (b && !(b = m_Wrapper.SetArg(      kernelIndex, argIndex, chunkSizeW)))                 { m_ErrorReport.push_back(loc); } argIndex++;//Chunk size width (gapW + 1).
 	if (b && !(b = m_Wrapper.SetArg(      kernelIndex, argIndex, chunkSizeH)))                 { m_ErrorReport.push_back(loc); } argIndex++;//Chunk size height (gapH + 1).
-	if (b && !(b = m_Wrapper.SetArg(      kernelIndex, argIndex, rowParity)))                  { m_ErrorReport.push_back(loc); } argIndex++;//Row parity.
-	if (b && !(b = m_Wrapper.SetArg(      kernelIndex, argIndex, colParity)))                  { m_ErrorReport.push_back(loc); } argIndex++;//Col parity.
+	if (b && !(b = m_Wrapper.SetArg(      kernelIndex, argIndex, chunkW)))					   { m_ErrorReport.push_back(loc); } argIndex++;//Column chunk.
+	if (b && !(b = m_Wrapper.SetArg(      kernelIndex, argIndex, chunkH)))					   { m_ErrorReport.push_back(loc); } argIndex++;//Row chunk.
 	//t.Toc(__FUNCTION__ " set args");

 	//t.Tic();
@ -1270,60 +1356,57 @@ SpatialFilterCL<T> RendererCL<T>::ConvertSpatialFilter()

 /// <summary>
 /// Convert the host side Ember object into an EmberCL object
-/// for passing to OpenCL.
+/// and a vector of XformCL for passing to OpenCL.
 /// </summary>
 /// <param name="ember">The Ember object to convert</param>
-/// <returns>The EmberCL object</returns>
+/// <param name="emberCL">The converted EmberCL</param>
+/// <param name="xformsCL">The converted vector of XformCL</param>
 template <typename T>
-EmberCL<T> RendererCL<T>::ConvertEmber(Ember<T>& ember)
+void RendererCL<T>::ConvertEmber(Ember<T>& ember, EmberCL<T>& emberCL, vector<XformCL<T>>& xformsCL)
 {
-	EmberCL<T> emberCL;
-
 	memset(&emberCL, 0, sizeof(EmberCL<T>));//Might not really be needed.

-	emberCL.m_RotA            = m_RotMat.A();
-	emberCL.m_RotB            = m_RotMat.B();
-	emberCL.m_RotD            = m_RotMat.D();
-	emberCL.m_RotE            = m_RotMat.E();
-	emberCL.m_CamMat		  = ember.m_CamMat;
-	emberCL.m_CenterX         = CenterX();
-	emberCL.m_CenterY		  = ember.m_RotCenterY;
-	emberCL.m_CamZPos		  = ember.m_CamZPos;
-	emberCL.m_CamPerspective  = ember.m_CamPerspective;
-	emberCL.m_CamYaw		  = ember.m_CamYaw;
-	emberCL.m_CamPitch		  = ember.m_CamPitch;
-	emberCL.m_CamDepthBlur	  = ember.m_CamDepthBlur;
-	emberCL.m_BlurCoef		  = ember.BlurCoef();
+	emberCL.m_RotA           = m_RotMat.A();
+	emberCL.m_RotB           = m_RotMat.B();
+	emberCL.m_RotD           = m_RotMat.D();
+	emberCL.m_RotE           = m_RotMat.E();
+	emberCL.m_CamMat		 = ember.m_CamMat;
+	emberCL.m_CenterX        = CenterX();
+	emberCL.m_CenterY		 = ember.m_RotCenterY;
+	emberCL.m_CamZPos		 = ember.m_CamZPos;
+	emberCL.m_CamPerspective = ember.m_CamPerspective;
+	emberCL.m_CamYaw		 = ember.m_CamYaw;
+	emberCL.m_CamPitch		 = ember.m_CamPitch;
+	emberCL.m_CamDepthBlur	 = ember.m_CamDepthBlur;
+	emberCL.m_BlurCoef		 = ember.BlurCoef();

-	for (unsigned int i = 0; i < ember.TotalXformCount() && i < MAX_CL_XFORM; i++)//Copy the relevant values for each xform, capped at the max.
+	for (unsigned int i = 0; i < ember.TotalXformCount() && i < xformsCL.size(); i++)
 	{
 		Xform<T>* xform = ember.GetTotalXform(i);

-		emberCL.m_Xforms[i].m_A = xform->m_Affine.A();
-		emberCL.m_Xforms[i].m_B = xform->m_Affine.B();
-		emberCL.m_Xforms[i].m_C = xform->m_Affine.C();
-		emberCL.m_Xforms[i].m_D = xform->m_Affine.D();
-		emberCL.m_Xforms[i].m_E = xform->m_Affine.E();
-		emberCL.m_Xforms[i].m_F = xform->m_Affine.F();
+		xformsCL[i].m_A = xform->m_Affine.A();
+		xformsCL[i].m_B = xform->m_Affine.B();
+		xformsCL[i].m_C = xform->m_Affine.C();
+		xformsCL[i].m_D = xform->m_Affine.D();
+		xformsCL[i].m_E = xform->m_Affine.E();
+		xformsCL[i].m_F = xform->m_Affine.F();

-		emberCL.m_Xforms[i].m_PostA = xform->m_Post.A();
-		emberCL.m_Xforms[i].m_PostB = xform->m_Post.B();
-		emberCL.m_Xforms[i].m_PostC = xform->m_Post.C();
-		emberCL.m_Xforms[i].m_PostD = xform->m_Post.D();
-		emberCL.m_Xforms[i].m_PostE = xform->m_Post.E();
-		emberCL.m_Xforms[i].m_PostF = xform->m_Post.F();
+		xformsCL[i].m_PostA = xform->m_Post.A();
+		xformsCL[i].m_PostB = xform->m_Post.B();
+		xformsCL[i].m_PostC = xform->m_Post.C();
+		xformsCL[i].m_PostD = xform->m_Post.D();
+		xformsCL[i].m_PostE = xform->m_Post.E();
+		xformsCL[i].m_PostF = xform->m_Post.F();

-		emberCL.m_Xforms[i].m_DirectColor = xform->m_DirectColor;
-		emberCL.m_Xforms[i].m_ColorSpeedCache = xform->ColorSpeedCache();
-		emberCL.m_Xforms[i].m_OneMinusColorCache = xform->OneMinusColorCache();
-		emberCL.m_Xforms[i].m_Opacity = xform->m_Opacity;
-		emberCL.m_Xforms[i].m_VizAdjusted = xform->VizAdjusted();
+		xformsCL[i].m_DirectColor = xform->m_DirectColor;
+		xformsCL[i].m_ColorSpeedCache = xform->ColorSpeedCache();
+		xformsCL[i].m_OneMinusColorCache = xform->OneMinusColorCache();
+		xformsCL[i].m_Opacity = xform->m_Opacity;
+		xformsCL[i].m_VizAdjusted = xform->VizAdjusted();

 		for (unsigned int varIndex = 0; varIndex < xform->TotalVariationCount() && varIndex < MAX_CL_VARS; varIndex++)//Assign all variation weights for this xform, with a max of MAX_CL_VARS.
-			emberCL.m_Xforms[i].m_VariationWeights[varIndex] = xform->GetVariation(varIndex)->m_Weight;
+			xformsCL[i].m_VariationWeights[varIndex] = xform->GetVariation(varIndex)->m_Weight;
 	}
-
-	return emberCL;
 }

 /// <summary>
--- a/Source/EmberCL/RendererCL.h
+++ b/Source/EmberCL/RendererCL.h
@ -42,14 +42,27 @@ public:
 	//Non-virtual member functions for OpenCL specific tasks.
 	bool Init(unsigned int platform, unsigned int device, bool shared, GLuint outputTexID);
 	bool SetOutputTexture(GLuint outputTexID);
-	inline unsigned int IterCountPerKernel();
-	inline unsigned int IterBlocksWide();
-	inline unsigned int IterBlocksHigh();
-	inline unsigned int IterBlockWidth();
-	inline unsigned int IterBlockHeight();
-	inline unsigned int IterGridWidth();
-	inline unsigned int IterGridHeight();
-	inline unsigned int TotalIterKernelCount();
+
+	//Iters per kernel/block/grid.
+	inline unsigned int IterCountPerKernel() const;
+	inline unsigned int IterCountPerBlock() const;
+	inline unsigned int IterCountPerGrid() const;
+
+	//Kernels per block.
+	inline unsigned int IterBlockKernelWidth() const;
+	inline unsigned int IterBlockKernelHeight() const;
+	inline unsigned int IterBlockKernelCount() const;
+
+	//Kernels per grid.
+	inline unsigned int IterGridKernelWidth() const;
+	inline unsigned int IterGridKernelHeight() const;
+	inline unsigned int IterGridKernelCount() const;
+
+	//Blocks per grid.
+	inline unsigned int IterGridBlockWidth() const;
+	inline unsigned int IterGridBlockHeight() const;
+	inline unsigned int IterGridBlockCount() const;
+
 	unsigned int PlatformIndex();
 	unsigned int DeviceIndex();
 	bool ReadHist();
@ -58,6 +71,9 @@ public:
 	bool ClearHist();
 	bool ClearAccum();
 	bool WritePoints(vector<PointCL<T>>& vec);
+#ifdef TEST_CL
+	bool WriteRandomPoints();
+#endif
 	string IterKernel();

 	//Virtual functions overridden from RendererCLBase.
@ -98,7 +114,7 @@ private:
 	eRenderStatus RunDensityFilter();
 	eRenderStatus RunFinalAccum();
 	bool ClearBuffer(const string& bufferName, unsigned int width, unsigned int height, unsigned int elementSize);
-	bool RunDensityFilterPrivate(unsigned int kernelIndex, unsigned int gridW, unsigned int gridH, unsigned int blockW, unsigned int blockH, unsigned int chunkSizeW, unsigned int chunkSizeH, unsigned int rowParity, unsigned int colParity);
+	bool RunDensityFilterPrivate(unsigned int kernelIndex, unsigned int gridW, unsigned int gridH, unsigned int blockW, unsigned int blockH, unsigned int chunkSizeW, unsigned int chunkSizeH, unsigned int chunkW, unsigned int chunkH);
 	int MakeAndGetDensityFilterProgram(size_t ss, unsigned int filterWidth);
 	int MakeAndGetFinalAccumProgram(T& alphaBase, T& alphaScale);
 	int MakeAndGetGammaCorrectionProgram();
@ -106,7 +122,7 @@ private:
 	//Private functions passing data to OpenCL programs.
 	DensityFilterCL<T> ConvertDensityFilter();
 	SpatialFilterCL<T> ConvertSpatialFilter();
-	EmberCL<T> ConvertEmber(Ember<T>& ember);
+	void ConvertEmber(Ember<T>& ember, EmberCL<T>& emberCL, vector<XformCL<T>>& xformsCL);
 	static CarToRasCL<T> ConvertCarToRas(const CarToRas<T>& carToRas);

 	bool m_Init;
@ -122,7 +138,9 @@ private:

 	//Buffer names.
 	string m_EmberBufferName;
+	string m_XformsBufferName;
 	string m_ParVarsBufferName;
+	string m_SeedsBufferName;
 	string m_DistBufferName;
 	string m_CarToRasBufferName;
 	string m_DEFilterParamsBufferName;
@ -146,6 +164,8 @@ private:
 	IMAGEGL2D m_AccumImage;
 	GLuint m_OutputTexID;
 	EmberCL<T> m_EmberCL;
+	vector<XformCL<T>> m_XformsCL;
+	vector<glm::highp_uvec2> m_Seeds;
 	Palette<float> m_Dmap;//Used instead of the base class' m_Dmap because OpenCL only supports float textures.
 	CarToRasCL<T> m_CarToRasCL;
 	DensityFilterCL<T> m_DensityFilterCL;