Initial source commit

2025-07-01 05:46:06 -04:00 · 2014-07-08 00:11:14 -07:00
parent 1493c22925
commit ef56c16b2b
214 changed files with 108754 additions and 0 deletions
--- a/Source/EmberCL/DEOpenCLKernelCreator.cpp
+++ b/Source/EmberCL/DEOpenCLKernelCreator.cpp
@ -0,0 +1,698 @@
+#include "EmberCLPch.h"
+#include "DEOpenCLKernelCreator.h"
+
+namespace EmberCLns	
+{
+/// <summary>
+/// Empty constructor that does nothing. The user must call the one which takes a bool
+/// argument before using this class.
+/// This constructor only exists so the class can be a member of a class.
+/// </summary>
+template <typename T>
+DEOpenCLKernelCreator<T>::DEOpenCLKernelCreator()
+{
+}
+
+/// <summary>
+/// Constructor for float template type that sets all kernel entry points as well as composes
+/// all kernel source strings.
+/// No program compilation is done here, the user must explicitly do it.
+/// The caller must specify whether they are using an nVidia or AMD card because it changes
+/// the amount of local memory available.
+/// </summary>
+/// <param name="nVidia">True if running on an nVidia card, else false.</param>
+template <>
+DEOpenCLKernelCreator<float>::DEOpenCLKernelCreator(bool nVidia)
+{
+	m_NVidia = nVidia;
+	m_LogScaleSumDEEntryPoint                 = "LogScaleSumDensityFilterKernel";
+	m_LogScaleAssignDEEntryPoint              = "LogScaleAssignDensityFilterKernel";
+	m_GaussianDEWithoutSsEntryPoint           = "GaussianDEWithoutSsKernel";
+	m_GaussianDESsWithScfEntryPoint           = "GaussianDESsWithScfKernel";
+	m_GaussianDESsWithoutScfEntryPoint        = "GaussianDESsWithoutScfKernel";
+	m_GaussianDEWithoutSsNoCacheEntryPoint    = "GaussianDEWithoutSsNoCacheKernel";
+	m_GaussianDESsWithScfNoCacheEntryPoint    = "GaussianDESsWithScfNoCacheKernel";
+	m_GaussianDESsWithoutScfNoCacheEntryPoint = "GaussianDESsWithoutScfNoCacheKernel";
+	m_LogScaleSumDEKernel                     = CreateLogScaleSumDEKernelString();
+	m_LogScaleAssignDEKernel                  = CreateLogScaleAssignDEKernelString();
+	m_GaussianDEWithoutSsKernel               = CreateGaussianDEKernel(1);
+	m_GaussianDESsWithScfKernel               = CreateGaussianDEKernel(2);
+	m_GaussianDESsWithoutScfKernel            = CreateGaussianDEKernel(3);
+	m_GaussianDEWithoutSsNoCacheKernel        = CreateGaussianDEKernelNoLocalCache(1);
+	m_GaussianDESsWithScfNoCacheKernel        = CreateGaussianDEKernelNoLocalCache(2);
+	m_GaussianDESsWithoutScfNoCacheKernel     = CreateGaussianDEKernelNoLocalCache(3);
+}
+
+/// <summary>
+/// Constructor for double template type that sets all kernel entry points as well as composes
+/// all kernel source strings.
+/// Note that no versions of kernels that use the cache are compiled because
+/// the cache is not big enough to hold double4.
+/// No program compilation is done here, the user must explicitly do it.
+/// Specifying true or false for the bool parameter has no effect since no local memory
+/// is used when instantiated with type double.
+/// </summary>
+/// <param name="nVidia">True if running on an nVidia card, else false. Ignored.</param>
+template <>
+DEOpenCLKernelCreator<double>::DEOpenCLKernelCreator(bool nVidia)
+{
+	m_NVidia = nVidia;
+	m_LogScaleSumDEEntryPoint                 = "LogScaleSumDensityFilterKernel";
+	m_LogScaleAssignDEEntryPoint              = "LogScaleAssignDensityFilterKernel";
+	m_GaussianDEWithoutSsNoCacheEntryPoint    = "GaussianDEWithoutSsNoCacheKernel";
+	m_GaussianDESsWithScfNoCacheEntryPoint    = "GaussianDESsWithScfNoCacheKernel";
+	m_GaussianDESsWithoutScfNoCacheEntryPoint = "GaussianDESsWithoutScfNoCacheKernel";
+	m_LogScaleSumDEKernel                     = CreateLogScaleSumDEKernelString();
+	m_LogScaleAssignDEKernel                  = CreateLogScaleAssignDEKernelString();
+	m_GaussianDEWithoutSsNoCacheKernel        = CreateGaussianDEKernelNoLocalCache(1);
+	m_GaussianDESsWithScfNoCacheKernel        = CreateGaussianDEKernelNoLocalCache(2);
+	m_GaussianDESsWithoutScfNoCacheKernel     = CreateGaussianDEKernelNoLocalCache(3);
+}
+
+/// <summary>
+/// Kernel source and entry point properties, getters only.
+/// </summary>
+
+template <typename T> string DEOpenCLKernelCreator<T>::LogScaleSumDEKernel() { return m_LogScaleSumDEKernel; }
+template <typename T> string DEOpenCLKernelCreator<T>::LogScaleSumDEEntryPoint() { return m_LogScaleSumDEEntryPoint; }
+template <typename T> string DEOpenCLKernelCreator<T>::LogScaleAssignDEKernel() { return m_LogScaleAssignDEKernel; }
+template <typename T> string DEOpenCLKernelCreator<T>::LogScaleAssignDEEntryPoint() { return m_LogScaleAssignDEEntryPoint; }
+
+/// <summary>
+/// Get the kernel source for the specified supersample and filterWidth.
+/// </summary>
+/// <param name="ss">The supersample being used</param>
+/// <param name="filterWidth">Filter width</param>
+/// <returns>The kernel source</returns>
+template <typename T>
+string DEOpenCLKernelCreator<T>::GaussianDEKernel(unsigned int ss, unsigned int filterWidth)
+{
+	if ((typeid(T) == typeid(double)) || (filterWidth > MaxDEFilterSize()))//Type double does not use cache.
+	{
+		if (ss > 1)
+		{
+			if (!(ss & 1))
+				return m_GaussianDESsWithScfNoCacheKernel;
+			else
+				return m_GaussianDESsWithoutScfNoCacheKernel;
+		}
+		else
+			return m_GaussianDEWithoutSsNoCacheKernel;
+	}
+	else
+	{
+		if (ss > 1)
+		{
+			if (!(ss & 1))
+				return m_GaussianDESsWithScfKernel;
+			else
+				return m_GaussianDESsWithoutScfKernel;
+		}
+		else
+			return m_GaussianDEWithoutSsKernel;
+	}
+}
+
+/// <summary>
+/// Get the kernel entry point for the specified supersample and filterWidth.
+/// </summary>
+/// <param name="ss">The supersample being used</param>
+/// <param name="filterWidth">Filter width</param>
+/// <returns>The name of the density estimation filtering entry point kernel function</returns>
+template <typename T>
+string DEOpenCLKernelCreator<T>::GaussianDEEntryPoint(unsigned int ss, unsigned int filterWidth)
+{
+	if ((typeid(T) == typeid(double)) || (filterWidth > MaxDEFilterSize()))//Type double does not use cache.
+	{
+		if (ss > 1)
+		{
+			if (!(ss & 1))
+				return m_GaussianDESsWithScfNoCacheEntryPoint;
+			else
+				return m_GaussianDESsWithoutScfNoCacheEntryPoint;
+		}
+		else
+			return m_GaussianDEWithoutSsNoCacheEntryPoint;
+	}
+	else
+	{
+		if (ss > 1)
+		{
+			if (!(ss & 1))
+				return m_GaussianDESsWithScfEntryPoint;
+			else
+				return m_GaussianDESsWithoutScfEntryPoint;
+		}
+		else
+			return m_GaussianDEWithoutSsEntryPoint;
+	}
+}
+
+/// <summary>
+/// Get the maximum filter size allowed for running the local memory version of density filtering
+/// Filters larger than this value will run the version without local memory caching.
+/// </summary>
+/// <returns>The maximum filter size allowed for running the local memory version of density filtering</returns>
+template <typename T>
+unsigned int DEOpenCLKernelCreator<T>::MaxDEFilterSize() { return 9; }//The true max would be (maxBoxSize - 1) / 2, but that's impractical because it can give us a tiny block size.
+
+/// <summary>
+/// Solve for the maximum filter radius.
+/// The final filter width is calculated by: (unsigned int)(ceil(m_MaxRad) * (T)m_Supersample) + (m_Supersample - 1);
+/// Must solve for what max rad should be in order to give a maximum final width of (maxBoxSize - 1) / 2, assuming
+/// a minimum block size of 1 which processes 1 pixel.
+/// Example: If a box size of 20 was allowed, a filter
+/// size of up to 9: (20 - 1) / 2 == (19 / 2) == 9 could be supported.
+/// This function is deprecated, the appropriate kernels take care of this problem now.
+/// </summary>
+/// <param name="maxBoxSize">Maximum size of the box.</param>
+/// <param name="desiredFilterSize">Size of the desired filter.</param>
+/// <param name="ss">The supersample being used</param>
+/// <returns>The maximum filter radius allowed</returns>
+template <typename T>
+T DEOpenCLKernelCreator<T>::SolveMaxDERad(unsigned int maxBoxSize, T desiredFilterSize, T ss)
+{
+	unsigned int finalFilterSize = (unsigned int)((ceil(desiredFilterSize) * ss) + (ss - 1.0));
+
+	//Return the desired size if the final size of it will fit.
+	if (finalFilterSize <= MaxDEFilterSize())
+		return desiredFilterSize;
+
+	//The final size doesn't fit, so scale the original down until it fits.
+	return (T)floor((MaxDEFilterSize() - (ss - 1.0)) / ss);
+}
+
+/// <summary>
+/// Determine the maximum filter box size based on the amount of local memory available
+/// to each block.
+/// </summary>
+/// <param name="localMem">The local memory available to a block</param>
+/// <returns>The maximum filter box size allowed</returns>
+template <typename T>
+unsigned int DEOpenCLKernelCreator<T>::SolveMaxBoxSize(unsigned int localMem)
+{
+	return (unsigned int)floor(sqrt(floor((T)localMem / 16.0)));//Divide by 16 because each element is float4.
+}
+
+/// <summary>
+/// Create the log scale kernel string, using summation.
+/// This means each cell will be added to, rather than just assigned.
+/// Since adding is slower than assigning, this should only be used when Passes > 1,
+/// otherwise use the kernel created from CreateLogScaleAssignDEKernelString().
+/// </summary>
+/// <returns>The kernel string</returns>
+template <typename T>
+string DEOpenCLKernelCreator<T>::CreateLogScaleSumDEKernelString()
+{
+	ostringstream os;
+ 
+	os	<<
+		ConstantDefinesString(typeid(T) == typeid(double)) <<
+		DensityFilterCLStructString <<
+		"__kernel void " << m_LogScaleSumDEEntryPoint << "(\n"
+		"	const __global real4* histogram,\n"
+		"	__global real4* accumulator,\n"
+		"	__constant DensityFilterCL* logFilter\n"
+		"\t)\n"
+		"{\n"
+		"	if ((GLOBAL_ID_X < logFilter->m_SuperRasW) && (GLOBAL_ID_Y < logFilter->m_SuperRasH))\n"
+		"	{\n"
+		"		uint index = (GLOBAL_ID_Y * logFilter->m_SuperRasW) + GLOBAL_ID_X;\n"
+		"\n"
+		"		if (histogram[index].w != 0)\n"
+		"		{\n"
+		"			real_t logScale = (logFilter->m_K1 * log(1.0 + histogram[index].w * logFilter->m_K2)) / histogram[index].w;\n"
+		"\n"
+		"			accumulator[index] += histogram[index] * logScale;\n"//Using a single real4 vector operation doubles the speed from doing each component individually.
+		"		}\n"
+		"\n"
+		"		barrier(CLK_GLOBAL_MEM_FENCE);\n"//Just to be safe. Makes no speed difference to do all of the time or only when there's a hit.
+		"	}\n"
+		"}\n";
+ 
+	return os.str();
+}
+
+/// <summary>
+/// Create the log scale kernel string, using assignment.
+/// Use this when Passes == 1.
+/// </summary>
+/// <returns>The kernel string</returns>
+template <typename T>
+string DEOpenCLKernelCreator<T>::CreateLogScaleAssignDEKernelString()
+{
+	ostringstream os;
+ 
+	os	<<
+		ConstantDefinesString(typeid(T) == typeid(double)) <<
+		DensityFilterCLStructString <<
+		"__kernel void " << m_LogScaleAssignDEEntryPoint << "(\n"
+		"	const __global real4* histogram,\n"
+		"	__global real4* accumulator,\n"
+		"	__constant DensityFilterCL* logFilter\n"
+		"\t)\n"
+		"{\n"
+		"	if ((GLOBAL_ID_X < logFilter->m_SuperRasW) && (GLOBAL_ID_Y < logFilter->m_SuperRasH))\n"
+		"	{\n"
+		"		uint index = (GLOBAL_ID_Y * logFilter->m_SuperRasW) + GLOBAL_ID_X;\n"
+		"\n"
+		"		if (histogram[index].w != 0)\n"
+		"		{\n"
+		"			real_t logScale = (logFilter->m_K1 * log(1.0 + histogram[index].w * logFilter->m_K2)) / histogram[index].w;\n"
+		"\n"
+		"			accumulator[index] = histogram[index] * logScale;\n"//Using a single real4 vector operation doubles the speed from doing each component individually.
+		"		}\n"
+		"\n"
+		"		barrier(CLK_GLOBAL_MEM_FENCE);\n"//Just to be safe. Makes no speed difference to do all of the time or only when there's a hit.
+		"	}\n"
+		"}\n";
+ 
+	return os.str();
+}
+
+/// <summary>
+/// Create the gaussian density filtering kernel string.
+/// 6 different methods of processing were tried before settling on this final and fastest 7th one.
+/// Each block processes a box and exits. No column or row advancements happen.
+/// The block accumulates to a temporary box and writes the contents to the global density filter buffer when done.
+/// Note this applies the filter from top to bottom row and not from the center outward like the CPU version does.
+/// This allows the image to be filtered without suffering from pixel loss due to race conditions.
+/// It is run in multiple passes that are spaced far enough apart on the image so as to not overlap.
+/// This allows writing to the global buffer without ever overlapping or using atomics.
+/// The supersample parameter will produce three different kernels.
+/// SS = 1, SS > 1 && SS even, SS > 1 && SS odd.
+/// The width of the kernl this runs in must be evenly divisible by 16 or else artifacts will occur.
+/// Note that because this function uses so many variables and is so complex, OpenCL can easily run
+/// out of resources in some cases. Certain variables had to be reused to condense the kernel footprint
+/// down enough to be able to run a block size of 32x32.
+/// For double precision, or for SS > 1, a size of 32x30 is used.
+/// Box width = (BLOCK_SIZE_X + (fw * 2)).
+/// Box height = (BLOCK_SIZE_Y + (fw * 2)).
+/// </summary>
+/// <param name="ss">The supersample being used</param>
+/// <returns>The kernel string</returns>
+template <typename T>
+string DEOpenCLKernelCreator<T>::CreateGaussianDEKernel(unsigned int ss)
+{
+	bool doSS = ss > 1;
+	bool doScf = !(ss & 1);
+	ostringstream os;
+
+	os <<
+		ConstantDefinesString(typeid(T) == typeid(double)) <<
+		DensityFilterCLStructString <<
+		UnionCLStructString <<
+		"__kernel void " << GaussianDEEntryPoint(ss, MaxDEFilterSize()) << "(\n" <<
+		"	const __global real4* histogram,\n"
+		"	__global real4reals* accumulator,\n"
+		"	__constant DensityFilterCL* densityFilter,\n"
+		"	const __global real_t* filterCoefs,\n"
+		"	const __global real_t* filterWidths,\n"
+		"	const __global uint* coefIndices,\n"
+		"	const uint chunkSizeW,\n"
+		"	const uint chunkSizeH,\n"
+		"	const uint rowParity,\n"
+		"	const uint colParity\n"
+		"\t)\n"
+		"{\n"
+		//Parity determines if this function should execute.
+		"	if ((GLOBAL_ID_X >= densityFilter->m_SuperRasW) ||\n"
+		"	    (GLOBAL_ID_Y >= densityFilter->m_SuperRasH) ||\n"
+		"	    ((BLOCK_ID_X % chunkSizeW) != colParity)    ||\n"
+		"	    ((BLOCK_ID_Y % chunkSizeH) != rowParity))     \n"
+		"			return;\n"
+		"\n";
+ 
+	if (doSS)
+	{
+		os <<
+		"	uint ss = (uint)floor((real_t)densityFilter->m_Supersample / 2.0);\n"
+		"	int densityBoxLeftX;\n"
+		"	int densityBoxRightX;\n"
+		"	int densityBoxTopY;\n"
+		"	int densityBoxBottomY;\n"
+		"\n";
+
+		if (doScf)
+		os <<
+		"	real_t scfact = pow(densityFilter->m_Supersample / (densityFilter->m_Supersample + 1.0), 2.0);\n";
+	}
+
+	//Compute the size of the temporary box which is the block width + 2 * filter width x block height + 2 * filter width.
+	//Ideally the block width and height are both 32. However, the height might be smaller if there isn't enough memory.
+	os <<
+		"	uint fullTempBoxWidth, fullTempBoxHeight;\n"
+		"	uint leftBound, rightBound, topBound, botBound;\n"
+		"	uint blockHistStartRow, blockHistEndRow, boxReadStartRow, boxReadEndRow;\n"
+		"	uint blockHistStartCol, boxReadStartCol, boxReadEndCol;\n"
+		"	uint accumWriteStartRow, accumWriteStartCol, colsToWrite;\n"
+
+		//If any of the variables above end up being made __local, init them here.
+		//At the moment, it's slower even though it's more memory efficient.
+		//"	if (THREAD_ID_X == 0 && THREAD_ID_Y == 0)\n"
+		//"	{\n"
+		//Init local vars here.
+		//"	}\n"
+		//"\n"
+		//"	barrier(CLK_LOCAL_MEM_FENCE);\n"
+		"\n"
+		"	fullTempBoxWidth = BLOCK_SIZE_X + (densityFilter->m_FilterWidth * 2);\n"
+		"	fullTempBoxHeight = BLOCK_SIZE_Y + (densityFilter->m_FilterWidth * 2);\n"
+		//Compute the bounds of the area to be sampled, which is just the ends minus the super sample minus 1.
+		"	leftBound = densityFilter->m_Supersample - 1;\n"
+		"	rightBound = densityFilter->m_SuperRasW - (densityFilter->m_Supersample - 1);\n"
+		"	topBound = densityFilter->m_Supersample - 1;\n"
+		"	botBound = densityFilter->m_SuperRasH - (densityFilter->m_Supersample - 1);\n"
+		"\n"
+		//Start and end values are the indices in the histogram read from
+		//and written to in the accumulator. They are not the indices for the local block of data.
+		//Before computing local offsets, compute the global offsets first to determine if any rows or cols fall outside of the bounds.
+		"	blockHistStartRow = min(botBound, topBound + (BLOCK_ID_Y * BLOCK_SIZE_Y));\n"//The first histogram row this block will process.
+		"	blockHistEndRow = min(botBound, blockHistStartRow + BLOCK_SIZE_Y);\n"//The last histogram row this block will process, clamped to the last row.
+		"	boxReadStartRow = densityFilter->m_FilterWidth - min(densityFilter->m_FilterWidth, blockHistStartRow);\n"//The first row in the local box to read from when writing back to the final accumulator for this block.
+		"	boxReadEndRow = densityFilter->m_FilterWidth + min(densityFilter->m_FilterWidth + BLOCK_SIZE_Y, densityFilter->m_SuperRasH - blockHistStartRow);\n"//The last row in the local box to read from  when writing back to the final accumulator for this block.
+		"	blockHistStartCol = min(rightBound, leftBound + (BLOCK_ID_X * BLOCK_SIZE_X));\n"//The first histogram column this block will process.
+		"	boxReadStartCol = densityFilter->m_FilterWidth - min(densityFilter->m_FilterWidth, blockHistStartCol);\n"//The first box row this block will read from when copying to the accumulator.
+		"	boxReadEndCol = densityFilter->m_FilterWidth + min(densityFilter->m_FilterWidth + BLOCK_SIZE_X, densityFilter->m_SuperRasW - blockHistStartCol);\n"//The last box row this block will read from when copying to the accumulator.
+		"\n"
+		//Last, the indices in the global accumulator that the local bounds will be writing to.
+		"	accumWriteStartRow = blockHistStartRow - min(densityFilter->m_FilterWidth,  blockHistStartRow);\n"//Will be fw - 0 except for boundary columns, it will be less.
+		"	accumWriteStartCol = blockHistStartCol - min(densityFilter->m_FilterWidth,  blockHistStartCol);\n"
+		"	colsToWrite = ceil((real_t)(boxReadEndCol - boxReadStartCol) / (real_t)BLOCK_SIZE_X);\n"
+		"\n"
+		"	uint threadHistRow = blockHistStartRow + THREAD_ID_Y;\n"//The histogram row this individual thread will be reading from.
+		"	uint threadHistCol = blockHistStartCol + THREAD_ID_X;\n"//The histogram column this individual thread will be reading from.
+		"\n"
+
+		//Compute the center position in this local box to serve as the center position
+		//from which filter application offsets are computed.
+		//These are the local indices for the local data that are temporarily accumulated to before
+		//writing out to the global accumulator.
+		"	uint boxRow = densityFilter->m_FilterWidth + THREAD_ID_Y;\n"
+		"	uint boxCol = densityFilter->m_FilterWidth + THREAD_ID_X;\n"
+		"	uint colElementsToZero = ceil((real_t)fullTempBoxWidth / (real_t)(BLOCK_SIZE_X));\n"//Usually is 2.
+		"	int i, j, k;\n"
+		"	uint filterSelectInt, filterCoefIndex;\n"
+		"	real_t cacheLog;\n"
+		"	real_t filterSelect;\n"
+		"	real4 bucket;\n"
+		;
+
+		//This will be treated as having dimensions of (BLOCK_SIZE_X + (fw * 2)) x (BLOCK_SIZE_Y + (fw * 2)).
+	if (m_NVidia)
+		os << "	__local real4reals filterBox[3000];\n";
+	else
+		os << "	__local real4reals filterBox[1200];\n";
+
+	os <<
+		//Zero the temp buffers first. This splits the zeroization evenly across all threads (columns) in the first block row.
+		//This is a middle ground solution. Previous methods tried:
+		//Thread (0, 0) does all init. This works, but is the slowest.
+		//Init is divided among all threads. This is the fastest but exposes a severe flaw in OpenCL,
+		//in that it will not get executed by all threads before proceeding, despite the barrier statement
+		//below. As a result, strange artifacts will get left around because filtering gets executed on a temp
+		//box that has not been properly zeroized.
+		//The only way to do it and still achieve reasonable speed is to have the first row do it. This is
+		//most likely because the first row gets executed first, ensuring zeroization is done when the rest
+		//of the threads execute.
+		"\n"//Dummy test zeroization for debugging.
+		//"	if (THREAD_ID_Y == 0 && THREAD_ID_X == 0)\n"//First thread of the block takes the responsibility of zeroizing.
+		//"	{\n"
+		//"		for (k = 0; k < 2 * 1024; k++)\n"
+		//"		{\n"
+		//"			filterBox[k].m_Real4 = 0;\n"
+		//"		}\n"
+		//"	}\n"
+		"	if (THREAD_ID_Y == 0)\n"//First row of the block takes the responsibility of zeroizing.
+		"	{\n"
+		"		for (i = 0; i < fullTempBoxHeight; i++)\n"//Each column in the row iterates through all rows.
+		"		{\n"
+		"			for (j = 0; j < colElementsToZero && ((colElementsToZero * THREAD_ID_X) + j) < fullTempBoxWidth; j++)\n"//And zeroizes a few columns from that row.
+		"			{\n"
+		"				filterBox[(i * fullTempBoxWidth) + ((colElementsToZero * THREAD_ID_X) + j)].m_Real4 = 0;\n"
+		"			}\n"
+		"		}\n"
+		"	}\n"
+		"\n"
+		"	barrier(CLK_LOCAL_MEM_FENCE);\n"
+		"\n"
+		"	if (threadHistRow < botBound && threadHistCol < rightBound)\n"
+		"	{\n"
+		"		bucket = histogram[(threadHistRow * densityFilter->m_SuperRasW) + threadHistCol];\n"
+		"\n"
+		"		if (bucket.w != 0)\n"
+		"		{\n"
+		"			cacheLog = (densityFilter->m_K1 * log(1.0 + bucket.w * densityFilter->m_K2)) / bucket.w;\n";
+ 
+	if (doSS)
+	{
+		os <<
+		"			filterSelect = 0;\n"
+		"			densityBoxLeftX = threadHistCol - min(threadHistCol, ss);\n"
+		"			densityBoxRightX = threadHistCol + min(ss, (densityFilter->m_SuperRasW - threadHistCol) - 1);\n"
+		"			densityBoxTopY = threadHistRow - min(threadHistRow, ss);\n"
+		"			densityBoxBottomY = threadHistRow + min(ss, (densityFilter->m_SuperRasH - threadHistRow) - 1);\n"
+		"\n"
+		"			for (j = densityBoxTopY; j <= densityBoxBottomY; j++)\n"
+		"			{\n"
+		"				for (i = densityBoxLeftX; i <= densityBoxRightX; i++)\n"
+		"				{\n"
+		"					filterSelect += histogram[i + (j * densityFilter->m_SuperRasW)].w;\n"
+		"				}\n"
+		"			}\n"
+		"\n";
+			
+		if (doScf)
+			os << "	filterSelect *= scfact;\n";
+	}
+	else
+	{
+		os
+			<< "	filterSelect = bucket.w;\n";
+	}
+ 
+	os <<
+		"\n"
+		"			if (filterSelect > densityFilter->m_MaxFilteredCounts)\n"
+		"				filterSelectInt = densityFilter->m_MaxFilterIndex;\n"
+		"			else if (filterSelect <= DE_THRESH)\n"
+		"				filterSelectInt = (int)ceil(filterSelect) - 1;\n"
+		"			else\n"
+		"				filterSelectInt = (int)DE_THRESH + (int)floor(pow((real_t)(filterSelect - DE_THRESH), densityFilter->m_Curve));\n"
+		"\n"
+		"			if (filterSelectInt > densityFilter->m_MaxFilterIndex)\n"
+		"				filterSelectInt = densityFilter->m_MaxFilterIndex;\n"
+		"\n"
+		"			filterCoefIndex = filterSelectInt * densityFilter->m_KernelSize;\n"
+		"\n"
+		//With this new method, only accumulate to the temp local buffer first. Write to the final accumulator last.
+		//For each loop through, note that there is a local memory barrier call inside of each call to AddToAccumNoCheck().
+		//If this isn't done, pixel errors occurr and even an out of resources error occurrs because too many writes are done to the same place in memory at once.
+		"			k = (int)densityFilter->m_FilterWidth;\n"//Need a signed int to use below, really is filter width, but reusing a variable to save space.
+		"\n"
+		"			for (j = -k; j <= k; j++)\n"
+		"			{\n"
+		"				for (i = -k; i <= k; i++)\n"
+		"				{\n"
+		"					filterSelectInt = filterCoefIndex + coefIndices[(abs(j) * (densityFilter->m_FilterWidth + 1)) + abs(i)];\n"//Really is filterCoeffIndexPlusOffset, but reusing a variable to save space.
+		"\n"
+		"					if (filterCoefs[filterSelectInt] != 0)\n"
+		"					{\n"
+		"						filterBox[(i + boxCol) + ((j + boxRow) * fullTempBoxWidth)].m_Real4 += (bucket * (filterCoefs[filterSelectInt] * cacheLog));\n"
+		"					}\n"
+		"				}\n"
+		"				barrier(CLK_LOCAL_MEM_FENCE);\n"//If this is the only barrier and the block size is exactly 16, it works perfectly. Otherwise, no chunks occur, but a many streaks.
+		"			}\n"
+		"		}\n"//bucket.w != 0.
+		"	}\n"//In bounds.
+		"\n"
+		"\n"
+		"	barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);\n"
+		"\n"
+		"	if (THREAD_ID_Y == 0)\n"
+		"	{\n"
+				//At this point, all threads in this block have applied the filter to their surrounding pixel and stored the results in the temp local box.
+				//Add the cells of it that are in bounds to the global accumulator.
+				//Compute offsets in local box to read from, and offsets into global accumulator to write to.
+				//Use a method here that is similar to the zeroization above: Each thread (column) in the first row iterates through all of the
+				//rows and adds a few columns to the accumulator.
+		"		for (i = boxReadStartRow, j = accumWriteStartRow; i < boxReadEndRow; i++, j++)\n"
+		"		{\n"
+		"			for (k = 0; k < colsToWrite; k++)\n"//Write a few columns.
+		"			{\n"
+		"				boxCol = (colsToWrite * THREAD_ID_X) + k;\n"//Really is colOffset, but reusing a variable to save space.
+		"\n"
+		"				if (boxReadStartCol + boxCol < boxReadEndCol)\n"
+		"					accumulator[(j * densityFilter->m_SuperRasW) + (accumWriteStartCol + boxCol)].m_Real4 += filterBox[(i * fullTempBoxWidth) + (boxReadStartCol + boxCol)].m_Real4;\n"
+		"			}\n"
+		"			barrier(CLK_GLOBAL_MEM_FENCE);\n"//This must be here or else chunks will go missing.
+		"		}\n"
+		"	}\n"
+		"}\n";
+
+	return os.str();
+}
+
+/// <summary>
+/// Create the gaussian density filtering kernel string, but use no local cache and perform
+/// all writes directly to the global density filtering buffer.
+/// Note this applies the filter from top to bottom row and not from the center outward like the CPU version does.
+/// This allows the image to be filtered without suffering from pixel loss due to race conditions.
+/// This is used for when the filter box is greater than can fit in the local cache.
+/// While the cached version is incredibly fast, this version offers no real gain over doing it
+/// on the CPU because the frequent global memory access brings performance to a crawl.
+/// The supersample parameter will produce three different kernels.
+/// SS = 1, SS > 1 && SS even, SS > 1 && SS odd.
+/// The width of the kernl this runs in must be evenly divisible by 16 or else artifacts will occur.
+/// Note that because this function uses so many variables and is so complex, OpenCL can easily run
+/// out of resources in some cases. Certain variables had to be reused to condense the kernel footprint
+/// down enough to be able to run a block size of 32x32.
+/// For double precision, or for SS > 1, a size of 32x30 is used.
+/// </summary>
+/// <param name="ss">The supersample being used</param>
+/// <returns>The kernel string</returns>
+template <typename T>
+string DEOpenCLKernelCreator<T>::CreateGaussianDEKernelNoLocalCache(unsigned int ss)
+{
+	bool doSS = ss > 1;
+	bool doScf = !(ss & 1);
+	ostringstream os;
+
+	os <<
+		ConstantDefinesString(typeid(T) == typeid(double)) <<
+		DensityFilterCLStructString <<
+		UnionCLStructString <<
+		AddToAccumWithCheckFunctionString <<
+		"__kernel void " << GaussianDEEntryPoint(ss, MaxDEFilterSize() + 1) << "(\n" <<
+		"	const __global real4* histogram,\n"
+		"	__global real4reals* accumulator,\n"
+		"	__constant DensityFilterCL* densityFilter,\n"
+		"	const __global real_t* filterCoefs,\n"
+		"	const __global real_t* filterWidths,\n"
+		"	const __global uint* coefIndices,\n"
+		"	const uint chunkSizeW,\n"
+		"	const uint chunkSizeH,\n"
+		"	const uint rowParity,\n"
+		"	const uint colParity\n"
+		"\t)\n"
+		"{\n"
+		//Parity determines if this function should execute.
+		"	if ((GLOBAL_ID_X >= densityFilter->m_SuperRasW) ||\n"
+		"	    (GLOBAL_ID_Y >= densityFilter->m_SuperRasH) ||\n"
+		"	    ((BLOCK_ID_X % chunkSizeW) != colParity)    ||\n"
+		"	    ((BLOCK_ID_Y % chunkSizeH) != rowParity))     \n"
+		"			return;\n"
+		"\n";
+ 
+	if (doSS)
+	{
+	os <<
+		"	uint ss = (uint)floor((real_t)densityFilter->m_Supersample / 2.0);\n"
+		"	int densityBoxLeftX;\n"
+		"	int densityBoxRightX;\n"
+		"	int densityBoxTopY;\n"
+		"	int densityBoxBottomY;\n";
+
+	if (doScf)
+	os << "	real_t scfact = pow((real_t)densityFilter->m_Supersample / ((real_t)densityFilter->m_Supersample + 1.0), 2.0);\n";
+	}
+ 
+	os <<
+		//Compute the bounds of the area to be sampled, which is just the ends minus the super sample minus 1.
+		"	uint leftBound = densityFilter->m_Supersample - 1;\n"
+		"	uint rightBound = densityFilter->m_SuperRasW - (densityFilter->m_Supersample - 1);\n"
+		"	uint topBound = densityFilter->m_Supersample - 1;\n"
+		"	uint botBound = densityFilter->m_SuperRasH - (densityFilter->m_Supersample - 1);\n"
+		"\n"
+		//Start and end values are the indices in the histogram read from and written to in the accumulator.
+		//Before computing local offsets, compute the global offsets first to determine if any rows or cols fall outside of the bounds.
+		"	uint blockHistStartRow = min(botBound, topBound + (BLOCK_ID_Y * BLOCK_SIZE_Y));\n"//The first histogram row this block will process.
+		"	uint threadHistRow = blockHistStartRow + THREAD_ID_Y;\n"//The histogram row this individual thread will be reading from.
+		"\n"
+		"	uint blockHistStartCol = min(rightBound, leftBound + (BLOCK_ID_X * BLOCK_SIZE_X));\n"//The first histogram column this block will process.
+		"	uint threadHistCol = blockHistStartCol + THREAD_ID_X;\n"//The histogram column this individual thread will be reading from.
+		"\n"
+		"	int i, j;\n"
+		"	uint filterSelectInt, filterCoefIndex;\n"
+		"	real_t cacheLog;\n"
+		"	real_t logScale;\n"
+		"	real_t filterSelect;\n"
+		"	real4 bucket;\n"
+		"\n"
+		"	if (threadHistRow < botBound && threadHistCol < rightBound)\n"
+		"	{\n"
+		"		bucket = histogram[(threadHistRow * densityFilter->m_SuperRasW) + threadHistCol];\n"
+		"\n"
+		"		if (bucket.w != 0)\n"
+		"		{\n"
+		"			cacheLog = (densityFilter->m_K1 * log(1.0 + bucket.w * densityFilter->m_K2)) / bucket.w;\n";
+ 
+	if (doSS)
+	{
+		os <<
+		"			filterSelect = 0;\n"
+		"			densityBoxLeftX = threadHistCol - min(threadHistCol, ss);\n"
+		"			densityBoxRightX = threadHistCol + min(ss, (densityFilter->m_SuperRasW - threadHistCol) - 1);\n"
+		"			densityBoxTopY = threadHistRow - min(threadHistRow, ss);\n"
+		"			densityBoxBottomY = threadHistRow + min(ss, (densityFilter->m_SuperRasH - threadHistRow) - 1);\n"
+		"\n"
+		"			for (j = densityBoxTopY; j <= densityBoxBottomY; j++)\n"
+		"			{\n"
+		"				for (i = densityBoxLeftX; i <= densityBoxRightX; i++)\n"
+		"				{\n"
+		"					filterSelect += histogram[i + (j * densityFilter->m_SuperRasW)].w;\n"
+		"				}\n"
+		"			}\n"
+		"\n";
+			
+		if (doScf)
+		os << "		filterSelect *= scfact;\n";
+	}
+	else
+	{
+		os
+	<< "			filterSelect = bucket.w;\n";
+	}
+ 
+	os <<
+		"\n"
+		"			if (filterSelect > densityFilter->m_MaxFilteredCounts)\n"
+		"				filterSelectInt = densityFilter->m_MaxFilterIndex;\n"
+		"			else if (filterSelect <= DE_THRESH)\n"
+		"				filterSelectInt = (int)ceil(filterSelect) - 1;\n"
+		"			else\n"
+		"				filterSelectInt = (int)DE_THRESH + (int)floor(pow((real_t)(filterSelect - DE_THRESH), densityFilter->m_Curve));\n"
+		"\n"
+		"			if (filterSelectInt > densityFilter->m_MaxFilterIndex)\n"
+		"				filterSelectInt = densityFilter->m_MaxFilterIndex;\n"
+		"\n"
+		"			filterCoefIndex = filterSelectInt * densityFilter->m_KernelSize;\n"
+		"\n"
+		"			int fw = (int)densityFilter->m_FilterWidth;\n"//Need a signed int to use below.
+		"\n"
+		"			for (j = -fw; j <= fw; j++)\n"
+		"			{\n"
+		"				for (i = -fw; i <= fw; i++)\n"
+		"				{\n"
+		"					if (AccumCheck(densityFilter->m_SuperRasW, densityFilter->m_SuperRasH, threadHistCol, i, threadHistRow, j))\n"
+		"					{\n"
+		"						filterSelectInt = filterCoefIndex + coefIndices[(abs(j) * (densityFilter->m_FilterWidth + 1)) + abs(i)];\n"//Really is filterCoeffIndexPlusOffset, but reusing a variable to save space.
+		"\n"
+		"						if (filterCoefs[filterSelectInt] != 0)\n"
+		"						{\n"
+		"							accumulator[(i + threadHistCol) + ((j + threadHistRow) * densityFilter->m_SuperRasW)].m_Real4 += (bucket * (filterCoefs[filterSelectInt] * cacheLog));\n"
+		"						}\n"
+		"					}\n"
+		"\n"
+		"					barrier(CLK_GLOBAL_MEM_FENCE);\n"//Required to avoid streaks.
+		"				}\n"
+		"			}\n"
+		"		}\n"//bucket.w != 0.
+		"	}\n"//In bounds.
+		"\n"
+		//"	barrier(CLK_GLOBAL_MEM_FENCE);\n"//Just to be safe.
+		"}\n";
+
+	return os.str();
+}
+}
--- a/Source/EmberCL/DEOpenCLKernelCreator.h
+++ b/Source/EmberCL/DEOpenCLKernelCreator.h
@ -0,0 +1,89 @@
+#pragma once
+ 
+#include "EmberCLPch.h"
+#include "EmberCLStructs.h"
+#include "EmberCLFunctions.h"
+
+/// <summary>
+/// DEOpenCLKernelCreator class.
+/// </summary>
+
+namespace EmberCLns
+{
+/// <summary>
+/// Kernel creator for density filtering.
+/// This implements both basic log scale filtering
+/// as well as the full flam3 density estimation filtering
+/// in OpenCL.
+/// Several conditionals are present in the CPU version. They
+/// are stripped out of the kernels and instead a separate kernel
+/// is created for every possible case.
+/// If the filter width is 9 or less, then the entire process can be
+/// done in shared memory which is very fast.
+/// However, if the filter width is greater than 9, shared memory is not
+/// used and all filtering is done directly with main global VRAM. This
+/// ends up being not much faster than doing it on the CPU.
+/// String members are kept for the program source and entry points
+/// for each version of the program.
+/// Template argument expected to be float or double.
+/// </summary>
+template <typename T>
+class EMBERCL_API DEOpenCLKernelCreator
+{
+public:
+	DEOpenCLKernelCreator();
+	DEOpenCLKernelCreator(bool nVidia);
+
+	//Accessors.
+	string LogScaleSumDEKernel();
+	string LogScaleSumDEEntryPoint();
+	string LogScaleAssignDEKernel();
+	string LogScaleAssignDEEntryPoint();
+	string GaussianDEKernel(unsigned int ss, unsigned int filterWidth);
+	string GaussianDEEntryPoint(unsigned int ss, unsigned int filterWidth);
+
+	//Miscellaneous static functions.
+	static unsigned int MaxDEFilterSize();
+	static T SolveMaxDERad(unsigned int maxBoxSize, T desiredFilterSize, T ss);
+	static unsigned int SolveMaxBoxSize(unsigned int localMem);
+
+private:
+	//Kernel creators.
+	string CreateLogScaleSumDEKernelString();
+	string CreateLogScaleAssignDEKernelString();
+	string CreateGaussianDEKernel(unsigned int ss);
+	string CreateGaussianDEKernelNoLocalCache(unsigned int ss);
+
+	string m_LogScaleSumDEKernel;
+	string m_LogScaleSumDEEntryPoint;
+
+	string m_LogScaleAssignDEKernel;
+	string m_LogScaleAssignDEEntryPoint;
+
+	string m_GaussianDEWithoutSsKernel;
+	string m_GaussianDEWithoutSsEntryPoint;
+
+	string m_GaussianDESsWithScfKernel;
+	string m_GaussianDESsWithScfEntryPoint;
+
+	string m_GaussianDESsWithoutScfKernel;
+	string m_GaussianDESsWithoutScfEntryPoint;
+
+	string m_GaussianDEWithoutSsNoCacheKernel;
+	string m_GaussianDEWithoutSsNoCacheEntryPoint;
+
+	string m_GaussianDESsWithScfNoCacheKernel;
+	string m_GaussianDESsWithScfNoCacheEntryPoint;
+
+	string m_GaussianDESsWithoutScfNoCacheKernel;
+	string m_GaussianDESsWithoutScfNoCacheEntryPoint;
+
+	bool m_NVidia;
+};
+
+template EMBERCL_API class DEOpenCLKernelCreator<float>;
+
+#ifdef DO_DOUBLE
+	template EMBERCL_API class DEOpenCLKernelCreator<double>;
+#endif
+}
--- a/Source/EmberCL/DllMain.cpp
+++ b/Source/EmberCL/DllMain.cpp
@ -0,0 +1,20 @@
+#include "EmberCLPch.h"
+
+/// <summary>
+/// Generated by Visual Studio to make the DLL run properly.
+/// </summary>
+BOOL APIENTRY DllMain( HMODULE hModule,
+					   DWORD  ul_reason_for_call,
+					   LPVOID lpReserved
+					 )
+{
+	switch (ul_reason_for_call)
+	{
+	case DLL_PROCESS_ATTACH:
+	case DLL_THREAD_ATTACH:
+	case DLL_THREAD_DETACH:
+	case DLL_PROCESS_DETACH:
+		break;
+	}
+	return TRUE;
+}
--- a/Source/EmberCL/EmberCLFunctions.h
+++ b/Source/EmberCL/EmberCLFunctions.h
@ -0,0 +1,413 @@
+#pragma once
+
+#include "EmberCLPch.h"
+#include "EmberCLStructs.h"
+
+/// <summary>
+/// OpenCL global function strings.
+/// </summary>
+
+namespace EmberCLns
+{
+/// <summary>
+/// OpenCL equivalent of Palette::RgbToHsv().
+/// </summary>
+static const char* RgbToHsvFunctionString = 
+	//rgb 0 - 1,
+	//h 0 - 6, s 0 - 1, v 0 - 1
+	"static inline void RgbToHsv(real4* rgb, real4* hsv)\n"
+	"{\n"
+	"	real_t max, min, del, rc, gc, bc;\n"
+	"\n"
+	//Compute maximum of r, g, b.
+	"	if ((*rgb).x >= (*rgb).y)\n"
+	"	{\n"
+	"		if ((*rgb).x >= (*rgb).z)\n"
+	"			max = (*rgb).x;\n"
+	"		else\n"
+	"			max = (*rgb).z;\n"
+	"	}\n"
+	"	else\n"
+	"	{\n"
+	"		if ((*rgb).y >= (*rgb).z)\n"
+	"			max = (*rgb).y;\n"
+	"		else\n"
+	"			max = (*rgb).z;\n"
+	"	}\n"
+	"\n"
+	//Compute minimum of r, g, b.
+	"	if ((*rgb).x <= (*rgb).y)\n"
+	"	{\n"
+	"		if ((*rgb).x <= (*rgb).z)\n"
+	"			min = (*rgb).x;\n"
+	"		else\n"
+	"			min = (*rgb).z;\n"
+	"	}\n"
+	"	else\n"
+	"	{\n"
+	"		if ((*rgb).y <= (*rgb).z)\n"
+	"			min = (*rgb).y;\n"
+	"		else\n"
+	"			min = (*rgb).z;\n"
+	"	}\n"
+	"\n"
+	"	del = max - min;\n"
+	"	(*hsv).z = max;\n"
+	"\n"
+	"	if (max != 0)\n"
+	"		(*hsv).y = del / max;\n"
+	"	else\n"
+	"		(*hsv).y = 0;\n"
+	"\n"
+	"	(*hsv).x = 0;\n"
+	"	if ((*hsv).y != 0)\n"
+	"	{\n"
+	"		rc = (max - (*rgb).x) / del;\n"
+	"		gc = (max - (*rgb).y) / del;\n"
+	"		bc = (max - (*rgb).z) / del;\n"
+	"\n"
+	"		if ((*rgb).x == max)\n"
+	"			(*hsv).x = bc - gc;\n"
+	"		else if ((*rgb).y == max)\n"
+	"			(*hsv).x = 2 + rc - bc;\n"
+	"		else if ((*rgb).z == max)\n"
+	"			(*hsv).x = 4 + gc - rc;\n"
+	"\n"
+	"		if ((*hsv).x < 0)\n"
+	"			(*hsv).x += 6;\n"
+	"	}\n"
+	"}\n"
+	"\n";
+
+/// <summary>
+/// OpenCL equivalent of Palette::HsvToRgb().
+/// </summary>
+static const char* HsvToRgbFunctionString = 
+	//h 0 - 6, s 0 - 1, v 0 - 1
+	//rgb 0 - 1 
+	"static inline void HsvToRgb(real4* hsv, real4* rgb)\n"
+	"{\n"
+	"	int j;\n"
+	"	real_t f, p, q, t;\n"
+	"\n"
+	"	while ((*hsv).x >= 6)\n"
+	"		(*hsv).x = (*hsv).x - 6;\n"
+	"\n"
+	"	while ((*hsv).x <  0)\n"
+	"		(*hsv).x = (*hsv).x + 6;\n"
+	"\n"
+	"	j = (int)floor((*hsv).x);\n"
+	"	f = (*hsv).x - j;\n"
+	"	p = (*hsv).z * (1 - (*hsv).y);\n"
+	"	q = (*hsv).z * (1 - ((*hsv).y * f));\n"
+	"	t = (*hsv).z * (1 - ((*hsv).y * (1 - f)));\n"
+	"\n"
+	"	switch (j)\n"
+	"	{\n"
+	"		case 0:  (*rgb).x = (*hsv).z; (*rgb).y = t;		   (*rgb).z = p;	    break;\n"
+	"		case 1:  (*rgb).x = q;		  (*rgb).y = (*hsv).z; (*rgb).z = p;	    break;\n"
+	"		case 2:  (*rgb).x = p;		  (*rgb).y = (*hsv).z; (*rgb).z = t;	    break;\n"
+	"		case 3:  (*rgb).x = p;		  (*rgb).y = q;		   (*rgb).z = (*hsv).z; break;\n"
+	"		case 4:  (*rgb).x = t;		  (*rgb).y = p;		   (*rgb).z = (*hsv).z; break;\n"
+	"		case 5:  (*rgb).x = (*hsv).z; (*rgb).y = p;		   (*rgb).z = q;	    break;\n"
+	"		default: (*rgb).x = (*hsv).z; (*rgb).y = t;		   (*rgb).z = p;	    break;\n"
+	"	}\n"
+	"}\n"
+	"\n";
+
+/// <summary>
+/// OpenCL equivalent of Palette::CalcAlpha().
+/// </summary>
+static const char* CalcAlphaFunctionString = 
+	"static inline real_t CalcAlpha(real_t density, real_t gamma, real_t linrange)\n"//Not the slightest clue what this is doing.//DOC
+	"{\n"
+	"	real_t frac, alpha, funcval = pow(linrange, gamma);\n"
+		"\n"
+	"	if (density > 0)\n"
+	"	{\n"
+	"		if (density < linrange)\n"
+	"		{\n"
+	"			frac = density / linrange;\n"
+	"			alpha = (1.0 - frac) * density * (funcval / linrange) + frac * pow(density, gamma);\n"
+	"		}\n"
+	"		else\n"
+	"			alpha = pow(density, gamma);\n"
+	"	}\n"
+	"	else\n"
+	"		alpha = 0;\n"
+	"\n"
+	"	return alpha;\n"
+	"}\n"
+	"\n";
+
+/// <summary>
+/// Use MWC 64 from David Thomas at the Imperial College of London for
+/// random numbers in OpenCL, instead of ISAAC which was used
+/// for CPU rendering.
+/// </summary>
+static const char* RandFunctionString =
+	"enum { MWC64X_A = 4294883355u };\n\n"
+	"inline uint MwcNext(uint2* s)\n"
+	"{\n"
+	"	uint res = (*s).x ^ (*s).y;			\n"//Calculate the result.
+	"	uint hi = mul_hi((*s).x, MWC64X_A); \n"//Step the RNG.
+	"	(*s).x = (*s).x * MWC64X_A + (*s).y;\n"//Pack the state back up.
+	"	(*s).y = hi + ((*s).x < (*s).y);	\n"
+	"	return res;							\n"//Return the next result.
+	"}\n"
+	"\n"
+	"inline uint MwcNextRange(uint2* s, uint val)\n"
+	"{\n"
+	"	return (val == 0) ? MwcNext(s) : (MwcNext(s) % val);\n"
+	"}\n"
+	"\n"
+	"inline real_t MwcNext01(uint2* s)\n"
+	"{\n"
+	"	return MwcNext(s) * (1.0 / 4294967296.0);\n"
+	"}\n"
+	"\n"
+	"inline real_t MwcNextNeg1Pos1(uint2* s)\n"
+	"{\n"
+	"	real_t f = (real_t)MwcNext(s) / UINT_MAX;\n"
+	"	return -1.0 + (f * (1.0 - (-1.0)));\n"
+	"}\n"
+	"\n";
+
+/// <summary>
+/// OpenCL equivalent of the global ClampRef().
+/// </summary>
+static const char* ClampRealFunctionString =
+	"inline real_t Clamp(real_t val, real_t min, real_t max)\n"
+	"{\n"
+	"	if (val < min)\n"
+	"		return min;\n"
+	"	else if (val > max)\n"
+	"		return max;\n"
+	"	else\n"
+	"		return val;\n"
+	"}\n"
+	"\n"
+	"inline void ClampRef(real_t* val, real_t min, real_t max)\n"
+	"{\n"
+	"	if (*val < min)\n"
+	"		*val = min;\n"
+	"	else if (*val > max)\n"
+	"		*val = max;\n"
+	"}\n"
+	"\n"
+	"inline real_t ClampGte(real_t val, real_t gte)\n"
+	"{\n"
+	"	return (val < gte) ? gte : val;\n"
+	"}\n"
+	"\n";
+
+/// <summary>
+/// OpenCL equivalent of the global LRint().
+/// </summary>
+static const char* InlineMathFunctionsString = 
+	"inline real_t LRint(real_t x)\n"
+	"{\n"
+	"    intPrec temp = (x >= 0.0 ? (intPrec)(x + 0.5) : (intPrec)(x - 0.5));\n"
+	"    return (real_t)temp;\n"
+	"}\n"
+	"\n"
+	"inline real_t Round(real_t r)\n"
+	"{\n"
+	"	return (r > 0.0) ? floor(r + 0.5) : ceil(r - 0.5);\n"
+	"}\n"
+	"\n"
+	"inline real_t Sign(real_t v)\n"
+	"{\n"
+	"	return (v < 0.0) ? -1 : (v > 0.0) ? 1 : 0.0;\n"
+	"}\n"
+	"\n"
+	"inline real_t SignNz(real_t v)\n"
+	"{\n"
+	"	return (v < 0.0) ? -1.0 : 1.0;\n"
+	"}\n"
+	"\n"
+	"inline real_t Sqr(real_t v)\n"
+	"{\n"
+	"	return v * v;\n"
+	"}\n"
+	"\n"
+	"inline real_t SafeSqrt(real_t x)\n"
+	"{\n"
+	"	if (x <= 0.0)\n"
+	"		return 0.0;\n"
+	"\n"
+	"	return sqrt(x);\n"
+	"}\n"
+	"\n"
+	"inline real_t Cube(real_t v)\n"
+	"{\n"
+	"	return v * v * v;\n"
+	"}\n"
+	"\n"
+	"inline real_t Hypot(real_t x, real_t y)\n"
+	"{\n"
+	"	return sqrt(SQR(x) + SQR(y));\n"
+	"}\n"
+	"\n"
+	"inline real_t Spread(real_t x, real_t y)\n"
+	"{\n"
+	"	return Hypot(x, y) * ((x) > 0.0 ? 1.0 : -1.0);\n"
+	"}\n"
+	"\n"
+	"inline real_t Powq4(real_t x, real_t y)\n"
+	"{\n"
+	"	return pow(fabs(x), y) * SignNz(x);\n"
+	"}\n"
+	"\n"
+	"inline real_t Powq4c(real_t x, real_t y)\n"
+	"{\n"
+	"	return y == 1.0 ? x : Powq4(x, y);\n"
+	"}\n"
+	"\n"
+	"inline real_t Zeps(real_t x)\n"
+	"{\n"
+	"	return x == 0.0 ? EPS6 : x;\n"
+	"}\n"
+	"\n"
+	"inline real_t Lerp(real_t a, real_t b, real_t p)\n"
+	"{\n"
+	"	return a + (b - a) * p;\n"
+	"}\n"
+	"\n"
+	"inline real_t Fabsmod(real_t v)\n"
+	"{\n"
+	"	real_t dummy;\n"
+	"\n"
+	"	return modf(v, &dummy);\n"
+	"}\n"
+	"\n"
+	"inline real_t Fosc(real_t p, real_t amp, real_t ph)\n"
+	"{\n"
+	"	return 0.5 - cos(p * amp + ph) * 0.5;\n"
+	"}\n"
+	"\n"
+	"inline real_t Foscn(real_t p, real_t ph)\n"
+	"{\n"
+	"	return 0.5 - cos(p + ph) * 0.5;\n"
+	"}\n"
+	"\n"
+	"inline real_t LogScale(real_t x)\n"
+	"{\n"
+	"	return x == 0.0 ? 0.0 : log((fabs(x) + 1) * M_E) * SignNz(x) / M_E;\n"
+	"}\n"
+	"\n"
+	"inline real_t LogMap(real_t x)\n"
+	"{\n"
+	"	return x == 0.0 ? 0.0 : (M_E + log(x * M_E)) * 0.25 * SignNz(x);\n"
+	"}\n"
+	"\n";
+
+/// <summary>
+/// OpenCL equivalent Renderer::AddToAccum().
+/// </summary>
+static const char* AddToAccumWithCheckFunctionString = 
+	"inline bool AccumCheck(int superRasW, int superRasH, int i, int ii, int j, int jj)\n"
+	"{\n"
+	"	return (j + jj >= 0 && j + jj < superRasH && i + ii >= 0 && i + ii < superRasW);\n"
+	"}\n"
+	"\n";
+
+/// <summary>
+/// OpenCL equivalent various CarToRas member functions.
+/// </summary>
+static const char* CarToRasFunctionString = 
+	"inline void CarToRasConvertPointToSingle(__constant CarToRasCL* carToRas, Point* point, unsigned int* singleBufferIndex)\n"
+	"{\n"
+	"	*singleBufferIndex = (unsigned int)(carToRas->m_PixPerImageUnitW * point->m_X - carToRas->m_RasLlX) + (carToRas->m_RasWidth * (unsigned int)(carToRas->m_PixPerImageUnitH * point->m_Y - carToRas->m_RasLlY));\n"
+	"}\n"
+	"\n"
+	"inline bool CarToRasInBounds(__constant CarToRasCL* carToRas, Point* point)\n"
+	"{\n"
+	"	return point->m_X >= carToRas->m_CarLlX &&\n"
+	"		point->m_X < carToRas->m_CarUrX &&\n"
+	"		point->m_Y < carToRas->m_CarUrY &&\n"
+	"		point->m_Y >= carToRas->m_CarLlY;\n"
+	"}\n"
+	"\n";
+
+static string AtomicString(bool doublePrecision, bool dp64AtomicSupport)
+{
+	ostringstream os;
+
+	//If they want single precision, or if they want double precision and have dp atomic support.
+	if (!doublePrecision || dp64AtomicSupport)
+	{
+		os <<
+		"void AtomicAdd(volatile __global real_t* source, const real_t operand)\n"
+		"{\n"
+		"	union\n"
+		"	{\n"
+		"		atomi intVal;\n"
+		"		real_t realVal;\n"
+		"	} newVal;\n"
+		"\n"
+		"	union\n"
+		"	{\n"
+		"		atomi intVal;\n"
+		"		real_t realVal;\n"
+		"	} prevVal;\n"
+		"\n"
+		"	do\n"
+		"	{\n"
+		"		prevVal.realVal = *source;\n"
+		"		newVal.realVal = prevVal.realVal + operand;\n"
+		"	} while (atomic_cmpxchg((volatile __global atomi*)source, prevVal.intVal, newVal.intVal) != prevVal.intVal);\n"
+		"}\n";
+	}
+	else//They want double precision and do not have dp atomic support.
+	{
+		os <<
+		"void AtomicAdd(volatile __global real_t* source, const real_t operand)\n"
+		"{\n"
+		"	union\n"
+		"	{\n"
+		"		uint intVal[2];\n"
+		"		real_t realVal;\n"
+		"	} newVal;\n"
+		"\n"
+		"	union\n"
+		"	{\n"
+		"		uint intVal[2];\n"
+		"		real_t realVal;\n"
+		"	} prevVal;\n"
+		"\n"
+		"	do\n"
+		"	{\n"
+		"		prevVal.realVal = *source;\n"
+		"		newVal.realVal = prevVal.realVal + operand;\n"
+		"	} while ((atomic_cmpxchg((volatile __global uint*)source, prevVal.intVal[0], newVal.intVal[0])     != prevVal.intVal[0]) ||\n"
+		"			 (atomic_cmpxchg((volatile __global uint*)source + 1, prevVal.intVal[1], newVal.intVal[1]) != prevVal.intVal[1]));\n"
+		"}\n";
+	}
+
+	return os.str();
+}
+
+#ifdef GRAVEYARD
+/*"void AtomicLocalAdd(volatile __local real_t* source, const real_t operand)\n"
+	"{\n"
+	"	union\n"
+	"	{\n"
+	"		atomi intVal;\n"
+	"		real_t realVal;\n"
+	"	} newVal;\n"
+	"\n"
+	"	union\n"
+	"	{\n"
+	"		atomi intVal;\n"
+	"		real_t realVal;\n"
+	"	} prevVal;\n"
+	"\n"
+	"	do\n"
+	"	{\n"
+	"		prevVal.realVal = *source;\n"
+	"		newVal.realVal = prevVal.realVal + operand;\n"
+	"	} while (atomic_cmpxchg((volatile __local atomi*)source, prevVal.intVal, newVal.intVal) != prevVal.intVal);\n"
+	"}\n"*/
+#endif
+}
--- a/Source/EmberCL/EmberCLPch.h
+++ b/Source/EmberCL/EmberCLPch.h
@ -0,0 +1,39 @@
+#pragma once
+
+/// <summary>
+/// Precompiled header file. Place all system includes here with appropriate #defines for different operating systems and compilers.
+/// </summary>
+
+#define NOMINMAX
+#define WIN32_LEAN_AND_MEAN//Exclude rarely-used stuff from Windows headers.
+#define _USE_MATH_DEFINES
+
+#ifdef _WIN32
+	#include <windows.h>
+	#include <SDKDDKVer.h>
+#endif
+
+#include <utility>
+#include <CL/cl.hpp>
+
+#include <algorithm>
+#include <cstdio>
+#include <cstdlib>
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <iterator>
+#include <time.h>
+
+#include "Timing.h"
+#include "Renderer.h"
+
+#if defined(BUILDING_EMBERCL)
+#define EMBERCL_API __declspec(dllexport)
+#else
+#define EMBERCL_API __declspec(dllimport)
+#endif
+
+using namespace std;
+using namespace EmberNs;
+//#define TEST_CL 1
--- a/Source/EmberCL/EmberCLStructs.h
+++ b/Source/EmberCL/EmberCLStructs.h
@ -0,0 +1,383 @@
+#pragma once
+
+#include "EmberCLPch.h"
+
+/// <summary>
+/// Various data structures defined for the CPU and OpenCL.
+/// These are stripped down versions of THE classes in Ember, for use with OpenCL.
+/// Their sole purpose is to pass values from the host to the device.
+/// They retain most of the member variables, but do not contain the functions.
+/// Visual Studio defaults to alighment of 16, but it's made explicit in case another compiler is used.
+/// This must match the alignment specified in the kernel.
+/// </summary>
+
+namespace EmberCLns
+{
+#define ALIGN __declspec(align(16))//These two must always match.
+#define ALIGN_CL "((aligned (16)))"//The extra parens are necessary.
+
+/// <summary>
+/// Various constants needed for rendering.
+/// </summary>
+static string ConstantDefinesString(bool doublePrecision)
+{
+	ostringstream os;
+
+	if (doublePrecision)
+	{
+		os << "#if defined(cl_amd_fp64)\n"//AMD extension available?
+		   << "	#pragma OPENCL EXTENSION cl_amd_fp64 : enable\n"
+		   << "#endif\n"
+		   << "#if defined(cl_khr_fp64)\n"//Khronos extension available?
+		   << "	#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
+		   << "#endif\n"
+		   << "#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable\n"//Only supported on nVidia.
+		   << "typedef long intPrec;\n"
+		   << "typedef ulong atomi;\n"
+		   << "typedef double real_t;\n"
+		   << "typedef double4 real4;\n";
+	}
+	else
+	{
+		os << "typedef int intPrec;\n"
+			  "typedef unsigned int atomi;\n"
+			  "typedef float real_t;\n"
+			  "typedef float4 real4;\n";
+	}
+
+	os <<
+		"typedef          long int int64;\n"
+		"typedef unsigned long int uint64;\n"
+		"\n"
+		"#define EPS ((1e-10))\n"//May need to change this, it might not be enough in some cases. Maybe try 1e-9 if things look funny when close to zero.
+		"#define EPS6 ((1e-6))\n"
+		"\n"
+		"//The number of threads per block used in the iteration function. Don't change\n"
+		"//it lightly; the block size is hard coded to be exactly 32 x 8.\n"
+		"#define NTHREADS 256u\n"
+		"#define THREADS_PER_WARP 32u\n"
+		"#define NWARPS (NTHREADS / THREADS_PER_WARP)\n"
+		"#define COLORMAP_LENGTH 256u\n"
+		"#define COLORMAP_LENGTH_MINUS_1 255u\n"
+		"#define DE_THRESH 100u\n"
+		"#define BadVal(x) (((x) != (x)) || ((x) > 1e10) || ((x) < -1e10))\n"
+		"#define Rint(A) floor((A) + (((A) < 0) ? -0.5 : 0.5))\n"
+		"#define SQR(x) ((x) * (x))\n"
+		"#define CUBE(x) ((x) * (x) * (x))\n"
+		"#define M_2PI (M_PI * 2)\n"
+		"#define M_3PI (M_PI * 3)\n"
+		"#define SQRT5 2.2360679774997896964091736687313\n"
+		"#define M_PHI 1.61803398874989484820458683436563\n"
+		"#define DEG_2_RAD (M_PI / 180)\n"
+		"\n"
+		"//Index in each dimension of a thread within a block.\n"
+		"#define THREAD_ID_X   (get_local_id(0))\n"
+		"#define THREAD_ID_Y   (get_local_id(1))\n"
+		"#define THREAD_ID_Z   (get_local_id(2))\n"
+		"\n"
+		"//Index in each dimension of a block within a grid.\n"
+		"#define BLOCK_ID_X    (get_group_id(0))\n"
+		"#define BLOCK_ID_Y    (get_group_id(1))\n"
+		"#define BLOCK_ID_Z    (get_group_id(2))\n"
+		"\n"
+		"//Absolute index in each dimension of a thread within a grid.\n"
+		"#define GLOBAL_ID_X   (get_global_id(0))\n"
+		"#define GLOBAL_ID_Y   (get_global_id(1))\n"
+		"#define GLOBAL_ID_Z   (get_global_id(2))\n"
+		"\n"
+		"//Dimensions of a block.\n"
+		"#define BLOCK_SIZE_X  (get_local_size(0))\n"
+		"#define BLOCK_SIZE_Y  (get_local_size(1))\n"
+		"#define BLOCK_SIZE_Z  (get_local_size(2))\n"
+		"\n"
+		"//Dimensions of a grid, in terms of blocks.\n"
+		"#define GRID_SIZE_X   (get_num_groups(0))\n"
+		"#define GRID_SIZE_Y   (get_num_groups(1))\n"
+		"#define GRID_SIZE_Z   (get_num_groups(2))\n"
+		"\n"
+		"//Dimensions of a grid, in terms of threads.\n"
+		"#define GLOBAL_SIZE_X (get_global_size(0))\n"
+		"#define GLOBAL_SIZE_Y (get_global_size(1))\n"
+		"#define GLOBAL_SIZE_Z (get_global_size(2))\n"
+		"\n"
+		"#define INDEX_IN_BLOCK_2D (THREAD_ID_Y * BLOCK_SIZE_X + THREAD_ID_X)\n"
+		"#define INDEX_IN_BLOCK_3D ((BLOCK_SIZE_X * BLOCK_SIZE_Y * THREAD_ID_Z) + INDEX_IN_BLOCK_2D)\n"
+		"\n"
+		"#define INDEX_IN_GRID_2D (GLOBAL_ID_Y * GLOBAL_SIZE_X + GLOBAL_ID_X)\n"
+		"#define INDEX_IN_GRID_3D ((GLOBAL_SIZE_X * GLOBAL_SIZE_Y * GLOBAL_ID_Z) + INDEX_IN_GRID_2D)\n"
+		"\n";
+
+	return os.str();
+}
+
+/// <summary>
+/// A point structure on the host that maps to the one used on the device to iterate in OpenCL.
+/// It might seem better to use vec4, however 2D palettes and even 3D coordinates may eventually
+/// be supported, which will make it more than 4 members.
+/// </summary>
+template <typename T>
+struct ALIGN PointCL
+{
+	T m_X;
+	T m_Y;
+	T m_Z;
+	T m_ColorX;
+	T m_LastXfUsed;
+};
+
+/// <summary>
+/// The point structure used to iterate in OpenCL.
+/// It might seem better to use float4, however 2D palettes and even 3D coordinates may eventually
+/// be supported, which will make it more than 4 members.
+/// </summary>
+static const char* PointCLStructString =
+"typedef struct __attribute__ " ALIGN_CL " _Point\n"
+"{\n"
+"	real_t m_X;\n"
+"	real_t m_Y;\n"
+"	real_t m_Z;\n"
+"	real_t m_ColorX;\n"
+"	uint m_LastXfUsed;\n"
+"} Point;\n"
+"\n";
+
+#define MAX_CL_VARS 8//These must always match.
+#define MAX_CL_VARS_STRING "8"
+
+/// <summary>
+/// A structure on the host used to hold all of the needed information for an xform used on the device to iterate in OpenCL.
+/// Template argument expected to be float or double.
+/// </summary>
+template <typename T>
+struct ALIGN XformCL
+{
+	T m_A, m_B, m_C, m_D, m_E, m_F;//24 (48)
+	T m_VariationWeights[MAX_CL_VARS];//56 (112)
+	T m_PostA, m_PostB, m_PostC, m_PostD, m_PostE, m_PostF;//80 (160)
+	T m_DirectColor;//84 (168)
+	T m_ColorSpeedCache;//88 (176)
+	T m_OneMinusColorCache;//92 (184)
+	T m_Opacity;//96 (192)
+	T m_VizAdjusted;//100 (200)
+};
+
+/// <summary>
+/// The xform structure used to iterate in OpenCL.
+/// </summary>
+static const char* XformCLStructString =
+"typedef struct __attribute__ " ALIGN_CL " _XformCL\n"
+"{\n"
+"	real_t m_A, m_B, m_C, m_D, m_E, m_F;\n"
+"	real_t m_VariationWeights[" MAX_CL_VARS_STRING "];\n"
+"	real_t m_PostA, m_PostB, m_PostC, m_PostD, m_PostE, m_PostF;\n"
+"	real_t m_DirectColor;\n"
+"	real_t m_ColorSpeedCache;\n"
+"	real_t m_OneMinusColorCache;\n"
+"	real_t m_Opacity;\n"
+"	real_t m_VizAdjusted;\n"
+"} XformCL;\n"
+"\n";
+
+#define MAX_CL_XFORM 21//These must always match.
+#define MAX_CL_XFORM_STRING "21"
+
+/// <summary>
+/// A structure on the host used to hold all of the needed information for an ember used on the device to iterate in OpenCL.
+/// Template argument expected to be float or double.
+/// </summary>
+template <typename T>
+struct ALIGN EmberCL
+{
+	unsigned int m_FinalXformIndex;
+	XformCL<T> m_Xforms[MAX_CL_XFORM];
+	T m_CamZPos;
+	T m_CamPerspective;
+	T m_CamYaw;
+	T m_CamPitch;
+	T m_CamDepthBlur;
+	T m_BlurCoef;
+	m3T m_CamMat;
+	T m_CenterX, m_CenterY;
+	T m_RotA, m_RotB, m_RotD, m_RotE;
+};
+
+/// <summary>
+/// The ember structure used to iterate in OpenCL.
+/// </summary>
+static const char* EmberCLStructString =
+"typedef struct __attribute__ " ALIGN_CL " _EmberCL\n"
+"{\n"
+"	uint m_FinalXformIndex;\n"
+"	XformCL m_Xforms[" MAX_CL_XFORM_STRING "];\n"
+"	real_t m_CamZPos;\n"
+"	real_t m_CamPerspective;\n"
+"	real_t m_CamYaw;\n"
+"	real_t m_CamPitch;\n"
+"	real_t m_CamDepthBlur;\n"
+"	real_t m_BlurCoef;\n"
+"	real_t m_C00;\n"
+"	real_t m_C01;\n"
+"	real_t m_C02;\n"
+"	real_t m_C10;\n"
+"	real_t m_C11;\n"
+"	real_t m_C12;\n"
+"	real_t m_C20;\n"
+"	real_t m_C21;\n"
+"	real_t m_C22;\n"
+"	real_t m_CenterX, m_CenterY;\n"
+"	real_t m_RotA, m_RotB, m_RotD, m_RotE;\n"
+"} EmberCL;\n"
+"\n";
+
+/// <summary>
+/// A structure on the host used to hold all of the needed information for cartesian to raster mapping used on the device to iterate in OpenCL.
+/// Template argument expected to be float or double.
+/// </summary>
+template <typename T>
+struct ALIGN CarToRasCL
+{
+	T m_PixPerImageUnitW, m_RasLlX;
+	unsigned int m_RasWidth;
+	T m_PixPerImageUnitH, m_RasLlY;
+	T m_CarLlX, m_CarUrX, m_CarUrY, m_CarLlY;
+};
+
+/// <summary>
+/// The cartesian to raster structure used to iterate in OpenCL.
+/// </summary>
+static const char* CarToRasCLStructString =
+"typedef struct __attribute__ " ALIGN_CL " _CarToRasCL\n"
+"{\n"
+"	real_t m_PixPerImageUnitW, m_RasLlX;\n"
+"	uint m_RasWidth;\n"
+"	real_t m_PixPerImageUnitH, m_RasLlY;\n"
+"	real_t m_CarLlX, m_CarUrX, m_CarUrY, m_CarLlY;\n"
+"} CarToRasCL;\n"
+"\n";
+
+/// <summary>
+/// A structure on the host used to hold all of the needed information for density filtering used on the device to iterate in OpenCL.
+/// Note that the actual filter buffer is held elsewhere.
+/// Template argument expected to be float or double.
+/// </summary>
+template <typename T>
+struct ALIGN DensityFilterCL
+{
+	T m_Curve;
+	T m_K1;
+	T m_K2;
+	unsigned int m_Supersample;
+	unsigned int m_SuperRasW;
+	unsigned int m_SuperRasH;
+	unsigned int m_KernelSize;
+	unsigned int m_MaxFilterIndex;
+	unsigned int m_MaxFilteredCounts;
+	unsigned int m_FilterWidth;
+};
+
+/// <summary>
+/// The density filtering structure used to iterate in OpenCL.
+/// Note that the actual filter buffer is held elsewhere.
+/// </summary>
+static const char* DensityFilterCLStructString =
+"typedef struct __attribute__ " ALIGN_CL " _DensityFilterCL\n"
+"{\n"
+"	real_t m_Curve;\n"
+"	real_t m_K1;\n"
+"	real_t m_K2;\n"
+"	uint m_Supersample;\n"
+"	uint m_SuperRasW;\n"
+"	uint m_SuperRasH;\n"
+"	uint m_KernelSize;\n"
+"	uint m_MaxFilterIndex;\n"
+"	uint m_MaxFilteredCounts;\n"
+"	uint m_FilterWidth;\n"
+"} DensityFilterCL;\n"
+"\n";
+
+/// <summary>
+/// A structure on the host used to hold all of the needed information for spatial filtering used on the device to iterate in OpenCL.
+/// Note that the actual filter buffer is held elsewhere.
+/// </summary>
+template <typename T>
+struct ALIGN SpatialFilterCL
+{
+	unsigned int m_SuperRasW;
+	unsigned int m_SuperRasH;
+	unsigned int m_FinalRasW;
+	unsigned int m_FinalRasH;
+	unsigned int m_Supersample;
+	unsigned int m_FilterWidth;
+	unsigned int m_NumChannels;
+	unsigned int m_BytesPerChannel;
+	unsigned int m_DensityFilterOffset;
+	unsigned int m_Transparency;
+	T m_Vibrancy;
+	T m_HighlightPower;
+	T m_Gamma;
+	T m_LinRange;
+	Color<T> m_Background;
+};
+
+/// <summary>
+/// The spatial filtering structure used to iterate in OpenCL.
+/// Note that the actual filter buffer is held elsewhere.
+/// </summary>
+static const char* SpatialFilterCLStructString =
+"typedef struct __attribute__ ((aligned (16))) _SpatialFilterCL\n"
+"{\n"
+"	uint m_SuperRasW;\n"
+"	uint m_SuperRasH;\n"
+"	uint m_FinalRasW;\n"
+"	uint m_FinalRasH;\n"
+"	uint m_Supersample;\n"
+"	uint m_FilterWidth;\n"
+"	uint m_NumChannels;\n"
+"	uint m_BytesPerChannel;\n"
+"	uint m_DensityFilterOffset;\n"
+"	uint m_Transparency;\n"
+"	real_t m_Vibrancy;\n"
+"	real_t m_HighlightPower;\n"
+"	real_t m_Gamma;\n"
+"	real_t m_LinRange;\n"
+"	real_t m_Background[4];\n"//For some reason, using float4/double4 here does not align no matter what. So just use an array of 4.
+"} SpatialFilterCL;\n"
+"\n";
+
+/// <summary>
+/// EmberCL makes extensive use of the build in vector types, however accessing
+/// their members as a buffer is not natively supported.
+/// Declaring them in a union with a buffer resolves this problem.
+/// </summary>
+static const char* UnionCLStructString =
+"typedef union\n"
+"{\n"
+"	uchar3 m_Uchar3;\n"
+"	uchar m_Uchars[3];\n"
+"} uchar3uchars;\n"
+"\n"
+"typedef union\n"
+"{\n"
+"	uchar4 m_Uchar4;\n"
+"	uchar m_Uchars[4];\n"
+"} uchar4uchars;\n"
+"\n"
+"typedef union\n"
+"{\n"
+"	uint4 m_Uint4;\n"
+"	uint m_Uints[4];\n"
+"} uint4uints;\n"
+"\n"
+"typedef union\n"//Use in places where float is required.
+"{\n"
+"	float4 m_Float4;\n"
+"	float m_Floats[4];\n"
+"} float4floats;\n"
+"\n"
+"typedef union\n"//Use in places where float or double can be used depending on the template type.
+"{\n"
+"	real4 m_Real4;\n"
+"	real_t m_Reals[4];\n"
+"} real4reals;\n"
+"\n";
+}
--- a/Source/EmberCL/FinalAccumOpenCLKernelCreator.cpp
+++ b/Source/EmberCL/FinalAccumOpenCLKernelCreator.cpp
@ -0,0 +1,517 @@
+#include "EmberCLPch.h"
+#include "FinalAccumOpenCLKernelCreator.h"
+
+namespace EmberCLns
+{
+/// <summary>
+/// Constructor that creates all kernel strings.
+/// The caller will access these strings through the accessor functions.
+/// </summary>
+template <typename T>
+FinalAccumOpenCLKernelCreator<T>::FinalAccumOpenCLKernelCreator()
+{
+	m_GammaCorrectionWithAlphaCalcEntryPoint    = "GammaCorrectionWithAlphaCalcKernel";
+	m_GammaCorrectionWithoutAlphaCalcEntryPoint = "GammaCorrectionWithoutAlphaCalcKernel";
+
+	m_GammaCorrectionWithAlphaCalcKernel    = CreateGammaCorrectionKernelString(true);
+	m_GammaCorrectionWithoutAlphaCalcKernel = CreateGammaCorrectionKernelString(false);
+
+	m_FinalAccumEarlyClipEntryPoint                               = "FinalAccumEarlyClipKernel";
+	m_FinalAccumEarlyClipWithAlphaCalcWithAlphaAccumEntryPoint    = "FinalAccumEarlyClipWithAlphaCalcWithAlphaAccumKernel";
+	m_FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumEntryPoint = "FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumKernel";
+
+	m_FinalAccumEarlyClipKernel                               = CreateFinalAccumKernelString(true, false, false);
+	m_FinalAccumEarlyClipWithAlphaCalcWithAlphaAccumKernel    = CreateFinalAccumKernelString(true, true,  true);
+	m_FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumKernel = CreateFinalAccumKernelString(true, false, true);
+
+	m_FinalAccumLateClipEntryPoint                               = "FinalAccumLateClipKernel";
+	m_FinalAccumLateClipWithAlphaCalcWithAlphaAccumEntryPoint    = "FinalAccumLateClipWithAlphaCalcWithAlphaAccumKernel";
+	m_FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumEntryPoint = "FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumKernel";
+
+	m_FinalAccumLateClipKernel                               = CreateFinalAccumKernelString(false, false, false);
+	m_FinalAccumLateClipWithAlphaCalcWithAlphaAccumKernel    = CreateFinalAccumKernelString(false, true,  true);
+	m_FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumKernel = CreateFinalAccumKernelString(false, false, true);
+}
+
+/// <summary>
+/// Kernel source and entry point properties, getters only.
+/// </summary>
+
+template <typename T> string FinalAccumOpenCLKernelCreator<T>::GammaCorrectionWithAlphaCalcKernel()        { return m_GammaCorrectionWithAlphaCalcKernel;	    }
+template <typename T> string FinalAccumOpenCLKernelCreator<T>::GammaCorrectionWithAlphaCalcEntryPoint()    { return m_GammaCorrectionWithAlphaCalcEntryPoint;    }
+template <typename T> string FinalAccumOpenCLKernelCreator<T>::GammaCorrectionWithoutAlphaCalcKernel()     { return m_GammaCorrectionWithoutAlphaCalcKernel;     }
+template <typename T> string FinalAccumOpenCLKernelCreator<T>::GammaCorrectionWithoutAlphaCalcEntryPoint() { return m_GammaCorrectionWithoutAlphaCalcEntryPoint; }
+
+template <typename T> string FinalAccumOpenCLKernelCreator<T>::FinalAccumEarlyClipKernel()                                   { return m_FinalAccumEarlyClipKernel;                                   }
+template <typename T> string FinalAccumOpenCLKernelCreator<T>::FinalAccumEarlyClipEntryPoint()                               { return m_FinalAccumEarlyClipEntryPoint;                               }
+template <typename T> string FinalAccumOpenCLKernelCreator<T>::FinalAccumEarlyClipWithAlphaCalcWithAlphaAccumKernel()        { return m_FinalAccumEarlyClipWithAlphaCalcWithAlphaAccumKernel;        }
+template <typename T> string FinalAccumOpenCLKernelCreator<T>::FinalAccumEarlyClipWithAlphaCalcWithAlphaAccumEntryPoint()    { return m_FinalAccumEarlyClipWithAlphaCalcWithAlphaAccumEntryPoint;    }
+template <typename T> string FinalAccumOpenCLKernelCreator<T>::FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumKernel()     { return m_FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumKernel;     }
+template <typename T> string FinalAccumOpenCLKernelCreator<T>::FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumEntryPoint() { return m_FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumEntryPoint; }
+
+template <typename T> string FinalAccumOpenCLKernelCreator<T>::FinalAccumLateClipKernel()                                   { return m_FinalAccumLateClipKernel;                                   }
+template <typename T> string FinalAccumOpenCLKernelCreator<T>::FinalAccumLateClipEntryPoint()                               { return m_FinalAccumLateClipEntryPoint;                               }
+template <typename T> string FinalAccumOpenCLKernelCreator<T>::FinalAccumLateClipWithAlphaCalcWithAlphaAccumKernel()        { return m_FinalAccumLateClipWithAlphaCalcWithAlphaAccumKernel;        }
+template <typename T> string FinalAccumOpenCLKernelCreator<T>::FinalAccumLateClipWithAlphaCalcWithAlphaAccumEntryPoint()    { return m_FinalAccumLateClipWithAlphaCalcWithAlphaAccumEntryPoint;    }
+template <typename T> string FinalAccumOpenCLKernelCreator<T>::FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumKernel()     { return m_FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumKernel;     }
+template <typename T> string FinalAccumOpenCLKernelCreator<T>::FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumEntryPoint() { return m_FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumEntryPoint; }
+
+/// <summary>
+/// Get the gamma correction entry point.
+/// </summary>
+/// <param name="channels">The number of channels used, 3 or 4.</param>
+/// <param name="transparency">True if channels equals 4 and using transparency, else false.</param>
+/// <returns>The name of the gamma correction entry point kernel function</returns>
+template <typename T>
+string FinalAccumOpenCLKernelCreator<T>::GammaCorrectionEntryPoint(unsigned int channels, bool transparency)
+{
+	bool alphaCalc = (channels > 3 && transparency);
+	return alphaCalc ? m_GammaCorrectionWithAlphaCalcEntryPoint : m_GammaCorrectionWithoutAlphaCalcEntryPoint;
+}
+
+/// <summary>
+/// Get the gamma correction kernel string.
+/// </summary>
+/// <param name="channels">The number of channels used, 3 or 4.</param>
+/// <param name="transparency">True if channels equals 4 and using transparency, else false.</param>
+/// <returns>The gamma correction kernel string</returns>
+template <typename T>
+string FinalAccumOpenCLKernelCreator<T>::GammaCorrectionKernel(unsigned int channels, bool transparency)
+{
+	bool alphaCalc = (channels > 3 && transparency);
+	return alphaCalc ? m_GammaCorrectionWithAlphaCalcKernel : m_GammaCorrectionWithoutAlphaCalcKernel;
+}
+
+/// <summary>
+/// Get the final accumulation entry point.
+/// </summary>
+/// <param name="earlyClip">True if early clip is desired, else false.</param>
+/// <param name="channels">The number of channels used, 3 or 4.</param>
+/// <param name="transparency">True if channels equals 4 and using transparency, else false.</param>
+/// <param name="alphaBase">Storage for the alpha base value used in the kernel. 0 if transparency is true, else 255.</param>
+/// <param name="alphaScale">Storage for the alpha scale value used in the kernel. 255 if transparency is true, else 0.</param>
+/// <returns>The name of the final accumulation entry point kernel function</returns>
+template <typename T>
+string FinalAccumOpenCLKernelCreator<T>::FinalAccumEntryPoint(bool earlyClip, unsigned int channels, bool transparency, T& alphaBase, T& alphaScale)
+{
+	bool alphaCalc = (channels > 3 && transparency);
+	bool alphaAccum = channels > 3;
+
+	if (alphaAccum)
+	{
+		alphaBase = transparency ? 0.0f : 255.0f;//See the table below.
+		alphaScale = transparency ? 255.0f : 0.0f;
+	}
+
+	if (earlyClip)
+	{
+		if (!alphaCalc && !alphaAccum)//Rgb output, the most common case.
+			return FinalAccumEarlyClipEntryPoint();
+		else if (alphaCalc && alphaAccum)//Rgba output and Transparency.
+			return FinalAccumEarlyClipWithAlphaCalcWithAlphaAccumEntryPoint();
+		else if (!alphaCalc && alphaAccum)//Rgba output and !Transparency.
+			return FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumEntryPoint();
+		else
+			return "";//Cannot have alphaCalc and !alphaAccum, it makes no sense.
+	}
+	else
+	{
+		if (!alphaCalc && !alphaAccum)//Rgb output, the most common case.
+			return FinalAccumLateClipEntryPoint();
+		else if (alphaCalc && alphaAccum)//Rgba output and Transparency.
+			return FinalAccumLateClipWithAlphaCalcWithAlphaAccumEntryPoint();
+		else if (!alphaCalc && alphaAccum)//Rgba output and !Transparency.
+			return FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumEntryPoint();
+		else
+			return "";//Cannot have alphaCalc and !alphaAccum, it makes no sense.
+	}
+}
+
+/// <summary>
+/// Get the final accumulation kernel string.
+/// </summary>
+/// <param name="earlyClip">True if early clip is desired, else false.</param>
+/// <param name="channels">The number of channels used, 3 or 4.</param>
+/// <param name="transparency">True if channels equals 4 and using transparency, else false.</param>
+/// <returns>The final accumulation kernel string</returns>
+template <typename T>
+string FinalAccumOpenCLKernelCreator<T>::FinalAccumKernel(bool earlyClip, unsigned int channels, bool transparency)
+{
+	bool alphaCalc = (channels > 3 && transparency);
+	bool alphaAccum = channels > 3;
+
+	if (earlyClip)
+	{
+		if (!alphaCalc && !alphaAccum)//Rgb output, the most common case.
+			return FinalAccumEarlyClipKernel();
+		else if (alphaCalc && alphaAccum)//Rgba output and Transparency.
+			return FinalAccumEarlyClipWithAlphaCalcWithAlphaAccumKernel();
+		else if (!alphaCalc && alphaAccum)//Rgba output and !Transparency.
+			return FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumKernel();
+		else
+			return "";//Cannot have alphaCalc and !alphaAccum, it makes no sense.
+	}
+	else
+	{
+		if (!alphaCalc && !alphaAccum)//Rgb output, the most common case.
+			return FinalAccumLateClipKernel();
+		else if (alphaCalc && alphaAccum)//Rgba output and Transparency.
+			return FinalAccumLateClipWithAlphaCalcWithAlphaAccumKernel();
+		else if (!alphaCalc && alphaAccum)//Rgba output and !Transparency.
+			return FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumKernel();
+		else
+			return "";//Cannot have alphaCalc and !alphaAccum, it makes no sense.
+	}
+}
+
+/// <summary>
+/// Wrapper around CreateFinalAccumKernelString().
+/// </summary>
+/// <param name="earlyClip">True if early clip is desired, else false.</param>
+/// <param name="channels">The number of channels used, 3 or 4.</param>
+/// <param name="transparency">True if channels equals 4 and using transparency, else false.</param>
+/// <returns>The final accumulation kernel string</returns>
+template <typename T>
+string FinalAccumOpenCLKernelCreator<T>::CreateFinalAccumKernelString(bool earlyClip, unsigned int channels, bool transparency)
+{
+	return CreateFinalAccumKernelString(earlyClip, (channels > 3 && transparency), channels > 3);
+}
+
+/// <summary>
+/// Create the final accumulation kernel string
+/// </summary>
+/// <param name="earlyClip">True if early clip is desired, else false.</param>
+/// <param name="alphaCalc">True if channels equals 4 and transparency is desired, else false.</param>
+/// <param name="alphaAccum">True if channels equals 4</param>
+/// <returns>The final accumulation kernel string</returns>
+template <typename T>
+string FinalAccumOpenCLKernelCreator<T>::CreateFinalAccumKernelString(bool earlyClip, bool alphaCalc, bool alphaAccum)
+{
+	ostringstream os;
+	string channels = alphaAccum ? "4" : "3";
+
+	os <<
+		ConstantDefinesString(typeid(T) == typeid(double)) <<
+		ClampRealFunctionString <<
+		UnionCLStructString <<
+		RgbToHsvFunctionString <<
+		HsvToRgbFunctionString <<
+		CalcAlphaFunctionString <<
+		SpatialFilterCLStructString;
+
+	if (earlyClip)
+	{
+		if (!alphaCalc && !alphaAccum)//Rgb output, the most common case.
+			os << "__kernel void " << m_FinalAccumEarlyClipEntryPoint << "(\n";
+		else if (alphaCalc && alphaAccum)//Rgba output and Transparency.
+			os << "__kernel void " << m_FinalAccumEarlyClipWithAlphaCalcWithAlphaAccumEntryPoint << "(\n";
+		else if (!alphaCalc && alphaAccum)//Rgba output and !Transparency.
+			os << "__kernel void " << m_FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumEntryPoint << "(\n";
+		else
+			return "";//Cannot have alphaCalc and !alphaAccum, it makes no sense.
+	}
+	else
+	{
+		os <<
+			CreateCalcNewRgbFunctionString(false) <<
+			CreateGammaCorrectionFunctionString(false, alphaCalc, alphaAccum, true);
+
+		if (!alphaCalc && !alphaAccum)//Rgb output, the most common case.
+			os << "__kernel void " << m_FinalAccumLateClipEntryPoint << "(\n";
+		else if (alphaCalc && alphaAccum)//Rgba output and Transparency.
+			os << "__kernel void " << m_FinalAccumLateClipWithAlphaCalcWithAlphaAccumEntryPoint << "(\n";
+		else if (!alphaCalc && alphaAccum)//Rgba output and !Transparency.
+			os << "__kernel void " << m_FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumEntryPoint << "(\n";
+		else
+			return "";//Cannot have alphaCalc and !alphaAccum, it makes no sense.
+	}
+
+	os <<
+		"	const __global real4reals* accumulator,\n"
+		"	__write_only image2d_t pixels,\n"
+		"	__constant SpatialFilterCL* spatialFilter,\n"
+		"	__constant real_t* filterCoefs,\n"
+		"	const real_t alphaBase,\n"
+		"	const real_t alphaScale\n"
+		"\t)\n"
+		"{\n"
+		"\n"
+		"	if ((GLOBAL_ID_Y >= spatialFilter->m_FinalRasH) || (GLOBAL_ID_X >= spatialFilter->m_FinalRasW))\n"
+		"		return;\n"
+		"\n"
+		"	unsigned int accumX = spatialFilter->m_DensityFilterOffset + (GLOBAL_ID_X * spatialFilter->m_Supersample);\n"
+		"	unsigned int accumY = spatialFilter->m_DensityFilterOffset + (GLOBAL_ID_Y * spatialFilter->m_Supersample);\n"
+
+		"	int2 finalCoord;\n"
+		"	finalCoord.x = GLOBAL_ID_X;\n"
+		"	finalCoord.y = GLOBAL_ID_Y;\n"
+		"	float4floats finalColor;\n"
+		"	real_t alpha, ls;\n"
+		"	int ii, jj;\n"
+		"	unsigned int filterKRowIndex;\n"
+		"	const __global real4reals* accumBucket;\n"
+		"	real4reals newBucket;\n"
+		"	newBucket.m_Real4 = 0;\n"
+		"	real4reals newRgb;\n"
+		"	newRgb.m_Real4 = 0;\n"
+		"\n"
+		"	for (jj = 0; jj < spatialFilter->m_FilterWidth; jj++)\n"
+		"	{\n"
+		"		filterKRowIndex = jj * spatialFilter->m_FilterWidth;\n"
+		"\n"
+		"		for (ii = 0; ii < spatialFilter->m_FilterWidth; ii++)\n"
+		"		{\n"
+		"			real_t k = filterCoefs[ii + filterKRowIndex];\n"
+		"\n"
+		"			accumBucket = accumulator + (accumX + ii) + ((accumY + jj) * spatialFilter->m_SuperRasW);\n"
+		"			newBucket.m_Real4 += (k * accumBucket->m_Real4);\n"
+		"		}\n"
+		"	}\n"
+		"\n";
+
+	//Not supporting 2 bytes per channel on the GPU. If the user wants it, run on the CPU.
+	if (earlyClip)//If early clip, simply assign values directly to the temp uint4 since they've been gamma corrected already, then write it straight to the output image below.
+	{
+		os <<
+		"	finalColor.m_Float4.x = (float)newBucket.m_Real4.x;\n"//CPU side clamps, skip here because write_imagef() does the clamping for us.
+		"	finalColor.m_Float4.y = (float)newBucket.m_Real4.y;\n"
+		"	finalColor.m_Float4.z = (float)newBucket.m_Real4.z;\n";
+	
+		if (alphaAccum)
+		{
+			if (alphaCalc)
+				os << "	finalColor.m_Float4.w = (float)newBucket.m_Real4.w * 255.0f;\n";
+			else
+				os << "	finalColor.m_Float4.w = 255;\n";
+		}
+	}
+	else
+	{
+		//Late clip, so must gamma correct from the temp new bucket to temp float4.
+		if (typeid(T) == typeid(double))
+		{
+			os <<
+		"	real4reals realFinal;\n"
+		"\n"
+		"	GammaCorrectionFloats(&newBucket, &(spatialFilter->m_Background[0]), spatialFilter->m_Gamma, spatialFilter->m_LinRange, spatialFilter->m_Vibrancy, spatialFilter->m_HighlightPower, alphaBase, alphaScale, &(realFinal.m_Reals[0]));\n"
+		"	finalColor.m_Float4.x = (float)realFinal.m_Real4.x;\n"
+		"	finalColor.m_Float4.y = (float)realFinal.m_Real4.y;\n"
+		"	finalColor.m_Float4.z = (float)realFinal.m_Real4.z;\n"
+		"	finalColor.m_Float4.w = (float)realFinal.m_Real4.w;\n"
+		;
+		}
+		else
+		{
+			os <<
+		"	GammaCorrectionFloats(&newBucket, &(spatialFilter->m_Background[0]), spatialFilter->m_Gamma, spatialFilter->m_LinRange, spatialFilter->m_Vibrancy, spatialFilter->m_HighlightPower, alphaBase, alphaScale, &(finalColor.m_Floats[0]));\n";
+		}
+	}
+
+	os <<
+		"	finalColor.m_Float4 /= 255.0f;\n"
+		"	write_imagef(pixels, finalCoord, finalColor.m_Float4);\n"//Use write_imagef instead of write_imageui because only the former works when sharing with an OpenGL texture.
+		"	barrier(CLK_GLOBAL_MEM_FENCE);\n"//Required, or else page tearing will occur during interactive rendering.
+		"}\n"
+		;
+
+	return os.str();
+}
+
+/// <summary>
+/// Creates the gamma correction function string.
+/// This is not a full kernel, just a function that is used in the kernels.
+/// </summary>
+/// <param name="globalBucket">True if writing to a global buffer (early clip), else false (late clip).</param>
+/// <param name="alphaCalc">True if channels equals 4 and transparency is desired, else false.</param>
+/// <param name="alphaAccum">True if channels equals 4</param>
+/// <param name="finalOut">True if writing to global buffer (late clip), else false (early clip).</param>
+/// <returns>The gamma correction function string</returns>
+template <typename T>
+string FinalAccumOpenCLKernelCreator<T>::CreateGammaCorrectionFunctionString(bool globalBucket, bool alphaCalc, bool alphaAccum, bool finalOut)
+{
+	ostringstream os;
+	string dataType;
+	string unionMember;
+	dataType = "real_t";
+
+	//Use real_t for all cases, early clip and final accum.
+	os << "void GammaCorrectionFloats(" << (globalBucket ? "__global " : "") << "real4reals* bucket, __constant real_t* background, real_t g, real_t linRange, real_t vibrancy, real_t highlightPower, real_t alphaBase, real_t alphaScale, " << (finalOut ? "" : "__global") << " real_t* correctedChannels)\n";
+
+	os
+	<< "{\n"
+	<< "	real_t alpha, ls, tmp, a;\n"
+	<< "	real4reals newRgb;\n"
+	<< "\n"
+	<< "	if (bucket->m_Reals[3] <= 0)\n"
+	<< "	{\n"
+	<< "		alpha = 0;\n"
+	<< "		ls = 0;\n"
+	<< "	}\n"
+	<< "	else\n"
+	<< "	{\n"
+	<< "		tmp = bucket->m_Reals[3];\n"
+	<< "		alpha = CalcAlpha(tmp, g, linRange);\n"
+	<< "		ls = vibrancy * 256.0 * alpha / tmp;\n"
+	<< "		ClampRef(&alpha, 0.0, 1.0);\n"
+	<< "	}\n"
+	<< "\n"
+	<< "	CalcNewRgb(bucket, ls, highlightPower, &newRgb);\n"
+	<< "\n"
+	<< "	for (unsigned int rgbi = 0; rgbi < 3; rgbi++)\n"
+	<< "	{\n"
+	<< "		a = newRgb.m_Reals[rgbi] + ((1.0 - vibrancy) * 256.0 * pow(bucket->m_Reals[rgbi], g));\n"
+	<< "\n";
+
+	if (!alphaCalc)
+	{
+		os <<
+		"		a += ((1.0 - alpha) * background[rgbi]);\n";
+	}
+	else
+	{
+		os
+	<< "		if (alpha > 0)\n"
+	<< "			a /= alpha;\n"
+	<< "		else\n"
+	<< "			a = 0;\n";
+	}
+	
+	os <<
+	"\n"
+	"			correctedChannels[rgbi] = (" << dataType << ")clamp(a, 0.0, 255.0);\n"
+	"		}\n"
+	"\n";
+	
+	//The CPU code has 3 cases for assigning alpha:
+	//[3] = alpha.//Early clip.
+	//[3] = alpha * 255.//Final Rgba with transparency.
+	//[3] = 255.//Final Rgba without transparency.
+	//Putting conditionals in GPU code is to be avoided. So do base + alpha * scale which will
+	//work for all 3 cases without using a conditional, which should be faster on a GPU. This gives:
+	//Base = 0,   scale = 1.   [3] = (0 +   (alpha * 1)).   [3] = alpha.
+	//Base = 0,   scale = 255. [3] = (0 +   (alpha * 255)). [3] = alpha * 255.
+	//Base = 255, scale = 0.   [3] = (255 + (alpha * 0)).   [3] = 255.
+	if (alphaAccum)
+	{
+		os
+	<< "	correctedChannels[3] = (" << dataType << ")(alphaBase + (alpha * alphaScale));\n";
+	}
+
+	os <<
+	"}\n"
+	"\n";
+
+	return os.str();
+}
+
+/// <summary>
+/// OpenCL equivalent of Palette::CalcNewRgb().
+/// </summary>
+/// <param name="globalBucket">True if writing the corrected value to a global buffer (early clip), else false (late clip).</param>
+/// <returns>The CalcNewRgb function string</returns>
+template <typename T>
+string FinalAccumOpenCLKernelCreator<T>::CreateCalcNewRgbFunctionString(bool globalBucket)
+{
+	ostringstream os;
+
+	os <<
+	"static void CalcNewRgb(" << (globalBucket ? "__global " : "") << "real4reals* oldRgb, real_t ls, real_t highPow, real4reals* newRgb)\n"
+	"{\n"
+	"	int rgbi;\n"
+	"	real_t newls, lsratio;\n"
+	"	real4reals newHsv;\n"
+	"	real_t maxa, maxc;\n"
+	"	real_t adjhlp;\n"
+	"\n"
+	"	if (ls == 0 || (oldRgb->m_Real4.x == 0 && oldRgb->m_Real4.y == 0 && oldRgb->m_Real4.z == 0))\n"//Can't do a vector compare to zero.
+	"	{\n"
+	"		newRgb->m_Real4 = 0;\n"
+	"		return;\n"
+	"	}\n"
+	"\n"
+	//Identify the most saturated channel.
+	"	maxc = max(max(oldRgb->m_Reals[0], oldRgb->m_Reals[1]), oldRgb->m_Reals[2]);\n"
+	"	maxa = ls * maxc;\n"
+	"\n"
+	//If a channel is saturated and highlight power is non-negative
+	//modify the color to prevent hue shift.
+	"	if (maxa > 255 && highPow >= 0)\n"
+	"	{\n"
+	"		newls = 255.0 / maxc;\n"
+	"		lsratio = pow(newls / ls, highPow);\n"
+	"\n"
+	//Calculate the max-value color (ranged 0 - 1).
+	"		for (rgbi = 0; rgbi < 3; rgbi++)\n"
+	"			newRgb->m_Reals[rgbi] = newls * oldRgb->m_Reals[rgbi] / 255.0;\n"
+	"\n"
+	//Reduce saturation by the lsratio.
+	"		RgbToHsv(&(newRgb->m_Real4), &(newHsv.m_Real4));\n"
+	"		newHsv.m_Real4.y *= lsratio;\n"
+	"		HsvToRgb(&(newHsv.m_Real4), &(newRgb->m_Real4));\n"
+	"\n"
+	"		for (rgbi = 0; rgbi < 3; rgbi++)\n"//Unrolling and vectorizing makes no difference.
+	"			newRgb->m_Reals[rgbi] *= 255.0;\n"
+	"	}\n"
+	"	else\n"
+	"	{\n"
+	"		newls = 255.0 / maxc;\n"
+	"		adjhlp = -highPow;\n"
+	"\n"
+	"		if (adjhlp > 1)\n"
+	"			adjhlp = 1;\n"
+	"\n"
+	"		if (maxa <= 255)\n"
+	"			adjhlp = 1;\n"
+	"\n"
+	//Calculate the max-value color (ranged 0 - 1) interpolated with the old behavior.
+	"		for (rgbi = 0; rgbi < 3; rgbi++)\n"//Unrolling, caching and vectorizing makes no difference.
+	"			newRgb->m_Reals[rgbi] = ((1.0 - adjhlp) * newls + adjhlp * ls) * oldRgb->m_Reals[rgbi];\n"
+	"	}\n"
+	"}\n"
+	"\n";
+
+	return os.str();
+}
+
+/// <summary>
+/// Create the gamma correction kernel string used for early clipping.
+/// </summary>
+/// <param name="alphaCalc">True if channels equals 4 and transparency is desired, else false.</param>
+/// <returns>The gamma correction kernel string used for early clipping</returns>
+template <typename T>
+string FinalAccumOpenCLKernelCreator<T>::CreateGammaCorrectionKernelString(bool alphaCalc)
+{
+	ostringstream os;
+	string dataType;
+
+	os <<
+		ConstantDefinesString(typeid(T) == typeid(double)) <<
+		ClampRealFunctionString <<
+		UnionCLStructString <<
+		RgbToHsvFunctionString <<
+		HsvToRgbFunctionString <<
+		CalcAlphaFunctionString <<
+		CreateCalcNewRgbFunctionString(true) <<
+		SpatialFilterCLStructString <<
+		CreateGammaCorrectionFunctionString(true, alphaCalc, true, false);//Will only be used with float in this case, early clip. Will always alpha accum.
+
+		os << "__kernel void " << (alphaCalc ? m_GammaCorrectionWithAlphaCalcEntryPoint : m_GammaCorrectionWithoutAlphaCalcEntryPoint) << "(\n" <<
+			"	__global real4reals* accumulator,\n"
+			"	__constant SpatialFilterCL* spatialFilter\n"
+			")\n"
+			"{\n"
+			"	int testGutter = 0;\n"
+			"\n"
+			"	if (GLOBAL_ID_Y >= (spatialFilter->m_SuperRasH - testGutter) || GLOBAL_ID_X >= (spatialFilter->m_SuperRasW - testGutter))\n"
+			"		return;\n"
+			"\n"
+			"	unsigned int superIndex = (GLOBAL_ID_Y * spatialFilter->m_SuperRasW) + GLOBAL_ID_X;\n"
+			"	__global real4reals* bucket = accumulator + superIndex;\n"
+			//Pass in an alphaBase and alphaScale of 0, 1 which means to just directly assign the computed alpha value.
+			"	GammaCorrectionFloats(bucket, &(spatialFilter->m_Background[0]), spatialFilter->m_Gamma, spatialFilter->m_LinRange, spatialFilter->m_Vibrancy, spatialFilter->m_HighlightPower, 0.0, 1.0, &(bucket->m_Reals[0]));\n"
+			"}\n"
+		;
+
+	return os.str();
+}
+}
--- a/Source/EmberCL/FinalAccumOpenCLKernelCreator.h
+++ b/Source/EmberCL/FinalAccumOpenCLKernelCreator.h
@ -0,0 +1,87 @@
+#pragma once
+ 
+#include "EmberCLPch.h"
+#include "EmberCLStructs.h"
+#include "EmberCLFunctions.h"
+
+/// <summary>
+/// FinalAccumOpenCLKernelCreator class.
+/// </summary>
+
+namespace EmberCLns
+{
+/// <summary>
+/// Class for creating the final accumulation code in OpenCL.
+/// There are many conditionals in the CPU code to create the
+/// final output image. This class creates many different kernels
+/// with all conditionals and unnecessary calculations stripped out.
+/// The conditionals are:
+/// Early clip/late clip
+/// Alpha channel, no alpha channel
+/// Alpha with/without transparency
+/// Template argument expected to be float or double.
+/// </summary>
+template <typename T>
+class EMBERCL_API FinalAccumOpenCLKernelCreator
+{
+public:
+	FinalAccumOpenCLKernelCreator();
+	
+	string GammaCorrectionWithAlphaCalcKernel();
+	string GammaCorrectionWithAlphaCalcEntryPoint();
+
+	string GammaCorrectionWithoutAlphaCalcKernel();
+	string GammaCorrectionWithoutAlphaCalcEntryPoint();
+
+	string FinalAccumEarlyClipKernel();
+	string FinalAccumEarlyClipEntryPoint();
+	string FinalAccumEarlyClipWithAlphaCalcWithAlphaAccumKernel();
+	string FinalAccumEarlyClipWithAlphaCalcWithAlphaAccumEntryPoint();
+	string FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumKernel();
+	string FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumEntryPoint();
+
+	string FinalAccumLateClipKernel();
+	string FinalAccumLateClipEntryPoint();
+	string FinalAccumLateClipWithAlphaCalcWithAlphaAccumKernel();
+	string FinalAccumLateClipWithAlphaCalcWithAlphaAccumEntryPoint();
+	string FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumKernel();
+	string FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumEntryPoint();
+	string GammaCorrectionEntryPoint(unsigned int channels, bool transparency);
+	string GammaCorrectionKernel(unsigned int channels, bool transparency);
+	string FinalAccumEntryPoint(bool earlyClip, unsigned int channels, bool transparency, T& alphaBase, T& alphaScale);
+	string FinalAccumKernel(bool earlyClip, unsigned int channels, bool transparency);
+
+private:
+	string CreateFinalAccumKernelString(bool earlyClip, unsigned int channels, bool transparency);
+	string CreateGammaCorrectionKernelString(bool alphaCalc);
+
+	string CreateFinalAccumKernelString(bool earlyClip, bool alphaCalc, bool alphaAccum);
+	string CreateGammaCorrectionFunctionString(bool globalBucket, bool alphaCalc, bool alphaAccum, bool finalOut);
+	string CreateCalcNewRgbFunctionString(bool globalBucket);
+	string m_GammaCorrectionWithAlphaCalcKernel;
+	string m_GammaCorrectionWithAlphaCalcEntryPoint;
+
+	string m_GammaCorrectionWithoutAlphaCalcKernel;
+	string m_GammaCorrectionWithoutAlphaCalcEntryPoint;
+
+	string m_FinalAccumEarlyClipKernel;//False, false.
+	string m_FinalAccumEarlyClipEntryPoint;
+	string m_FinalAccumEarlyClipWithAlphaCalcWithAlphaAccumKernel;//True, true.
+	string m_FinalAccumEarlyClipWithAlphaCalcWithAlphaAccumEntryPoint;
+	string m_FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumKernel;//False, true.
+	string m_FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumEntryPoint;
+
+	string m_FinalAccumLateClipKernel;//False, false.
+	string m_FinalAccumLateClipEntryPoint;
+	string m_FinalAccumLateClipWithAlphaCalcWithAlphaAccumKernel;//True, true.
+	string m_FinalAccumLateClipWithAlphaCalcWithAlphaAccumEntryPoint;
+	string m_FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumKernel;//False, true.
+	string m_FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumEntryPoint;
+};
+
+template EMBERCL_API class FinalAccumOpenCLKernelCreator<float>;
+
+#ifdef DO_DOUBLE
+	template EMBERCL_API class FinalAccumOpenCLKernelCreator<double>;
+#endif
+}
--- a/Source/EmberCL/IterOpenCLKernelCreator.cpp
+++ b/Source/EmberCL/IterOpenCLKernelCreator.cpp
@ -0,0 +1,785 @@
+#include "EmberCLPch.h"
+#include "IterOpenCLKernelCreator.h"
+
+namespace EmberCLns
+{
+/// <summary>
+/// Empty constructor that does nothing. The user must call the one which takes a bool
+/// argument before using this class.
+/// This constructor only exists so the class can be a member of a class.
+/// </summary>
+template <typename T>
+IterOpenCLKernelCreator<T>::IterOpenCLKernelCreator()
+{
+}
+
+/// <summary>
+/// Constructor that sets up some basic entry point strings and creates
+/// the zeroization kernel string since it requires no conditional inputs.
+/// </summary>
+template <typename T>
+IterOpenCLKernelCreator<T>::IterOpenCLKernelCreator(bool nVidia)
+{
+	m_NVidia = nVidia;
+	m_IterEntryPoint = "IterateKernel";
+	m_ZeroizeEntryPoint = "ZeroizeKernel";
+	m_ZeroizeKernel = CreateZeroizeKernelString();
+}
+
+/// <summary>
+/// Accessors.
+/// </summary>
+
+template <typename T> string IterOpenCLKernelCreator<T>::ZeroizeKernel() { return m_ZeroizeKernel; }
+template <typename T> string IterOpenCLKernelCreator<T>::ZeroizeEntryPoint() { return m_ZeroizeEntryPoint; }
+template <typename T> string IterOpenCLKernelCreator<T>::IterEntryPoint() { return m_IterEntryPoint; }
+
+/// <summary>
+/// Create the iteration kernel string using the Cuburn method.
+/// Template argument expected to be float or double.
+/// </summary>
+/// <param name="ember">The ember to create the kernel string for</param>
+/// <param name="params">The parametric variation #define string</param>
+/// <param name="doAccum">Debugging parameter to include or omit accumulating to the histogram. Default: true.</param>
+/// <returns>The kernel string</returns>
+template <typename T>
+string IterOpenCLKernelCreator<T>::CreateIterKernelString(Ember<T>& ember, string& parVarDefines, bool lockAccum, bool doAccum)
+{
+	bool doublePrecision = typeid(T) == typeid(double);
+	unsigned int i, v, varIndex, varCount, totalXformCount = ember.TotalXformCount();
+	ostringstream kernelIterBody, xformFuncs, os;
+	vector<Variation<T>*> variations;
+
+	xformFuncs << "\n" << parVarDefines << endl;
+	ember.GetPresentVariations(variations);
+	std::for_each(variations.begin(), variations.end(), [&](Variation<T>* var) { if (var) xformFuncs << var->OpenCLFuncsString(); });
+
+	for (i = 0; i < totalXformCount; i++)
+	{
+		Xform<T>* xform = ember.GetTotalXform(i);
+		unsigned int totalVarCount = xform->TotalVariationCount();
+		bool needPrecalcSumSquares = false;
+		bool needPrecalcSqrtSumSquares = false;
+		bool needPrecalcAngles = false;
+		bool needPrecalcAtanXY = false;
+		bool needPrecalcAtanYX = false;
+
+		v = varIndex = varCount = 0;
+		xformFuncs <<
+			"void Xform" << i << "(__constant XformCL* xform, __constant real_t* parVars, Point* inPoint, Point* outPoint, uint2* mwc)\n" <<
+			"{\n"
+			"	real_t transX, transY, transZ;\n"
+			"	real4 vIn, vOut = 0.0;\n";
+		
+		//Determine if any variations, regular, pre, or post need precalcs.
+		while (Variation<T>* var = xform->GetVariation(v++))
+		{
+			needPrecalcSumSquares     |= var->NeedPrecalcSumSquares();
+			needPrecalcSqrtSumSquares |= var->NeedPrecalcSqrtSumSquares();
+			needPrecalcAngles         |= var->NeedPrecalcAngles();
+			needPrecalcAtanXY         |= var->NeedPrecalcAtanXY();
+			needPrecalcAtanYX         |= var->NeedPrecalcAtanYX();
+		}
+
+		if (needPrecalcSumSquares)
+			xformFuncs << "\treal_t precalcSumSquares;\n";
+
+		if (needPrecalcSqrtSumSquares)
+			xformFuncs << "\treal_t precalcSqrtSumSquares;\n";
+
+		if (needPrecalcAngles)
+		{
+			xformFuncs << "\treal_t precalcSina;\n";
+			xformFuncs << "\treal_t precalcCosa;\n";
+		}
+
+		if (needPrecalcAtanXY)
+			xformFuncs << "\treal_t precalcAtanxy;\n";
+
+		if (needPrecalcAtanYX)
+			xformFuncs << "\treal_t precalcAtanyx;\n";
+
+		xformFuncs << "\treal_t tempColor = outPoint->m_ColorX = xform->m_ColorSpeedCache + (xform->m_OneMinusColorCache * inPoint->m_ColorX);\n";
+
+		if (xform->PreVariationCount() + xform->VariationCount() == 0)
+		{
+			xformFuncs <<
+			"	outPoint->m_X = (xform->m_A * inPoint->m_X) + (xform->m_B * inPoint->m_Y) + xform->m_C;\n" <<
+			"	outPoint->m_Y = (xform->m_D * inPoint->m_X) + (xform->m_E * inPoint->m_Y) + xform->m_F;\n" <<
+			"	outPoint->m_Z = inPoint->m_Z;\n";
+		}
+		else
+		{
+			xformFuncs <<
+			"	transX = (xform->m_A * inPoint->m_X) + (xform->m_B * inPoint->m_Y) + xform->m_C;\n" <<
+			"	transY = (xform->m_D * inPoint->m_X) + (xform->m_E * inPoint->m_Y) + xform->m_F;\n" <<
+			"	transZ = inPoint->m_Z;\n";
+
+			varCount = xform->PreVariationCount();
+
+			if (varCount > 0)
+			{
+				xformFuncs << "\n\t//Apply each of the " << varCount << " pre variations in this xform.\n";
+
+				//Output the code for each pre variation in this xform.
+				for (varIndex = 0; varIndex < varCount; varIndex++)
+				{
+					if (Variation<T>* var = xform->GetVariation(varIndex))
+					{
+						xformFuncs << "\n\t//" << var->Name() << ".\n";
+						xformFuncs << var->PrecalcOpenCLString();
+						xformFuncs << xform->ReadOpenCLString(VARTYPE_PRE) << endl;
+						xformFuncs << var->OpenCLString() << endl;
+						xformFuncs << xform->WriteOpenCLString(VARTYPE_PRE, var->AssignType()) << endl;
+					}
+				}
+			}
+
+			if (xform->VariationCount() > 0)
+			{
+				if (xform->NeedPrecalcSumSquares())
+					xformFuncs << "\tprecalcSumSquares = SQR(transX) + SQR(transY);\n";
+
+				if (xform->NeedPrecalcSqrtSumSquares())
+					xformFuncs << "\tprecalcSqrtSumSquares = sqrt(precalcSumSquares);\n";
+
+				if (xform->NeedPrecalcAngles())
+				{
+					xformFuncs << "\tprecalcSina = transX / precalcSqrtSumSquares;\n";
+					xformFuncs << "\tprecalcCosa = transY / precalcSqrtSumSquares;\n";
+				}
+
+				if (xform->NeedPrecalcAtanXY())
+					xformFuncs << "\tprecalcAtanxy = atan2(transX, transY);\n";
+
+				if (xform->NeedPrecalcAtanYX())
+					xformFuncs << "\tprecalcAtanyx = atan2(transY, transX);\n";
+
+				xformFuncs << "\n\toutPoint->m_X = 0;";
+				xformFuncs << "\n\toutPoint->m_Y = 0;";
+				xformFuncs << "\n\toutPoint->m_Z = 0;\n";
+				xformFuncs << "\n\t//Apply each of the " << xform->VariationCount() << " regular variations in this xform.\n\n";
+				xformFuncs << xform->ReadOpenCLString(VARTYPE_REG);
+
+				varCount += xform->VariationCount();
+
+				//Output the code for each regular variation in this xform.
+				for (; varIndex < varCount; varIndex++)
+				{
+					if (Variation<T>* var = xform->GetVariation(varIndex))
+					{
+						xformFuncs << "\n\t//" << var->Name() << ".\n"
+							<< var->OpenCLString() << (varIndex == varCount - 1 ? "\n" : "\n\n")
+							<< xform->WriteOpenCLString(VARTYPE_REG, ASSIGNTYPE_SUM);
+					}
+				}
+			}
+			else
+			{
+				xformFuncs <<
+				"	outPoint->m_X = transX;\n"
+				"	outPoint->m_Y = transY;\n"
+				"	outPoint->m_Z = transZ;\n";
+			}
+		}
+
+		if (xform->PostVariationCount() > 0)
+		{
+			varCount += xform->PostVariationCount();
+			xformFuncs << "\n\t//Apply each of the " << xform->PostVariationCount() << " post variations in this xform.\n";
+
+			//Output the code for each post variation in this xform.
+			for (; varIndex < varCount; varIndex++)
+			{
+				if (Variation<T>* var = xform->GetVariation(varIndex))
+				{
+					xformFuncs << "\n\t//" << var->Name() << ".\n";
+					xformFuncs << var->PrecalcOpenCLString();
+					xformFuncs << xform->ReadOpenCLString(VARTYPE_POST) << endl;
+					xformFuncs << var->OpenCLString() << endl;
+					xformFuncs << xform->WriteOpenCLString(VARTYPE_POST, var->AssignType()) << (varIndex == varCount - 1 ? "\n" : "\n\n");
+				}
+			}
+		}
+		
+		if (xform->HasPost())
+		{
+			xformFuncs <<
+				"\n\t//Apply post affine transform.\n"
+				"\treal_t tempX = outPoint->m_X;\n"
+				"\n"
+				"\toutPoint->m_X = (xform->m_PostA * tempX) + (xform->m_PostB * outPoint->m_Y) + xform->m_PostC;\n" <<
+				"\toutPoint->m_Y = (xform->m_PostD * tempX) + (xform->m_PostE * outPoint->m_Y) + xform->m_PostF;\n";
+		}
+
+		xformFuncs << "\toutPoint->m_ColorX = outPoint->m_ColorX + xform->m_DirectColor * (tempColor - outPoint->m_ColorX);\n";
+		xformFuncs << "}\n"
+				   << "\n";
+	}
+
+	os <<
+		ConstantDefinesString(doublePrecision) <<
+		InlineMathFunctionsString <<
+		ClampRealFunctionString <<
+		RandFunctionString <<
+		PointCLStructString <<
+		XformCLStructString <<
+		EmberCLStructString <<
+		UnionCLStructString <<
+		CarToRasCLStructString <<
+		CarToRasFunctionString <<
+		AtomicString(doublePrecision, m_NVidia) <<
+		xformFuncs.str() <<
+		"__kernel void " << m_IterEntryPoint << "(\n" <<
+		"	uint iterCount,\n"
+		"	uint fuseCount,\n"
+		"	uint seed,\n"
+		"	__constant EmberCL* ember,\n"
+		"	__constant real_t* parVars,\n"
+		"	__global uchar* xformDistributions,\n"//Using uchar is quicker than uint. Can't be constant because the size can be too large to fit when using xaos.//FINALOPT
+		"	__constant CarToRasCL* carToRas,\n"
+		"	__global real4reals* histogram,\n"
+		"	uint histSize,\n"
+		"	__read_only image2d_t palette,\n"
+		"	__global Point* points\n"
+		"\t)\n"
+		"{\n"
+		"	bool fuse, ok;\n"
+		"	uint threadIndex = INDEX_IN_BLOCK_2D;\n"
+		"	uint i, itersToDo;\n"
+		"	uint consec = 0;\n"
+		//"	int badvals = 0;\n"
+		"	uint histIndex;\n"
+		"	real_t p00, p01;\n"
+		"	Point firstPoint, secondPoint, tempPoint;\n"
+		"	uint2 mwc;\n"
+		"	float4 palColor1;\n"
+		"	int2 iPaletteCoord;\n"
+		"	const sampler_t paletteSampler = CLK_NORMALIZED_COORDS_FALSE |\n"//Coords from 0 to 255.
+		"		CLK_ADDRESS_CLAMP_TO_EDGE |\n"//Clamp to edge
+		"		CLK_FILTER_NEAREST;\n"//Don't interpolate
+		"	uint threadXY = (THREAD_ID_X + THREAD_ID_Y);\n"
+		"	uint threadXDivRows = (THREAD_ID_X / (NTHREADS / THREADS_PER_WARP));\n"
+		"	uint threadsMinus1 = NTHREADS - 1;\n"
+		;
+	
+	os <<
+		"\n"
+		"	__local Point swap[NTHREADS];\n"
+		"	__local uint xfsel[NWARPS];\n"
+		"\n"
+		"	unsigned int pointsIndex = INDEX_IN_GRID_2D;\n"
+		"	mwc.x = (pointsIndex + 1 * seed) & 0x7FFFFFFF;\n"
+		"	mwc.y = ((BLOCK_ID_X + 1) + (pointsIndex + 1) * seed) & 0x7FFFFFFF;\n"
+		"	iPaletteCoord.y = 0;\n"
+		"\n"
+		"	if (fuseCount > 0)\n"
+		"	{\n"
+		"		fuse = true;\n"
+		"		itersToDo = fuseCount;\n"
+		"		firstPoint.m_X = MwcNextNeg1Pos1(&mwc);\n"
+		"		firstPoint.m_Y = MwcNextNeg1Pos1(&mwc);\n"
+		"		firstPoint.m_Z = 0.0;\n"
+		"		firstPoint.m_ColorX = MwcNext01(&mwc);\n"
+		"		firstPoint.m_LastXfUsed = 0;\n"
+		"	}\n"
+		"	else\n"
+		"	{\n"
+		"		fuse = false;\n"
+		"		itersToDo = iterCount;\n"
+		"		firstPoint = points[pointsIndex];\n"
+		"	}\n"
+		"\n";
+	
+		//This is done once initially here and then again after each swap-sync in the main loop.
+		//This along with the randomness that the point shuffle provides gives sufficient randomness
+		//to produce results identical to those produced on the CPU.
+	os <<
+		"	if (THREAD_ID_Y == 0 && THREAD_ID_X < NWARPS)\n"
+		"		xfsel[THREAD_ID_X] = MwcNext(&mwc) % " << CHOOSE_XFORM_GRAIN << ";\n"//It's faster to do the % here ahead of time than every time an xform is looked up to use inside the loop.
+		"\n"
+		"	barrier(CLK_LOCAL_MEM_FENCE);\n"
+		"\n"
+		"	for (i = 0; i < itersToDo; i++)\n"
+		"	{\n";
+
+		os <<
+		"		consec = 0;\n"
+		"\n"
+		"		do\n"
+		"		{\n";
+		if (ember.XaosPresent())
+		{
+			os <<
+		"			secondPoint.m_LastXfUsed = xformDistributions[xfsel[THREAD_ID_Y] + (" << CHOOSE_XFORM_GRAIN << " * (firstPoint.m_LastXfUsed + 1u))];\n\n";
+		}
+		else
+		{
+			os <<
+		"			secondPoint.m_LastXfUsed = xformDistributions[xfsel[THREAD_ID_Y]];\n\n";
+		}
+
+		for (i = 0; i < ember.XformCount(); i++)
+		{
+			if (i == 0)
+				os <<
+		"			if (secondPoint.m_LastXfUsed == " << i << ")\n";
+			else
+				os <<
+		"			else if (secondPoint.m_LastXfUsed == " << i << ")\n";
+
+		os <<
+		"			{\n" <<
+		"				Xform" << i << "(&(ember->m_Xforms[" << i << "]), parVars, &firstPoint, &secondPoint, &mwc);\n" <<
+		"			}\n";
+		}
+		os <<
+		"\n"
+		"			ok = !BadVal(secondPoint.m_X) && !BadVal(secondPoint.m_Y);\n"
+		//"			ok = !BadVal(secondPoint.m_X) && !BadVal(secondPoint.m_Y) && !BadVal(secondPoint.m_Z);\n"
+		"\n"
+		"			if (!ok)\n"
+		"			{\n"
+		"				firstPoint.m_X = MwcNextNeg1Pos1(&mwc);\n"
+		"				firstPoint.m_Y = MwcNextNeg1Pos1(&mwc);\n"
+		"				firstPoint.m_Z = 0.0;\n"
+		"				firstPoint.m_ColorX = secondPoint.m_ColorX;\n"
+		"				consec++;\n"
+		//"				badvals++;\n"
+		"			}\n"
+		"		}\n"
+		"		while (!ok && consec < 5);\n"
+		"\n"
+		"		if (!ok)\n"
+		"		{\n"
+		"			secondPoint.m_X = MwcNextNeg1Pos1(&mwc);\n"
+		"			secondPoint.m_Y = MwcNextNeg1Pos1(&mwc);\n"
+		"			secondPoint.m_Z = 0.0;\n"
+		"		}\n"
+		"\n"//Rotate points between threads. This is how randomization is achieved.
+		"		uint swr = threadXY + ((i & 1u) * threadXDivRows);\n"
+		"		uint sw = (swr * THREADS_PER_WARP + THREAD_ID_X) & threadsMinus1;\n"
+		"\n"
+
+		//Write to another thread's location.
+		"		swap[sw] = secondPoint;\n"
+		"\n"
+
+		//Populate randomized xform index buffer with new random values.
+		"		if (THREAD_ID_Y == 0 && THREAD_ID_X < NWARPS)\n"
+		"			xfsel[THREAD_ID_X] = MwcNext(&mwc) % " << CHOOSE_XFORM_GRAIN << ";\n"
+		"\n"
+		"		barrier(CLK_LOCAL_MEM_FENCE);\n"
+		"\n"
+
+		//Another thread will have written to this thread's location, so read the new value and use it for accumulation below.
+		"		firstPoint = swap[threadIndex];\n"
+		"\n"
+		"		if (fuse)\n"
+		"		{\n"
+		"			if (i >= fuseCount - 1)\n"
+		"			{\n"
+		"				i = 0;\n"
+		"				fuse = false;\n"
+		"				itersToDo = iterCount;\n"
+		"				barrier(CLK_LOCAL_MEM_FENCE);\n"//Sort of seems necessary, sort of doesn't. Makes no speed difference.
+		"			}\n"
+		"\n"
+		"			continue;\n"
+		"		}\n"
+		"\n";
+		
+		if (ember.UseFinalXform())
+		{
+			//CPU takes an extra step here to preserve the opacity of the randomly selected xform, rather than the final xform's opacity.
+			//The same thing takes place here automatically because secondPoint.m_LastXfUsed is used below to retrieve the opacity when accumulating.
+			os <<
+		"		if ((ember->m_Xforms[ember->m_FinalXformIndex].m_Opacity == 1) || (MwcNext01(&mwc) < ember->m_Xforms[ember->m_FinalXformIndex].m_Opacity))\n"
+		"		{\n"
+		"			Xform" << (ember.TotalXformCount() - 1) << "(&(ember->m_Xforms[ember->m_FinalXformIndex]), parVars, &secondPoint, &tempPoint, &mwc);\n"
+		"			secondPoint = tempPoint;\n"
+		"		}\n"
+		"\n";
+		}
+		
+		os << CreateProjectionString(ember);
+
+		if (doAccum)
+		{
+			os <<
+		"		p00 = secondPoint.m_X - ember->m_CenterX;\n"
+		"		p01 = secondPoint.m_Y - ember->m_CenterY;\n"
+		"		tempPoint.m_X = (p00 * ember->m_RotA) + (p01 * ember->m_RotB) + ember->m_CenterX;\n"
+		"		tempPoint.m_Y = (p00 * ember->m_RotD) + (p01 * ember->m_RotE) + ember->m_CenterY;\n"
+		"\n"
+		//Add this point to the appropriate location in the histogram.
+		"		if (CarToRasInBounds(carToRas, &tempPoint))\n"
+		"		{\n"
+		"			CarToRasConvertPointToSingle(carToRas, &tempPoint, &histIndex);\n"
+		"\n"
+		"			if (histIndex < histSize)\n"//Provides an extra level of safety and makes no speed difference.
+		"			{\n";
+
+			//Basic texture index interoplation does not produce identical results
+			//to the CPU. So the code here must explicitly do the same thing and not
+			//rely on the GPU texture coordinate lookup.
+			if (ember.m_PaletteMode == PALETTE_LINEAR)
+			{
+			os <<
+		"				real_t colorIndexFrac;\n"
+		"				real_t colorIndex = secondPoint.m_ColorX * COLORMAP_LENGTH;\n"
+		"				int intColorIndex = (int)colorIndex;\n"
+		"				float4 palColor2;\n"
+		"\n"
+		"				if (intColorIndex < 0)\n"
+		"				{\n"
+		"					intColorIndex = 0;\n"
+		"					colorIndexFrac = 0;\n"
+		"				}\n"
+		"				else if (intColorIndex >= COLORMAP_LENGTH_MINUS_1)\n"
+		"				{\n"
+		"					intColorIndex = COLORMAP_LENGTH_MINUS_1 - 1;\n"
+		"					colorIndexFrac = 1.0;\n"
+		"				}\n"
+		"				else\n"
+		"				{\n"
+		"					colorIndexFrac = colorIndex - (real_t)intColorIndex;\n"//Interpolate between intColorIndex and intColorIndex + 1.
+		"				}\n"
+		"\n"
+		"				iPaletteCoord.x = intColorIndex;\n"//Palette operations are strictly float because OpenCL does not support dp64 textures.
+		"				palColor1 = read_imagef(palette, paletteSampler, iPaletteCoord);\n"
+		"				iPaletteCoord.x += 1;\n"
+		"				palColor2 = read_imagef(palette, paletteSampler, iPaletteCoord);\n"
+		"				palColor1 = (palColor1 * (1.0f - (float)colorIndexFrac)) + (palColor2 * (float)colorIndexFrac);\n";//The 1.0f here *must* have the 'f' suffix at the end to compile.
+			}
+			else if (ember.m_PaletteMode == PALETTE_STEP)
+			{
+				os <<
+		"				iPaletteCoord.x = (int)(secondPoint.m_ColorX * COLORMAP_LENGTH);\n"
+		"				palColor1 = read_imagef(palette, paletteSampler, iPaletteCoord);\n";
+			}
+
+			if (lockAccum)
+			{
+				if (typeid(T) == typeid(double))
+				{
+					os <<
+		"				AtomicAdd(&(histogram[histIndex].m_Reals[0]), (real_t)palColor1.x * ember->m_Xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n"//Always apply opacity, even though it's usually 1.
+		"				AtomicAdd(&(histogram[histIndex].m_Reals[1]), (real_t)palColor1.y * ember->m_Xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n"
+		"				AtomicAdd(&(histogram[histIndex].m_Reals[2]), (real_t)palColor1.z * ember->m_Xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n"
+		"				AtomicAdd(&(histogram[histIndex].m_Reals[3]), (real_t)palColor1.w * ember->m_Xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n";
+				}
+				else
+				{
+				os <<
+		"				AtomicAdd(&(histogram[histIndex].m_Reals[0]), palColor1.x * ember->m_Xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n"//Always apply opacity, even though it's usually 1.
+		"				AtomicAdd(&(histogram[histIndex].m_Reals[1]), palColor1.y * ember->m_Xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n"
+		"				AtomicAdd(&(histogram[histIndex].m_Reals[2]), palColor1.z * ember->m_Xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n"
+		"				AtomicAdd(&(histogram[histIndex].m_Reals[3]), palColor1.w * ember->m_Xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n";
+				}
+			}
+			else
+			{
+				if (typeid(T) == typeid(double))
+				{
+					os <<
+		"				real4 realColor;\n"
+		"\n"
+		"				realColor.x = (real_t)palColor1.x;\n"
+		"				realColor.y = (real_t)palColor1.y;\n"
+		"				realColor.z = (real_t)palColor1.z;\n"
+		"				realColor.w = (real_t)palColor1.w;\n"
+		"				histogram[histIndex].m_Real4 += (realColor * ember->m_Xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n";
+				}
+				else
+				{
+				os <<
+		"				histogram[histIndex].m_Real4 += (palColor1 * ember->m_Xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n";
+				}
+			}
+
+			os <<
+		"			}\n"//histIndex < histSize.
+		"		}\n"//CarToRasInBounds.
+		"\n"
+		"		barrier(CLK_GLOBAL_MEM_FENCE);\n";//Barrier every time, whether or not the point was in bounds, else artifacts will occur when doing strips.
+		}
+		
+	os <<
+		"	}\n"//Main for loop.
+		"\n"
+		//At this point, iterating for this round is done, so write the final points back out
+		//to the global points buffer to be used as inputs for the next round. This preserves point trajectory
+		//between kernel calls.
+		"	points[pointsIndex] = firstPoint;\n"
+		"	barrier(CLK_GLOBAL_MEM_FENCE);\n"
+		"}\n";
+
+	return os.str();
+}
+
+/// <summary>
+/// Create an OpenCL string of #defines and a corresponding host side vector for parametric variation values.
+/// Parametric variations present a special problem in the iteration code.
+/// The values can't be passed in with the array of other xform values because
+/// the length of the parametric values is unknown.
+/// This is solved by passing a separate buffer of values dedicated specifically
+/// to parametric variations.
+/// In OpenCL, a series of #define constants are declared which specify the indices in 
+/// the buffer where the various values are stored.
+/// The possibility of a parametric variation type being present in multiple xforms is taken
+/// into account by appending the xform index to the #define, thus making each unique.
+/// The kernel creator then uses these to retrieve the values in the iteration code.
+/// Example:
+/// Xform1: Curl (curl_c1: 1.1, curl_c2: 2.2)
+/// Xform2: Curl (curl_c1: 4.4, curl_c2: 5.5)
+/// Xform3: Blob (blob_low: 1, blob_high: 2, blob_waves: 3)
+///
+/// Host vector to be passed as arg to the iter kernel call:
+/// [1.1][2.2][4.4][5.5][1][2][3]
+///
+/// #defines in OpenCL to access the buffer:
+///
+/// #define CURL_C1_1 0
+/// #define CURL_C2_1 1
+/// #define CURL_C1_2 2
+/// #define CURL_C2_2 3
+/// #define BLOB_LOW_3 4
+/// #define BLOB_HIGH_3 5
+/// #define BLOB_WAVES_ 6
+///
+/// The variations the use these #defines by first looking up the index of the
+/// xform they belong to in the parent ember and generating the OpenCL string based on that
+/// in their overriden OpenCLString() functions.
+/// Template argument expected to be float or double.
+/// </summary>
+/// <param name="ember">The ember to create the values from</param>
+/// <param name="params">The string,vector pair to store the values in</param>
+/// <param name="doVals">True if the vector should be populated, else false. Default: true.</param>
+/// <param name="doString">True if the string should be populated, else false. Default: true.</param>
+template <typename T>
+void IterOpenCLKernelCreator<T>::ParVarIndexDefines(Ember<T>& ember, pair<string, vector<T>>& params, bool doVals, bool doString)
+{
+	unsigned int i, j, k, size = 0, xformCount = ember.TotalXformCount();
+	Xform<T>* xform;
+	ParametricVariation<T>* parVar;
+	ostringstream os;
+
+	if (doVals)
+		params.second.clear();
+
+	for (i = 0; i < xformCount; i++)
+	{
+		if (xform = ember.GetTotalXform(i))
+		{
+			unsigned int varCount = xform->TotalVariationCount();
+
+			for (j = 0; j < varCount; j++)
+			{
+				if (parVar = dynamic_cast<ParametricVariation<T>*>(xform->GetVariation(j)))
+				{
+					for (k = 0; k < parVar->ParamCount(); k++)
+					{
+						if (doString)
+							os << "#define " << ToUpper(parVar->Params()[k].Name()) << "_" << i << " " << size << endl;//Uniquely identify this param in this variation in this xform.
+
+						if (doVals)
+							params.second.push_back(parVar->Params()[k].ParamVal());
+
+						size++;
+					}
+				}
+			}
+		}
+	}
+
+	if (doString)
+	{
+		os << "\n";
+		params.first = os.str();
+	}
+}
+
+/// <summary>
+/// Determine whether the two embers passed in differ enough
+/// to require a rebuild of the iteration code.
+/// A rebuild is required if they differ in the following ways:
+/// Xform count
+/// Final xform presence
+/// Xaos presence
+/// Palette accumulation mode
+/// Xform post affine presence
+/// Variation count
+/// Variation type
+/// Template argument expected to be float or double.
+/// </summary>
+/// <param name="ember1">The first ember to compare</param>
+/// <param name="ember2">The second ember to compare</param>
+/// <returns>True if a rebuild is required, else false</returns>
+template <typename T>
+bool IterOpenCLKernelCreator<T>::IsBuildRequired(Ember<T>& ember1, Ember<T>& ember2)
+{
+	unsigned int i, j, xformCount = ember1.TotalXformCount();
+	
+	if (xformCount != ember2.TotalXformCount())
+		return true;
+
+	if (ember1.UseFinalXform() != ember2.UseFinalXform())
+		return true;
+
+	if (ember1.XaosPresent() != ember2.XaosPresent())
+		return true;
+
+	if (ember1.m_PaletteMode != ember2.m_PaletteMode)
+		return true;
+
+	if (ember1.ProjBits() != ember2.ProjBits())
+		return true;
+
+	for (i = 0; i < xformCount; i++)
+	{
+		Xform<T>* xform1 = ember1.GetTotalXform(i);
+		Xform<T>* xform2 = ember2.GetTotalXform(i);
+		unsigned int varCount = xform1->TotalVariationCount();
+
+		if (xform1->HasPost() != xform2->HasPost())
+			return true;
+
+		if (varCount != xform2->TotalVariationCount())
+			return true;
+
+		for (j = 0; j < varCount; j++)
+			if (xform1->GetVariation(j)->VariationId() != xform2->GetVariation(j)->VariationId())
+				return true;
+	}
+
+	return false;
+}
+
+/// <summary>
+/// Create the zeroize kernel string.
+/// OpenCL comes with no way to zeroize a buffer like memset()
+/// would do on the CPU. So a special kernel must be ran to set a range
+/// of memory addresses to zero.
+/// </summary>
+/// <returns>The kernel string</returns>
+template <typename T>
+string IterOpenCLKernelCreator<T>::CreateZeroizeKernelString()
+{
+	ostringstream os;
+
+	os <<
+		ConstantDefinesString(typeid(T) == typeid(double)) <<//Double precision doesn't matter here since it's not used.
+		"__kernel void " << m_ZeroizeEntryPoint << "(__global uchar* buffer, uint width, uint height)\n"
+		"{\n"
+		"	if (GLOBAL_ID_X >= width || GLOBAL_ID_Y >= height)\n"
+		"		return;\n"
+		"\n"
+		"	buffer[(GLOBAL_ID_Y * width) + GLOBAL_ID_X] = 0;\n"//Can't use INDEX_IN_GRID_2D here because the grid might be larger than the buffer to make even dimensions.
+		"	barrier(CLK_GLOBAL_MEM_FENCE);\n"//Just to be safe.
+		"}\n"
+		"\n";
+
+	return os.str();
+}
+
+/// <summary>
+/// Create the string for 3D projection based on the 3D values of the ember.
+/// Projection is done on the second point.
+/// If any of these fields toggle between 0 and nonzero between runs, a recompile is triggered.
+/// </summary>
+/// <param name="ember">The ember to create the projection string for</param>
+/// <returns>The kernel string</returns>
+template <typename T>
+string IterOpenCLKernelCreator<T>::CreateProjectionString(Ember<T>& ember)
+{
+	unsigned int projBits = ember.ProjBits();
+	ostringstream os;
+
+	if (projBits)
+	{
+		if (projBits & PROJBITS_BLUR)
+		{
+			if (projBits & PROJBITS_YAW)
+			{
+				os <<
+	"		real_t dsin, dcos;\n"
+	"		real_t t = MwcNext01(&mwc) * M_2PI;\n"
+	"		real_t z = secondPoint.m_Z - ember->m_CamZPos;\n"
+	"		real_t x = ember->m_C00 * secondPoint.m_X + ember->m_C10 * secondPoint.m_Y;\n"
+	"		real_t y = ember->m_C01 * secondPoint.m_X + ember->m_C11 * secondPoint.m_Y + ember->m_C21 * z;\n"
+	"\n"
+	"		z = ember->m_C02 * secondPoint.m_X + ember->m_C12 * secondPoint.m_Y + ember->m_C22 * z;\n"
+	"\n"
+	"		real_t zr = 1 - ember->m_CamPerspective * z;\n"
+	"		real_t dr = MwcNext01(&mwc) * ember->m_BlurCoef * z;\n"
+	"\n"
+	"		dsin = sin(t);\n"
+	"		dcos = cos(t);\n"
+	"\n"
+	"		secondPoint.m_X  = (x + dr * dcos) / zr;\n"
+	"		secondPoint.m_Y  = (y + dr * dsin) / zr;\n"
+	"		secondPoint.m_Z -= ember->m_CamZPos;\n";
+			}
+			else
+			{
+				os <<
+	"		real_t y, z, zr;\n"
+	"		real_t dsin, dcos;\n"
+	"		real_t t = MwcNext01(&mwc) * M_2PI;\n"
+	"\n"
+	"		z = secondPoint.m_Z - ember->m_CamZPos;\n"
+	"		y = ember->m_C11 * secondPoint.m_Y + ember->m_C21 * z;\n"
+	"		z = ember->m_C12 * secondPoint.m_Y + ember->m_C22 * z;\n"
+	"		zr = 1 - ember->m_CamPerspective * z;\n"
+	"\n"
+	"		dsin = sin(t);\n"
+	"		dcos = cos(t);\n"
+	"\n"
+	"		real_t dr = MwcNext01(&mwc) * ember->m_BlurCoef * z;\n"
+	"\n"
+	"		secondPoint.m_X = (secondPoint.m_X + dr * dcos) / zr;\n"
+	"		secondPoint.m_Y = (y + dr * dsin) / zr;\n"
+	"		secondPoint.m_Z -= ember->m_CamZPos;\n";
+			}
+		}
+		else if ((projBits & PROJBITS_PITCH) || (projBits & PROJBITS_YAW))
+		{
+			if (projBits & PROJBITS_YAW)
+			{
+				os <<
+	"		real_t z  = secondPoint.m_Z - ember->m_CamZPos;\n"
+	"		real_t x  = ember->m_C00 * secondPoint.m_X + ember->m_C10 * secondPoint.m_Y;\n"
+	"		real_t y  = ember->m_C01 * secondPoint.m_X + ember->m_C11 * secondPoint.m_Y + ember->m_C21 * z;\n"
+	"		real_t zr = 1 - ember->m_CamPerspective * (ember->m_C02 * secondPoint.m_X + ember->m_C12 * secondPoint.m_Y + ember->m_C22 * z);\n"
+	"\n"
+	"		secondPoint.m_X = x / zr;\n"
+	"		secondPoint.m_Y = y / zr;\n"
+	"		secondPoint.m_Z -= ember->m_CamZPos;\n";
+			}
+			else
+			{
+				os <<
+	"		real_t z  = secondPoint.m_Z - ember->m_CamZPos;\n"
+	"		real_t y  = ember->m_C11 * secondPoint.m_Y + ember->m_C21 * z;\n"
+	"		real_t zr = 1 - ember->m_CamPerspective * (ember->m_C12 * secondPoint.m_Y + ember->m_C22 * z);\n"
+	"\n"
+	"		secondPoint.m_X /= zr;\n"
+	"		secondPoint.m_Y  = y / zr;\n"
+	"		secondPoint.m_Z -= ember->m_CamZPos;\n";
+			}
+		}
+		else
+		{
+			os <<
+	"		real_t zr = 1 - ember->m_CamPerspective * (secondPoint.m_Z - ember->m_CamZPos);\n"
+	"\n"
+	"		secondPoint.m_X /= zr;\n"
+	"		secondPoint.m_Y /= zr;\n"
+	"		secondPoint.m_Z -= ember->m_CamZPos;\n";
+		}
+	}
+
+	return os.str();
+}
+}
--- a/Source/EmberCL/IterOpenCLKernelCreator.h
+++ b/Source/EmberCL/IterOpenCLKernelCreator.h
@ -0,0 +1,89 @@
+#pragma once
+
+#include "EmberCLPch.h"
+#include "EmberCLStructs.h"
+#include "EmberCLFunctions.h"
+
+/// <summary>
+/// IterOpenCLKernelCreator class.
+/// </summary>
+
+namespace EmberCLns
+{
+/// <summary>
+/// Class for creating the main iteration code in OpenCL.
+/// It uses the Cuburn method of iterating where all conditionals
+/// are stripped out and a specific kernel is compiled at run-time.
+/// It uses a very sophisticated method for randomization that avoids
+/// the problem of warp/wavefront divergence that would occur if every
+/// thread selected a random xform to apply.
+/// This only works with embers of type float, double is not supported.
+/// </summary>
+template <typename T>
+class EMBERCL_API IterOpenCLKernelCreator
+{
+public:
+	IterOpenCLKernelCreator();
+	IterOpenCLKernelCreator(bool nVidia);
+	string ZeroizeKernel();
+	string ZeroizeEntryPoint();
+	string IterEntryPoint();
+	string CreateIterKernelString(Ember<T>& ember, string& parVarDefines, bool lockAccum = false, bool doAccum = true);
+	static void ParVarIndexDefines(Ember<T>& ember, pair<string, vector<T>>& params, bool doVals = true, bool doString = true);
+	static bool IsBuildRequired(Ember<T>& ember1, Ember<T>& ember2);
+
+private:
+	string CreateZeroizeKernelString();
+	string CreateProjectionString(Ember<T>& ember);
+
+	string m_IterEntryPoint;
+	string m_ZeroizeKernel;
+	string m_ZeroizeEntryPoint;
+	bool m_NVidia;
+};
+
+template EMBERCL_API class IterOpenCLKernelCreator<float>;
+
+#ifdef DO_DOUBLE
+	template EMBERCL_API class IterOpenCLKernelCreator<double>;
+#endif
+
+//
+//template EMBERCL_API string IterOpenCLKernelCreator::CreateIterKernelString<float>(Ember<float>& ember, string& parVarDefines, bool lockAccum, bool doAccum);
+//template EMBERCL_API string IterOpenCLKernelCreator::CreateIterKernelString<double>(Ember<double>& ember, string& parVarDefines, bool lockAccum, bool doAccum);
+//
+//template EMBERCL_API void IterOpenCLKernelCreator::ParVarIndexDefines<float>(Ember<float>& ember, pair<string, vector<float>>& params, bool doVals, bool doString);
+//template EMBERCL_API void IterOpenCLKernelCreator::ParVarIndexDefines<double>(Ember<double>& ember, pair<string, vector<double>>& params, bool doVals, bool doString);
+//
+//template EMBERCL_API bool IterOpenCLKernelCreator::IsBuildRequired<float>(Ember<float>& ember1, Ember<float>& ember2);
+//template EMBERCL_API bool IterOpenCLKernelCreator::IsBuildRequired<double>(Ember<double>& ember1, Ember<double>& ember2);
+
+#ifdef OPEN_CL_TEST_AREA
+typedef void (*KernelFuncPointer) (unsigned int gridWidth, unsigned int gridHeight, unsigned int blockWidth, unsigned int blockHeight,
+								   unsigned int BLOCK_ID_X, unsigned int BLOCK_ID_Y, unsigned int THREAD_ID_X, unsigned int THREAD_ID_Y);
+
+static void OpenCLSim(unsigned int gridWidth, unsigned int gridHeight, unsigned int blockWidth, unsigned int blockHeight, KernelFuncPointer func)
+{
+	cout << "OpenCLSim(): " << endl;
+	cout << "	Params: " << endl;
+	cout << "		gridW: " << gridWidth << endl;
+	cout << "		gridH: " << gridHeight << endl;
+	cout << "		blockW: " << blockWidth << endl;
+	cout << "		blockH: " << blockHeight << endl;
+
+	for (unsigned int i = 0; i < gridHeight; i += blockHeight)
+	{
+		for (unsigned int j = 0; j < gridWidth; j += blockWidth)
+		{
+			for (unsigned int k = 0; k < blockHeight; k++)
+			{
+				for (unsigned int l = 0; l < blockWidth; l++)
+				{
+					func(gridWidth, gridHeight, blockWidth, blockHeight, j / blockWidth, i / blockHeight, l, k);
+				}
+			}
+		}
+	}
+}
+#endif
+}
--- a/Source/EmberCL/OpenCLWrapper.cpp
+++ b/Source/EmberCL/OpenCLWrapper.cpp
--- a/Source/EmberCL/OpenCLWrapper.h
+++ b/Source/EmberCL/OpenCLWrapper.h
@ -0,0 +1,219 @@
+#pragma once
+
+#include "EmberCLPch.h"
+
+/// <summary>
+/// OpenCLWrapper, Spk, NamedBuffer, NamedImage2D, NamedImage2DGL classes.
+/// </summary>
+
+namespace EmberCLns
+{
+#if CL_VERSION_1_2
+#define IMAGEGL2D cl::ImageGL
+#else
+#define IMAGEGL2D cl::Image2DGL
+#endif
+
+/// <summary>
+/// Class to contain all of the things needed to store an OpenCL program.
+/// The name of it, the source, the compiled program object and the kernel.
+/// </summary>
+class EMBERCL_API Spk
+{
+public:
+	string m_Name;
+	cl::Program::Sources m_Source;
+	cl::Program m_Program;
+	cl::Kernel m_Kernel;
+};
+
+/// <summary>
+/// Class to hold an OpenCL buffer with a name to identify it by.
+/// </summary>
+class EMBERCL_API NamedBuffer
+{
+public:
+	NamedBuffer()
+	{
+	}
+
+	NamedBuffer(cl::Buffer& buff, string name)
+	{
+		m_Buffer = buff;
+		m_Name = name;
+	}
+
+	cl::Buffer m_Buffer;
+	string m_Name;
+};
+
+/// <summary>
+/// Class to hold a 2D image with a name to identify it by.
+/// </summary>
+class EMBERCL_API NamedImage2D
+{
+public:
+	NamedImage2D()
+	{
+	}
+
+	NamedImage2D(cl::Image2D& image, string name)
+	{
+		m_Image = image;
+		m_Name = name;
+	}
+
+	cl::Image2D m_Image;
+	string m_Name;
+};
+
+/// <summary>
+/// Class to hold a 2D image that is mapped to an OpenGL texture
+/// and a name to identify it by.
+/// </summary>
+class EMBERCL_API NamedImage2DGL
+{
+public:
+	NamedImage2DGL()
+	{
+	}
+
+	NamedImage2DGL(IMAGEGL2D& image, string name)
+	{
+		m_Image = image;
+		m_Name = name;
+	}
+
+	IMAGEGL2D m_Image;
+	string m_Name;
+};
+
+/// <summary>
+/// Running kernels in OpenCL can require quite a bit of setup, tear down and
+/// general housekeeping. This class helps shield the user from such hassles.
+/// It's main utility is in holding collections of programs, buffers and images
+/// all identified by names. That way, a user can access them as needed without
+/// having to pollute their code.
+/// In addition, writing to an existing object by name determines if the object
+/// can be overwritten, or if it needs to be deleted and replaced by the new one.
+/// This class derives from EmberReport, so the caller is able
+/// to retrieve a text dump of error information if any errors occur.
+/// </summary>
+class EMBERCL_API OpenCLWrapper : public EmberReport
+{
+public:
+	OpenCLWrapper();
+	bool CheckOpenCL();
+	bool Init(unsigned int platform, unsigned int device, bool shared = false);
+
+	//Programs.
+	bool AddProgram(std::string name, std::string& program, std::string& entryPoint, bool doublePrecision);
+	void ClearPrograms();
+
+	//Buffers.
+	bool AddBuffer(string name, size_t size, cl_mem_flags flags = CL_MEM_READ_WRITE);
+	bool AddAndWriteBuffer(string name, void* data, size_t size);
+	bool WriteBuffer(string name, void* data, size_t size);
+	bool WriteBuffer(unsigned int bufferIndex, void* data, size_t size);
+	bool ReadBuffer(string name, void* data, size_t size);
+	bool ReadBuffer(unsigned int bufferIndex, void* data, size_t size);
+	int FindBufferIndex(string name);
+	unsigned int GetBufferSize(string name);
+	unsigned int GetBufferSize(unsigned int bufferIndex);
+	void ClearBuffers();
+
+	//Images.
+	bool AddAndWriteImage(string name, cl_mem_flags flags, const cl::ImageFormat& format, ::size_t width, ::size_t height, ::size_t row_pitch, void* data = NULL, bool shared = false, GLuint texName = 0);
+	bool WriteImage2D(unsigned int index, bool shared, ::size_t width, ::size_t height, ::size_t row_pitch, void* data);
+	bool ReadImage(string name, ::size_t width, ::size_t height, ::size_t row_pitch, bool shared, void* data);
+	bool ReadImage(unsigned int imageIndex, ::size_t width, ::size_t height, ::size_t row_pitch, bool shared, void* data);
+	int FindImageIndex(string name, bool shared);
+	unsigned int GetImageSize(string name, bool shared);
+	unsigned int GetImageSize(unsigned int imageIndex, bool shared);
+	bool CompareImageParams(cl::Image& image, cl_mem_flags flags, const cl::ImageFormat& format, ::size_t width, ::size_t height, ::size_t row_pitch);
+	void ClearImages(bool shared);
+	bool CreateImage2D(cl::Image2D& image2D, cl_mem_flags flags, cl::ImageFormat format, ::size_t width, ::size_t height, ::size_t row_pitch = 0, void* data = NULL);
+	bool CreateImage2DGL(IMAGEGL2D& image2DGL, cl_mem_flags flags, GLenum target, GLint miplevel, GLuint texobj);
+	bool EnqueueAcquireGLObjects(string name);
+	bool EnqueueAcquireGLObjects(IMAGEGL2D& image);
+	bool EnqueueReleaseGLObjects(string name);
+	bool EnqueueReleaseGLObjects(IMAGEGL2D& image);
+	bool EnqueueAcquireGLObjects(const VECTOR_CLASS<cl::Memory>* memObjects = NULL);
+	bool EnqueueReleaseGLObjects(const VECTOR_CLASS<cl::Memory>* memObjects = NULL);
+	bool CreateSampler(cl::Sampler& sampler, cl_bool normalizedCoords, cl_addressing_mode addressingMode, cl_filter_mode filterMode);
+	
+	//Arguments.
+	bool SetBufferArg(unsigned int kernelIndex, unsigned int argIndex, string name);
+	bool SetBufferArg(unsigned int kernelIndex, unsigned int argIndex, unsigned int bufferIndex);
+	bool SetImageArg(unsigned int kernelIndex, unsigned int argIndex, bool shared, string name);
+	bool SetImageArg(unsigned int kernelIndex, unsigned int argIndex, bool shared, unsigned int imageIndex);
+	
+	/// <summary>
+	/// Set an argument in the specified kernel, at the specified argument index.
+	/// Must keep this here in the .h because it's templated.
+	/// </summary>
+	/// <param name="kernelIndex">Index of the kernel whose argument will be set</param>
+	/// <param name="argIndex">Index of the argument to set</param>
+	/// <param name="arg">The argument value to set</param>
+	/// <returns>True if success, else false</returns>
+	template <typename T>
+	bool SetArg(unsigned int kernelIndex, unsigned int argIndex, T arg)
+	{
+		if (m_Init && kernelIndex < m_Programs.size())
+		{
+			cl_int err = m_Programs[kernelIndex].m_Kernel.setArg(argIndex, arg);
+
+			return CheckCL(err, "cl::Kernel::setArg()");
+		}
+
+		return false;
+	}
+
+	//Kernels.
+	int FindKernelIndex(string name);
+	bool RunKernel(unsigned int kernelIndex, unsigned int totalGridWidth, unsigned int totalGridHeight, unsigned int totalGridDepth, unsigned int blockWidth, unsigned int blockHeight, unsigned int blockDepth);
+
+	//Info.
+	template<typename T>
+	T GetInfo(size_t platform, size_t device, cl_device_info name);
+	string PlatformName(size_t platform);
+	vector<string> PlatformNames();
+	string DeviceName(size_t platform, size_t device);
+	vector<string> DeviceNames(size_t platform);
+	string DeviceAndPlatformNames();
+	string DumpInfo();
+
+	//Accessors.
+	bool Ok();
+	bool Shared();
+	cl::Context Context();
+	unsigned int PlatformIndex();
+	unsigned int DeviceIndex();
+	unsigned int LocalMemSize();
+
+	static void MakeEvenGridDims(unsigned int blockW, unsigned int blockH, unsigned int& gridW, unsigned int& gridH);
+
+private:
+	bool CreateContext(bool shared);
+	bool CreateSPK(std::string& name, std::string& program, std::string& entryPoint, Spk& spk, bool doublePrecision);
+	bool CheckCL(cl_int err, const char* name);
+	std::string ErrorToStringCL(cl_int err);
+
+	bool m_Init;
+	bool m_Shared;
+	unsigned int m_PlatformIndex;
+	unsigned int m_DeviceIndex;
+	unsigned int m_LocalMemSize;
+	cl::Platform m_Platform;
+	cl::Context m_Context;
+	cl::Device m_Device;
+	cl::CommandQueue m_Queue;
+	std::vector<cl::Platform> m_Platforms;
+	std::vector<std::vector<cl::Device>> m_Devices;
+	std::vector<cl::Device> m_DeviceVec;
+	std::vector<Spk> m_Programs;
+	std::vector<NamedBuffer> m_Buffers;
+	std::vector<NamedImage2D> m_Images;
+	std::vector<NamedImage2DGL> m_GLImages;
+};
+}
--- a/Source/EmberCL/RendererCL.cpp
+++ b/Source/EmberCL/RendererCL.cpp
--- a/Source/EmberCL/RendererCL.h
+++ b/Source/EmberCL/RendererCL.h
@ -0,0 +1,156 @@
+#pragma once
+
+#include "EmberCLPch.h"
+#include "OpenCLWrapper.h"
+#include "IterOpenCLKernelCreator.h"
+#include "DEOpenCLKernelCreator.h"
+#include "FinalAccumOpenCLKernelCreator.h"
+
+/// <summary>
+/// RendererCL class.
+/// </summary>
+
+namespace EmberCLns
+{
+class EMBERCL_API RendererCLBase
+{
+public:
+	virtual bool ReadFinal(unsigned char* pixels) { return false; }
+	virtual bool ClearFinal() { return false; }
+};
+
+/// <summary>
+/// RendererCL is a derivation of the basic CPU renderer which
+/// overrides various functions to render on the GPU using OpenCL.
+/// Since this class derives from EmberReport and also contains an
+/// OpenCLWrapper member which also derives from EmberReport, the
+/// reporting functions are overridden to aggregate the errors from
+/// both sources.
+/// It does not support different types for T and bucketT, so it only has one template argument
+/// and uses both for the base.
+/// </summary>
+template <typename T>
+class EMBERCL_API RendererCL : public RendererCLBase, public Renderer<T, T>
+{
+public:
+	RendererCL(unsigned int platform = 0, unsigned int device = 0, bool shared = false, GLuint outputTexID = 0);
+	~RendererCL();
+
+	//Ordinary member functions for OpenCL specific tasks.
+	bool Init(unsigned int platform, unsigned int device, bool shared, GLuint outputTexID);
+	inline unsigned int IterBlocksWide();
+	inline unsigned int IterBlocksHigh();
+	inline unsigned int IterBlockWidth();
+	inline unsigned int IterBlockHeight();
+	inline unsigned int IterGridWidth();
+	inline unsigned int IterGridHeight();
+	inline unsigned int TotalIterKernelCount();
+	unsigned int PlatformIndex();
+	unsigned int DeviceIndex();
+	bool ReadHist();
+	bool ReadAccum();
+	bool ReadPoints(vector<PointCL<T>>& vec);
+	virtual bool ReadFinal(unsigned char* pixels);
+	virtual bool ClearFinal();
+	bool ClearHist();
+	bool ClearAccum();
+	bool WritePoints(vector<PointCL<T>>& vec);
+	string IterKernel();
+
+	//Public virtual functions overriden from Renderer.
+	virtual unsigned __int64 MemoryAvailable();
+	virtual bool Ok() const;
+	virtual void NumChannels(unsigned int numChannels);
+	virtual void DumpErrorReport();
+	virtual void ClearErrorReport();
+	virtual unsigned int SubBatchSize() const;
+	virtual unsigned int ThreadCount() const;
+	virtual void ThreadCount(unsigned int threads, const char* seedString = NULL);
+	virtual bool CreateDEFilter(bool& newAlloc);
+	virtual bool CreateSpatialFilter(bool& newAlloc);
+	virtual eRendererType RendererType() const;
+	virtual string ErrorReportString();
+	virtual vector<string> ErrorReport();
+
+#ifndef TEST_CL
+protected:
+#endif
+	//Protected virtual functions overriden from Renderer.
+	virtual void MakeDmap(T colorScalar);
+	virtual bool Alloc();
+	virtual bool ResetBuckets(bool resetHist = true, bool resetAccum = true);
+	virtual eRenderStatus LogScaleDensityFilter();
+	virtual eRenderStatus GaussianDensityFilter();
+	virtual eRenderStatus AccumulatorToFinalImage(unsigned char* pixels, size_t finalOffset);
+	virtual EmberStats Iterate(unsigned __int64 iterCount, unsigned int pass, unsigned int temporalSample);
+
+private:
+	//Private functions for making and running OpenCL programs.
+	bool BuildIterProgramForEmber(bool doAccum = true);
+	bool RunIter(unsigned __int64 iterCount, unsigned int pass, unsigned int temporalSample, unsigned __int64& itersRan);
+	eRenderStatus RunLogScaleFilter();
+	eRenderStatus RunDensityFilter();
+	eRenderStatus RunFinalAccum();
+	bool ClearBuffer(string bufferName, unsigned int width, unsigned int height, unsigned int elementSize);
+	bool RunDensityFilterPrivate(unsigned int kernelIndex, unsigned int gridW, unsigned int gridH, unsigned int blockW, unsigned int blockH, unsigned int chunkSizeW, unsigned int chunkSizeH, unsigned int rowParity, unsigned int colParity);
+	int MakeAndGetDensityFilterProgram(unsigned int ss, unsigned int filterWidth);
+	int MakeAndGetFinalAccumProgram(T& alphaBase, T& alphaScale);
+	int MakeAndGetGammaCorrectionProgram();
+
+	//Private functions passing data to OpenCL programs.
+	DensityFilterCL<T> ConvertDensityFilter();
+	SpatialFilterCL<T> ConvertSpatialFilter();
+	EmberCL<T> ConvertEmber(Ember<T>& ember);
+	static CarToRasCL<T> ConvertCarToRas(const CarToRas<T>& carToRas);
+
+	bool m_Init;
+	bool m_NVidia;
+	bool m_DoublePrecision;
+	unsigned int m_IterBlocksWide, m_IterBlockWidth;
+	unsigned int m_IterBlocksHigh, m_IterBlockHeight;
+	unsigned int m_MaxDEBlockSizeW;
+	unsigned int m_MaxDEBlockSizeH;
+	unsigned int m_WarpSize;
+	unsigned int m_Calls;
+
+	string m_EmberBufferName;
+	string m_ParVarsBufferName;
+	string m_DistBufferName;
+	string m_CarToRasBufferName;
+	string m_DEFilterParamsBufferName;
+	string m_SpatialFilterParamsBufferName;
+	string m_DECoefsBufferName;
+	string m_DEWidthsBufferName;
+	string m_DECoefIndicesBufferName;
+	string m_SpatialFilterCoefsBufferName;
+	string m_HistBufferName;
+	string m_AccumBufferName;
+	string m_FinalImageName;
+	string m_PointsBufferName;
+
+	string m_IterKernel;
+	
+	OpenCLWrapper m_Wrapper;
+	cl::ImageFormat m_PaletteFormat;
+	cl::ImageFormat m_FinalFormat;
+	cl::Image2D m_Palette;
+	IMAGEGL2D m_AccumImage;
+	GLuint m_OutputTexID;
+	EmberCL<T> m_EmberCL;
+	Palette<float> m_Dmap;//Used instead of the base class' m_Dmap because OpenCL only supports float textures.
+	CarToRasCL<T> m_CarToRasCL;
+	DensityFilterCL<T> m_DensityFilterCL;
+	SpatialFilterCL<T> m_SpatialFilterCL;
+	IterOpenCLKernelCreator<T> m_IterOpenCLKernelCreator;
+	DEOpenCLKernelCreator<T> m_DEOpenCLKernelCreator;
+	FinalAccumOpenCLKernelCreator<T> m_FinalAccumOpenCLKernelCreator;
+	pair<string, vector<T>> m_Params;
+	Ember<T> m_LastBuiltEmber;
+};
+
+template EMBERCL_API class RendererCL<float>;
+
+#ifdef DO_DOUBLE
+	template EMBERCL_API class RendererCL<double>;
+#endif
+}