#include "EmberCLPch.h"
#include "FinalAccumOpenCLKernelCreator.h"
namespace EmberCLns
{
/// 
/// Constructor that creates all kernel strings.
/// The caller will access these strings through the accessor functions.
/// 
template 
FinalAccumOpenCLKernelCreator::FinalAccumOpenCLKernelCreator()
{
	m_GammaCorrectionWithAlphaCalcEntryPoint    = "GammaCorrectionWithAlphaCalcKernel";
	m_GammaCorrectionWithoutAlphaCalcEntryPoint = "GammaCorrectionWithoutAlphaCalcKernel";
	m_GammaCorrectionWithAlphaCalcKernel    = CreateGammaCorrectionKernelString(true);
	m_GammaCorrectionWithoutAlphaCalcKernel = CreateGammaCorrectionKernelString(false);
	m_FinalAccumEarlyClipEntryPoint                               = "FinalAccumEarlyClipKernel";
	m_FinalAccumEarlyClipWithAlphaCalcWithAlphaAccumEntryPoint    = "FinalAccumEarlyClipWithAlphaCalcWithAlphaAccumKernel";
	m_FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumEntryPoint = "FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumKernel";
	m_FinalAccumEarlyClipKernel                               = CreateFinalAccumKernelString(true, false, false);
	m_FinalAccumEarlyClipWithAlphaCalcWithAlphaAccumKernel    = CreateFinalAccumKernelString(true, true,  true);
	m_FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumKernel = CreateFinalAccumKernelString(true, false, true);
	m_FinalAccumLateClipEntryPoint                               = "FinalAccumLateClipKernel";
	m_FinalAccumLateClipWithAlphaCalcWithAlphaAccumEntryPoint    = "FinalAccumLateClipWithAlphaCalcWithAlphaAccumKernel";
	m_FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumEntryPoint = "FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumKernel";
	m_FinalAccumLateClipKernel                               = CreateFinalAccumKernelString(false, false, false);
	m_FinalAccumLateClipWithAlphaCalcWithAlphaAccumKernel    = CreateFinalAccumKernelString(false, true,  true);
	m_FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumKernel = CreateFinalAccumKernelString(false, false, true);
}
/// 
/// Kernel source and entry point properties, getters only.
/// 
template  string FinalAccumOpenCLKernelCreator::GammaCorrectionWithAlphaCalcKernel()        { return m_GammaCorrectionWithAlphaCalcKernel;	    }
template  string FinalAccumOpenCLKernelCreator::GammaCorrectionWithAlphaCalcEntryPoint()    { return m_GammaCorrectionWithAlphaCalcEntryPoint;    }
template  string FinalAccumOpenCLKernelCreator::GammaCorrectionWithoutAlphaCalcKernel()     { return m_GammaCorrectionWithoutAlphaCalcKernel;     }
template  string FinalAccumOpenCLKernelCreator::GammaCorrectionWithoutAlphaCalcEntryPoint() { return m_GammaCorrectionWithoutAlphaCalcEntryPoint; }
template  string FinalAccumOpenCLKernelCreator::FinalAccumEarlyClipKernel()                                   { return m_FinalAccumEarlyClipKernel;                                   }
template  string FinalAccumOpenCLKernelCreator::FinalAccumEarlyClipEntryPoint()                               { return m_FinalAccumEarlyClipEntryPoint;                               }
template  string FinalAccumOpenCLKernelCreator::FinalAccumEarlyClipWithAlphaCalcWithAlphaAccumKernel()        { return m_FinalAccumEarlyClipWithAlphaCalcWithAlphaAccumKernel;        }
template  string FinalAccumOpenCLKernelCreator::FinalAccumEarlyClipWithAlphaCalcWithAlphaAccumEntryPoint()    { return m_FinalAccumEarlyClipWithAlphaCalcWithAlphaAccumEntryPoint;    }
template  string FinalAccumOpenCLKernelCreator::FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumKernel()     { return m_FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumKernel;     }
template  string FinalAccumOpenCLKernelCreator::FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumEntryPoint() { return m_FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumEntryPoint; }
template  string FinalAccumOpenCLKernelCreator::FinalAccumLateClipKernel()                                   { return m_FinalAccumLateClipKernel;                                   }
template  string FinalAccumOpenCLKernelCreator::FinalAccumLateClipEntryPoint()                               { return m_FinalAccumLateClipEntryPoint;                               }
template  string FinalAccumOpenCLKernelCreator::FinalAccumLateClipWithAlphaCalcWithAlphaAccumKernel()        { return m_FinalAccumLateClipWithAlphaCalcWithAlphaAccumKernel;        }
template  string FinalAccumOpenCLKernelCreator::FinalAccumLateClipWithAlphaCalcWithAlphaAccumEntryPoint()    { return m_FinalAccumLateClipWithAlphaCalcWithAlphaAccumEntryPoint;    }
template  string FinalAccumOpenCLKernelCreator::FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumKernel()     { return m_FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumKernel;     }
template  string FinalAccumOpenCLKernelCreator::FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumEntryPoint() { return m_FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumEntryPoint; }
/// 
/// Get the gamma correction entry point.
/// 
/// The number of channels used, 3 or 4.
/// True if channels equals 4 and using transparency, else false.
/// The name of the gamma correction entry point kernel function
template 
string FinalAccumOpenCLKernelCreator::GammaCorrectionEntryPoint(size_t channels, bool transparency)
{
	bool alphaCalc = ((channels > 3) && transparency);
	return alphaCalc ? m_GammaCorrectionWithAlphaCalcEntryPoint : m_GammaCorrectionWithoutAlphaCalcEntryPoint;
}
/// 
/// Get the gamma correction kernel string.
/// 
/// The number of channels used, 3 or 4.
/// True if channels equals 4 and using transparency, else false.
/// The gamma correction kernel string
template 
string FinalAccumOpenCLKernelCreator::GammaCorrectionKernel(size_t channels, bool transparency)
{
	bool alphaCalc = ((channels > 3) && transparency);
	return alphaCalc ? m_GammaCorrectionWithAlphaCalcKernel : m_GammaCorrectionWithoutAlphaCalcKernel;
}
/// 
/// Get the final accumulation entry point.
/// 
/// True if early clip is desired, else false.
/// The number of channels used, 3 or 4.
/// True if channels equals 4 and using transparency, else false.
/// Storage for the alpha base value used in the kernel. 0 if transparency is true, else 255.
/// Storage for the alpha scale value used in the kernel. 255 if transparency is true, else 0.
/// The name of the final accumulation entry point kernel function
template 
string FinalAccumOpenCLKernelCreator::FinalAccumEntryPoint(bool earlyClip, size_t channels, bool transparency, T& alphaBase, T& alphaScale)
{
	bool alphaCalc = ((channels > 3) && transparency);
	bool alphaAccum = channels > 3;
	if (alphaAccum)
	{
		alphaBase = transparency ? 0.0f : 255.0f;//See the table below.
		alphaScale = transparency ? 255.0f : 0.0f;
	}
	if (earlyClip)
	{
		if (!alphaCalc && !alphaAccum)//Rgb output, the most common case.
			return FinalAccumEarlyClipEntryPoint();
		else if (alphaCalc && alphaAccum)//Rgba output and Transparency.
			return FinalAccumEarlyClipWithAlphaCalcWithAlphaAccumEntryPoint();
		else if (!alphaCalc && alphaAccum)//Rgba output and !Transparency.
			return FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumEntryPoint();
		else
			return "";//Cannot have alphaCalc and !alphaAccum, it makes no sense.
	}
	else
	{
		if (!alphaCalc && !alphaAccum)//Rgb output, the most common case.
			return FinalAccumLateClipEntryPoint();
		else if (alphaCalc && alphaAccum)//Rgba output and Transparency.
			return FinalAccumLateClipWithAlphaCalcWithAlphaAccumEntryPoint();
		else if (!alphaCalc && alphaAccum)//Rgba output and !Transparency.
			return FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumEntryPoint();
		else
			return "";//Cannot have alphaCalc and !alphaAccum, it makes no sense.
	}
}
/// 
/// Get the final accumulation kernel string.
/// 
/// True if early clip is desired, else false.
/// The number of channels used, 3 or 4.
/// True if channels equals 4 and using transparency, else false.
/// The final accumulation kernel string
template 
string FinalAccumOpenCLKernelCreator::FinalAccumKernel(bool earlyClip, size_t channels, bool transparency)
{
	bool alphaCalc = (channels > 3 && transparency);
	bool alphaAccum = channels > 3;
	if (earlyClip)
	{
		if (!alphaCalc && !alphaAccum)//Rgb output, the most common case.
			return FinalAccumEarlyClipKernel();
		else if (alphaCalc && alphaAccum)//Rgba output and Transparency.
			return FinalAccumEarlyClipWithAlphaCalcWithAlphaAccumKernel();
		else if (!alphaCalc && alphaAccum)//Rgba output and !Transparency.
			return FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumKernel();
		else
			return "";//Cannot have alphaCalc and !alphaAccum, it makes no sense.
	}
	else
	{
		if (!alphaCalc && !alphaAccum)//Rgb output, the most common case.
			return FinalAccumLateClipKernel();
		else if (alphaCalc && alphaAccum)//Rgba output and Transparency.
			return FinalAccumLateClipWithAlphaCalcWithAlphaAccumKernel();
		else if (!alphaCalc && alphaAccum)//Rgba output and !Transparency.
			return FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumKernel();
		else
			return "";//Cannot have alphaCalc and !alphaAccum, it makes no sense.
	}
}
/// 
/// Wrapper around CreateFinalAccumKernelString().
/// 
/// True if early clip is desired, else false.
/// The number of channels used, 3 or 4.
/// True if channels equals 4 and using transparency, else false.
/// The final accumulation kernel string
template 
string FinalAccumOpenCLKernelCreator::CreateFinalAccumKernelString(bool earlyClip, size_t channels, bool transparency)
{
	return CreateFinalAccumKernelString(earlyClip, (channels > 3 && transparency), channels > 3);
}
/// 
/// Create the final accumulation kernel string
/// 
/// True if early clip is desired, else false.
/// True if channels equals 4 and transparency is desired, else false.
/// True if channels equals 4
/// The final accumulation kernel string
template 
string FinalAccumOpenCLKernelCreator::CreateFinalAccumKernelString(bool earlyClip, bool alphaCalc, bool alphaAccum)
{
	ostringstream os;
	string channels = alphaAccum ? "4" : "3";
	os <<
		ConstantDefinesString(typeid(T) == typeid(double)) <<
		ClampRealFunctionString <<
		UnionCLStructString <<
		RgbToHsvFunctionString <<
		HsvToRgbFunctionString <<
		CalcAlphaFunctionString <<
		CurveAdjustFunctionString <<
		SpatialFilterCLStructString;
	if (earlyClip)
	{
		if (!alphaCalc && !alphaAccum)//Rgb output, the most common case.
			os << "__kernel void " << m_FinalAccumEarlyClipEntryPoint << "(\n";
		else if (alphaCalc && alphaAccum)//Rgba output and Transparency.
			os << "__kernel void " << m_FinalAccumEarlyClipWithAlphaCalcWithAlphaAccumEntryPoint << "(\n";
		else if (!alphaCalc && alphaAccum)//Rgba output and !Transparency.
			os << "__kernel void " << m_FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumEntryPoint << "(\n";
		else
			return "";//Cannot have alphaCalc and !alphaAccum, it makes no sense.
	}
	else
	{
		os <<
			CreateCalcNewRgbFunctionString(false) <<
			CreateGammaCorrectionFunctionString(false, alphaCalc, alphaAccum, true);
		if (!alphaCalc && !alphaAccum)//Rgb output, the most common case.
			os << "__kernel void " << m_FinalAccumLateClipEntryPoint << "(\n";
		else if (alphaCalc && alphaAccum)//Rgba output and Transparency.
			os << "__kernel void " << m_FinalAccumLateClipWithAlphaCalcWithAlphaAccumEntryPoint << "(\n";
		else if (!alphaCalc && alphaAccum)//Rgba output and !Transparency.
			os << "__kernel void " << m_FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumEntryPoint << "(\n";
		else
			return "";//Cannot have alphaCalc and !alphaAccum, it makes no sense.
	}
	os <<
		"	const __global real4reals* accumulator,\n"
		"	__write_only image2d_t pixels,\n"
		"	__constant SpatialFilterCL* spatialFilter,\n"
		"	__constant real_t* filterCoefs,\n"
		"	__constant real4reals* csa,\n"
		"	const uint doCurves,\n"
		"	const real_t alphaBase,\n"
		"	const real_t alphaScale\n"
		"\t)\n"
		"{\n"
		"\n"
		"	if ((GLOBAL_ID_Y >= spatialFilter->m_FinalRasH) || (GLOBAL_ID_X >= spatialFilter->m_FinalRasW))\n"
		"		return;\n"
		"\n"
		"	uint accumX = spatialFilter->m_DensityFilterOffset + (GLOBAL_ID_X * spatialFilter->m_Supersample);\n"
		"	uint accumY = spatialFilter->m_DensityFilterOffset + (GLOBAL_ID_Y * spatialFilter->m_Supersample);\n"
		"	int2 finalCoord;\n"
		"	finalCoord.x = GLOBAL_ID_X;\n"
		"	finalCoord.y = (int)((spatialFilter->m_YAxisUp == 1) ? ((spatialFilter->m_FinalRasH - GLOBAL_ID_Y) - 1) : GLOBAL_ID_Y);\n"
		"	float4floats finalColor;\n"
		"	int ii, jj;\n"
		"	uint filterKRowIndex;\n"
		"	const __global real4reals* accumBucket;\n"
		"	real4reals newBucket;\n"
		"	newBucket.m_Real4 = 0;\n"
		"\n"
		"	for (jj = 0; jj < spatialFilter->m_FilterWidth; jj++)\n"
		"	{\n"
		"		filterKRowIndex = jj * spatialFilter->m_FilterWidth;\n"
		"\n"
		"		for (ii = 0; ii < spatialFilter->m_FilterWidth; ii++)\n"
		"		{\n"
		"			real_t k = filterCoefs[ii + filterKRowIndex];\n"
		"\n"
		"			accumBucket = accumulator + (accumX + ii) + ((accumY + jj) * spatialFilter->m_SuperRasW);\n"
		"			newBucket.m_Real4 += (k * accumBucket->m_Real4);\n"
		"		}\n"
		"	}\n"
		"\n";
	//Not supporting 2 bytes per channel on the GPU. If the user wants it, run on the CPU.
	if (earlyClip)//If early clip, simply assign values directly to the temp float4 since they've been gamma corrected already, then write it straight to the output image below.
	{
		os <<
		"	finalColor.m_Float4.x = (float)newBucket.m_Real4.x;\n"//CPU side clamps, skip here because write_imagef() does the clamping for us.
		"	finalColor.m_Float4.y = (float)newBucket.m_Real4.y;\n"
		"	finalColor.m_Float4.z = (float)newBucket.m_Real4.z;\n";
		if (alphaAccum)
		{
			if (alphaCalc)
				os << "	finalColor.m_Float4.w = (float)newBucket.m_Real4.w * 255.0f;\n";
			else
				os << "	finalColor.m_Float4.w = 255;\n";
		}
	}
	else
	{
		//Late clip, so must gamma correct from the temp new bucket to temp float4.
		if (typeid(T) == typeid(double))
		{
			os <<
		"	real4reals realFinal;\n"
		"\n"
		"	GammaCorrectionFloats(&newBucket, &(spatialFilter->m_Background[0]), spatialFilter->m_Gamma, spatialFilter->m_LinRange, spatialFilter->m_Vibrancy, spatialFilter->m_HighlightPower, alphaBase, alphaScale, &(realFinal.m_Reals[0]));\n"
		"	finalColor.m_Float4.x = (float)realFinal.m_Real4.x;\n"
		"	finalColor.m_Float4.y = (float)realFinal.m_Real4.y;\n"
		"	finalColor.m_Float4.z = (float)realFinal.m_Real4.z;\n"
		"	finalColor.m_Float4.w = (float)realFinal.m_Real4.w;\n"
		;
		}
		else
		{
			os <<
		"	GammaCorrectionFloats(&newBucket, &(spatialFilter->m_Background[0]), spatialFilter->m_Gamma, spatialFilter->m_LinRange, spatialFilter->m_Vibrancy, spatialFilter->m_HighlightPower, alphaBase, alphaScale, &(finalColor.m_Floats[0]));\n";
		}
	}
	os <<
		"\n"
		"	if (doCurves)\n"
		"	{\n"
		"		CurveAdjust(csa, &(finalColor.m_Floats[0]), 1);\n"
		"		CurveAdjust(csa, &(finalColor.m_Floats[1]), 2);\n"
		"		CurveAdjust(csa, &(finalColor.m_Floats[2]), 3);\n"
		"	}\n"
		"\n"
		"	finalColor.m_Float4 /= 255.0f;\n"
		"	write_imagef(pixels, finalCoord, finalColor.m_Float4);\n"//Use write_imagef instead of write_imageui because only the former works when sharing with an OpenGL texture.
		"	barrier(CLK_GLOBAL_MEM_FENCE);\n"//Required, or else page tearing will occur during interactive rendering.
		"}\n"
		;
	return os.str();
}
/// 
/// Creates the gamma correction function string.
/// This is not a full kernel, just a function that is used in the kernels.
/// 
/// True if writing to a global buffer (early clip), else false (late clip).
/// True if channels equals 4 and transparency is desired, else false.
/// True if channels equals 4
/// True if writing to global buffer (late clip), else false (early clip).
/// The gamma correction function string
template 
string FinalAccumOpenCLKernelCreator::CreateGammaCorrectionFunctionString(bool globalBucket, bool alphaCalc, bool alphaAccum, bool finalOut)
{
	ostringstream os;
	string dataType;
	string unionMember;
	dataType = "real_t";
	//Use real_t for all cases, early clip and final accum.
	os << "void GammaCorrectionFloats(" << (globalBucket ? "__global " : "") << "real4reals* bucket, __constant real_t* background, real_t g, real_t linRange, real_t vibrancy, real_t highlightPower, real_t alphaBase, real_t alphaScale, " << (finalOut ? "" : "__global") << " real_t* correctedChannels)\n";
	os
	<< "{\n"
	<< "	real_t alpha, ls, tmp, a;\n"
	<< "	real4reals newRgb;\n"
	<< "\n"
	<< "	if (bucket->m_Reals[3] <= 0)\n"
	<< "	{\n"
	<< "		alpha = 0;\n"
	<< "		ls = 0;\n"
	<< "	}\n"
	<< "	else\n"
	<< "	{\n"
	<< "		tmp = bucket->m_Reals[3];\n"
	<< "		alpha = CalcAlpha(tmp, g, linRange);\n"
	<< "		ls = vibrancy * 256.0 * alpha / tmp;\n"
	<< "		ClampRef(&alpha, 0.0, 1.0);\n"
	<< "	}\n"
	<< "\n"
	<< "	CalcNewRgb(bucket, ls, highlightPower, &newRgb);\n"
	<< "\n"
	<< "	for (uint rgbi = 0; rgbi < 3; rgbi++)\n"
	<< "	{\n"
	<< "		a = newRgb.m_Reals[rgbi] + ((1.0 - vibrancy) * 256.0 * pow(bucket->m_Reals[rgbi], g));\n"
	<< "\n";
	if (!alphaCalc)
	{
		os <<
		"		a += ((1.0 - alpha) * background[rgbi]);\n";
	}
	else
	{
		os
	<< "		if (alpha > 0)\n"
	<< "			a /= alpha;\n"
	<< "		else\n"
	<< "			a = 0;\n";
	}
	os <<
	"\n"
	"			correctedChannels[rgbi] = (" << dataType << ")clamp(a, 0.0, 255.0);\n"
	"		}\n"
	"\n";
	//The CPU code has 3 cases for assigning alpha:
	//[3] = alpha.//Early clip.
	//[3] = alpha * 255.//Final Rgba with transparency.
	//[3] = 255.//Final Rgba without transparency.
	//Putting conditionals in GPU code is to be avoided. So do base + alpha * scale which will
	//work for all 3 cases without using a conditional, which should be faster on a GPU. This gives:
	//Base = 0,   scale = 1.   [3] = (0 +   (alpha * 1)).   [3] = alpha.
	//Base = 0,   scale = 255. [3] = (0 +   (alpha * 255)). [3] = alpha * 255.
	//Base = 255, scale = 0.   [3] = (255 + (alpha * 0)).   [3] = 255.
	if (alphaAccum)
	{
		os
	<< "	correctedChannels[3] = (" << dataType << ")(alphaBase + (alpha * alphaScale));\n";
	}
	os <<
	"}\n"
	"\n";
	return os.str();
}
/// 
/// OpenCL equivalent of Palette::CalcNewRgb().
/// 
/// True if writing the corrected value to a global buffer (early clip), else false (late clip).
/// The CalcNewRgb function string
template 
string FinalAccumOpenCLKernelCreator::CreateCalcNewRgbFunctionString(bool globalBucket)
{
	ostringstream os;
	os <<
	"static void CalcNewRgb(" << (globalBucket ? "__global " : "") << "real4reals* oldRgb, real_t ls, real_t highPow, real4reals* newRgb)\n"
	"{\n"
	"	int rgbi;\n"
	"	real_t newls, lsratio;\n"
	"	real4reals newHsv;\n"
	"	real_t maxa, maxc;\n"
	"	real_t adjhlp;\n"
	"\n"
	"	if (ls == 0 || (oldRgb->m_Real4.x == 0 && oldRgb->m_Real4.y == 0 && oldRgb->m_Real4.z == 0))\n"//Can't do a vector compare to zero.
	"	{\n"
	"		newRgb->m_Real4 = 0;\n"
	"		return;\n"
	"	}\n"
	"\n"
	//Identify the most saturated channel.
	"	maxc = max(max(oldRgb->m_Reals[0], oldRgb->m_Reals[1]), oldRgb->m_Reals[2]);\n"
	"	maxa = ls * maxc;\n"
	"\n"
	//If a channel is saturated and highlight power is non-negative
	//modify the color to prevent hue shift.
	"	if (maxa > 255 && highPow >= 0)\n"
	"	{\n"
	"		newls = 255.0 / maxc;\n"
	"		lsratio = pow(newls / ls, highPow);\n"
	"\n"
	//Calculate the max-value color (ranged 0 - 1).
	"		for (rgbi = 0; rgbi < 3; rgbi++)\n"
	"			newRgb->m_Reals[rgbi] = newls * oldRgb->m_Reals[rgbi] / 255.0;\n"
	"\n"
	//Reduce saturation by the lsratio.
	"		RgbToHsv(&(newRgb->m_Real4), &(newHsv.m_Real4));\n"
	"		newHsv.m_Real4.y *= lsratio;\n"
	"		HsvToRgb(&(newHsv.m_Real4), &(newRgb->m_Real4));\n"
	"\n"
	"		for (rgbi = 0; rgbi < 3; rgbi++)\n"//Unrolling and vectorizing makes no difference.
	"			newRgb->m_Reals[rgbi] *= 255.0;\n"
	"	}\n"
	"	else\n"
	"	{\n"
	"		newls = 255.0 / maxc;\n"
	"		adjhlp = -highPow;\n"
	"\n"
	"		if (adjhlp > 1)\n"
	"			adjhlp = 1;\n"
	"\n"
	"		if (maxa <= 255)\n"
	"			adjhlp = 1;\n"
	"\n"
	//Calculate the max-value color (ranged 0 - 1) interpolated with the old behavior.
	"		for (rgbi = 0; rgbi < 3; rgbi++)\n"//Unrolling, caching and vectorizing makes no difference.
	"			newRgb->m_Reals[rgbi] = ((1.0 - adjhlp) * newls + adjhlp * ls) * oldRgb->m_Reals[rgbi];\n"
	"	}\n"
	"}\n"
	"\n";
	return os.str();
}
/// 
/// Create the gamma correction kernel string used for early clipping.
/// 
/// True if channels equals 4 and transparency is desired, else false.
/// The gamma correction kernel string used for early clipping
template 
string FinalAccumOpenCLKernelCreator::CreateGammaCorrectionKernelString(bool alphaCalc)
{
	ostringstream os;
	string dataType;
	os <<
		ConstantDefinesString(typeid(T) == typeid(double)) <<
		ClampRealFunctionString <<
		UnionCLStructString <<
		RgbToHsvFunctionString <<
		HsvToRgbFunctionString <<
		CalcAlphaFunctionString <<
		CreateCalcNewRgbFunctionString(true) <<
		SpatialFilterCLStructString <<
		CreateGammaCorrectionFunctionString(true, alphaCalc, true, false);//Will only be used with float in this case, early clip. Will always alpha accum.
		os << "__kernel void " << (alphaCalc ? m_GammaCorrectionWithAlphaCalcEntryPoint : m_GammaCorrectionWithoutAlphaCalcEntryPoint) << "(\n" <<
			"	__global real4reals* accumulator,\n"
			"	__constant SpatialFilterCL* spatialFilter\n"
			")\n"
			"{\n"
			"	int testGutter = 0;\n"
			"\n"
			"	if (GLOBAL_ID_Y >= (spatialFilter->m_SuperRasH - testGutter) || GLOBAL_ID_X >= (spatialFilter->m_SuperRasW - testGutter))\n"
			"		return;\n"
			"\n"
			"	uint superIndex = (GLOBAL_ID_Y * spatialFilter->m_SuperRasW) + GLOBAL_ID_X;\n"
			"	__global real4reals* bucket = accumulator + superIndex;\n"
			//Pass in an alphaBase and alphaScale of 0, 1 which means to just directly assign the computed alpha value.
			"	GammaCorrectionFloats(bucket, &(spatialFilter->m_Background[0]), spatialFilter->m_Gamma, spatialFilter->m_LinRange, spatialFilter->m_Vibrancy, spatialFilter->m_HighlightPower, 0.0, 1.0, &(bucket->m_Reals[0]));\n"
			"}\n"
		;
	return os.str();
}
template EMBERCL_API class FinalAccumOpenCLKernelCreator;
#ifdef DO_DOUBLE
	template EMBERCL_API class FinalAccumOpenCLKernelCreator;
#endif
}