--Bug fixes

-Attempt to fix several OpenCL build errors that were occurring on Mac. --Code changes -Improve the coloring code during final accum by getting rid of the last remnants of unnecessary scaling to 255 from flam3. -Begin work of supporting 16bpc on the GPU.
2025-07-01 13:56:06 -04:00 · 2017-07-04 23:08:06 -07:00
parent 562ab9b01a
commit 2adf9f3ff9
16 changed files with 292 additions and 232 deletions
--- a/Source/EmberCL/EmberCLFunctions.h
+++ b/Source/EmberCL/EmberCLFunctions.h
@ -12,7 +12,7 @@ namespace EmberCLns
 /// <summary>
 /// OpenCL equivalent of Palette::RgbToHsv().
 /// </summary>
-static const char* RgbToHsvFunctionString = 
+static const char* RgbToHsvFunctionString =
 	//rgb 0 - 1,
 	//h 0 - 6, s 0 - 1, v 0 - 1
 	"static inline void RgbToHsv(real4_bucket* rgb, real4_bucket* hsv)\n"
@ -82,9 +82,9 @@ static const char* RgbToHsvFunctionString =
 /// <summary>
 /// OpenCL equivalent of Palette::HsvToRgb().
 /// </summary>
-static const char* HsvToRgbFunctionString = 
+static const char* HsvToRgbFunctionString =
 	//h 0 - 6, s 0 - 1, v 0 - 1
-	//rgb 0 - 1 
+	//rgb 0 - 1
 	"static inline void HsvToRgb(real4_bucket* hsv, real4_bucket* rgb)\n"
 	"{\n"
 	"	int j;\n"
@ -118,11 +118,11 @@ static const char* HsvToRgbFunctionString =
 /// <summary>
 /// OpenCL equivalent of Palette::CalcAlpha().
 /// </summary>
-static const char* CalcAlphaFunctionString = 
+static const char* CalcAlphaFunctionString =
 	"static inline real_t CalcAlpha(real_bucket_t density, real_bucket_t gamma, real_bucket_t linrange)\n"//Not the slightest clue what this is doing.//DOC
 	"{\n"
 	"	real_bucket_t frac, alpha, funcval = pow(linrange, gamma);\n"
-		"\n"
+	"\n"
 	"	if (density > 0)\n"
 	"	{\n"
 	"		if (density < linrange)\n"
@ -147,13 +147,13 @@ static const char* CalcAlphaFunctionString =
 /// during final accumulation, which only takes floats.
 /// </summary>
 static const char* CurveAdjustFunctionString =
-"static inline void CurveAdjust(__constant real4reals_bucket* csa, float* a, uint index)\n"
-"{\n"
-"	uint tempIndex = (uint)clamp(*a, (float)0.0, (float)COLORMAP_LENGTH_MINUS_1);\n"
-"	uint tempIndex2 = (uint)clamp((float)csa[tempIndex].m_Real4.x, (float)0.0, (float)COLORMAP_LENGTH_MINUS_1);\n"
-"\n"
-"	*a = (float)round(csa[tempIndex2].m_Reals[index]);\n"
-"}\n";
+	"static inline void CurveAdjust(__constant real4reals_bucket* csa, float* a, uint index)\n"
+	"{\n"
+	"	uint tempIndex = (uint)clamp(*a * (float)COLORMAP_LENGTH_MINUS_1, (float)0.0, (float)COLORMAP_LENGTH_MINUS_1);\n"
+	"	uint tempIndex2 = (uint)clamp((float)csa[tempIndex].m_Real4.x * (float)COLORMAP_LENGTH_MINUS_1, (float)0.0, (float)COLORMAP_LENGTH_MINUS_1);\n"
+	"\n"
+	"	*a = (float)csa[tempIndex2].m_Reals[index];\n"
+	"}\n";

 /// <summary>
 /// Use MWC 64 from David Thomas at the Imperial College of London for
@ -197,7 +197,7 @@ static const char* RandFunctionString =
 /// <summary>
 /// OpenCL equivalent Renderer::AddToAccum().
 /// </summary>
-static const char* AddToAccumWithCheckFunctionString = 
+static const char* AddToAccumWithCheckFunctionString =
 	"inline bool AccumCheck(int superRasW, int superRasH, int i, int ii, int j, int jj)\n"
 	"{\n"
 	"	return (j + jj >= 0 && j + jj < superRasH && i + ii >= 0 && i + ii < superRasW);\n"
@ -207,7 +207,7 @@ static const char* AddToAccumWithCheckFunctionString =
 /// <summary>
 /// OpenCL equivalent various CarToRas member functions.
 /// </summary>
-static const char* CarToRasFunctionString = 
+static const char* CarToRasFunctionString =
 	"inline void CarToRasConvertPointToSingle(__constant CarToRasCL* carToRas, Point* point, uint* singleBufferIndex)\n"
 	"{\n"
 	"	*singleBufferIndex = (uint)(carToRas->m_PixPerImageUnitW * point->m_X - carToRas->m_RasLlX) + (carToRas->m_RasWidth * (uint)(carToRas->m_PixPerImageUnitH * point->m_Y - carToRas->m_RasLlY));\n"
@ -225,29 +225,27 @@ static const char* CarToRasFunctionString =
 static string AtomicString()
 {
 	ostringstream os;
-
 	os <<
-		"void AtomicAdd(volatile __global real_bucket_t* source, const real_bucket_t operand)\n"
-		"{\n"
-		"	union\n"
-		"	{\n"
-		"		atomi intVal;\n"
-		"		real_bucket_t realVal;\n"
-		"	} newVal;\n"
-		"\n"
-		"	union\n"
-		"	{\n"
-		"		atomi intVal;\n"
-		"		real_bucket_t realVal;\n"
-		"	} prevVal;\n"
-		"\n"
-		"	do\n"
-		"	{\n"
-		"		prevVal.realVal = *source;\n"
-		"		newVal.realVal = prevVal.realVal + operand;\n"
-		"	} while (atomic_cmpxchg((volatile __global atomi*)source, prevVal.intVal, newVal.intVal) != prevVal.intVal);\n"
-		"}\n";
-
+	   "void AtomicAdd(volatile __global real_bucket_t* source, const real_bucket_t operand)\n"
+	   "{\n"
+	   "	union\n"
+	   "	{\n"
+	   "		atomi intVal;\n"
+	   "		real_bucket_t realVal;\n"
+	   "	} newVal;\n"
+	   "\n"
+	   "	union\n"
+	   "	{\n"
+	   "		atomi intVal;\n"
+	   "		real_bucket_t realVal;\n"
+	   "	} prevVal;\n"
+	   "\n"
+	   "	do\n"
+	   "	{\n"
+	   "		prevVal.realVal = *source;\n"
+	   "		newVal.realVal = prevVal.realVal + operand;\n"
+	   "	} while (atomic_cmpxchg((volatile __global atomi*)source, prevVal.intVal, newVal.intVal) != prevVal.intVal);\n"
+	   "}\n";
 	return os.str();
 }
 }
--- a/Source/EmberCL/EmberCLStructs.h
+++ b/Source/EmberCL/EmberCLStructs.h
@ -70,16 +70,21 @@ static string ConstantDefinesString(bool doublePrecision)
 	   "#define THREADS_PER_WARP 32u\n"
 	   "#define NWARPS (NTHREADS / THREADS_PER_WARP)\n"
 	   "#define COLORMAP_LENGTH 256u\n"
-	   "#define COLORMAP_LENGTH_MINUS_1 255u\n"
+	   "#define COLORMAP_LENGTH_MINUS_1 255\n"
 	   "#define DE_THRESH 100u\n"
 	   "#define BadVal(x) (((x) != (x)) || ((x) > 1e10) || ((x) < -1e10))\n"
 	   "#define SQR(x) ((x) * (x))\n"
 	   "#define CUBE(x) ((x) * (x) * (x))\n"
-	   "#define M_2PI (M_PI * 2)\n"
-	   "#define M_3PI (M_PI * 3)\n"
+	   "#define MPI ((real_t)M_PI)\n"
+	   "#define MPI2 ((real_t)M_PI_2)\n"
+	   "#define MPI4 ((real_t)M_PI_4)\n"
+	   "#define M1PI ((real_t)M_1_PI)\n"
+	   "#define M2PI ((real_t)M_2_PI)\n"
+	   "#define M_2PI (MPI * 2)\n"
+	   "#define M_3PI (MPI * 3)\n"
 	   "#define SQRT5 2.2360679774997896964091736687313\n"
 	   "#define M_PHI 1.61803398874989484820458683436563\n"
-	   "#define DEG_2_RAD (M_PI / 180)\n"
+	   "#define DEG_2_RAD (MPI / 180)\n"
 	   "\n"
 	   "//Index in each dimension of a thread within a block.\n"
 	   "#define THREAD_ID_X   (get_local_id(0))\n"
--- a/Source/EmberCL/FinalAccumOpenCLKernelCreator.cpp
+++ b/Source/EmberCL/FinalAccumOpenCLKernelCreator.cpp
@ -83,8 +83,8 @@ const string& FinalAccumOpenCLKernelCreator::FinalAccumEntryPoint(bool earlyClip

 	if (alphaAccum)
 	{
-		alphaBase = transparency ? 0 : 255;//See the table below.
-		alphaScale = transparency ? 255 : 0;
+		alphaBase = transparency ? 0 : 1;//See the table below.
+		alphaScale = transparency ? 1 : 0;
 	}

 	if (earlyClip)
@ -260,9 +260,9 @@ string FinalAccumOpenCLKernelCreator::CreateFinalAccumKernelString(bool earlyCli
 		if (alphaAccum)
 		{
 			if (alphaCalc)
-				os << "	finalColor.m_Float4.w = (float)newBucket.m_Real4.w * 255.0f;\n";
+				os << "	finalColor.m_Float4.w = (float)newBucket.m_Real4.w;\n";
 			else
-				os << "	finalColor.m_Float4.w = 255.0f;\n";
+				os << "	finalColor.m_Float4.w = 1.0f;\n";
 		}
 	}
 	else
@ -296,7 +296,6 @@ string FinalAccumOpenCLKernelCreator::CreateFinalAccumKernelString(bool earlyCli
 	   "		CurveAdjust(csa, &(finalColor.m_Floats[2]), 3);\n"
 	   "	}\n"
 	   "\n"
-	   "	finalColor.m_Float4 /= 255.0f;\n"
 	   "	write_imagef(pixels, finalCoord, finalColor.m_Float4);\n"//Use write_imagef instead of write_imageui because only the former works when sharing with an OpenGL texture.
 	   "	barrier(CLK_GLOBAL_MEM_FENCE);\n"//Required, or else page tearing will occur during interactive rendering.
 	   "}\n"
@ -335,7 +334,7 @@ string FinalAccumOpenCLKernelCreator::CreateGammaCorrectionFunctionString(bool g
 			<< "	{\n"
 			<< "		tmp = bucket->m_Reals[3];\n"
 			<< "		alpha = CalcAlpha(tmp, g, linRange);\n"
-			<< "		ls = vibrancy * 256.0 * alpha / tmp;\n"
+			<< "		ls = vibrancy * alpha / tmp;\n"
 			<< "		alpha = clamp(alpha, (real_bucket_t)0.0, (real_bucket_t)1.0);\n"
 			<< "	}\n"
 			<< "\n"
@ -343,7 +342,7 @@ string FinalAccumOpenCLKernelCreator::CreateGammaCorrectionFunctionString(bool g
 			<< "\n"
 			<< "	for (uint rgbi = 0; rgbi < 3; rgbi++)\n"
 			<< "	{\n"
-			<< "		a = newRgb.m_Reals[rgbi] + ((1.0 - vibrancy) * 256.0 * pow(fabs(bucket->m_Reals[rgbi]), g));\n"
+			<< "		a = newRgb.m_Reals[rgbi] + ((1.0 - vibrancy) * pow(fabs(bucket->m_Reals[rgbi]), g));\n"
 			<< "\n";

 	if (!alphaCalc)
@ -362,7 +361,7 @@ string FinalAccumOpenCLKernelCreator::CreateGammaCorrectionFunctionString(bool g

 	os <<
 	   "\n"
-	   "			correctedChannels[rgbi] = (" << dataType << ")clamp(a, (real_bucket_t)0.0, (real_bucket_t)255.0);\n"
+	   "			correctedChannels[rgbi] = (" << dataType << ")clamp(a, (real_bucket_t)0.0, (real_bucket_t)1.0);\n"
 	   "		}\n"
 	   "\n";

@ -399,9 +398,9 @@ string FinalAccumOpenCLKernelCreator::CreateCalcNewRgbFunctionString(bool global
 	   "static void CalcNewRgb(" << (globalBucket ? "__global " : "") << "real4reals_bucket* oldRgb, real_bucket_t ls, real_bucket_t highPow, real4reals_bucket* newRgb)\n"
 	   "{\n"
 	   "	int rgbi;\n"
-	   "	real_bucket_t newls, lsratio;\n"
+	   "	real_bucket_t lsratio;\n"
 	   "	real4reals_bucket newHsv;\n"
-	   "	real_bucket_t maxa, maxc;\n"
+	   "	real_bucket_t maxa, maxc, newls;\n"
 	   "	real_bucket_t adjhlp;\n"
 	   "\n"
 	   "	if (ls == 0 || (oldRgb->m_Real4.x == 0 && oldRgb->m_Real4.y == 0 && oldRgb->m_Real4.z == 0))\n"//Can't do a vector compare to zero.
@ -413,35 +412,31 @@ string FinalAccumOpenCLKernelCreator::CreateCalcNewRgbFunctionString(bool global
 	   //Identify the most saturated channel.
 	   "	maxc = max(max(oldRgb->m_Reals[0], oldRgb->m_Reals[1]), oldRgb->m_Reals[2]);\n"
 	   "	maxa = ls * maxc;\n"
+	   "	newls = 1 / maxc;\n"
 	   "\n"
 	   //If a channel is saturated and highlight power is non-negative
 	   //modify the color to prevent hue shift.
-	   "	if (maxa > 255 && highPow >= 0)\n"
+	   "	if (maxa > 1 && highPow >= 0)\n"
 	   "	{\n"
-	   "		newls = 255.0 / maxc;\n"
 	   "		lsratio = pow(newls / ls, highPow);\n"
 	   "\n"
 	   //Calculate the max-value color (ranged 0 - 1).
 	   "		for (rgbi = 0; rgbi < 3; rgbi++)\n"
-	   "			newRgb->m_Reals[rgbi] = newls * oldRgb->m_Reals[rgbi] / 255.0;\n"
+	   "			newRgb->m_Reals[rgbi] = newls * oldRgb->m_Reals[rgbi];\n"
 	   "\n"
 	   //Reduce saturation by the lsratio.
 	   "		RgbToHsv(&(newRgb->m_Real4), &(newHsv.m_Real4));\n"
 	   "		newHsv.m_Real4.y *= lsratio;\n"
 	   "		HsvToRgb(&(newHsv.m_Real4), &(newRgb->m_Real4));\n"
-	   "\n"
-	   "		for (rgbi = 0; rgbi < 3; rgbi++)\n"//Unrolling and vectorizing makes no difference.
-	   "			newRgb->m_Reals[rgbi] *= 255.0;\n"
 	   "	}\n"
 	   "	else\n"
 	   "	{\n"
-	   "		newls = 255.0 / maxc;\n"
 	   "		adjhlp = -highPow;\n"
 	   "\n"
 	   "		if (adjhlp > 1)\n"
 	   "			adjhlp = 1;\n"
 	   "\n"
-	   "		if (maxa <= 255)\n"
+	   "		if (maxa <= 1)\n"
 	   "			adjhlp = 1;\n"
 	   "\n"
 	   //Calculate the max-value color (ranged 0 - 1) interpolated with the old behavior.
--- a/Source/EmberCL/RendererCL.cpp
+++ b/Source/EmberCL/RendererCL.cpp
@ -669,7 +669,7 @@ bool RendererCL<T, bucketT>::Alloc(bool histOnly)

 	if (b && !(b = wrapper.AddBuffer(m_SpatialFilterParamsBufferName, sizeof(m_SpatialFilterCL))))	 { AddToReport(loc); }

-	if (b && !(b = wrapper.AddBuffer(m_CurvesCsaName, SizeOf(m_Csa.m_Entries))))					 { AddToReport(loc); }
+	if (b && !(b = wrapper.AddBuffer(m_CurvesCsaName, SizeOf(m_Csa))))					             { AddToReport(loc); }

 	if (b && !(b = wrapper.AddBuffer(m_AccumBufferName, size)))										 { AddToReport(loc); }//Accum buffer.

@ -877,6 +877,17 @@ EmberStats RendererCL<T, bucketT>::Iterate(size_t iterCount, size_t temporalSamp
 	return stats;
 }

+/// <summary>
+/// Override which just passes false to the base.
+/// This is because curves are scaled from 0-1 to 0-255 or 0-65535 on the CPU, but need to be kept as 0-1 for OpenCL because the texture expects normalized values.
+/// </summary>
+/// <param name="scale">Ignored</param>
+template <typename T, typename bucketT>
+void RendererCL<T, bucketT>::ComputeCurves(bool scale)
+{
+	Renderer<T, bucketT>::ComputeCurves(false);
+}
+
 /// <summary>
 /// Private functions for making and running OpenCL programs.
 /// </summary>
@ -1312,7 +1323,7 @@ eRenderStatus RendererCL<T, bucketT>::RunFinalAccum()

 		if (b && !(b = wrapper.AddAndWriteBuffer(m_SpatialFilterParamsBufferName, reinterpret_cast<void*>(&m_SpatialFilterCL), sizeof(m_SpatialFilterCL)))) { AddToReport(loc); }

-		if (b && !(b = wrapper.AddAndWriteBuffer(m_CurvesCsaName,				  m_Csa.m_Entries.data(),					   SizeOf(m_Csa.m_Entries))))   { AddToReport(loc); }
+		if (b && !(b = wrapper.AddAndWriteBuffer(m_CurvesCsaName,				  m_Csa.data(),					   SizeOf(m_Csa))))   { AddToReport(loc); }

 		//Since early clip requires gamma correcting the entire accumulator first,
 		//it can't be done inside of the normal final accumulation kernel, so
--- a/Source/EmberCL/RendererCL.h
+++ b/Source/EmberCL/RendererCL.h
@ -168,6 +168,7 @@ protected:
 	virtual eRenderStatus GaussianDensityFilter() override;
 	virtual eRenderStatus AccumulatorToFinalImage(byte* pixels, size_t finalOffset) override;
 	virtual EmberStats Iterate(size_t iterCount, size_t temporalSample) override;
+	virtual void ComputeCurves(bool scale) override;

 #ifndef TEST_CL
 private: