--Code changes

-Remove all dependencies on Intel Threading Building Blocks. -Write our own version of parallel_for().
2025-07-02 06:16:17 -04:00 · 2021-12-29 23:10:54 -07:00
parent 5524180d4b
commit ee7ca7065e
50 changed files with 140 additions and 3255 deletions
--- a/Source/Ember/Ember.h
+++ b/Source/Ember/Ember.h
@ -12,9 +12,29 @@
 /// <summary>
 /// Ember class.
 /// </summary>
-
 namespace EmberNs
 {
+static void parallel_for(size_t start, size_t end, size_t parlevel, std::function<void(size_t)> func)
+{
+	const auto ct = parlevel == 0 ? EmberNs::Timing::ProcessorCount() : parlevel;
+	std::vector<std::thread> threads(ct);
+	const auto chunkSize = (end - start) / ct;
+
+	for (size_t i = 0; i < ct; i++)
+	{
+		threads.push_back(std::thread([&, i]
+		{
+			const auto chunkStart = chunkSize* i;
+			const auto chunkEnd = std::min(chunkStart + chunkSize, end);
+
+			for (size_t j = chunkStart; j < chunkEnd; j++)
+				func(j);
+		}));
+	}
+
+	EmberNs::Join(threads);
+}
+
 template <typename T> class Interpolater;

 /// <summary>
--- a/Source/Ember/EmberPch.h
+++ b/Source/Ember/EmberPch.h
@ -66,12 +66,6 @@
 	#include "libxml2/libxml/parser.h"
 #endif

-#if !defined(Q_MOC_RUN)
-	//Intel's Threading Building Blocks is what's used for all threading.
-	#include <tbb/parallel_for.h>
-	#include <tbb/task_scheduler_init.h>
-#endif
-
 #define GLM_FORCE_RADIANS 1
 #define GLM_ENABLE_EXPERIMENTAL 1

@ -88,7 +82,6 @@
 #include <glm/gtc/type_ptr.hpp>
 #include <glm/gtx/string_cast.hpp>

-using namespace tbb;
 using namespace std;
 using namespace std::chrono;
 using namespace glm;
--- a/Source/Ember/Palette.h
+++ b/Source/Ember/Palette.h
@ -406,9 +406,9 @@ public:
 			{
 				for (size_t j = 0; j < width; j++)
 				{
-					v[(width * 3 * i) + (j * 3)]     = static_cast<byte>(m_Entries[j][0] * T(255));//Palettes are as [0..1], so convert to [0..255] here since it's for GUI display.
-					v[(width * 3 * i) + (j * 3) + 1] = static_cast<byte>(m_Entries[j][1] * T(255));
-					v[(width * 3 * i) + (j * 3) + 2] = static_cast<byte>(m_Entries[j][2] * T(255));
+					v[(width * 3 * i) + (j * 3)]     = static_cast<byte>(m_Entries[j][0] * static_cast<T>(255));//Palettes are as [0..1], so convert to [0..255] here since it's for GUI display.
+					v[(width * 3 * i) + (j * 3) + 1] = static_cast<byte>(m_Entries[j][1] * static_cast<T>(255));
+					v[(width * 3 * i) + (j * 3) + 2] = static_cast<byte>(m_Entries[j][2] * static_cast<T>(255));
 				}
 			}
 		}
--- a/Source/Ember/Renderer.cpp
+++ b/Source/Ember/Renderer.cpp
@ -871,19 +871,12 @@ bool Renderer<T, bucketT>::Alloc(bool histOnly)
 template <typename T, typename bucketT>
 bool Renderer<T, bucketT>::ResetBuckets(bool resetHist, bool resetAccum)
 {
-	//parallel_invoke(
-	//[&]
-	//{
 	if (resetHist && !m_HistBuckets.empty())
 		Memset(m_HistBuckets);

-	//},
-	//[&]
-	//{
 	if (resetAccum && !m_AccumulatorBuckets.empty())
 		Memset(m_AccumulatorBuckets);

-	//});
 	return resetHist || resetAccum;
 }

@ -930,7 +923,7 @@ eRenderStatus Renderer<T, bucketT>::LogScaleDensityFilter(bool forceOutput)
 	//Timing t(4);
 	//Original didn't parallelize this, doing so gives a 50-75% speedup.
 	//The value can be directly assigned, which is quicker than summing.
-	parallel_for(startRow, endRow, static_cast<size_t>(1), [&](size_t j)
+	parallel_for(startRow, endRow, m_ThreadsToUse, [&](size_t j)
 	{
 		size_t row = j * m_SuperRasW;
 		size_t rowEnd = row + endCol;
@ -954,11 +947,7 @@ eRenderStatus Renderer<T, bucketT>::LogScaleDensityFilter(bool forceOutput)
 				}
 			}
 		}
-	}
-#if defined(_WIN32) || defined(__APPLE__)
-	, tbb::static_partitioner()
-#endif
-				);
+	});

 	if (m_Callback && !m_Abort)
 		if (!m_Callback->ProgressFunc(m_Ember, m_ProgressParameter, 100.0, 1, 0))
@ -982,14 +971,13 @@ eRenderStatus Renderer<T, bucketT>::GaussianDensityFilter()
 	bool scf = !(Supersample() & 1);
 	intmax_t ss = Floor<T>(Supersample() / static_cast<T>(2));
 	T scfact = std::pow(Supersample() / (Supersample() + static_cast<T>(1)), static_cast<T>(2));
-	size_t threads = m_ThreadsToUse;
 	size_t startRow = Supersample() - 1;
 	size_t endRow = m_SuperRasH - (Supersample() - 1);//Original did + which is most likely wrong.
 	intmax_t startCol = Supersample() - 1;
 	intmax_t endCol = m_SuperRasW - (Supersample() - 1);
-	size_t chunkSize = static_cast<size_t>(std::ceil(static_cast<double>(endRow - startRow) / static_cast<double>(threads)));
+	size_t chunkSize = static_cast<size_t>(std::ceil(static_cast<double>(endRow - startRow) / static_cast<double>(m_ThreadsToUse)));
 	//parallel_for scales very well, dividing the work almost perfectly among all processors.
-	parallel_for(static_cast<size_t>(0), threads, static_cast<size_t>(1), [&] (size_t threadIndex)
+	parallel_for(static_cast<size_t>(0), m_ThreadsToUse, m_ThreadsToUse, [&] (size_t threadIndex)
 	{
 		size_t pixelNumber = 0;
 		const auto localStartRow = static_cast<intmax_t>(std::min<size_t>(startRow + (threadIndex * chunkSize), endRow - 1));
@ -1123,11 +1111,7 @@ eRenderStatus Renderer<T, bucketT>::GaussianDensityFilter()
 				}
 			}
 		}
-	}
-#if defined(_WIN32) || defined(__APPLE__)
-	, tbb::static_partitioner()
-#endif
-				);
+	});

 	if (m_Callback && !m_Abort)
 		m_Callback->ProgressFunc(m_Ember, m_ProgressParameter, 100.0, 1, 0);
@ -1166,7 +1150,7 @@ eRenderStatus Renderer<T, bucketT>::AccumulatorToFinalImage(vector<v4F>& pixels,
 	//The original does it this way as well and it's roughly 11 times faster to do it this way than inline below with each pixel.
 	if (EarlyClip())
 	{
-		parallel_for(static_cast<size_t>(0), m_SuperRasH, static_cast<size_t>(1), [&](size_t j)
+		parallel_for(static_cast<size_t>(0), m_SuperRasH, m_ThreadsToUse, [&](size_t j)
 		{
 			auto rowStart = m_AccumulatorBuckets.data() + (j * m_SuperRasW);//Pull out of inner loop for optimization.
 			const auto rowEnd = rowStart + m_SuperRasW;
@ -1176,11 +1160,7 @@ eRenderStatus Renderer<T, bucketT>::AccumulatorToFinalImage(vector<v4F>& pixels,
 				GammaCorrection(*rowStart, background, g, linRange, vibrancy, false, glm::value_ptr(*rowStart));//Write back in place.
 				rowStart++;
 			}
-		}
-#if defined(_WIN32) || defined(__APPLE__)
-		, tbb::static_partitioner()
-#endif
-					);
+		});
 	}

 	if (m_Abort)
@ -1193,7 +1173,7 @@ eRenderStatus Renderer<T, bucketT>::AccumulatorToFinalImage(vector<v4F>& pixels,
 	//otherwise artifacts that resemble page tearing will occur in an interactive run. It's
 	//critical to never exit this loop prematurely.
 	//for (size_t j = 0; j < FinalRasH(); j++)//Keep around for debugging.
-	parallel_for(static_cast<size_t>(0), FinalRasH(), static_cast<size_t>(1), [&](size_t j)
+	parallel_for(static_cast<size_t>(0), FinalRasH(), m_ThreadsToUse, [&](size_t j)
 	{
 		Color<bucketT> newBucket;
 		size_t pixelsRowStart = (m_YAxisUp ? ((FinalRasH() - j) - 1) : j) * FinalRasW();//Pull out of inner loop for optimization.
@ -1226,11 +1206,7 @@ eRenderStatus Renderer<T, bucketT>::AccumulatorToFinalImage(vector<v4F>& pixels,
 			auto pf = reinterpret_cast<float*>(pv4T);
 			GammaCorrection(*(reinterpret_cast<tvec4<bucketT, glm::defaultp>*>(&newBucket)), background, g, linRange, vibrancy, true, pf);
 		}
-	}
-#if defined(_WIN32) || defined(__APPLE__)
-	, tbb::static_partitioner()
-#endif
-				);
+	});

 	//Insert the palette into the image for debugging purposes. Not implemented on the GPU.
 	if (m_InsertPalette)
@ -1288,7 +1264,7 @@ EmberStats Renderer<T, bucketT>::Iterate(size_t iterCount, size_t temporalSample
 		m_ThreadEmbers.insert(m_ThreadEmbers.begin(), m_ThreadsToUse, m_Ember);
 	}

-	parallel_for(static_cast<size_t>(0), m_ThreadsToUse, static_cast<size_t>(1), [&] (size_t threadIndex)
+	parallel_for(static_cast<size_t>(0), m_ThreadsToUse, m_ThreadsToUse, [&] (size_t threadIndex)
 	{
 #if defined(_WIN32)
 		SetThreadPriority(GetCurrentThread(), static_cast<int>(m_Priority));
@ -1375,11 +1351,7 @@ EmberStats Renderer<T, bucketT>::Iterate(size_t iterCount, size_t temporalSample
 				}
 			}
 		}
-	}
-#if defined(_WIN32) || defined(__APPLE__)
-	, tbb::static_partitioner()
-#endif
-				);
+	});
 	stats.m_Iters = std::accumulate(m_SubBatch.begin(), m_SubBatch.end(), 0ULL);//Sum of iter count of all threads.
 	stats.m_Badvals = std::accumulate(m_BadVals.begin(), m_BadVals.end(), 0ULL);
 	stats.m_IterMs = m_IterTimer.Toc();