--User changes

-Remove some warnings about interpolation type on first and last flames. --Code changes -Make DE block size always be 16x16, this should help stability on some Nvidia cards. No changes for AMD cards since they were that size already. -Since the block size is now so small, do not reduce it further when supersampling. -Clean up some variable names and documentation around OpenCL DE to be more clear.
2016-03-12 19:25:19 -08:00
parent 65be0143ff
commit 19cb27b83a
6 changed files with 530 additions and 553 deletions
@@ -437,16 +437,12 @@ public:
 		{
 			if (i1 == 0)
 			{
 				//fprintf(stderr, "error: cannot use smooth interpolation on first segment.\n");
 				//fprintf(stderr, "reverting to linear interpolation.\n");
 				Align(&embers[i1], &localEmbers[0], 2);
 				smoothFlag = false;
 			}
 			if (i2 == size - 1)
 			{
 				//fprintf(stderr, "error: cannot use smooth interpolation on last segment.\n");
 				//fprintf(stderr, "reverting to linear interpolation.\n");
 				Align(&embers[i1], &localEmbers[0], 2);
 				smoothFlag = false;
 			}
@@ -344,16 +344,10 @@ public:
 		if (emberSize > 0)
 		{
 			if (embers[0].m_Interp == eInterp::EMBER_INTERP_SMOOTH)
 			{
 				cout << "Warning: smooth interpolation cannot be used for first segment.\n         switching to linear.\n";
 				embers[0].m_Interp = eInterp::EMBER_INTERP_LINEAR;
 			}
 			if (emberSize >= 2 && embers[emberSize - 2].m_Interp == eInterp::EMBER_INTERP_SMOOTH)
 			{
 				cout << "Warning: smooth interpolation cannot be used for last segment.\n         switching to linear.\n";
 				embers[emberSize - 2].m_Interp = eInterp::EMBER_INTERP_LINEAR;
 			}
 		}
 		//Finally, ensure that consecutive 'rotate' parameters never exceed
@@ -940,12 +940,20 @@ size_t OpenCLWrapper::GlobalMemSize() const { return m_GlobalMemSize; }
 size_t OpenCLWrapper::MaxAllocSize() const { return m_MaxAllocSize; }
 /// <summary>
-/// Makes the even grid dims.
+/// Make even grid dimensions.
 /// The size of the blocks in terms of threads must divide evenly into the total number of threads in the grid.
 /// In the case of a remainder, expand the width and height of the grid to the next highest evenly divisible value.
 /// Ex:
 ///		blockW = 5, blockH = 5
 ///		gridW = 18, gridH = 27
 ///
 /// To make these even:
 ///		gridW = 20, gridH = 30
 /// </summary>
-/// <param name="blockW">The block w.</param>
+/// <param name="blockW">The width of each block in terms of threads.</param>
-/// <param name="blockH">The block h.</param>
+/// <param name="blockH">The height of each block in terms of threads.</param>
-/// <param name="gridW">The grid w.</param>
+/// <param name="gridW">The width of the entire grid in terms of threads.</param>
-/// <param name="gridH">The grid h.</param>
+/// <param name="gridH">The width of the entire grid in terms of threads.</param>
 void OpenCLWrapper::MakeEvenGridDims(size_t blockW, size_t blockH, size_t& gridW, size_t& gridH)
 {
 	if (gridW % blockW != 0)
@@ -165,13 +165,14 @@ bool RendererCL<T, bucketT>::Init(const vector<pair<size_t, size_t>>& devices, b
 		if (b)
 		{
-			//This is the maximum box dimension for density filtering which consists of (blockSize  * blockSize) + (2 * filterWidth).
+			//This is the maximum box dimension for density filtering which consists of (blockSize * blockSize) + (2 * filterWidth).
-			//These blocks must be square, and ideally, 32x32.
+			//These blocks should be square, and ideally, 32x32.
-			//Sadly, at the moment, Fermi runs out of resources at that block size because the DE filter function is so complex.
+			//Sadly, at the moment, the GPU runs out of resources at that block size because the DE filter function is so complex.
 			//The next best block size seems to be 24x24.
 			//AMD is further limited because of less local memory so these have to be 16 on AMD.
-			m_MaxDEBlockSizeW = m_Devices[0]->Nvidia() ? 24 : 16;//These *must* both be divisible by 8 or else pixels will go missing.
+			//Users have reported crashes on Nvidia cards even at size 24, so just to be safe, make them both 16 for all manufacturers.
-			m_MaxDEBlockSizeH = m_Devices[0]->Nvidia() ? 24 : 16;
+			m_MaxDEBlockSizeW = 16;
 			m_MaxDEBlockSizeH = 16;
 			FillSeeds();
 			for (size_t device = 0; device < m_Devices.size(); device++)
@@ -1191,22 +1192,18 @@ eRenderStatus RendererCL<T, bucketT>::RunDensityFilter()
 	if (kernelIndex != -1)
 	{
-		uint leftBound  = m_DensityFilterCL.m_Supersample - 1;
+		uint ssm1		  = m_DensityFilterCL.m_Supersample - 1;
-		uint rightBound = m_DensityFilterCL.m_SuperRasW - (m_DensityFilterCL.m_Supersample - 1);
+		uint leftBound    = ssm1;
-		uint topBound   = leftBound;
+		uint rightBound   = m_DensityFilterCL.m_SuperRasW - ssm1;
-		uint botBound   = m_DensityFilterCL.m_SuperRasH - (m_DensityFilterCL.m_Supersample - 1);
+		uint topBound     = leftBound;
 		uint botBound     = m_DensityFilterCL.m_SuperRasH - ssm1;
 		size_t gridW      = rightBound - leftBound;
 		size_t gridH      = botBound - topBound;
-		size_t blockSizeW = m_MaxDEBlockSizeW;//These *must* both be divisible by 16 or else pixels will go missing.
+		size_t blockSizeW = m_MaxDEBlockSizeW;
 		size_t blockSizeH = m_MaxDEBlockSizeH;
-		auto& wrapper = m_Devices[0]->m_Wrapper;
+		double fw2        = m_DensityFilterCL.m_FilterWidth * 2.0;
-
+		auto& wrapper     = m_Devices[0]->m_Wrapper;
-		//OpenCL runs out of resources when using double or a supersample of 2.
+		//Can't just blindly pass dimension in vals. Must adjust them first to evenly divide the thread count
 		//Remedy this by reducing the height of the block by 2.
 		if (m_DoublePrecision || m_DensityFilterCL.m_Supersample > 1)
 			blockSizeH -= 2;
 		//Can't just blindly pass dimension in vals. Must adjust them first to evenly divide the block count
 		//into the total grid dimensions.
 		OpenCLWrapper::MakeEvenGridDims(blockSizeW, blockSizeH, gridW, gridH);
 		//t.Tic();
@@ -1215,11 +1212,11 @@ eRenderStatus RendererCL<T, bucketT>::RunDensityFilter()
 		//The other is to proces the entire image in multiple passes, and each pass processes blocks of pixels
 		//that are far enough apart such that their filters do not overlap.
 		//Do the latter.
-		//Gap is in terms of blocks. How many blocks must separate two blocks running at the same time.
+		//Gap is in terms of blocks and specifies how many blocks must separate two blocks running at the same time.
-		uint gapW = uint(ceil((m_DensityFilterCL.m_FilterWidth * 2.0) / double(blockSizeW)));
+		uint gapW = uint(ceil(fw2 / blockSizeW));
-		uint chunkSizeW = gapW + 1;
+		uint chunkSizeW = gapW + 1;//Chunk size is also in terms of blocks and is one block (the one running) plus the gap to the right of it.
-		uint gapH = uint(ceil((m_DensityFilterCL.m_FilterWidth * 2.0) / double(blockSizeH)));
+		uint gapH = uint(ceil(fw2 / blockSizeH));
-		uint chunkSizeH = gapH + 1;
+		uint chunkSizeH = gapH + 1;//Chunk size is also in terms of blocks and is one block (the one running) plus the gap below it.
 		double totalChunks = chunkSizeW * chunkSizeH;
 		if (b && !(b = wrapper.AddAndWriteBuffer(m_DEFilterParamsBufferName, reinterpret_cast<void*>(&m_DensityFilterCL), sizeof(m_DensityFilterCL)))) { AddToReport(loc); }
@@ -1257,22 +1254,22 @@ eRenderStatus RendererCL<T, bucketT>::RunDensityFilter()
 		}
 #else
-		gridW /= chunkSizeW;
+		gridW /= chunkSizeW;//Grid must be scaled down by number of chunks.
 		gridH /= chunkSizeH;
 		OpenCLWrapper::MakeEvenGridDims(blockSizeW, blockSizeH, gridW, gridH);
-		for (uint rowChunk = 0; b && !m_Abort && rowChunk < chunkSizeH; rowChunk++)
+		for (uint rowChunkPass = 0; b && !m_Abort && rowChunkPass < chunkSizeH; rowChunkPass++)//Number of vertical passes.
 		{
-			for (uint colChunk = 0; b && !m_Abort && colChunk < chunkSizeW; colChunk++)
+			for (uint colChunkPass = 0; b && !m_Abort && colChunkPass < chunkSizeW; colChunkPass++)//Number of horizontal passes.
 			{
 				//t2.Tic();
-				if (b && !(b = RunDensityFilterPrivate(kernelIndex, gridW, gridH, blockSizeW, blockSizeH, chunkSizeW, chunkSizeH, colChunk, rowChunk))) { m_Abort = true; AddToReport(loc); }
+				if (b && !(b = RunDensityFilterPrivate(kernelIndex, gridW, gridH, blockSizeW, blockSizeH, chunkSizeW, chunkSizeH, colChunkPass, rowChunkPass))) { m_Abort = true; AddToReport(loc); }
 				//t2.Toc(loc);
 				if (b && m_Callback)
 				{
-					double percent = (double((rowChunk * chunkSizeW) + (colChunk + 1)) / totalChunks) * 100.0;
+					double percent = (double((rowChunkPass * chunkSizeW) + (colChunkPass + 1)) / totalChunks) * 100.0;
 					double etaMs = ((100.0 - percent) / percent) * t.Toc();
 					if (!m_Callback->ProgressFunc(m_Ember, m_ProgressParameter, percent, 1, etaMs))
@@ -1456,11 +1453,11 @@ bool RendererCL<T, bucketT>::ClearBuffer(size_t device, const string& bufferName
 /// <param name="blockH">Block height</param>
 /// <param name="chunkSizeW">Chunk size width (gapW + 1)</param>
 /// <param name="chunkSizeH">Chunk size height (gapH + 1)</param>
-/// <param name="rowParity">Row parity</param>
+/// <param name="colChunkPass">The current horizontal pass index</param>
-/// <param name="colParity">Column parity</param>
+/// <param name="rowChunkPass">The current vertical pass index</param>
 /// <returns>True if success, else false.</returns>
 template <typename T, typename bucketT>
-bool RendererCL<T, bucketT>::RunDensityFilterPrivate(size_t kernelIndex, size_t gridW, size_t gridH, size_t blockW, size_t blockH, uint chunkSizeW, uint chunkSizeH, uint chunkW, uint chunkH)
+bool RendererCL<T, bucketT>::RunDensityFilterPrivate(size_t kernelIndex, size_t gridW, size_t gridH, size_t blockW, size_t blockH, uint chunkSizeW, uint chunkSizeH, uint colChunkPass, uint rowChunkPass)
 {
 	//Timing t(4);
 	bool b = true;
@@ -1487,9 +1484,9 @@ bool RendererCL<T, bucketT>::RunDensityFilterPrivate(size_t kernelIndex, size_t
 		if (b && !(b = wrapper.SetArg(kernelIndex, argIndex, chunkSizeH)))						 { AddToReport(loc); } argIndex++;//Chunk size height (gapH + 1).
-		if (b && !(b = wrapper.SetArg(kernelIndex, argIndex, chunkW)))							 { AddToReport(loc); } argIndex++;//Column chunk.
+		if (b && !(b = wrapper.SetArg(kernelIndex, argIndex, colChunkPass)))					 { AddToReport(loc); } argIndex++;//Column chunk, horizontal pass.
-		if (b && !(b = wrapper.SetArg(kernelIndex, argIndex, chunkH)))							 { AddToReport(loc); } argIndex++;//Row chunk.
+		if (b && !(b = wrapper.SetArg(kernelIndex, argIndex, rowChunkPass)))					 { AddToReport(loc); } argIndex++;//Row chunk, vertical pass.
 		//t.Toc(__FUNCTION__ " set args");
@@ -178,7 +178,7 @@ private:
 	eRenderStatus RunDensityFilter();
 	eRenderStatus RunFinalAccum();
 	bool ClearBuffer(size_t device, const string& bufferName, uint width, uint height, uint elementSize);
-	bool RunDensityFilterPrivate(size_t kernelIndex, size_t gridW, size_t gridH, size_t blockW, size_t blockH, uint chunkSizeW, uint chunkSizeH, uint chunkW, uint chunkH);
+	bool RunDensityFilterPrivate(size_t kernelIndex, size_t gridW, size_t gridH, size_t blockW, size_t blockH, uint chunkSizeW, uint chunkSizeH, uint colChunkPass, uint rowChunkPass);
 	int MakeAndGetDensityFilterProgram(size_t ss, uint filterWidth);
 	int MakeAndGetFinalAccumProgram(double& alphaBase, double& alphaScale);
 	int MakeAndGetGammaCorrectionProgram();