diff --git a/Source/Ember/Interpolate.h b/Source/Ember/Interpolate.h index 7d56def..e89bbec 100644 --- a/Source/Ember/Interpolate.h +++ b/Source/Ember/Interpolate.h @@ -437,16 +437,12 @@ public: { if (i1 == 0) { - //fprintf(stderr, "error: cannot use smooth interpolation on first segment.\n"); - //fprintf(stderr, "reverting to linear interpolation.\n"); Align(&embers[i1], &localEmbers[0], 2); smoothFlag = false; } if (i2 == size - 1) { - //fprintf(stderr, "error: cannot use smooth interpolation on last segment.\n"); - //fprintf(stderr, "reverting to linear interpolation.\n"); Align(&embers[i1], &localEmbers[0], 2); smoothFlag = false; } diff --git a/Source/Ember/XmlToEmber.h b/Source/Ember/XmlToEmber.h index da69128..7043dfb 100644 --- a/Source/Ember/XmlToEmber.h +++ b/Source/Ember/XmlToEmber.h @@ -344,16 +344,10 @@ public: if (emberSize > 0) { if (embers[0].m_Interp == eInterp::EMBER_INTERP_SMOOTH) - { - cout << "Warning: smooth interpolation cannot be used for first segment.\n switching to linear.\n"; embers[0].m_Interp = eInterp::EMBER_INTERP_LINEAR; - } if (emberSize >= 2 && embers[emberSize - 2].m_Interp == eInterp::EMBER_INTERP_SMOOTH) - { - cout << "Warning: smooth interpolation cannot be used for last segment.\n switching to linear.\n"; embers[emberSize - 2].m_Interp = eInterp::EMBER_INTERP_LINEAR; - } } //Finally, ensure that consecutive 'rotate' parameters never exceed diff --git a/Source/EmberCL/DEOpenCLKernelCreator.cpp b/Source/EmberCL/DEOpenCLKernelCreator.cpp index f1a164c..2c2910a 100644 --- a/Source/EmberCL/DEOpenCLKernelCreator.cpp +++ b/Source/EmberCL/DEOpenCLKernelCreator.cpp @@ -6,8 +6,6 @@ namespace EmberCLns /// /// Constructor that sets all kernel entry points as well as composes /// all kernel source strings. -/// Note that no versions of kernels that use the cache are compiled because -/// the cache is not big enough to hold double4. /// No program compilation is done here, the user must explicitly do it. /// The caller must specify whether they are using an nVidia or AMD card because it changes /// the amount of local memory available. @@ -18,7 +16,6 @@ DEOpenCLKernelCreator::DEOpenCLKernelCreator(bool doublePrecision, bool nVidia) { m_DoublePrecision = doublePrecision; m_NVidia = nVidia; - #ifdef ROW_ONLY_DE m_LogScaleAssignDEEntryPoint = "LogScaleAssignDensityFilterKernel"; m_GaussianDEWithoutSsEntryPoint = "GaussianDEWithoutSsKernel"; @@ -49,7 +46,7 @@ DEOpenCLKernelCreator::DEOpenCLKernelCreator(bool doublePrecision, bool nVidia) m_GaussianDEWithoutSsNoCacheKernel = CreateGaussianDEKernelNoLocalCache(1); m_GaussianDESsWithScfNoCacheKernel = CreateGaussianDEKernelNoLocalCache(2); m_GaussianDESsWithoutScfNoCacheKernel = CreateGaussianDEKernelNoLocalCache(3); -#endif +#endif } /// @@ -68,30 +65,31 @@ const string& DEOpenCLKernelCreator::LogScaleAssignDEEntryPoint() const { return const string& DEOpenCLKernelCreator::GaussianDEKernel(size_t ss, uint filterWidth) const { #ifndef ROW_ONLY_DE + if (filterWidth > MaxDEFilterSize()) { if (ss > 1) { if (!(ss & 1)) - return m_GaussianDESsWithScfNoCacheKernel; + return m_GaussianDESsWithScfNoCacheKernel;//SS 2 or 4. else - return m_GaussianDESsWithoutScfNoCacheKernel; + return m_GaussianDESsWithoutScfNoCacheKernel;//SS 3. } else - return m_GaussianDEWithoutSsNoCacheKernel; + return m_GaussianDEWithoutSsNoCacheKernel;//SS 1; } - else + else//Use cache. #endif { if (ss > 1) { if (!(ss & 1)) - return m_GaussianDESsWithScfKernel; + return m_GaussianDESsWithScfKernel;//SS 2 or 4. else - return m_GaussianDESsWithoutScfKernel; + return m_GaussianDESsWithoutScfKernel;//SS 3. } else - return m_GaussianDEWithoutSsKernel; + return m_GaussianDEWithoutSsKernel;//SS 1; } } @@ -104,6 +102,7 @@ const string& DEOpenCLKernelCreator::GaussianDEKernel(size_t ss, uint filterWidt const string& DEOpenCLKernelCreator::GaussianDEEntryPoint(size_t ss, uint filterWidth) const { #ifndef ROW_ONLY_DE + if (filterWidth > MaxDEFilterSize()) { if (ss > 1) @@ -181,7 +180,6 @@ uint DEOpenCLKernelCreator::SolveMaxBoxSize(uint localMem) string DEOpenCLKernelCreator::CreateLogScaleAssignDEKernelString() { ostringstream os; - os << ConstantDefinesString(m_DoublePrecision) << DensityFilterCLStructString << @@ -205,7 +203,6 @@ string DEOpenCLKernelCreator::CreateLogScaleAssignDEKernelString() " barrier(CLK_GLOBAL_MEM_FENCE);\n"//Just to be safe. Makes no speed difference to do all of the time or only when there's a hit. " }\n" "}\n"; - return os.str(); } @@ -215,204 +212,200 @@ string DEOpenCLKernelCreator::CreateGaussianDEKernel(size_t ss) bool doSS = ss > 1; bool doScf = !(ss & 1); ostringstream os; - os << - ConstantDefinesString(m_DoublePrecision) << - DensityFilterCLStructString << - UnionCLStructString << - "__kernel void " << GaussianDEEntryPoint(ss, MaxDEFilterSize()) << "(\n" << - " const __global real4_bucket* histogram,\n" - " __global real4reals_bucket* accumulator,\n" - " __constant DensityFilterCL* densityFilter,\n" - " const __global real_bucket_t* filterCoefs,\n" - " const __global real_bucket_t* filterWidths,\n" - " const __global uint* coefIndices,\n" - " const uint chunkSizeW,\n" - " const uint chunkSizeH,\n" - " const uint chunkW,\n" - " const uint chunkH\n" - "\t)\n" - "{\n" - " uint rowsToProcess = 32;\n"//Rows to process. - "\n" - " if (((((BLOCK_ID_X * chunkSizeW) + chunkW) * BLOCK_SIZE_X) + THREAD_ID_X >= densityFilter->m_SuperRasW) ||\n" - " ((((BLOCK_ID_Y * chunkSizeH) + chunkH) * rowsToProcess) + THREAD_ID_Y >= densityFilter->m_SuperRasH))\n" - " return;\n" - "\n"; + ConstantDefinesString(m_DoublePrecision) << + DensityFilterCLStructString << + UnionCLStructString << + "__kernel void " << GaussianDEEntryPoint(ss, MaxDEFilterSize()) << "(\n" << + " const __global real4_bucket* histogram,\n" + " __global real4reals_bucket* accumulator,\n" + " __constant DensityFilterCL* densityFilter,\n" + " const __global real_bucket_t* filterCoefs,\n" + " const __global real_bucket_t* filterWidths,\n" + " const __global uint* coefIndices,\n" + " const uint chunkSizeW,\n" + " const uint chunkSizeH,\n" + " const uint colChunkPass,\n" + " const uint rowChunkPass\n" + "\t)\n" + "{\n" + " uint rowsToProcess = 32;\n"//Rows to process. + "\n" + " if (((((BLOCK_ID_X * chunkSizeW) + colChunkPass) * BLOCK_SIZE_X) + THREAD_ID_X >= densityFilter->m_SuperRasW) ||\n" + " ((((BLOCK_ID_Y * chunkSizeH) + rowChunkPass) * rowsToProcess) + THREAD_ID_Y >= densityFilter->m_SuperRasH))\n" + " return;\n" + "\n"; if (doSS) { os << - " uint ss = (uint)floor((real_bucket_t)densityFilter->m_Supersample / 2.0);\n" - " int densityBoxLeftX;\n" - " int densityBoxRightX;\n" - " int densityBoxTopY;\n" - " int densityBoxBottomY;\n" - "\n"; + " uint ss = (uint)floor((real_bucket_t)densityFilter->m_Supersample / 2.0);\n" + " int densityBoxLeftX;\n" + " int densityBoxRightX;\n" + " int densityBoxTopY;\n" + " int densityBoxBottomY;\n" + "\n"; if (doScf) os << - " real_bucket_t scfact = pow(densityFilter->m_Supersample / (densityFilter->m_Supersample + (real_bucket_t)1.0), (real_bucket_t)2.0);\n"; + " real_bucket_t scfact = pow(densityFilter->m_Supersample / (densityFilter->m_Supersample + (real_bucket_t)1.0), (real_bucket_t)2.0);\n"; } os << - " uint fullTempBoxWidth;\n" - " uint leftBound, rightBound, topBound, botBound;\n" - " uint blockHistStartRow, blockHistEndRow, histCol;\n" - " uint blockHistStartCol, boxReadStartCol, boxReadEndCol;\n" - " uint accumWriteStartCol, colsToWrite, colOffset, colsToWriteOffset;\n" - " int histRow, filterRow, accumWriteOffset;\n" - "\n" - " fullTempBoxWidth = BLOCK_SIZE_X + (densityFilter->m_FilterWidth * 2);\n" - //Compute the bounds of the area to be sampled, which is just the ends minus the super sample minus 1. - " leftBound = densityFilter->m_Supersample - 1;\n" - " rightBound = densityFilter->m_SuperRasW - (densityFilter->m_Supersample - 1);\n" - " topBound = densityFilter->m_Supersample - 1;\n" - " botBound = densityFilter->m_SuperRasH - (densityFilter->m_Supersample - 1);\n" - "\n" - //Start and end values are the indices in the histogram read from - //and written to in the accumulator. They are not the indices for the local block of data. - //Before computing local offsets, compute the global offsets first to determine if any rows or cols fall outside of the bounds. - " blockHistStartRow = min(botBound, topBound + (((BLOCK_ID_Y * chunkSizeH) + chunkH) * rowsToProcess));\n"//The first histogram row this block will process. - " blockHistEndRow = min(botBound, blockHistStartRow + rowsToProcess);\n"//The last histogram row this block will process, clamped to the last row. - " blockHistStartCol = min(rightBound, leftBound + (((BLOCK_ID_X * chunkSizeW) + chunkW) * BLOCK_SIZE_X));\n"//The first histogram column this block will process. - " boxReadStartCol = densityFilter->m_FilterWidth - min(densityFilter->m_FilterWidth, blockHistStartCol);\n"//The first box col this block will read from when copying to the accumulator. - " boxReadEndCol = densityFilter->m_FilterWidth + min(densityFilter->m_FilterWidth + BLOCK_SIZE_X, densityFilter->m_SuperRasW - blockHistStartCol);\n"//The last box col this block will read from when copying to the accumulator. - "\n" - //Last, the indices in the global accumulator that the local bounds will be writing to. - " accumWriteStartCol = blockHistStartCol - min(densityFilter->m_FilterWidth, blockHistStartCol);\n"//The first column in the accumulator this block will write to. - " colsToWrite = ceil((real_bucket_t)(boxReadEndCol - boxReadStartCol) / (real_bucket_t)BLOCK_SIZE_X);\n"//Elements per thread to be written to the accumulator. - " histCol = blockHistStartCol + THREAD_ID_X;\n"//The histogram column this individual thread will be reading from. - "\n" - " if (histCol >= rightBound)\n" - " return;\n" - "\n" - //Compute the col position in this local box to serve as the center position - //from which filter application offsets are computed. - //These are the local indices for the local data that are temporarily accumulated to before - //writing out to the global accumulator. - " uint boxCol = densityFilter->m_FilterWidth + THREAD_ID_X;\n" - " uint colsToZeroOffset, colsToZero = ceil((real_bucket_t)fullTempBoxWidth / (real_bucket_t)(BLOCK_SIZE_X));\n"//Usually is 2. - " int i, j, k, jmin, jmax;\n" - " uint filterSelectInt, filterCoefIndex;\n" - " real_bucket_t cacheLog;\n" - " real_bucket_t filterSelect;\n" - " real4_bucket bucket;\n" - ; - + " uint fullTempBoxWidth;\n" + " uint leftBound, rightBound, topBound, botBound;\n" + " uint blockHistStartRow, blockHistEndRow, histCol;\n" + " uint blockHistStartCol, boxReadStartCol, boxReadEndCol;\n" + " uint accumWriteStartCol, colsToWrite, colOffset, colsToWriteOffset;\n" + " int histRow, filterRow, accumWriteOffset;\n" + "\n" + " fullTempBoxWidth = BLOCK_SIZE_X + (densityFilter->m_FilterWidth * 2);\n" + //Compute the bounds of the area to be sampled, which is just the ends minus the super sample minus 1. + " leftBound = densityFilter->m_Supersample - 1;\n" + " rightBound = densityFilter->m_SuperRasW - (densityFilter->m_Supersample - 1);\n" + " topBound = densityFilter->m_Supersample - 1;\n" + " botBound = densityFilter->m_SuperRasH - (densityFilter->m_Supersample - 1);\n" + "\n" + //Start and end values are the indices in the histogram read from + //and written to in the accumulator. They are not the indices for the local block of data. + //Before computing local offsets, compute the global offsets first to determine if any rows or cols fall outside of the bounds. + " blockHistStartRow = min(botBound, topBound + (((BLOCK_ID_Y * chunkSizeH) + rowChunkPass) * rowsToProcess));\n"//The first histogram row this block will process. + " blockHistEndRow = min(botBound, blockHistStartRow + rowsToProcess);\n"//The last histogram row this block will process, clamped to the last row. + " blockHistStartCol = min(rightBound, leftBound + (((BLOCK_ID_X * chunkSizeW) + colChunkPass) * BLOCK_SIZE_X));\n"//The first histogram column this block will process. + " boxReadStartCol = densityFilter->m_FilterWidth - min(densityFilter->m_FilterWidth, blockHistStartCol);\n"//The first box col this block will read from when copying to the accumulator. + " boxReadEndCol = densityFilter->m_FilterWidth + min(densityFilter->m_FilterWidth + BLOCK_SIZE_X, densityFilter->m_SuperRasW - blockHistStartCol);\n"//The last box col this block will read from when copying to the accumulator. + "\n" + //Last, the indices in the global accumulator that the local bounds will be writing to. + " accumWriteStartCol = blockHistStartCol - min(densityFilter->m_FilterWidth, blockHistStartCol);\n"//The first column in the accumulator this block will write to. + " colsToWrite = ceil((real_bucket_t)(boxReadEndCol - boxReadStartCol) / (real_bucket_t)BLOCK_SIZE_X);\n"//Elements per thread to be written to the accumulator. + " histCol = blockHistStartCol + THREAD_ID_X;\n"//The histogram column this individual thread will be reading from. + "\n" + " if (histCol >= rightBound)\n" + " return;\n" + "\n" + //Compute the col position in this local box to serve as the center position + //from which filter application offsets are computed. + //These are the local indices for the local data that are temporarily accumulated to before + //writing out to the global accumulator. + " uint boxCol = densityFilter->m_FilterWidth + THREAD_ID_X;\n" + " uint colsToZeroOffset, colsToZero = ceil((real_bucket_t)fullTempBoxWidth / (real_bucket_t)(BLOCK_SIZE_X));\n"//Usually is 2. + " int i, j, k, jmin, jmax;\n" + " uint filterSelectInt, filterCoefIndex;\n" + " real_bucket_t cacheLog;\n" + " real_bucket_t filterSelect;\n" + " real4_bucket bucket;\n" + ; os << " __local real4reals_bucket filterBox[192];\n";//Must be >= fullTempBoxWidth. - os << - "\n" - " colsToZeroOffset = colsToZero * THREAD_ID_X;\n" - " colsToWriteOffset = colsToWrite * THREAD_ID_X;\n" - " k = (int)densityFilter->m_FilterWidth;\n"//Need a signed int to use below, really is filter width, but reusing a variable to save space. - "\n" - " for (histRow = blockHistStartRow; histRow < blockHistEndRow; histRow++)\n"//Process pixels by row, for 32 rows. - " {\n" - " bucket = histogram[(histRow * densityFilter->m_SuperRasW) + histCol];\n" - "\n" - " if (bucket.w != 0)\n" - " cacheLog = (densityFilter->m_K1 * log(1.0 + bucket.w * densityFilter->m_K2)) / bucket.w;\n" - "\n"; + "\n" + " colsToZeroOffset = colsToZero * THREAD_ID_X;\n" + " colsToWriteOffset = colsToWrite * THREAD_ID_X;\n" + " k = (int)densityFilter->m_FilterWidth;\n"//Need a signed int to use below, really is filter width, but reusing a variable to save space. + "\n" + " for (histRow = blockHistStartRow; histRow < blockHistEndRow; histRow++)\n"//Process pixels by row, for 32 rows. + " {\n" + " bucket = histogram[(histRow * densityFilter->m_SuperRasW) + histCol];\n" + "\n" + " if (bucket.w != 0)\n" + " cacheLog = (densityFilter->m_K1 * log(1.0 + bucket.w * densityFilter->m_K2)) / bucket.w;\n" + "\n"; if (doSS) { os << - " filterSelect = 0;\n" - " densityBoxLeftX = histCol - min(histCol, ss);\n" - " densityBoxRightX = histCol + min(ss, (densityFilter->m_SuperRasW - histCol) - 1);\n" - " densityBoxTopY = histRow - min((uint)histRow, ss);\n" - " densityBoxBottomY = histRow + min(ss, (densityFilter->m_SuperRasH - histRow) - 1);\n" - "\n" - " for (j = densityBoxTopY; j <= densityBoxBottomY; j++)\n" - " {\n" - " for (i = densityBoxLeftX; i <= densityBoxRightX; i++)\n" - " {\n" - " filterSelect += histogram[(j * densityFilter->m_SuperRasW) + i].w;\n" - " }\n" - " }\n" - "\n"; + " filterSelect = 0;\n" + " densityBoxLeftX = histCol - min(histCol, ss);\n" + " densityBoxRightX = histCol + min(ss, (densityFilter->m_SuperRasW - histCol) - 1);\n" + " densityBoxTopY = histRow - min((uint)histRow, ss);\n" + " densityBoxBottomY = histRow + min(ss, (densityFilter->m_SuperRasH - histRow) - 1);\n" + "\n" + " for (j = densityBoxTopY; j <= densityBoxBottomY; j++)\n" + " {\n" + " for (i = densityBoxLeftX; i <= densityBoxRightX; i++)\n" + " {\n" + " filterSelect += histogram[(j * densityFilter->m_SuperRasW) + i].w;\n" + " }\n" + " }\n" + "\n"; - if (doScf) - os << " filterSelect *= scfact;\n"; + if (doScf) + os << " filterSelect *= scfact;\n"; } else { - os - << " filterSelect = bucket.w;\n"; + os + << " filterSelect = bucket.w;\n"; } os << - "\n" - " if (filterSelect > densityFilter->m_MaxFilteredCounts)\n" - " filterSelectInt = densityFilter->m_MaxFilterIndex;\n" - " else if (filterSelect <= DE_THRESH)\n" - " filterSelectInt = (int)ceil(filterSelect) - 1;\n" - " else if (filterSelect != 0)\n" - " filterSelectInt = (int)DE_THRESH + (int)floor(pow((real_bucket_t)(filterSelect - DE_THRESH), densityFilter->m_Curve));\n" - " else\n" - " filterSelectInt = 0;\n" - "\n" - " if (filterSelectInt > densityFilter->m_MaxFilterIndex)\n" - " filterSelectInt = densityFilter->m_MaxFilterIndex;\n" - "\n" - " filterCoefIndex = filterSelectInt * densityFilter->m_KernelSize;\n" - "\n" - //With this new method, only accumulate to the temp local buffer first. Write to the final accumulator last. - //For each loop through, note that there is a local memory barrier call inside of each call to AddToAccumNoCheck(). - //If this isn't done, pixel errors occurr and even an out of resources error occurrs because too many writes are done to the same place in memory at once. - " jmin = min(k, histRow);\n" - " jmax = (int)min((densityFilter->m_SuperRasH - 1) - histRow, densityFilter->m_FilterWidth);\n" - "\n" - " for (j = -jmin; j <= jmax; j++)\n" - " {\n" - " for (i = 0; i < colsToZero && (colsToZeroOffset + i) < fullTempBoxWidth; i++)\n"//Each thread zeroizes a few columns. - " {\n" - " filterBox[colsToZeroOffset + i].m_Real4 = 0;\n" - " }\n" - "\n" - " barrier(CLK_LOCAL_MEM_FENCE);\n" - "\n" - " if (bucket.w != 0)\n" - " {\n" - " filterRow = abs(j) * (densityFilter->m_FilterWidth + 1);\n" - "\n" - " for (i = -k; i <= k; i++)\n" - " {\n" - " filterSelectInt = filterCoefIndex + coefIndices[filterRow + abs(i)];\n"//Really is filterCoeffIndexPlusOffset, but reusing a variable to save space. - " filterBox[i + boxCol].m_Real4 += (bucket * (filterCoefs[filterSelectInt] * cacheLog));\n" - " }\n" - " }\n" - "\n" - " barrier(CLK_LOCAL_MEM_FENCE);\n" - "\n" - //At this point, all threads in this block have applied the filter to their surrounding pixels and stored the results in the temp local box. - //Add the cells of it that are in bounds to the global accumulator. - //Compute offsets in local box to read from, and offsets into global accumulator to write to. - //Use a method here that is similar to the zeroization above: Each thread (column) in the first row iterates through all of the - //rows and adds a few columns to the accumulator. - //" if (THREAD_ID_X == 0)\n" - //" {\n" - //" for (int kk = boxReadStartCol, i = 0; kk < boxReadEndCol; kk++, i++)\n"//Each thread writes a few columns.//Could do away with kk//TODO//OPT - //" {\n" - //" accumulator[((histRow + j) * densityFilter->m_SuperRasW) + (accumWriteStartCol + i)].m_Real4 += filterBox[kk].m_Real4;\n" - //" }\n" - //" }\n" - " accumWriteOffset = ((histRow + j) * densityFilter->m_SuperRasW) + accumWriteStartCol;\n" - "\n" - " for (i = 0; i < colsToWrite; i++)\n"//Each thread writes a few columns. - " {\n" - " colOffset = colsToWriteOffset + i;\n" - "\n" - " if (boxReadStartCol + colOffset < boxReadEndCol)\n" - " accumulator[accumWriteOffset + colOffset].m_Real4 += filterBox[boxReadStartCol + colOffset].m_Real4;\n" - " }\n" - " }\n"//for() filter rows. - " barrier(CLK_GLOBAL_MEM_FENCE);\n" - " }\n"//for() histogram rows. - "}\n"; - + "\n" + " if (filterSelect > densityFilter->m_MaxFilteredCounts)\n" + " filterSelectInt = densityFilter->m_MaxFilterIndex;\n" + " else if (filterSelect <= DE_THRESH)\n" + " filterSelectInt = (int)ceil(filterSelect) - 1;\n" + " else if (filterSelect != 0)\n" + " filterSelectInt = (int)DE_THRESH + (int)floor(pow((real_bucket_t)(filterSelect - DE_THRESH), densityFilter->m_Curve));\n" + " else\n" + " filterSelectInt = 0;\n" + "\n" + " if (filterSelectInt > densityFilter->m_MaxFilterIndex)\n" + " filterSelectInt = densityFilter->m_MaxFilterIndex;\n" + "\n" + " filterCoefIndex = filterSelectInt * densityFilter->m_KernelSize;\n" + "\n" + //With this new method, only accumulate to the temp local buffer first. Write to the final accumulator last. + //For each loop through, note that there is a local memory barrier call inside of each call to AddToAccumNoCheck(). + //If this isn't done, pixel errors occurr and even an out of resources error occurrs because too many writes are done to the same place in memory at once. + " jmin = min(k, histRow);\n" + " jmax = (int)min((densityFilter->m_SuperRasH - 1) - histRow, densityFilter->m_FilterWidth);\n" + "\n" + " for (j = -jmin; j <= jmax; j++)\n" + " {\n" + " for (i = 0; i < colsToZero && (colsToZeroOffset + i) < fullTempBoxWidth; i++)\n"//Each thread zeroizes a few columns. + " {\n" + " filterBox[colsToZeroOffset + i].m_Real4 = 0;\n" + " }\n" + "\n" + " barrier(CLK_LOCAL_MEM_FENCE);\n" + "\n" + " if (bucket.w != 0)\n" + " {\n" + " filterRow = abs(j) * (densityFilter->m_FilterWidth + 1);\n" + "\n" + " for (i = -k; i <= k; i++)\n" + " {\n" + " filterSelectInt = filterCoefIndex + coefIndices[filterRow + abs(i)];\n"//Really is filterCoeffIndexPlusOffset, but reusing a variable to save space. + " filterBox[i + boxCol].m_Real4 += (bucket * (filterCoefs[filterSelectInt] * cacheLog));\n" + " }\n" + " }\n" + "\n" + " barrier(CLK_LOCAL_MEM_FENCE);\n" + "\n" + //At this point, all threads in this block have applied the filter to their surrounding pixels and stored the results in the temp local box. + //Add the cells of it that are in bounds to the global accumulator. + //Compute offsets in local box to read from, and offsets into global accumulator to write to. + //Use a method here that is similar to the zeroization above: Each thread (column) in the first row iterates through all of the + //rows and adds a few columns to the accumulator. + //" if (THREAD_ID_X == 0)\n" + //" {\n" + //" for (int kk = boxReadStartCol, i = 0; kk < boxReadEndCol; kk++, i++)\n"//Each thread writes a few columns.//Could do away with kk//TODO//OPT + //" {\n" + //" accumulator[((histRow + j) * densityFilter->m_SuperRasW) + (accumWriteStartCol + i)].m_Real4 += filterBox[kk].m_Real4;\n" + //" }\n" + //" }\n" + " accumWriteOffset = ((histRow + j) * densityFilter->m_SuperRasW) + accumWriteStartCol;\n" + "\n" + " for (i = 0; i < colsToWrite; i++)\n"//Each thread writes a few columns. + " {\n" + " colOffset = colsToWriteOffset + i;\n" + "\n" + " if (boxReadStartCol + colOffset < boxReadEndCol)\n" + " accumulator[accumWriteOffset + colOffset].m_Real4 += filterBox[boxReadStartCol + colOffset].m_Real4;\n" + " }\n" + " }\n"//for() filter rows. + " barrier(CLK_GLOBAL_MEM_FENCE);\n" + " }\n"//for() histogram rows. + "}\n"; return os.str(); } @@ -443,237 +436,228 @@ string DEOpenCLKernelCreator::CreateGaussianDEKernel(size_t ss) bool doSS = ss > 1; bool doScf = !(ss & 1); ostringstream os; - os << - ConstantDefinesString(m_DoublePrecision) << - DensityFilterCLStructString << - UnionCLStructString << - "__kernel void " << GaussianDEEntryPoint(ss, MaxDEFilterSize()) << "(\n" << - " const __global real4_bucket* histogram,\n" - " __global real4reals_bucket* accumulator,\n" - " __constant DensityFilterCL* densityFilter,\n" - " const __global real_bucket_t* filterCoefs,\n" - " const __global real_bucket_t* filterWidths,\n" - " const __global uint* coefIndices,\n" - " const uint chunkSizeW,\n" - " const uint chunkSizeH,\n" - " const uint chunkW,\n" - " const uint chunkH\n" - "\t)\n" - "{\n" - " if (((((BLOCK_ID_X * chunkSizeW) + chunkW) * BLOCK_SIZE_X) + THREAD_ID_X >= densityFilter->m_SuperRasW) ||\n" - " ((((BLOCK_ID_Y * chunkSizeH) + chunkH) * BLOCK_SIZE_Y) + THREAD_ID_Y >= densityFilter->m_SuperRasH))\n" - " return;\n" - "\n"; + ConstantDefinesString(m_DoublePrecision) << + DensityFilterCLStructString << + UnionCLStructString << + "__kernel void " << GaussianDEEntryPoint(ss, MaxDEFilterSize()) << "(\n" << + " const __global real4_bucket* histogram,\n" + " __global real4reals_bucket* accumulator,\n" + " __constant DensityFilterCL* densityFilter,\n" + " const __global real_bucket_t* filterCoefs,\n" + " const __global real_bucket_t* filterWidths,\n" + " const __global uint* coefIndices,\n" + " const uint chunkSizeW,\n" + " const uint chunkSizeH,\n" + " const uint colChunkPass,\n" + " const uint rowChunkPass\n" + "\t)\n" + "{\n" + " if (((((BLOCK_ID_X * chunkSizeW) + colChunkPass) * BLOCK_SIZE_X) + THREAD_ID_X >= densityFilter->m_SuperRasW) ||\n" + " ((((BLOCK_ID_Y * chunkSizeH) + rowChunkPass) * BLOCK_SIZE_Y) + THREAD_ID_Y >= densityFilter->m_SuperRasH))\n" + " return;\n" + "\n"; if (doSS) { os << - " uint ss = (uint)floor((real_bucket_t)densityFilter->m_Supersample / 2.0);\n" - " int densityBoxLeftX;\n" - " int densityBoxRightX;\n" - " int densityBoxTopY;\n" - " int densityBoxBottomY;\n" - "\n"; + " uint ss = (uint)floor((real_bucket_t)densityFilter->m_Supersample / 2.0);\n" + " int densityBoxLeftX;\n" + " int densityBoxRightX;\n" + " int densityBoxTopY;\n" + " int densityBoxBottomY;\n" + "\n"; if (doScf) - os << - " real_bucket_t scfact = pow(densityFilter->m_Supersample / (densityFilter->m_Supersample + (real_bucket_t)1.0), (real_bucket_t)2.0);\n"; + os << + " real_bucket_t scfact = pow(densityFilter->m_Supersample / (densityFilter->m_Supersample + (real_bucket_t)1.0), (real_bucket_t)2.0);\n"; } //Compute the size of the temporary box which is the block width + 2 * filter width x block height + 2 * filter width. //Ideally the block width and height are both 32. However, the height might be smaller if there isn't enough memory. os << - " uint fullTempBoxWidth, fullTempBoxHeight;\n" - " uint leftBound, rightBound, topBound, botBound;\n" - " uint blockHistStartRow, blockHistEndRow, boxReadStartRow, boxReadEndRow;\n" - " uint blockHistStartCol, boxReadStartCol, boxReadEndCol;\n" - " uint accumWriteStartRow, accumWriteStartCol, colsToWrite;\n" - - //If any of the variables above end up being made __local, init them here. - //At the moment, it's slower even though it's more memory efficient. - //" if (THREAD_ID_X == 0 && THREAD_ID_Y == 0)\n" - //" {\n" - //Init local vars here. - //" }\n" - //"\n" - //" barrier(CLK_LOCAL_MEM_FENCE);\n" - "\n" - " fullTempBoxWidth = BLOCK_SIZE_X + (densityFilter->m_FilterWidth * 2);\n" - " fullTempBoxHeight = BLOCK_SIZE_Y + (densityFilter->m_FilterWidth * 2);\n" - //Compute the bounds of the area to be sampled, which is just the ends minus the super sample minus 1. - " leftBound = densityFilter->m_Supersample - 1;\n" - " rightBound = densityFilter->m_SuperRasW - (densityFilter->m_Supersample - 1);\n" - " topBound = densityFilter->m_Supersample - 1;\n" - " botBound = densityFilter->m_SuperRasH - (densityFilter->m_Supersample - 1);\n" - "\n" - //Start and end values are the indices in the histogram read from - //and written to in the accumulator. They are not the indices for the local block of data. - //Before computing local offsets, compute the global offsets first to determine if any rows or cols fall outside of the bounds. - " blockHistStartRow = min(botBound, (uint)(topBound + (((BLOCK_ID_Y * chunkSizeH) + chunkH) * BLOCK_SIZE_Y)));\n"//The first histogram row this block will process. - " blockHistEndRow = min(botBound, (uint)(blockHistStartRow + BLOCK_SIZE_Y));\n"//The last histogram row this block will process, clamped to the last row. - " boxReadStartRow = densityFilter->m_FilterWidth - min(densityFilter->m_FilterWidth, blockHistStartRow);\n"//The first row in the local box to read from when writing back to the final accumulator for this block. - " boxReadEndRow = densityFilter->m_FilterWidth + min((uint)(densityFilter->m_FilterWidth + BLOCK_SIZE_Y), densityFilter->m_SuperRasH - blockHistStartRow);\n"//The last row in the local box to read from when writing back to the final accumulator for this block. - " blockHistStartCol = min(rightBound, leftBound + (uint)(((BLOCK_ID_X * chunkSizeW) + chunkW) * BLOCK_SIZE_X));\n"//The first histogram column this block will process. - " boxReadStartCol = densityFilter->m_FilterWidth - min(densityFilter->m_FilterWidth, blockHistStartCol);\n"//The first box col this block will read from when copying to the accumulator. - " boxReadEndCol = densityFilter->m_FilterWidth + min(densityFilter->m_FilterWidth + (uint)BLOCK_SIZE_X, densityFilter->m_SuperRasW - blockHistStartCol);\n"//The last box col this block will read from when copying to the accumulator. - "\n" - //Last, the indices in the global accumulator that the local bounds will be writing to. - " accumWriteStartRow = blockHistStartRow - min(densityFilter->m_FilterWidth, blockHistStartRow);\n"//Will be fw - 0 except for boundary columns, it will be less. - " accumWriteStartCol = blockHistStartCol - min(densityFilter->m_FilterWidth, blockHistStartCol);\n" - " colsToWrite = ceil((real_bucket_t)(boxReadEndCol - boxReadStartCol) / (real_bucket_t)BLOCK_SIZE_X);\n" - "\n" - " uint threadHistRow = blockHistStartRow + THREAD_ID_Y;\n"//The histogram row this individual thread will be reading from. - " uint threadHistCol = blockHistStartCol + THREAD_ID_X;\n"//The histogram column this individual thread will be reading from. - "\n" - - //Compute the center position in this local box to serve as the center position - //from which filter application offsets are computed. - //These are the local indices for the local data that are temporarily accumulated to before - //writing out to the global accumulator. - " uint boxRow = densityFilter->m_FilterWidth + THREAD_ID_Y;\n" - " uint boxCol = densityFilter->m_FilterWidth + THREAD_ID_X;\n" - " uint colElementsToZero = ceil((real_bucket_t)fullTempBoxWidth / (real_bucket_t)(BLOCK_SIZE_X));\n"//Usually is 2. - " int i, j, k;\n" - " uint filterSelectInt, filterCoefIndex;\n" - " real_bucket_t cacheLog;\n" - " real_bucket_t filterSelect;\n" - " real4_bucket bucket;\n" - ; - - //This will be treated as having dimensions of (BLOCK_SIZE_X + (fw * 2)) x (BLOCK_SIZE_Y + (fw * 2)). - if (m_NVidia) - os << " __local real4reals_bucket filterBox[3000];\n"; - else - os << " __local real4reals_bucket filterBox[1200];\n"; - + " uint fullTempBoxWidth, fullTempBoxHeight;\n" + " uint leftBound, rightBound, topBound, botBound;\n" + " uint blockHistStartRow, blockHistEndRow, boxReadStartRow, boxReadEndRow;\n" + " uint blockHistStartCol, boxReadStartCol, boxReadEndCol;\n" + " uint accumWriteStartRow, accumWriteStartCol, colsToWrite;\n" + //If any of the variables above end up being made __local, init them here. + //At the moment, it's slower even though it's more memory efficient. + //" if (THREAD_ID_X == 0 && THREAD_ID_Y == 0)\n" + //" {\n" + //Init local vars here. + //" }\n" + //"\n" + //" barrier(CLK_LOCAL_MEM_FENCE);\n" + "\n" + " fullTempBoxWidth = BLOCK_SIZE_X + (densityFilter->m_FilterWidth * 2);\n" + " fullTempBoxHeight = BLOCK_SIZE_Y + (densityFilter->m_FilterWidth * 2);\n" + //Compute the bounds of the area to be sampled, which is just the ends minus the super sample minus 1. + " leftBound = densityFilter->m_Supersample - 1;\n" + " rightBound = densityFilter->m_SuperRasW - (densityFilter->m_Supersample - 1);\n" + " topBound = densityFilter->m_Supersample - 1;\n" + " botBound = densityFilter->m_SuperRasH - (densityFilter->m_Supersample - 1);\n" + "\n" + //Start and end values are the indices in the histogram read from + //and written to in the accumulator. They are not the indices for the local block of data. + //Before computing local offsets, compute the global offsets first to determine if any rows or cols fall outside of the bounds. + " blockHistStartRow = min(botBound, (uint)(topBound + (((BLOCK_ID_Y * chunkSizeH) + rowChunkPass) * BLOCK_SIZE_Y)));\n"//The first histogram row this block will process. + " blockHistEndRow = min(botBound, (uint)(blockHistStartRow + BLOCK_SIZE_Y));\n"//The last histogram row this block will process, clamped to the last row. + " boxReadStartRow = densityFilter->m_FilterWidth - min(densityFilter->m_FilterWidth, blockHistStartRow);\n"//The first row in the local box to read from when writing back to the final accumulator for this block. + " boxReadEndRow = densityFilter->m_FilterWidth + min((uint)(densityFilter->m_FilterWidth + BLOCK_SIZE_Y), densityFilter->m_SuperRasH - blockHistStartRow);\n"//The last row in the local box to read from when writing back to the final accumulator for this block. + " blockHistStartCol = min(rightBound, leftBound + (uint)(((BLOCK_ID_X * chunkSizeW) + colChunkPass) * BLOCK_SIZE_X));\n"//The first histogram column this block will process. + " boxReadStartCol = densityFilter->m_FilterWidth - min(densityFilter->m_FilterWidth, blockHistStartCol);\n"//The first box col this block will read from when copying to the accumulator. + " boxReadEndCol = densityFilter->m_FilterWidth + min(densityFilter->m_FilterWidth + (uint)BLOCK_SIZE_X, densityFilter->m_SuperRasW - blockHistStartCol);\n"//The last box col this block will read from when copying to the accumulator. + "\n" + //Last, the indices in the global accumulator that the local bounds will be writing to. + " accumWriteStartRow = blockHistStartRow - min(densityFilter->m_FilterWidth, blockHistStartRow);\n"//Will be fw - 0 except for boundary columns, it will be less. + " accumWriteStartCol = blockHistStartCol - min(densityFilter->m_FilterWidth, blockHistStartCol);\n" + " colsToWrite = ceil((real_bucket_t)(boxReadEndCol - boxReadStartCol) / (real_bucket_t)BLOCK_SIZE_X);\n" + "\n" + " uint threadHistRow = blockHistStartRow + THREAD_ID_Y;\n"//The histogram row this individual thread will be reading from. + " uint threadHistCol = blockHistStartCol + THREAD_ID_X;\n"//The histogram column this individual thread will be reading from. + "\n" + //Compute the center position in this local box to serve as the center position + //from which filter application offsets are computed. + //These are the local indices for the local data that are temporarily accumulated to before + //writing out to the global accumulator. + " uint boxRow = densityFilter->m_FilterWidth + THREAD_ID_Y;\n" + " uint boxCol = densityFilter->m_FilterWidth + THREAD_ID_X;\n" + " uint colElementsToZero = ceil((real_bucket_t)fullTempBoxWidth / (real_bucket_t)(BLOCK_SIZE_X));\n"//Usually is 2. + " int i, j, k;\n" + " uint filterSelectInt, filterCoefIndex;\n" + " real_bucket_t cacheLog;\n" + " real_bucket_t filterSelect;\n" + " real4_bucket bucket;\n" + ; + //This will be treated as having dimensions of (BLOCK_SIZE_X + (fw * 2)) x (BLOCK_SIZE_Y + (fw * 2)). + os << " __local real4reals_bucket filterBox[1200];\n";//Really only need 1156 os << - //Zero the temp buffers first. This splits the zeroization evenly across all threads (columns) in the first block row. - //This is a middle ground solution. Previous methods tried: - //Thread (0, 0) does all init. This works, but is the slowest. - //Init is divided among all threads. This is the fastest but exposes a severe flaw in OpenCL, - //in that it will not get executed by all threads before proceeding, despite the barrier statement - //below. As a result, strange artifacts will get left around because filtering gets executed on a temp - //box that has not been properly zeroized. - //The only way to do it and still achieve reasonable speed is to have the first row do it. This is - //most likely because the first row gets executed first, ensuring zeroization is done when the rest - //of the threads execute. - "\n"//Dummy test zeroization for debugging. - //" if (THREAD_ID_Y == 0 && THREAD_ID_X == 0)\n"//First thread of the block takes the responsibility of zeroizing. - //" {\n" - //" for (k = 0; k < 2 * 1024; k++)\n" - //" {\n" - //" filterBox[k].m_Real4 = 0;\n" - //" }\n" - //" }\n" - " if (THREAD_ID_Y == 0)\n"//First row of the block takes the responsibility of zeroizing. - " {\n" - " for (i = 0; i < fullTempBoxHeight; i++)\n"//Each column in the row iterates through all rows. - " {\n" - " for (j = 0; j < colElementsToZero && ((colElementsToZero * THREAD_ID_X) + j) < fullTempBoxWidth; j++)\n"//And zeroizes a few columns from that row. - " {\n" - " filterBox[(i * fullTempBoxWidth) + ((colElementsToZero * THREAD_ID_X) + j)].m_Real4 = 0;\n" - " }\n" - " }\n" - " }\n" - "\n" - " barrier(CLK_LOCAL_MEM_FENCE);\n" - "\n" - " if (threadHistRow < botBound && threadHistCol < rightBound)\n" - " {\n" - " bucket = histogram[(threadHistRow * densityFilter->m_SuperRasW) + threadHistCol];\n" - "\n" - " if (bucket.w != 0)\n" - " {\n" - " cacheLog = (densityFilter->m_K1 * log(1.0 + bucket.w * densityFilter->m_K2)) / bucket.w;\n"; + //Zero the temp buffers first. This splits the zeroization evenly across all threads (columns) in the first block row. + //This is a middle ground solution. Previous methods tried: + //Thread (0, 0) does all init. This works, but is the slowest. + //Init is divided among all threads. This is the fastest but exposes a severe flaw in OpenCL, + //in that it will not get executed by all threads before proceeding, despite the barrier statement + //below. As a result, strange artifacts will get left around because filtering gets executed on a temp + //box that has not been properly zeroized. + //The only way to do it and still achieve reasonable speed is to have the first row do it. This is + //most likely because the first row gets executed first, ensuring zeroization is done when the rest + //of the threads execute. + "\n"//Dummy test zeroization for debugging. + //" if (THREAD_ID_Y == 0 && THREAD_ID_X == 0)\n"//First thread of the block takes the responsibility of zeroizing. + //" {\n" + //" for (k = 0; k < 2 * 1024; k++)\n" + //" {\n" + //" filterBox[k].m_Real4 = 0;\n" + //" }\n" + //" }\n" + " if (THREAD_ID_Y == 0)\n"//First row of the block takes the responsibility of zeroizing. + " {\n" + " for (i = 0; i < fullTempBoxHeight; i++)\n"//Each column in the row iterates through all rows. + " {\n" + " for (j = 0; j < colElementsToZero && ((colElementsToZero * THREAD_ID_X) + j) < fullTempBoxWidth; j++)\n"//And zeroizes a few columns from that row. + " {\n" + " filterBox[(i * fullTempBoxWidth) + ((colElementsToZero * THREAD_ID_X) + j)].m_Real4 = 0;\n" + " }\n" + " }\n" + " }\n" + "\n" + " barrier(CLK_LOCAL_MEM_FENCE);\n" + "\n" + " if (threadHistRow < botBound && threadHistCol < rightBound)\n" + " {\n" + " bucket = histogram[(threadHistRow * densityFilter->m_SuperRasW) + threadHistCol];\n" + "\n" + " if (bucket.w != 0)\n" + " {\n" + " cacheLog = (densityFilter->m_K1 * log(1.0 + bucket.w * densityFilter->m_K2)) / bucket.w;\n"; if (doSS) { os << - " filterSelect = 0;\n" - " densityBoxLeftX = threadHistCol - min(threadHistCol, ss);\n" - " densityBoxRightX = threadHistCol + min(ss, (densityFilter->m_SuperRasW - threadHistCol) - 1);\n" - " densityBoxTopY = threadHistRow - min(threadHistRow, ss);\n" - " densityBoxBottomY = threadHistRow + min(ss, (densityFilter->m_SuperRasH - threadHistRow) - 1);\n" - "\n" - " for (j = densityBoxTopY; j <= densityBoxBottomY; j++)\n" - " {\n" - " for (i = densityBoxLeftX; i <= densityBoxRightX; i++)\n" - " {\n" - " filterSelect += histogram[i + (j * densityFilter->m_SuperRasW)].w;\n" - " }\n" - " }\n" - "\n"; + " filterSelect = 0;\n" + " densityBoxLeftX = threadHistCol - min(threadHistCol, ss);\n" + " densityBoxRightX = threadHistCol + min(ss, (densityFilter->m_SuperRasW - threadHistCol) - 1);\n" + " densityBoxTopY = threadHistRow - min(threadHistRow, ss);\n" + " densityBoxBottomY = threadHistRow + min(ss, (densityFilter->m_SuperRasH - threadHistRow) - 1);\n" + "\n" + " for (j = densityBoxTopY; j <= densityBoxBottomY; j++)\n" + " {\n" + " for (i = densityBoxLeftX; i <= densityBoxRightX; i++)\n" + " {\n" + " filterSelect += histogram[i + (j * densityFilter->m_SuperRasW)].w;\n" + " }\n" + " }\n" + "\n"; if (doScf) - os << - " filterSelect *= scfact;\n"; + os << + " filterSelect *= scfact;\n"; } else { os << - " filterSelect = bucket.w;\n"; + " filterSelect = bucket.w;\n"; } os << - "\n" - " if (filterSelect > densityFilter->m_MaxFilteredCounts)\n" - " filterSelectInt = densityFilter->m_MaxFilterIndex;\n" - " else if (filterSelect <= DE_THRESH)\n" - " filterSelectInt = (int)ceil(filterSelect) - 1;\n" - " else\n" - " filterSelectInt = (int)DE_THRESH + (int)floor(pow((real_bucket_t)(filterSelect - DE_THRESH), densityFilter->m_Curve));\n" - "\n" - " if (filterSelectInt > densityFilter->m_MaxFilterIndex)\n" - " filterSelectInt = densityFilter->m_MaxFilterIndex;\n" - "\n" - " filterCoefIndex = filterSelectInt * densityFilter->m_KernelSize;\n" - "\n" - //With this new method, only accumulate to the temp local buffer first. Write to the final accumulator last. - //For each loop through, note that there is a local memory barrier call inside of each call to AddToAccumNoCheck(). - //If this isn't done, pixel errors occurr and even an out of resources error occurrs because too many writes are done to the same place in memory at once. - " k = (int)densityFilter->m_FilterWidth;\n"//Need a signed int to use below, really is filter width, but reusing a variable to save space. - "\n" - " for (j = -k; j <= k; j++)\n" - " {\n" - " for (i = -k; i <= k; i++)\n" - " {\n" - " filterSelectInt = filterCoefIndex + coefIndices[(abs(j) * (densityFilter->m_FilterWidth + 1)) + abs(i)];\n"//Really is filterCoeffIndexPlusOffset, but reusing a variable to save space. - "\n" - " if (filterCoefs[filterSelectInt] != 0)\n"//This conditional actually improves speed, despite SIMT being bad at conditionals. - " {\n" - " filterBox[(i + boxCol) + ((j + boxRow) * fullTempBoxWidth)].m_Real4 += (bucket * (filterCoefs[filterSelectInt] * cacheLog));\n" - " }\n" - " }\n" - " barrier(CLK_LOCAL_MEM_FENCE);\n"//If this is the only barrier and the block size is exactly 16, it works perfectly. Otherwise, no chunks occur, but a many streaks. - " }\n" - " }\n"//bucket.w != 0. - " }\n"//In bounds. - "\n" - "\n" - " barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);\n" - "\n" - " if (THREAD_ID_Y == 0)\n" - " {\n" - //At this point, all threads in this block have applied the filter to their surrounding pixels and stored the results in the temp local box. - //Add the cells of it that are in bounds to the global accumulator. - //Compute offsets in local box to read from, and offsets into global accumulator to write to. - //Use a method here that is similar to the zeroization above: Each thread (column) in the first row iterates through all of the - //rows and adds a few columns to the accumulator. - " for (i = boxReadStartRow, j = accumWriteStartRow; i < boxReadEndRow; i++, j++)\n" - " {\n" - " for (k = 0; k < colsToWrite; k++)\n"//Each thread writes a few columns. - " {\n" - " boxCol = (colsToWrite * THREAD_ID_X) + k;\n"//Really is colOffset, but reusing a variable to save space. - "\n" - " if (boxReadStartCol + boxCol < boxReadEndCol)\n" - " accumulator[(j * densityFilter->m_SuperRasW) + (accumWriteStartCol + boxCol)].m_Real4 += filterBox[(i * fullTempBoxWidth) + (boxReadStartCol + boxCol)].m_Real4;\n" - " }\n" - " barrier(CLK_GLOBAL_MEM_FENCE);\n"//This must be here or else chunks will go missing. - " }\n" - " }\n" - "}\n"; - + "\n" + " if (filterSelect > densityFilter->m_MaxFilteredCounts)\n" + " filterSelectInt = densityFilter->m_MaxFilterIndex;\n" + " else if (filterSelect <= DE_THRESH)\n" + " filterSelectInt = (int)ceil(filterSelect) - 1;\n" + " else\n" + " filterSelectInt = (int)DE_THRESH + (int)floor(pow((real_bucket_t)(filterSelect - DE_THRESH), densityFilter->m_Curve));\n" + "\n" + " if (filterSelectInt > densityFilter->m_MaxFilterIndex)\n" + " filterSelectInt = densityFilter->m_MaxFilterIndex;\n" + "\n" + " filterCoefIndex = filterSelectInt * densityFilter->m_KernelSize;\n" + "\n" + //With this new method, only accumulate to the temp local buffer first. Write to the final accumulator last. + //For each loop through, note that there is a local memory barrier call inside of each call to AddToAccumNoCheck(). + //If this isn't done, pixel errors occurr and even an out of resources error occurrs because too many writes are done to the same place in memory at once. + " k = (int)densityFilter->m_FilterWidth;\n"//Need a signed int to use below, really is filter width, but reusing a variable to save space. + "\n" + " for (j = -k; j <= k; j++)\n" + " {\n" + " for (i = -k; i <= k; i++)\n" + " {\n" + " filterSelectInt = filterCoefIndex + coefIndices[(abs(j) * (densityFilter->m_FilterWidth + 1)) + abs(i)];\n"//Really is filterCoeffIndexPlusOffset, but reusing a variable to save space. + "\n" + " if (filterCoefs[filterSelectInt] != 0)\n"//This conditional actually improves speed, despite SIMT being bad at conditionals. + " {\n" + " filterBox[(i + boxCol) + ((j + boxRow) * fullTempBoxWidth)].m_Real4 += (bucket * (filterCoefs[filterSelectInt] * cacheLog));\n" + " }\n" + " }\n" + " barrier(CLK_LOCAL_MEM_FENCE);\n"//If this is the only barrier and the block size is exactly 16, it works perfectly. Otherwise, no chunks occur, but a many streaks. + " }\n" + " }\n"//bucket.w != 0. + " }\n"//In bounds. + "\n" + "\n" + " barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);\n" + "\n" + " if (THREAD_ID_Y == 0)\n" + " {\n" + //At this point, all threads in this block have applied the filter to their surrounding pixels and stored the results in the temp local box. + //Add the cells of it that are in bounds to the global accumulator. + //Compute offsets in local box to read from, and offsets into global accumulator to write to. + //Use a method here that is similar to the zeroization above: Each thread (column) in the first row iterates through all of the + //rows and adds a few columns to the accumulator. + " for (i = boxReadStartRow, j = accumWriteStartRow; i < boxReadEndRow; i++, j++)\n" + " {\n" + " for (k = 0; k < colsToWrite; k++)\n"//Each thread writes a few columns. + " {\n" + " boxCol = (colsToWrite * THREAD_ID_X) + k;\n"//Really is colOffset, but reusing a variable to save space. + "\n" + " if (boxReadStartCol + boxCol < boxReadEndCol)\n" + " accumulator[(j * densityFilter->m_SuperRasW) + (accumWriteStartCol + boxCol)].m_Real4 += filterBox[(i * fullTempBoxWidth) + (boxReadStartCol + boxCol)].m_Real4;\n" + " }\n" + " barrier(CLK_GLOBAL_MEM_FENCE);\n"//This must be here or else chunks will go missing. + " }\n" + " }\n" + "}\n"; return os.str(); } #endif @@ -701,139 +685,137 @@ string DEOpenCLKernelCreator::CreateGaussianDEKernelNoLocalCache(size_t ss) bool doSS = ss > 1; bool doScf = !(ss & 1); ostringstream os; - os << - ConstantDefinesString(m_DoublePrecision) << - DensityFilterCLStructString << - UnionCLStructString << - AddToAccumWithCheckFunctionString << - "__kernel void " << GaussianDEEntryPoint(ss, MaxDEFilterSize() + 1) << "(\n" << - " const __global real4_bucket* histogram,\n" - " __global real4reals_bucket* accumulator,\n" - " __constant DensityFilterCL* densityFilter,\n" - " const __global real_bucket_t* filterCoefs,\n" - " const __global real_bucket_t* filterWidths,\n" - " const __global uint* coefIndices,\n" - " const uint chunkSizeW,\n" - " const uint chunkSizeH,\n" - " const uint chunkW,\n" - " const uint chunkH\n" - "\t)\n" - "{\n" - " if (((((BLOCK_ID_X * chunkSizeW) + chunkW) * BLOCK_SIZE_X) + THREAD_ID_X >= densityFilter->m_SuperRasW) ||\n" - " ((((BLOCK_ID_Y * chunkSizeH) + chunkH) * BLOCK_SIZE_Y) + THREAD_ID_Y >= densityFilter->m_SuperRasH))\n" - " return;\n" - "\n"; + ConstantDefinesString(m_DoublePrecision) << + DensityFilterCLStructString << + UnionCLStructString << + AddToAccumWithCheckFunctionString << + "__kernel void " << GaussianDEEntryPoint(ss, MaxDEFilterSize() + 1) << "(\n" << + " const __global real4_bucket* histogram,\n" + " __global real4reals_bucket* accumulator,\n" + " __constant DensityFilterCL* densityFilter,\n" + " const __global real_bucket_t* filterCoefs,\n" + " const __global real_bucket_t* filterWidths,\n" + " const __global uint* coefIndices,\n" + " const uint chunkSizeW,\n" + " const uint chunkSizeH,\n" + " const uint colChunkPass,\n" + " const uint rowChunkPass\n" + "\t)\n" + "{\n" + " if (((((BLOCK_ID_X * chunkSizeW) + colChunkPass) * BLOCK_SIZE_X) + THREAD_ID_X >= densityFilter->m_SuperRasW) ||\n" + " ((((BLOCK_ID_Y * chunkSizeH) + rowChunkPass) * BLOCK_SIZE_Y) + THREAD_ID_Y >= densityFilter->m_SuperRasH))\n" + " return;\n" + "\n"; if (doSS) { - os << - " uint ss = (uint)floor((real_bucket_t)densityFilter->m_Supersample / 2.0);\n" - " int densityBoxLeftX;\n" - " int densityBoxRightX;\n" - " int densityBoxTopY;\n" - " int densityBoxBottomY;\n"; + os << + " uint ss = (uint)floor((real_bucket_t)densityFilter->m_Supersample / 2.0);\n" + " int densityBoxLeftX;\n" + " int densityBoxRightX;\n" + " int densityBoxTopY;\n" + " int densityBoxBottomY;\n"; - if (doScf) - os << " real_bucket_t scfact = pow((real_bucket_t)densityFilter->m_Supersample / ((real_bucket_t)densityFilter->m_Supersample + (real_bucket_t)1.0), (real_bucket_t)2.0);\n"; + if (doScf) + os << " real_bucket_t scfact = pow((real_bucket_t)densityFilter->m_Supersample / ((real_bucket_t)densityFilter->m_Supersample + (real_bucket_t)1.0), (real_bucket_t)2.0);\n"; } os << - //Compute the bounds of the area to be sampled, which is just the ends minus the super sample minus 1. - " uint leftBound = densityFilter->m_Supersample - 1;\n" - " uint rightBound = densityFilter->m_SuperRasW - (densityFilter->m_Supersample - 1);\n" - " uint topBound = densityFilter->m_Supersample - 1;\n" - " uint botBound = densityFilter->m_SuperRasH - (densityFilter->m_Supersample - 1);\n" - "\n" - //Start and end values are the indices in the histogram read from and written to in the accumulator. - //Before computing local offsets, compute the global offsets first to determine if any rows or cols fall outside of the bounds. - " uint blockHistStartRow = min(botBound, (uint)(topBound + (((BLOCK_ID_Y * chunkSizeH) + chunkH) * BLOCK_SIZE_Y)));\n"//The first histogram row this block will process. - " uint threadHistRow = blockHistStartRow + THREAD_ID_Y;\n"//The histogram row this individual thread will be reading from. - "\n" - " uint blockHistStartCol = min(rightBound, leftBound + (uint)(((BLOCK_ID_X * chunkSizeW) + chunkW) * BLOCK_SIZE_X));\n"//The first histogram column this block will process. - " uint threadHistCol = blockHistStartCol + THREAD_ID_X;\n"//The histogram column this individual thread will be reading from. - "\n" - " int i, j;\n" - " uint filterSelectInt, filterCoefIndex;\n" - " real_bucket_t cacheLog;\n" - " real_bucket_t filterSelect;\n" - " real4_bucket bucket;\n" - "\n" - " if (threadHistRow < botBound && threadHistCol < rightBound)\n" - " {\n" - " bucket = histogram[(threadHistRow * densityFilter->m_SuperRasW) + threadHistCol];\n" - "\n" - " if (bucket.w != 0)\n" - " {\n" - " cacheLog = (densityFilter->m_K1 * log(1.0 + bucket.w * densityFilter->m_K2)) / bucket.w;\n"; + //Compute the bounds of the area to be sampled, which is just the ends minus the super sample minus 1. + " uint leftBound = densityFilter->m_Supersample - 1;\n" + " uint rightBound = densityFilter->m_SuperRasW - (densityFilter->m_Supersample - 1);\n" + " uint topBound = densityFilter->m_Supersample - 1;\n" + " uint botBound = densityFilter->m_SuperRasH - (densityFilter->m_Supersample - 1);\n" + "\n" + //Start and end values are the indices in the histogram read from and written to in the accumulator. + //Before computing local offsets, compute the global offsets first to determine if any rows or cols fall outside of the bounds. + " uint blockHistStartRow = min(botBound, (uint)(topBound + (((BLOCK_ID_Y * chunkSizeH) + rowChunkPass) * BLOCK_SIZE_Y)));\n"//The first histogram row this block will process. + " uint threadHistRow = blockHistStartRow + THREAD_ID_Y;\n"//The histogram row this individual thread will be reading from. + "\n" + " uint blockHistStartCol = min(rightBound, leftBound + (uint)(((BLOCK_ID_X * chunkSizeW) + colChunkPass) * BLOCK_SIZE_X));\n"//The first histogram column this block will process. + " uint threadHistCol = blockHistStartCol + THREAD_ID_X;\n"//The histogram column this individual thread will be reading from. + "\n" + " int i, j;\n" + " uint filterSelectInt, filterCoefIndex;\n" + " real_bucket_t cacheLog;\n" + " real_bucket_t filterSelect;\n" + " real4_bucket bucket;\n" + "\n" + " if (threadHistRow < botBound && threadHistCol < rightBound)\n" + " {\n" + " bucket = histogram[(threadHistRow * densityFilter->m_SuperRasW) + threadHistCol];\n" + "\n" + " if (bucket.w != 0)\n" + " {\n" + " cacheLog = (densityFilter->m_K1 * log(1.0 + bucket.w * densityFilter->m_K2)) / bucket.w;\n"; if (doSS) { os << - " filterSelect = 0;\n" - " densityBoxLeftX = threadHistCol - min(threadHistCol, ss);\n" - " densityBoxRightX = threadHistCol + min(ss, (densityFilter->m_SuperRasW - threadHistCol) - 1);\n" - " densityBoxTopY = threadHistRow - min(threadHistRow, ss);\n" - " densityBoxBottomY = threadHistRow + min(ss, (densityFilter->m_SuperRasH - threadHistRow) - 1);\n" - "\n" - " for (j = densityBoxTopY; j <= densityBoxBottomY; j++)\n" - " {\n" - " for (i = densityBoxLeftX; i <= densityBoxRightX; i++)\n" - " {\n" - " filterSelect += histogram[i + (j * densityFilter->m_SuperRasW)].w;\n" - " }\n" - " }\n" - "\n"; + " filterSelect = 0;\n" + " densityBoxLeftX = threadHistCol - min(threadHistCol, ss);\n" + " densityBoxRightX = threadHistCol + min(ss, (densityFilter->m_SuperRasW - threadHistCol) - 1);\n" + " densityBoxTopY = threadHistRow - min(threadHistRow, ss);\n" + " densityBoxBottomY = threadHistRow + min(ss, (densityFilter->m_SuperRasH - threadHistRow) - 1);\n" + "\n" + " for (j = densityBoxTopY; j <= densityBoxBottomY; j++)\n" + " {\n" + " for (i = densityBoxLeftX; i <= densityBoxRightX; i++)\n" + " {\n" + " filterSelect += histogram[i + (j * densityFilter->m_SuperRasW)].w;\n" + " }\n" + " }\n" + "\n"; if (doScf) - os << - " filterSelect *= scfact;\n"; + os << + " filterSelect *= scfact;\n"; } else { os - << " filterSelect = bucket.w;\n"; + << " filterSelect = bucket.w;\n"; } os << - "\n" - " if (filterSelect > densityFilter->m_MaxFilteredCounts)\n" - " filterSelectInt = densityFilter->m_MaxFilterIndex;\n" - " else if (filterSelect <= DE_THRESH)\n" - " filterSelectInt = (int)ceil(filterSelect) - 1;\n" - " else\n" - " filterSelectInt = (int)DE_THRESH + (int)floor(pow((real_bucket_t)(filterSelect - DE_THRESH), densityFilter->m_Curve));\n" - "\n" - " if (filterSelectInt > densityFilter->m_MaxFilterIndex)\n" - " filterSelectInt = densityFilter->m_MaxFilterIndex;\n" - "\n" - " filterCoefIndex = filterSelectInt * densityFilter->m_KernelSize;\n" - "\n" - " int fw = (int)densityFilter->m_FilterWidth;\n"//Need a signed int to use below. - "\n" - " for (j = -fw; j <= fw; j++)\n" - " {\n" - " for (i = -fw; i <= fw; i++)\n" - " {\n" - " if (AccumCheck(densityFilter->m_SuperRasW, densityFilter->m_SuperRasH, threadHistCol, i, threadHistRow, j))\n" - " {\n" - " filterSelectInt = filterCoefIndex + coefIndices[(abs(j) * (densityFilter->m_FilterWidth + 1)) + abs(i)];\n"//Really is filterCoeffIndexPlusOffset, but reusing a variable to save space. - "\n" - " if (filterCoefs[filterSelectInt] != 0)\n" - " {\n" - " accumulator[(i + threadHistCol) + ((j + threadHistRow) * densityFilter->m_SuperRasW)].m_Real4 += (bucket * (filterCoefs[filterSelectInt] * cacheLog));\n" - " }\n" - " }\n" - "\n" - " barrier(CLK_GLOBAL_MEM_FENCE);\n"//Required to avoid streaks. - " }\n" - " }\n" - " }\n"//bucket.w != 0. - " }\n"//In bounds. - //"\n" - //" barrier(CLK_GLOBAL_MEM_FENCE);\n"//Just to be safe. - "}\n"; - + "\n" + " if (filterSelect > densityFilter->m_MaxFilteredCounts)\n" + " filterSelectInt = densityFilter->m_MaxFilterIndex;\n" + " else if (filterSelect <= DE_THRESH)\n" + " filterSelectInt = (int)ceil(filterSelect) - 1;\n" + " else\n" + " filterSelectInt = (int)DE_THRESH + (int)floor(pow((real_bucket_t)(filterSelect - DE_THRESH), densityFilter->m_Curve));\n" + "\n" + " if (filterSelectInt > densityFilter->m_MaxFilterIndex)\n" + " filterSelectInt = densityFilter->m_MaxFilterIndex;\n" + "\n" + " filterCoefIndex = filterSelectInt * densityFilter->m_KernelSize;\n" + "\n" + " int fw = (int)densityFilter->m_FilterWidth;\n"//Need a signed int to use below. + "\n" + " for (j = -fw; j <= fw; j++)\n" + " {\n" + " for (i = -fw; i <= fw; i++)\n" + " {\n" + " if (AccumCheck(densityFilter->m_SuperRasW, densityFilter->m_SuperRasH, threadHistCol, i, threadHistRow, j))\n" + " {\n" + " filterSelectInt = filterCoefIndex + coefIndices[(abs(j) * (densityFilter->m_FilterWidth + 1)) + abs(i)];\n"//Really is filterCoeffIndexPlusOffset, but reusing a variable to save space. + "\n" + " if (filterCoefs[filterSelectInt] != 0)\n" + " {\n" + " accumulator[(i + threadHistCol) + ((j + threadHistRow) * densityFilter->m_SuperRasW)].m_Real4 += (bucket * (filterCoefs[filterSelectInt] * cacheLog));\n" + " }\n" + " }\n" + "\n" + " barrier(CLK_GLOBAL_MEM_FENCE);\n"//Required to avoid streaks. + " }\n" + " }\n" + " }\n"//bucket.w != 0. + " }\n"//In bounds. + //"\n" + //" barrier(CLK_GLOBAL_MEM_FENCE);\n"//Just to be safe. + "}\n"; return os.str(); } } diff --git a/Source/EmberCL/OpenCLWrapper.cpp b/Source/EmberCL/OpenCLWrapper.cpp index e976f0a..c53cf12 100644 --- a/Source/EmberCL/OpenCLWrapper.cpp +++ b/Source/EmberCL/OpenCLWrapper.cpp @@ -940,12 +940,20 @@ size_t OpenCLWrapper::GlobalMemSize() const { return m_GlobalMemSize; } size_t OpenCLWrapper::MaxAllocSize() const { return m_MaxAllocSize; } /// -/// Makes the even grid dims. +/// Make even grid dimensions. +/// The size of the blocks in terms of threads must divide evenly into the total number of threads in the grid. +/// In the case of a remainder, expand the width and height of the grid to the next highest evenly divisible value. +/// Ex: +/// blockW = 5, blockH = 5 +/// gridW = 18, gridH = 27 +/// +/// To make these even: +/// gridW = 20, gridH = 30 /// -/// The block w. -/// The block h. -/// The grid w. -/// The grid h. +/// The width of each block in terms of threads. +/// The height of each block in terms of threads. +/// The width of the entire grid in terms of threads. +/// The width of the entire grid in terms of threads. void OpenCLWrapper::MakeEvenGridDims(size_t blockW, size_t blockH, size_t& gridW, size_t& gridH) { if (gridW % blockW != 0) diff --git a/Source/EmberCL/RendererCL.cpp b/Source/EmberCL/RendererCL.cpp index d3ff379..8fc059b 100644 --- a/Source/EmberCL/RendererCL.cpp +++ b/Source/EmberCL/RendererCL.cpp @@ -165,13 +165,14 @@ bool RendererCL::Init(const vector>& devices, b if (b) { - //This is the maximum box dimension for density filtering which consists of (blockSize * blockSize) + (2 * filterWidth). - //These blocks must be square, and ideally, 32x32. - //Sadly, at the moment, Fermi runs out of resources at that block size because the DE filter function is so complex. + //This is the maximum box dimension for density filtering which consists of (blockSize * blockSize) + (2 * filterWidth). + //These blocks should be square, and ideally, 32x32. + //Sadly, at the moment, the GPU runs out of resources at that block size because the DE filter function is so complex. //The next best block size seems to be 24x24. //AMD is further limited because of less local memory so these have to be 16 on AMD. - m_MaxDEBlockSizeW = m_Devices[0]->Nvidia() ? 24 : 16;//These *must* both be divisible by 8 or else pixels will go missing. - m_MaxDEBlockSizeH = m_Devices[0]->Nvidia() ? 24 : 16; + //Users have reported crashes on Nvidia cards even at size 24, so just to be safe, make them both 16 for all manufacturers. + m_MaxDEBlockSizeW = 16; + m_MaxDEBlockSizeH = 16; FillSeeds(); for (size_t device = 0; device < m_Devices.size(); device++) @@ -1191,22 +1192,18 @@ eRenderStatus RendererCL::RunDensityFilter() if (kernelIndex != -1) { - uint leftBound = m_DensityFilterCL.m_Supersample - 1; - uint rightBound = m_DensityFilterCL.m_SuperRasW - (m_DensityFilterCL.m_Supersample - 1); - uint topBound = leftBound; - uint botBound = m_DensityFilterCL.m_SuperRasH - (m_DensityFilterCL.m_Supersample - 1); + uint ssm1 = m_DensityFilterCL.m_Supersample - 1; + uint leftBound = ssm1; + uint rightBound = m_DensityFilterCL.m_SuperRasW - ssm1; + uint topBound = leftBound; + uint botBound = m_DensityFilterCL.m_SuperRasH - ssm1; size_t gridW = rightBound - leftBound; size_t gridH = botBound - topBound; - size_t blockSizeW = m_MaxDEBlockSizeW;//These *must* both be divisible by 16 or else pixels will go missing. + size_t blockSizeW = m_MaxDEBlockSizeW; size_t blockSizeH = m_MaxDEBlockSizeH; - auto& wrapper = m_Devices[0]->m_Wrapper; - - //OpenCL runs out of resources when using double or a supersample of 2. - //Remedy this by reducing the height of the block by 2. - if (m_DoublePrecision || m_DensityFilterCL.m_Supersample > 1) - blockSizeH -= 2; - - //Can't just blindly pass dimension in vals. Must adjust them first to evenly divide the block count + double fw2 = m_DensityFilterCL.m_FilterWidth * 2.0; + auto& wrapper = m_Devices[0]->m_Wrapper; + //Can't just blindly pass dimension in vals. Must adjust them first to evenly divide the thread count //into the total grid dimensions. OpenCLWrapper::MakeEvenGridDims(blockSizeW, blockSizeH, gridW, gridH); //t.Tic(); @@ -1215,11 +1212,11 @@ eRenderStatus RendererCL::RunDensityFilter() //The other is to proces the entire image in multiple passes, and each pass processes blocks of pixels //that are far enough apart such that their filters do not overlap. //Do the latter. - //Gap is in terms of blocks. How many blocks must separate two blocks running at the same time. - uint gapW = uint(ceil((m_DensityFilterCL.m_FilterWidth * 2.0) / double(blockSizeW))); - uint chunkSizeW = gapW + 1; - uint gapH = uint(ceil((m_DensityFilterCL.m_FilterWidth * 2.0) / double(blockSizeH))); - uint chunkSizeH = gapH + 1; + //Gap is in terms of blocks and specifies how many blocks must separate two blocks running at the same time. + uint gapW = uint(ceil(fw2 / blockSizeW)); + uint chunkSizeW = gapW + 1;//Chunk size is also in terms of blocks and is one block (the one running) plus the gap to the right of it. + uint gapH = uint(ceil(fw2 / blockSizeH)); + uint chunkSizeH = gapH + 1;//Chunk size is also in terms of blocks and is one block (the one running) plus the gap below it. double totalChunks = chunkSizeW * chunkSizeH; if (b && !(b = wrapper.AddAndWriteBuffer(m_DEFilterParamsBufferName, reinterpret_cast(&m_DensityFilterCL), sizeof(m_DensityFilterCL)))) { AddToReport(loc); } @@ -1257,22 +1254,22 @@ eRenderStatus RendererCL::RunDensityFilter() } #else - gridW /= chunkSizeW; + gridW /= chunkSizeW;//Grid must be scaled down by number of chunks. gridH /= chunkSizeH; OpenCLWrapper::MakeEvenGridDims(blockSizeW, blockSizeH, gridW, gridH); - for (uint rowChunk = 0; b && !m_Abort && rowChunk < chunkSizeH; rowChunk++) + for (uint rowChunkPass = 0; b && !m_Abort && rowChunkPass < chunkSizeH; rowChunkPass++)//Number of vertical passes. { - for (uint colChunk = 0; b && !m_Abort && colChunk < chunkSizeW; colChunk++) + for (uint colChunkPass = 0; b && !m_Abort && colChunkPass < chunkSizeW; colChunkPass++)//Number of horizontal passes. { //t2.Tic(); - if (b && !(b = RunDensityFilterPrivate(kernelIndex, gridW, gridH, blockSizeW, blockSizeH, chunkSizeW, chunkSizeH, colChunk, rowChunk))) { m_Abort = true; AddToReport(loc); } + if (b && !(b = RunDensityFilterPrivate(kernelIndex, gridW, gridH, blockSizeW, blockSizeH, chunkSizeW, chunkSizeH, colChunkPass, rowChunkPass))) { m_Abort = true; AddToReport(loc); } //t2.Toc(loc); if (b && m_Callback) { - double percent = (double((rowChunk * chunkSizeW) + (colChunk + 1)) / totalChunks) * 100.0; + double percent = (double((rowChunkPass * chunkSizeW) + (colChunkPass + 1)) / totalChunks) * 100.0; double etaMs = ((100.0 - percent) / percent) * t.Toc(); if (!m_Callback->ProgressFunc(m_Ember, m_ProgressParameter, percent, 1, etaMs)) @@ -1456,11 +1453,11 @@ bool RendererCL::ClearBuffer(size_t device, const string& bufferName /// Block height /// Chunk size width (gapW + 1) /// Chunk size height (gapH + 1) -/// Row parity -/// Column parity +/// The current horizontal pass index +/// The current vertical pass index /// True if success, else false. template -bool RendererCL::RunDensityFilterPrivate(size_t kernelIndex, size_t gridW, size_t gridH, size_t blockW, size_t blockH, uint chunkSizeW, uint chunkSizeH, uint chunkW, uint chunkH) +bool RendererCL::RunDensityFilterPrivate(size_t kernelIndex, size_t gridW, size_t gridH, size_t blockW, size_t blockH, uint chunkSizeW, uint chunkSizeH, uint colChunkPass, uint rowChunkPass) { //Timing t(4); bool b = true; @@ -1487,9 +1484,9 @@ bool RendererCL::RunDensityFilterPrivate(size_t kernelIndex, size_t if (b && !(b = wrapper.SetArg(kernelIndex, argIndex, chunkSizeH))) { AddToReport(loc); } argIndex++;//Chunk size height (gapH + 1). - if (b && !(b = wrapper.SetArg(kernelIndex, argIndex, chunkW))) { AddToReport(loc); } argIndex++;//Column chunk. + if (b && !(b = wrapper.SetArg(kernelIndex, argIndex, colChunkPass))) { AddToReport(loc); } argIndex++;//Column chunk, horizontal pass. - if (b && !(b = wrapper.SetArg(kernelIndex, argIndex, chunkH))) { AddToReport(loc); } argIndex++;//Row chunk. + if (b && !(b = wrapper.SetArg(kernelIndex, argIndex, rowChunkPass))) { AddToReport(loc); } argIndex++;//Row chunk, vertical pass. //t.Toc(__FUNCTION__ " set args"); diff --git a/Source/EmberCL/RendererCL.h b/Source/EmberCL/RendererCL.h index ce52292..4f5093b 100644 --- a/Source/EmberCL/RendererCL.h +++ b/Source/EmberCL/RendererCL.h @@ -178,7 +178,7 @@ private: eRenderStatus RunDensityFilter(); eRenderStatus RunFinalAccum(); bool ClearBuffer(size_t device, const string& bufferName, uint width, uint height, uint elementSize); - bool RunDensityFilterPrivate(size_t kernelIndex, size_t gridW, size_t gridH, size_t blockW, size_t blockH, uint chunkSizeW, uint chunkSizeH, uint chunkW, uint chunkH); + bool RunDensityFilterPrivate(size_t kernelIndex, size_t gridW, size_t gridH, size_t blockW, size_t blockH, uint chunkSizeW, uint chunkSizeH, uint colChunkPass, uint rowChunkPass); int MakeAndGetDensityFilterProgram(size_t ss, uint filterWidth); int MakeAndGetFinalAccumProgram(double& alphaBase, double& alphaScale); int MakeAndGetGammaCorrectionProgram();