mirror of
https://bitbucket.org/mfeemster/fractorium.git
synced 2025-02-12 15:58:31 -05:00
--User changes
-Remove some warnings about interpolation type on first and last flames. --Code changes -Make DE block size always be 16x16, this should help stability on some Nvidia cards. No changes for AMD cards since they were that size already. -Since the block size is now so small, do not reduce it further when supersampling. -Clean up some variable names and documentation around OpenCL DE to be more clear.
This commit is contained in:
parent
65be0143ff
commit
19cb27b83a
@ -437,16 +437,12 @@ public:
|
|||||||
{
|
{
|
||||||
if (i1 == 0)
|
if (i1 == 0)
|
||||||
{
|
{
|
||||||
//fprintf(stderr, "error: cannot use smooth interpolation on first segment.\n");
|
|
||||||
//fprintf(stderr, "reverting to linear interpolation.\n");
|
|
||||||
Align(&embers[i1], &localEmbers[0], 2);
|
Align(&embers[i1], &localEmbers[0], 2);
|
||||||
smoothFlag = false;
|
smoothFlag = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (i2 == size - 1)
|
if (i2 == size - 1)
|
||||||
{
|
{
|
||||||
//fprintf(stderr, "error: cannot use smooth interpolation on last segment.\n");
|
|
||||||
//fprintf(stderr, "reverting to linear interpolation.\n");
|
|
||||||
Align(&embers[i1], &localEmbers[0], 2);
|
Align(&embers[i1], &localEmbers[0], 2);
|
||||||
smoothFlag = false;
|
smoothFlag = false;
|
||||||
}
|
}
|
||||||
|
@ -344,17 +344,11 @@ public:
|
|||||||
if (emberSize > 0)
|
if (emberSize > 0)
|
||||||
{
|
{
|
||||||
if (embers[0].m_Interp == eInterp::EMBER_INTERP_SMOOTH)
|
if (embers[0].m_Interp == eInterp::EMBER_INTERP_SMOOTH)
|
||||||
{
|
|
||||||
cout << "Warning: smooth interpolation cannot be used for first segment.\n switching to linear.\n";
|
|
||||||
embers[0].m_Interp = eInterp::EMBER_INTERP_LINEAR;
|
embers[0].m_Interp = eInterp::EMBER_INTERP_LINEAR;
|
||||||
}
|
|
||||||
|
|
||||||
if (emberSize >= 2 && embers[emberSize - 2].m_Interp == eInterp::EMBER_INTERP_SMOOTH)
|
if (emberSize >= 2 && embers[emberSize - 2].m_Interp == eInterp::EMBER_INTERP_SMOOTH)
|
||||||
{
|
|
||||||
cout << "Warning: smooth interpolation cannot be used for last segment.\n switching to linear.\n";
|
|
||||||
embers[emberSize - 2].m_Interp = eInterp::EMBER_INTERP_LINEAR;
|
embers[emberSize - 2].m_Interp = eInterp::EMBER_INTERP_LINEAR;
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
//Finally, ensure that consecutive 'rotate' parameters never exceed
|
//Finally, ensure that consecutive 'rotate' parameters never exceed
|
||||||
//a difference of more than 180 degrees (+/-) for interpolation.
|
//a difference of more than 180 degrees (+/-) for interpolation.
|
||||||
|
@ -6,8 +6,6 @@ namespace EmberCLns
|
|||||||
/// <summary>
|
/// <summary>
|
||||||
/// Constructor that sets all kernel entry points as well as composes
|
/// Constructor that sets all kernel entry points as well as composes
|
||||||
/// all kernel source strings.
|
/// all kernel source strings.
|
||||||
/// Note that no versions of kernels that use the cache are compiled because
|
|
||||||
/// the cache is not big enough to hold double4.
|
|
||||||
/// No program compilation is done here, the user must explicitly do it.
|
/// No program compilation is done here, the user must explicitly do it.
|
||||||
/// The caller must specify whether they are using an nVidia or AMD card because it changes
|
/// The caller must specify whether they are using an nVidia or AMD card because it changes
|
||||||
/// the amount of local memory available.
|
/// the amount of local memory available.
|
||||||
@ -18,7 +16,6 @@ DEOpenCLKernelCreator::DEOpenCLKernelCreator(bool doublePrecision, bool nVidia)
|
|||||||
{
|
{
|
||||||
m_DoublePrecision = doublePrecision;
|
m_DoublePrecision = doublePrecision;
|
||||||
m_NVidia = nVidia;
|
m_NVidia = nVidia;
|
||||||
|
|
||||||
#ifdef ROW_ONLY_DE
|
#ifdef ROW_ONLY_DE
|
||||||
m_LogScaleAssignDEEntryPoint = "LogScaleAssignDensityFilterKernel";
|
m_LogScaleAssignDEEntryPoint = "LogScaleAssignDensityFilterKernel";
|
||||||
m_GaussianDEWithoutSsEntryPoint = "GaussianDEWithoutSsKernel";
|
m_GaussianDEWithoutSsEntryPoint = "GaussianDEWithoutSsKernel";
|
||||||
@ -68,30 +65,31 @@ const string& DEOpenCLKernelCreator::LogScaleAssignDEEntryPoint() const { return
|
|||||||
const string& DEOpenCLKernelCreator::GaussianDEKernel(size_t ss, uint filterWidth) const
|
const string& DEOpenCLKernelCreator::GaussianDEKernel(size_t ss, uint filterWidth) const
|
||||||
{
|
{
|
||||||
#ifndef ROW_ONLY_DE
|
#ifndef ROW_ONLY_DE
|
||||||
|
|
||||||
if (filterWidth > MaxDEFilterSize())
|
if (filterWidth > MaxDEFilterSize())
|
||||||
{
|
{
|
||||||
if (ss > 1)
|
if (ss > 1)
|
||||||
{
|
{
|
||||||
if (!(ss & 1))
|
if (!(ss & 1))
|
||||||
return m_GaussianDESsWithScfNoCacheKernel;
|
return m_GaussianDESsWithScfNoCacheKernel;//SS 2 or 4.
|
||||||
else
|
else
|
||||||
return m_GaussianDESsWithoutScfNoCacheKernel;
|
return m_GaussianDESsWithoutScfNoCacheKernel;//SS 3.
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
return m_GaussianDEWithoutSsNoCacheKernel;
|
return m_GaussianDEWithoutSsNoCacheKernel;//SS 1;
|
||||||
}
|
}
|
||||||
else
|
else//Use cache.
|
||||||
#endif
|
#endif
|
||||||
{
|
{
|
||||||
if (ss > 1)
|
if (ss > 1)
|
||||||
{
|
{
|
||||||
if (!(ss & 1))
|
if (!(ss & 1))
|
||||||
return m_GaussianDESsWithScfKernel;
|
return m_GaussianDESsWithScfKernel;//SS 2 or 4.
|
||||||
else
|
else
|
||||||
return m_GaussianDESsWithoutScfKernel;
|
return m_GaussianDESsWithoutScfKernel;//SS 3.
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
return m_GaussianDEWithoutSsKernel;
|
return m_GaussianDEWithoutSsKernel;//SS 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -104,6 +102,7 @@ const string& DEOpenCLKernelCreator::GaussianDEKernel(size_t ss, uint filterWidt
|
|||||||
const string& DEOpenCLKernelCreator::GaussianDEEntryPoint(size_t ss, uint filterWidth) const
|
const string& DEOpenCLKernelCreator::GaussianDEEntryPoint(size_t ss, uint filterWidth) const
|
||||||
{
|
{
|
||||||
#ifndef ROW_ONLY_DE
|
#ifndef ROW_ONLY_DE
|
||||||
|
|
||||||
if (filterWidth > MaxDEFilterSize())
|
if (filterWidth > MaxDEFilterSize())
|
||||||
{
|
{
|
||||||
if (ss > 1)
|
if (ss > 1)
|
||||||
@ -181,7 +180,6 @@ uint DEOpenCLKernelCreator::SolveMaxBoxSize(uint localMem)
|
|||||||
string DEOpenCLKernelCreator::CreateLogScaleAssignDEKernelString()
|
string DEOpenCLKernelCreator::CreateLogScaleAssignDEKernelString()
|
||||||
{
|
{
|
||||||
ostringstream os;
|
ostringstream os;
|
||||||
|
|
||||||
os <<
|
os <<
|
||||||
ConstantDefinesString(m_DoublePrecision) <<
|
ConstantDefinesString(m_DoublePrecision) <<
|
||||||
DensityFilterCLStructString <<
|
DensityFilterCLStructString <<
|
||||||
@ -205,7 +203,6 @@ string DEOpenCLKernelCreator::CreateLogScaleAssignDEKernelString()
|
|||||||
" barrier(CLK_GLOBAL_MEM_FENCE);\n"//Just to be safe. Makes no speed difference to do all of the time or only when there's a hit.
|
" barrier(CLK_GLOBAL_MEM_FENCE);\n"//Just to be safe. Makes no speed difference to do all of the time or only when there's a hit.
|
||||||
" }\n"
|
" }\n"
|
||||||
"}\n";
|
"}\n";
|
||||||
|
|
||||||
return os.str();
|
return os.str();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -215,7 +212,6 @@ string DEOpenCLKernelCreator::CreateGaussianDEKernel(size_t ss)
|
|||||||
bool doSS = ss > 1;
|
bool doSS = ss > 1;
|
||||||
bool doScf = !(ss & 1);
|
bool doScf = !(ss & 1);
|
||||||
ostringstream os;
|
ostringstream os;
|
||||||
|
|
||||||
os <<
|
os <<
|
||||||
ConstantDefinesString(m_DoublePrecision) <<
|
ConstantDefinesString(m_DoublePrecision) <<
|
||||||
DensityFilterCLStructString <<
|
DensityFilterCLStructString <<
|
||||||
@ -229,14 +225,14 @@ string DEOpenCLKernelCreator::CreateGaussianDEKernel(size_t ss)
|
|||||||
" const __global uint* coefIndices,\n"
|
" const __global uint* coefIndices,\n"
|
||||||
" const uint chunkSizeW,\n"
|
" const uint chunkSizeW,\n"
|
||||||
" const uint chunkSizeH,\n"
|
" const uint chunkSizeH,\n"
|
||||||
" const uint chunkW,\n"
|
" const uint colChunkPass,\n"
|
||||||
" const uint chunkH\n"
|
" const uint rowChunkPass\n"
|
||||||
"\t)\n"
|
"\t)\n"
|
||||||
"{\n"
|
"{\n"
|
||||||
" uint rowsToProcess = 32;\n"//Rows to process.
|
" uint rowsToProcess = 32;\n"//Rows to process.
|
||||||
"\n"
|
"\n"
|
||||||
" if (((((BLOCK_ID_X * chunkSizeW) + chunkW) * BLOCK_SIZE_X) + THREAD_ID_X >= densityFilter->m_SuperRasW) ||\n"
|
" if (((((BLOCK_ID_X * chunkSizeW) + colChunkPass) * BLOCK_SIZE_X) + THREAD_ID_X >= densityFilter->m_SuperRasW) ||\n"
|
||||||
" ((((BLOCK_ID_Y * chunkSizeH) + chunkH) * rowsToProcess) + THREAD_ID_Y >= densityFilter->m_SuperRasH))\n"
|
" ((((BLOCK_ID_Y * chunkSizeH) + rowChunkPass) * rowsToProcess) + THREAD_ID_Y >= densityFilter->m_SuperRasH))\n"
|
||||||
" return;\n"
|
" return;\n"
|
||||||
"\n";
|
"\n";
|
||||||
|
|
||||||
@ -273,9 +269,9 @@ string DEOpenCLKernelCreator::CreateGaussianDEKernel(size_t ss)
|
|||||||
//Start and end values are the indices in the histogram read from
|
//Start and end values are the indices in the histogram read from
|
||||||
//and written to in the accumulator. They are not the indices for the local block of data.
|
//and written to in the accumulator. They are not the indices for the local block of data.
|
||||||
//Before computing local offsets, compute the global offsets first to determine if any rows or cols fall outside of the bounds.
|
//Before computing local offsets, compute the global offsets first to determine if any rows or cols fall outside of the bounds.
|
||||||
" blockHistStartRow = min(botBound, topBound + (((BLOCK_ID_Y * chunkSizeH) + chunkH) * rowsToProcess));\n"//The first histogram row this block will process.
|
" blockHistStartRow = min(botBound, topBound + (((BLOCK_ID_Y * chunkSizeH) + rowChunkPass) * rowsToProcess));\n"//The first histogram row this block will process.
|
||||||
" blockHistEndRow = min(botBound, blockHistStartRow + rowsToProcess);\n"//The last histogram row this block will process, clamped to the last row.
|
" blockHistEndRow = min(botBound, blockHistStartRow + rowsToProcess);\n"//The last histogram row this block will process, clamped to the last row.
|
||||||
" blockHistStartCol = min(rightBound, leftBound + (((BLOCK_ID_X * chunkSizeW) + chunkW) * BLOCK_SIZE_X));\n"//The first histogram column this block will process.
|
" blockHistStartCol = min(rightBound, leftBound + (((BLOCK_ID_X * chunkSizeW) + colChunkPass) * BLOCK_SIZE_X));\n"//The first histogram column this block will process.
|
||||||
" boxReadStartCol = densityFilter->m_FilterWidth - min(densityFilter->m_FilterWidth, blockHistStartCol);\n"//The first box col this block will read from when copying to the accumulator.
|
" boxReadStartCol = densityFilter->m_FilterWidth - min(densityFilter->m_FilterWidth, blockHistStartCol);\n"//The first box col this block will read from when copying to the accumulator.
|
||||||
" boxReadEndCol = densityFilter->m_FilterWidth + min(densityFilter->m_FilterWidth + BLOCK_SIZE_X, densityFilter->m_SuperRasW - blockHistStartCol);\n"//The last box col this block will read from when copying to the accumulator.
|
" boxReadEndCol = densityFilter->m_FilterWidth + min(densityFilter->m_FilterWidth + BLOCK_SIZE_X, densityFilter->m_SuperRasW - blockHistStartCol);\n"//The last box col this block will read from when copying to the accumulator.
|
||||||
"\n"
|
"\n"
|
||||||
@ -299,9 +295,7 @@ string DEOpenCLKernelCreator::CreateGaussianDEKernel(size_t ss)
|
|||||||
" real_bucket_t filterSelect;\n"
|
" real_bucket_t filterSelect;\n"
|
||||||
" real4_bucket bucket;\n"
|
" real4_bucket bucket;\n"
|
||||||
;
|
;
|
||||||
|
|
||||||
os << " __local real4reals_bucket filterBox[192];\n";//Must be >= fullTempBoxWidth.
|
os << " __local real4reals_bucket filterBox[192];\n";//Must be >= fullTempBoxWidth.
|
||||||
|
|
||||||
os <<
|
os <<
|
||||||
"\n"
|
"\n"
|
||||||
" colsToZeroOffset = colsToZero * THREAD_ID_X;\n"
|
" colsToZeroOffset = colsToZero * THREAD_ID_X;\n"
|
||||||
@ -412,7 +406,6 @@ string DEOpenCLKernelCreator::CreateGaussianDEKernel(size_t ss)
|
|||||||
" barrier(CLK_GLOBAL_MEM_FENCE);\n"
|
" barrier(CLK_GLOBAL_MEM_FENCE);\n"
|
||||||
" }\n"//for() histogram rows.
|
" }\n"//for() histogram rows.
|
||||||
"}\n";
|
"}\n";
|
||||||
|
|
||||||
return os.str();
|
return os.str();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -443,7 +436,6 @@ string DEOpenCLKernelCreator::CreateGaussianDEKernel(size_t ss)
|
|||||||
bool doSS = ss > 1;
|
bool doSS = ss > 1;
|
||||||
bool doScf = !(ss & 1);
|
bool doScf = !(ss & 1);
|
||||||
ostringstream os;
|
ostringstream os;
|
||||||
|
|
||||||
os <<
|
os <<
|
||||||
ConstantDefinesString(m_DoublePrecision) <<
|
ConstantDefinesString(m_DoublePrecision) <<
|
||||||
DensityFilterCLStructString <<
|
DensityFilterCLStructString <<
|
||||||
@ -457,12 +449,12 @@ string DEOpenCLKernelCreator::CreateGaussianDEKernel(size_t ss)
|
|||||||
" const __global uint* coefIndices,\n"
|
" const __global uint* coefIndices,\n"
|
||||||
" const uint chunkSizeW,\n"
|
" const uint chunkSizeW,\n"
|
||||||
" const uint chunkSizeH,\n"
|
" const uint chunkSizeH,\n"
|
||||||
" const uint chunkW,\n"
|
" const uint colChunkPass,\n"
|
||||||
" const uint chunkH\n"
|
" const uint rowChunkPass\n"
|
||||||
"\t)\n"
|
"\t)\n"
|
||||||
"{\n"
|
"{\n"
|
||||||
" if (((((BLOCK_ID_X * chunkSizeW) + chunkW) * BLOCK_SIZE_X) + THREAD_ID_X >= densityFilter->m_SuperRasW) ||\n"
|
" if (((((BLOCK_ID_X * chunkSizeW) + colChunkPass) * BLOCK_SIZE_X) + THREAD_ID_X >= densityFilter->m_SuperRasW) ||\n"
|
||||||
" ((((BLOCK_ID_Y * chunkSizeH) + chunkH) * BLOCK_SIZE_Y) + THREAD_ID_Y >= densityFilter->m_SuperRasH))\n"
|
" ((((BLOCK_ID_Y * chunkSizeH) + rowChunkPass) * BLOCK_SIZE_Y) + THREAD_ID_Y >= densityFilter->m_SuperRasH))\n"
|
||||||
" return;\n"
|
" return;\n"
|
||||||
"\n";
|
"\n";
|
||||||
|
|
||||||
@ -489,7 +481,6 @@ string DEOpenCLKernelCreator::CreateGaussianDEKernel(size_t ss)
|
|||||||
" uint blockHistStartRow, blockHistEndRow, boxReadStartRow, boxReadEndRow;\n"
|
" uint blockHistStartRow, blockHistEndRow, boxReadStartRow, boxReadEndRow;\n"
|
||||||
" uint blockHistStartCol, boxReadStartCol, boxReadEndCol;\n"
|
" uint blockHistStartCol, boxReadStartCol, boxReadEndCol;\n"
|
||||||
" uint accumWriteStartRow, accumWriteStartCol, colsToWrite;\n"
|
" uint accumWriteStartRow, accumWriteStartCol, colsToWrite;\n"
|
||||||
|
|
||||||
//If any of the variables above end up being made __local, init them here.
|
//If any of the variables above end up being made __local, init them here.
|
||||||
//At the moment, it's slower even though it's more memory efficient.
|
//At the moment, it's slower even though it's more memory efficient.
|
||||||
//" if (THREAD_ID_X == 0 && THREAD_ID_Y == 0)\n"
|
//" if (THREAD_ID_X == 0 && THREAD_ID_Y == 0)\n"
|
||||||
@ -510,11 +501,11 @@ string DEOpenCLKernelCreator::CreateGaussianDEKernel(size_t ss)
|
|||||||
//Start and end values are the indices in the histogram read from
|
//Start and end values are the indices in the histogram read from
|
||||||
//and written to in the accumulator. They are not the indices for the local block of data.
|
//and written to in the accumulator. They are not the indices for the local block of data.
|
||||||
//Before computing local offsets, compute the global offsets first to determine if any rows or cols fall outside of the bounds.
|
//Before computing local offsets, compute the global offsets first to determine if any rows or cols fall outside of the bounds.
|
||||||
" blockHistStartRow = min(botBound, (uint)(topBound + (((BLOCK_ID_Y * chunkSizeH) + chunkH) * BLOCK_SIZE_Y)));\n"//The first histogram row this block will process.
|
" blockHistStartRow = min(botBound, (uint)(topBound + (((BLOCK_ID_Y * chunkSizeH) + rowChunkPass) * BLOCK_SIZE_Y)));\n"//The first histogram row this block will process.
|
||||||
" blockHistEndRow = min(botBound, (uint)(blockHistStartRow + BLOCK_SIZE_Y));\n"//The last histogram row this block will process, clamped to the last row.
|
" blockHistEndRow = min(botBound, (uint)(blockHistStartRow + BLOCK_SIZE_Y));\n"//The last histogram row this block will process, clamped to the last row.
|
||||||
" boxReadStartRow = densityFilter->m_FilterWidth - min(densityFilter->m_FilterWidth, blockHistStartRow);\n"//The first row in the local box to read from when writing back to the final accumulator for this block.
|
" boxReadStartRow = densityFilter->m_FilterWidth - min(densityFilter->m_FilterWidth, blockHistStartRow);\n"//The first row in the local box to read from when writing back to the final accumulator for this block.
|
||||||
" boxReadEndRow = densityFilter->m_FilterWidth + min((uint)(densityFilter->m_FilterWidth + BLOCK_SIZE_Y), densityFilter->m_SuperRasH - blockHistStartRow);\n"//The last row in the local box to read from when writing back to the final accumulator for this block.
|
" boxReadEndRow = densityFilter->m_FilterWidth + min((uint)(densityFilter->m_FilterWidth + BLOCK_SIZE_Y), densityFilter->m_SuperRasH - blockHistStartRow);\n"//The last row in the local box to read from when writing back to the final accumulator for this block.
|
||||||
" blockHistStartCol = min(rightBound, leftBound + (uint)(((BLOCK_ID_X * chunkSizeW) + chunkW) * BLOCK_SIZE_X));\n"//The first histogram column this block will process.
|
" blockHistStartCol = min(rightBound, leftBound + (uint)(((BLOCK_ID_X * chunkSizeW) + colChunkPass) * BLOCK_SIZE_X));\n"//The first histogram column this block will process.
|
||||||
" boxReadStartCol = densityFilter->m_FilterWidth - min(densityFilter->m_FilterWidth, blockHistStartCol);\n"//The first box col this block will read from when copying to the accumulator.
|
" boxReadStartCol = densityFilter->m_FilterWidth - min(densityFilter->m_FilterWidth, blockHistStartCol);\n"//The first box col this block will read from when copying to the accumulator.
|
||||||
" boxReadEndCol = densityFilter->m_FilterWidth + min(densityFilter->m_FilterWidth + (uint)BLOCK_SIZE_X, densityFilter->m_SuperRasW - blockHistStartCol);\n"//The last box col this block will read from when copying to the accumulator.
|
" boxReadEndCol = densityFilter->m_FilterWidth + min(densityFilter->m_FilterWidth + (uint)BLOCK_SIZE_X, densityFilter->m_SuperRasW - blockHistStartCol);\n"//The last box col this block will read from when copying to the accumulator.
|
||||||
"\n"
|
"\n"
|
||||||
@ -526,7 +517,6 @@ string DEOpenCLKernelCreator::CreateGaussianDEKernel(size_t ss)
|
|||||||
" uint threadHistRow = blockHistStartRow + THREAD_ID_Y;\n"//The histogram row this individual thread will be reading from.
|
" uint threadHistRow = blockHistStartRow + THREAD_ID_Y;\n"//The histogram row this individual thread will be reading from.
|
||||||
" uint threadHistCol = blockHistStartCol + THREAD_ID_X;\n"//The histogram column this individual thread will be reading from.
|
" uint threadHistCol = blockHistStartCol + THREAD_ID_X;\n"//The histogram column this individual thread will be reading from.
|
||||||
"\n"
|
"\n"
|
||||||
|
|
||||||
//Compute the center position in this local box to serve as the center position
|
//Compute the center position in this local box to serve as the center position
|
||||||
//from which filter application offsets are computed.
|
//from which filter application offsets are computed.
|
||||||
//These are the local indices for the local data that are temporarily accumulated to before
|
//These are the local indices for the local data that are temporarily accumulated to before
|
||||||
@ -540,13 +530,8 @@ string DEOpenCLKernelCreator::CreateGaussianDEKernel(size_t ss)
|
|||||||
" real_bucket_t filterSelect;\n"
|
" real_bucket_t filterSelect;\n"
|
||||||
" real4_bucket bucket;\n"
|
" real4_bucket bucket;\n"
|
||||||
;
|
;
|
||||||
|
|
||||||
//This will be treated as having dimensions of (BLOCK_SIZE_X + (fw * 2)) x (BLOCK_SIZE_Y + (fw * 2)).
|
//This will be treated as having dimensions of (BLOCK_SIZE_X + (fw * 2)) x (BLOCK_SIZE_Y + (fw * 2)).
|
||||||
if (m_NVidia)
|
os << " __local real4reals_bucket filterBox[1200];\n";//Really only need 1156
|
||||||
os << " __local real4reals_bucket filterBox[3000];\n";
|
|
||||||
else
|
|
||||||
os << " __local real4reals_bucket filterBox[1200];\n";
|
|
||||||
|
|
||||||
os <<
|
os <<
|
||||||
//Zero the temp buffers first. This splits the zeroization evenly across all threads (columns) in the first block row.
|
//Zero the temp buffers first. This splits the zeroization evenly across all threads (columns) in the first block row.
|
||||||
//This is a middle ground solution. Previous methods tried:
|
//This is a middle ground solution. Previous methods tried:
|
||||||
@ -673,7 +658,6 @@ string DEOpenCLKernelCreator::CreateGaussianDEKernel(size_t ss)
|
|||||||
" }\n"
|
" }\n"
|
||||||
" }\n"
|
" }\n"
|
||||||
"}\n";
|
"}\n";
|
||||||
|
|
||||||
return os.str();
|
return os.str();
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
@ -701,7 +685,6 @@ string DEOpenCLKernelCreator::CreateGaussianDEKernelNoLocalCache(size_t ss)
|
|||||||
bool doSS = ss > 1;
|
bool doSS = ss > 1;
|
||||||
bool doScf = !(ss & 1);
|
bool doScf = !(ss & 1);
|
||||||
ostringstream os;
|
ostringstream os;
|
||||||
|
|
||||||
os <<
|
os <<
|
||||||
ConstantDefinesString(m_DoublePrecision) <<
|
ConstantDefinesString(m_DoublePrecision) <<
|
||||||
DensityFilterCLStructString <<
|
DensityFilterCLStructString <<
|
||||||
@ -716,12 +699,12 @@ string DEOpenCLKernelCreator::CreateGaussianDEKernelNoLocalCache(size_t ss)
|
|||||||
" const __global uint* coefIndices,\n"
|
" const __global uint* coefIndices,\n"
|
||||||
" const uint chunkSizeW,\n"
|
" const uint chunkSizeW,\n"
|
||||||
" const uint chunkSizeH,\n"
|
" const uint chunkSizeH,\n"
|
||||||
" const uint chunkW,\n"
|
" const uint colChunkPass,\n"
|
||||||
" const uint chunkH\n"
|
" const uint rowChunkPass\n"
|
||||||
"\t)\n"
|
"\t)\n"
|
||||||
"{\n"
|
"{\n"
|
||||||
" if (((((BLOCK_ID_X * chunkSizeW) + chunkW) * BLOCK_SIZE_X) + THREAD_ID_X >= densityFilter->m_SuperRasW) ||\n"
|
" if (((((BLOCK_ID_X * chunkSizeW) + colChunkPass) * BLOCK_SIZE_X) + THREAD_ID_X >= densityFilter->m_SuperRasW) ||\n"
|
||||||
" ((((BLOCK_ID_Y * chunkSizeH) + chunkH) * BLOCK_SIZE_Y) + THREAD_ID_Y >= densityFilter->m_SuperRasH))\n"
|
" ((((BLOCK_ID_Y * chunkSizeH) + rowChunkPass) * BLOCK_SIZE_Y) + THREAD_ID_Y >= densityFilter->m_SuperRasH))\n"
|
||||||
" return;\n"
|
" return;\n"
|
||||||
"\n";
|
"\n";
|
||||||
|
|
||||||
@ -747,10 +730,10 @@ string DEOpenCLKernelCreator::CreateGaussianDEKernelNoLocalCache(size_t ss)
|
|||||||
"\n"
|
"\n"
|
||||||
//Start and end values are the indices in the histogram read from and written to in the accumulator.
|
//Start and end values are the indices in the histogram read from and written to in the accumulator.
|
||||||
//Before computing local offsets, compute the global offsets first to determine if any rows or cols fall outside of the bounds.
|
//Before computing local offsets, compute the global offsets first to determine if any rows or cols fall outside of the bounds.
|
||||||
" uint blockHistStartRow = min(botBound, (uint)(topBound + (((BLOCK_ID_Y * chunkSizeH) + chunkH) * BLOCK_SIZE_Y)));\n"//The first histogram row this block will process.
|
" uint blockHistStartRow = min(botBound, (uint)(topBound + (((BLOCK_ID_Y * chunkSizeH) + rowChunkPass) * BLOCK_SIZE_Y)));\n"//The first histogram row this block will process.
|
||||||
" uint threadHistRow = blockHistStartRow + THREAD_ID_Y;\n"//The histogram row this individual thread will be reading from.
|
" uint threadHistRow = blockHistStartRow + THREAD_ID_Y;\n"//The histogram row this individual thread will be reading from.
|
||||||
"\n"
|
"\n"
|
||||||
" uint blockHistStartCol = min(rightBound, leftBound + (uint)(((BLOCK_ID_X * chunkSizeW) + chunkW) * BLOCK_SIZE_X));\n"//The first histogram column this block will process.
|
" uint blockHistStartCol = min(rightBound, leftBound + (uint)(((BLOCK_ID_X * chunkSizeW) + colChunkPass) * BLOCK_SIZE_X));\n"//The first histogram column this block will process.
|
||||||
" uint threadHistCol = blockHistStartCol + THREAD_ID_X;\n"//The histogram column this individual thread will be reading from.
|
" uint threadHistCol = blockHistStartCol + THREAD_ID_X;\n"//The histogram column this individual thread will be reading from.
|
||||||
"\n"
|
"\n"
|
||||||
" int i, j;\n"
|
" int i, j;\n"
|
||||||
@ -833,7 +816,6 @@ string DEOpenCLKernelCreator::CreateGaussianDEKernelNoLocalCache(size_t ss)
|
|||||||
//"\n"
|
//"\n"
|
||||||
//" barrier(CLK_GLOBAL_MEM_FENCE);\n"//Just to be safe.
|
//" barrier(CLK_GLOBAL_MEM_FENCE);\n"//Just to be safe.
|
||||||
"}\n";
|
"}\n";
|
||||||
|
|
||||||
return os.str();
|
return os.str();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -940,12 +940,20 @@ size_t OpenCLWrapper::GlobalMemSize() const { return m_GlobalMemSize; }
|
|||||||
size_t OpenCLWrapper::MaxAllocSize() const { return m_MaxAllocSize; }
|
size_t OpenCLWrapper::MaxAllocSize() const { return m_MaxAllocSize; }
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// Makes the even grid dims.
|
/// Make even grid dimensions.
|
||||||
|
/// The size of the blocks in terms of threads must divide evenly into the total number of threads in the grid.
|
||||||
|
/// In the case of a remainder, expand the width and height of the grid to the next highest evenly divisible value.
|
||||||
|
/// Ex:
|
||||||
|
/// blockW = 5, blockH = 5
|
||||||
|
/// gridW = 18, gridH = 27
|
||||||
|
///
|
||||||
|
/// To make these even:
|
||||||
|
/// gridW = 20, gridH = 30
|
||||||
/// </summary>
|
/// </summary>
|
||||||
/// <param name="blockW">The block w.</param>
|
/// <param name="blockW">The width of each block in terms of threads.</param>
|
||||||
/// <param name="blockH">The block h.</param>
|
/// <param name="blockH">The height of each block in terms of threads.</param>
|
||||||
/// <param name="gridW">The grid w.</param>
|
/// <param name="gridW">The width of the entire grid in terms of threads.</param>
|
||||||
/// <param name="gridH">The grid h.</param>
|
/// <param name="gridH">The width of the entire grid in terms of threads.</param>
|
||||||
void OpenCLWrapper::MakeEvenGridDims(size_t blockW, size_t blockH, size_t& gridW, size_t& gridH)
|
void OpenCLWrapper::MakeEvenGridDims(size_t blockW, size_t blockH, size_t& gridW, size_t& gridH)
|
||||||
{
|
{
|
||||||
if (gridW % blockW != 0)
|
if (gridW % blockW != 0)
|
||||||
|
@ -166,12 +166,13 @@ bool RendererCL<T, bucketT>::Init(const vector<pair<size_t, size_t>>& devices, b
|
|||||||
if (b)
|
if (b)
|
||||||
{
|
{
|
||||||
//This is the maximum box dimension for density filtering which consists of (blockSize * blockSize) + (2 * filterWidth).
|
//This is the maximum box dimension for density filtering which consists of (blockSize * blockSize) + (2 * filterWidth).
|
||||||
//These blocks must be square, and ideally, 32x32.
|
//These blocks should be square, and ideally, 32x32.
|
||||||
//Sadly, at the moment, Fermi runs out of resources at that block size because the DE filter function is so complex.
|
//Sadly, at the moment, the GPU runs out of resources at that block size because the DE filter function is so complex.
|
||||||
//The next best block size seems to be 24x24.
|
//The next best block size seems to be 24x24.
|
||||||
//AMD is further limited because of less local memory so these have to be 16 on AMD.
|
//AMD is further limited because of less local memory so these have to be 16 on AMD.
|
||||||
m_MaxDEBlockSizeW = m_Devices[0]->Nvidia() ? 24 : 16;//These *must* both be divisible by 8 or else pixels will go missing.
|
//Users have reported crashes on Nvidia cards even at size 24, so just to be safe, make them both 16 for all manufacturers.
|
||||||
m_MaxDEBlockSizeH = m_Devices[0]->Nvidia() ? 24 : 16;
|
m_MaxDEBlockSizeW = 16;
|
||||||
|
m_MaxDEBlockSizeH = 16;
|
||||||
FillSeeds();
|
FillSeeds();
|
||||||
|
|
||||||
for (size_t device = 0; device < m_Devices.size(); device++)
|
for (size_t device = 0; device < m_Devices.size(); device++)
|
||||||
@ -1191,22 +1192,18 @@ eRenderStatus RendererCL<T, bucketT>::RunDensityFilter()
|
|||||||
|
|
||||||
if (kernelIndex != -1)
|
if (kernelIndex != -1)
|
||||||
{
|
{
|
||||||
uint leftBound = m_DensityFilterCL.m_Supersample - 1;
|
uint ssm1 = m_DensityFilterCL.m_Supersample - 1;
|
||||||
uint rightBound = m_DensityFilterCL.m_SuperRasW - (m_DensityFilterCL.m_Supersample - 1);
|
uint leftBound = ssm1;
|
||||||
|
uint rightBound = m_DensityFilterCL.m_SuperRasW - ssm1;
|
||||||
uint topBound = leftBound;
|
uint topBound = leftBound;
|
||||||
uint botBound = m_DensityFilterCL.m_SuperRasH - (m_DensityFilterCL.m_Supersample - 1);
|
uint botBound = m_DensityFilterCL.m_SuperRasH - ssm1;
|
||||||
size_t gridW = rightBound - leftBound;
|
size_t gridW = rightBound - leftBound;
|
||||||
size_t gridH = botBound - topBound;
|
size_t gridH = botBound - topBound;
|
||||||
size_t blockSizeW = m_MaxDEBlockSizeW;//These *must* both be divisible by 16 or else pixels will go missing.
|
size_t blockSizeW = m_MaxDEBlockSizeW;
|
||||||
size_t blockSizeH = m_MaxDEBlockSizeH;
|
size_t blockSizeH = m_MaxDEBlockSizeH;
|
||||||
|
double fw2 = m_DensityFilterCL.m_FilterWidth * 2.0;
|
||||||
auto& wrapper = m_Devices[0]->m_Wrapper;
|
auto& wrapper = m_Devices[0]->m_Wrapper;
|
||||||
|
//Can't just blindly pass dimension in vals. Must adjust them first to evenly divide the thread count
|
||||||
//OpenCL runs out of resources when using double or a supersample of 2.
|
|
||||||
//Remedy this by reducing the height of the block by 2.
|
|
||||||
if (m_DoublePrecision || m_DensityFilterCL.m_Supersample > 1)
|
|
||||||
blockSizeH -= 2;
|
|
||||||
|
|
||||||
//Can't just blindly pass dimension in vals. Must adjust them first to evenly divide the block count
|
|
||||||
//into the total grid dimensions.
|
//into the total grid dimensions.
|
||||||
OpenCLWrapper::MakeEvenGridDims(blockSizeW, blockSizeH, gridW, gridH);
|
OpenCLWrapper::MakeEvenGridDims(blockSizeW, blockSizeH, gridW, gridH);
|
||||||
//t.Tic();
|
//t.Tic();
|
||||||
@ -1215,11 +1212,11 @@ eRenderStatus RendererCL<T, bucketT>::RunDensityFilter()
|
|||||||
//The other is to proces the entire image in multiple passes, and each pass processes blocks of pixels
|
//The other is to proces the entire image in multiple passes, and each pass processes blocks of pixels
|
||||||
//that are far enough apart such that their filters do not overlap.
|
//that are far enough apart such that their filters do not overlap.
|
||||||
//Do the latter.
|
//Do the latter.
|
||||||
//Gap is in terms of blocks. How many blocks must separate two blocks running at the same time.
|
//Gap is in terms of blocks and specifies how many blocks must separate two blocks running at the same time.
|
||||||
uint gapW = uint(ceil((m_DensityFilterCL.m_FilterWidth * 2.0) / double(blockSizeW)));
|
uint gapW = uint(ceil(fw2 / blockSizeW));
|
||||||
uint chunkSizeW = gapW + 1;
|
uint chunkSizeW = gapW + 1;//Chunk size is also in terms of blocks and is one block (the one running) plus the gap to the right of it.
|
||||||
uint gapH = uint(ceil((m_DensityFilterCL.m_FilterWidth * 2.0) / double(blockSizeH)));
|
uint gapH = uint(ceil(fw2 / blockSizeH));
|
||||||
uint chunkSizeH = gapH + 1;
|
uint chunkSizeH = gapH + 1;//Chunk size is also in terms of blocks and is one block (the one running) plus the gap below it.
|
||||||
double totalChunks = chunkSizeW * chunkSizeH;
|
double totalChunks = chunkSizeW * chunkSizeH;
|
||||||
|
|
||||||
if (b && !(b = wrapper.AddAndWriteBuffer(m_DEFilterParamsBufferName, reinterpret_cast<void*>(&m_DensityFilterCL), sizeof(m_DensityFilterCL)))) { AddToReport(loc); }
|
if (b && !(b = wrapper.AddAndWriteBuffer(m_DEFilterParamsBufferName, reinterpret_cast<void*>(&m_DensityFilterCL), sizeof(m_DensityFilterCL)))) { AddToReport(loc); }
|
||||||
@ -1257,22 +1254,22 @@ eRenderStatus RendererCL<T, bucketT>::RunDensityFilter()
|
|||||||
}
|
}
|
||||||
|
|
||||||
#else
|
#else
|
||||||
gridW /= chunkSizeW;
|
gridW /= chunkSizeW;//Grid must be scaled down by number of chunks.
|
||||||
gridH /= chunkSizeH;
|
gridH /= chunkSizeH;
|
||||||
OpenCLWrapper::MakeEvenGridDims(blockSizeW, blockSizeH, gridW, gridH);
|
OpenCLWrapper::MakeEvenGridDims(blockSizeW, blockSizeH, gridW, gridH);
|
||||||
|
|
||||||
for (uint rowChunk = 0; b && !m_Abort && rowChunk < chunkSizeH; rowChunk++)
|
for (uint rowChunkPass = 0; b && !m_Abort && rowChunkPass < chunkSizeH; rowChunkPass++)//Number of vertical passes.
|
||||||
{
|
{
|
||||||
for (uint colChunk = 0; b && !m_Abort && colChunk < chunkSizeW; colChunk++)
|
for (uint colChunkPass = 0; b && !m_Abort && colChunkPass < chunkSizeW; colChunkPass++)//Number of horizontal passes.
|
||||||
{
|
{
|
||||||
//t2.Tic();
|
//t2.Tic();
|
||||||
if (b && !(b = RunDensityFilterPrivate(kernelIndex, gridW, gridH, blockSizeW, blockSizeH, chunkSizeW, chunkSizeH, colChunk, rowChunk))) { m_Abort = true; AddToReport(loc); }
|
if (b && !(b = RunDensityFilterPrivate(kernelIndex, gridW, gridH, blockSizeW, blockSizeH, chunkSizeW, chunkSizeH, colChunkPass, rowChunkPass))) { m_Abort = true; AddToReport(loc); }
|
||||||
|
|
||||||
//t2.Toc(loc);
|
//t2.Toc(loc);
|
||||||
|
|
||||||
if (b && m_Callback)
|
if (b && m_Callback)
|
||||||
{
|
{
|
||||||
double percent = (double((rowChunk * chunkSizeW) + (colChunk + 1)) / totalChunks) * 100.0;
|
double percent = (double((rowChunkPass * chunkSizeW) + (colChunkPass + 1)) / totalChunks) * 100.0;
|
||||||
double etaMs = ((100.0 - percent) / percent) * t.Toc();
|
double etaMs = ((100.0 - percent) / percent) * t.Toc();
|
||||||
|
|
||||||
if (!m_Callback->ProgressFunc(m_Ember, m_ProgressParameter, percent, 1, etaMs))
|
if (!m_Callback->ProgressFunc(m_Ember, m_ProgressParameter, percent, 1, etaMs))
|
||||||
@ -1456,11 +1453,11 @@ bool RendererCL<T, bucketT>::ClearBuffer(size_t device, const string& bufferName
|
|||||||
/// <param name="blockH">Block height</param>
|
/// <param name="blockH">Block height</param>
|
||||||
/// <param name="chunkSizeW">Chunk size width (gapW + 1)</param>
|
/// <param name="chunkSizeW">Chunk size width (gapW + 1)</param>
|
||||||
/// <param name="chunkSizeH">Chunk size height (gapH + 1)</param>
|
/// <param name="chunkSizeH">Chunk size height (gapH + 1)</param>
|
||||||
/// <param name="rowParity">Row parity</param>
|
/// <param name="colChunkPass">The current horizontal pass index</param>
|
||||||
/// <param name="colParity">Column parity</param>
|
/// <param name="rowChunkPass">The current vertical pass index</param>
|
||||||
/// <returns>True if success, else false.</returns>
|
/// <returns>True if success, else false.</returns>
|
||||||
template <typename T, typename bucketT>
|
template <typename T, typename bucketT>
|
||||||
bool RendererCL<T, bucketT>::RunDensityFilterPrivate(size_t kernelIndex, size_t gridW, size_t gridH, size_t blockW, size_t blockH, uint chunkSizeW, uint chunkSizeH, uint chunkW, uint chunkH)
|
bool RendererCL<T, bucketT>::RunDensityFilterPrivate(size_t kernelIndex, size_t gridW, size_t gridH, size_t blockW, size_t blockH, uint chunkSizeW, uint chunkSizeH, uint colChunkPass, uint rowChunkPass)
|
||||||
{
|
{
|
||||||
//Timing t(4);
|
//Timing t(4);
|
||||||
bool b = true;
|
bool b = true;
|
||||||
@ -1487,9 +1484,9 @@ bool RendererCL<T, bucketT>::RunDensityFilterPrivate(size_t kernelIndex, size_t
|
|||||||
|
|
||||||
if (b && !(b = wrapper.SetArg(kernelIndex, argIndex, chunkSizeH))) { AddToReport(loc); } argIndex++;//Chunk size height (gapH + 1).
|
if (b && !(b = wrapper.SetArg(kernelIndex, argIndex, chunkSizeH))) { AddToReport(loc); } argIndex++;//Chunk size height (gapH + 1).
|
||||||
|
|
||||||
if (b && !(b = wrapper.SetArg(kernelIndex, argIndex, chunkW))) { AddToReport(loc); } argIndex++;//Column chunk.
|
if (b && !(b = wrapper.SetArg(kernelIndex, argIndex, colChunkPass))) { AddToReport(loc); } argIndex++;//Column chunk, horizontal pass.
|
||||||
|
|
||||||
if (b && !(b = wrapper.SetArg(kernelIndex, argIndex, chunkH))) { AddToReport(loc); } argIndex++;//Row chunk.
|
if (b && !(b = wrapper.SetArg(kernelIndex, argIndex, rowChunkPass))) { AddToReport(loc); } argIndex++;//Row chunk, vertical pass.
|
||||||
|
|
||||||
//t.Toc(__FUNCTION__ " set args");
|
//t.Toc(__FUNCTION__ " set args");
|
||||||
|
|
||||||
|
@ -178,7 +178,7 @@ private:
|
|||||||
eRenderStatus RunDensityFilter();
|
eRenderStatus RunDensityFilter();
|
||||||
eRenderStatus RunFinalAccum();
|
eRenderStatus RunFinalAccum();
|
||||||
bool ClearBuffer(size_t device, const string& bufferName, uint width, uint height, uint elementSize);
|
bool ClearBuffer(size_t device, const string& bufferName, uint width, uint height, uint elementSize);
|
||||||
bool RunDensityFilterPrivate(size_t kernelIndex, size_t gridW, size_t gridH, size_t blockW, size_t blockH, uint chunkSizeW, uint chunkSizeH, uint chunkW, uint chunkH);
|
bool RunDensityFilterPrivate(size_t kernelIndex, size_t gridW, size_t gridH, size_t blockW, size_t blockH, uint chunkSizeW, uint chunkSizeH, uint colChunkPass, uint rowChunkPass);
|
||||||
int MakeAndGetDensityFilterProgram(size_t ss, uint filterWidth);
|
int MakeAndGetDensityFilterProgram(size_t ss, uint filterWidth);
|
||||||
int MakeAndGetFinalAccumProgram(double& alphaBase, double& alphaScale);
|
int MakeAndGetFinalAccumProgram(double& alphaBase, double& alphaScale);
|
||||||
int MakeAndGetGammaCorrectionProgram();
|
int MakeAndGetGammaCorrectionProgram();
|
||||||
|
Loading…
Reference in New Issue
Block a user