mirror of
https://bitbucket.org/mfeemster/fractorium.git
synced 2025-07-01 13:56:06 -04:00
--Bug fixes
-Fix improper usage of rand() in cpow2, cpow3, hypertile1, hypertile3D1, hypertile3D2, juliac, juliaq. -Fix program crashing during density filtering on some Nvidia cards. -hypertile3D1 was wrong. -Parsing phoenix_julia when coming from Apophysis was wrong. -Density filtering was freezing on certain Nvidia cards. --Code changes -Optimize juliac, npolar. -Add a new function Crand() which behaves like the legacy C rand() which returns an integer between 0 and 32766, inclusive. -Use RandBit() in some places. -Remove Zeps() from vignette, it's not needed. -Restructure OpenCL code for density filtering such that it does not hang after being compiled on some Nvidia cards, such as the gtx 1660. Remove barriers from conditionals where possible.
This commit is contained in:
@ -3,6 +3,16 @@
|
||||
|
||||
namespace EmberCLns
|
||||
{
|
||||
|
||||
/// <summary>
|
||||
/// Empty constructor.
|
||||
/// This is needed because this class will need to be empty before it's initialized in RendererCL
|
||||
/// with specific arguments.
|
||||
/// </summary>
|
||||
DEOpenCLKernelCreator::DEOpenCLKernelCreator()
|
||||
{
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Constructor that sets all kernel entry points as well as composes
|
||||
/// all kernel source strings.
|
||||
@ -330,77 +340,81 @@ string DEOpenCLKernelCreator::CreateGaussianDEKernel(size_t ss)
|
||||
"\n"
|
||||
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
||||
"\n"
|
||||
" if (threadHistRow < botBound && threadHistCol < rightBound)\n"
|
||||
" {\n"
|
||||
" if (threadHistRow < botBound && threadHistCol < rightBound)\n"//This is done to avoid putting barriers inside of conidtionals.
|
||||
" bucket = histogram[(threadHistRow * densityFilter->m_SuperRasW) + threadHistCol];\n"
|
||||
" else\n"
|
||||
" bucket = 0.0;\n"
|
||||
"\n"
|
||||
" if (bucket.w != 0)\n"
|
||||
" {\n"
|
||||
" cacheLog = (densityFilter->m_K1 * log((real_bucket_t)fma(bucket.w, densityFilter->m_K2, (real_bucket_t)1.0))) / bucket.w;\n";
|
||||
" if (bucket.w != 0)\n"//This is done to avoid putting barriers inside of conidtionals.
|
||||
" {\n"
|
||||
" cacheLog = (densityFilter->m_K1 * log((real_bucket_t)fma(bucket.w, densityFilter->m_K2, (real_bucket_t)1.0))) / bucket.w;\n";
|
||||
|
||||
if (doSS)
|
||||
{
|
||||
os <<
|
||||
" filterSelect = 0;\n"
|
||||
" densityBoxLeftX = threadHistCol - min(threadHistCol, ss);\n"
|
||||
" densityBoxRightX = threadHistCol + min(ss, (densityFilter->m_SuperRasW - threadHistCol) - 1);\n"
|
||||
" densityBoxTopY = threadHistRow - min(threadHistRow, ss);\n"
|
||||
" densityBoxBottomY = threadHistRow + min(ss, (densityFilter->m_SuperRasH - threadHistRow) - 1);\n"
|
||||
" filterSelect = 0;\n"
|
||||
" densityBoxLeftX = threadHistCol - min(threadHistCol, ss);\n"
|
||||
" densityBoxRightX = threadHistCol + min(ss, (densityFilter->m_SuperRasW - threadHistCol) - 1);\n"
|
||||
" densityBoxTopY = threadHistRow - min(threadHistRow, ss);\n"
|
||||
" densityBoxBottomY = threadHistRow + min(ss, (densityFilter->m_SuperRasH - threadHistRow) - 1);\n"
|
||||
"\n"
|
||||
" for (j = densityBoxTopY; j <= densityBoxBottomY; j++)\n"
|
||||
" for (j = densityBoxTopY; j <= densityBoxBottomY; j++)\n"
|
||||
" {\n"
|
||||
" for (i = densityBoxLeftX; i <= densityBoxRightX; i++)\n"
|
||||
" {\n"
|
||||
" for (i = densityBoxLeftX; i <= densityBoxRightX; i++)\n"
|
||||
" {\n"
|
||||
" filterSelect += histogram[i + (j * densityFilter->m_SuperRasW)].w;\n"
|
||||
" }\n"
|
||||
" filterSelect += histogram[i + (j * densityFilter->m_SuperRasW)].w;\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
"\n";
|
||||
|
||||
if (doScf)
|
||||
os <<
|
||||
" filterSelect *= scfact;\n";
|
||||
" filterSelect *= scfact;\n";
|
||||
}
|
||||
else
|
||||
{
|
||||
os <<
|
||||
" filterSelect = bucket.w;\n";
|
||||
" filterSelect = bucket.w;\n";
|
||||
}
|
||||
|
||||
os <<
|
||||
" }\n"
|
||||
" else\n"
|
||||
" {\n"
|
||||
" cacheLog = 0.0;\n"
|
||||
" filterSelect = 1.0;\n"//Will subtract 1 to be 0 below.
|
||||
" }\n"
|
||||
"\n"
|
||||
" if (filterSelect > densityFilter->m_MaxFilteredCounts)\n"
|
||||
" filterSelectInt = densityFilter->m_MaxFilterIndex;\n"
|
||||
" else if (filterSelect <= DE_THRESH)\n"
|
||||
" filterSelectInt = (int)ceil(filterSelect) - 1;\n"
|
||||
" else\n"
|
||||
" filterSelectInt = (int)DE_THRESH + (int)floor(pow((real_bucket_t)(filterSelect - DE_THRESH), densityFilter->m_Curve));\n"
|
||||
" if (filterSelect > densityFilter->m_MaxFilteredCounts)\n"
|
||||
" filterSelectInt = densityFilter->m_MaxFilterIndex;\n"
|
||||
" else if (filterSelect <= DE_THRESH)\n"
|
||||
" filterSelectInt = (int)ceil(filterSelect) - 1;\n"
|
||||
" else\n"
|
||||
" filterSelectInt = (int)DE_THRESH + (int)floor(pow((real_bucket_t)(filterSelect - DE_THRESH), densityFilter->m_Curve));\n"
|
||||
"\n"
|
||||
" if (filterSelectInt > densityFilter->m_MaxFilterIndex)\n"
|
||||
" filterSelectInt = densityFilter->m_MaxFilterIndex;\n"
|
||||
" if (filterSelectInt > densityFilter->m_MaxFilterIndex)\n"
|
||||
" filterSelectInt = densityFilter->m_MaxFilterIndex;\n"
|
||||
"\n"
|
||||
" filterCoefIndex = filterSelectInt * densityFilter->m_KernelSize;\n"
|
||||
" filterCoefIndex = filterSelectInt * densityFilter->m_KernelSize;\n"
|
||||
"\n"
|
||||
//With this new method, only accumulate to the temp local buffer first. Write to the final accumulator last.
|
||||
//For each loop through, note that there is a local memory barrier call inside of each call to AddToAccumNoCheck().
|
||||
//If this isn't done, pixel errors occurr and even an out of resources error occurrs because too many writes are done to the same place in memory at once.
|
||||
" k = (int)densityFilter->m_FilterWidth;\n"//Need a signed int to use below, really is filter width, but reusing a variable to save space.
|
||||
" k = (int)densityFilter->m_FilterWidth;\n"//Need a signed int to use below, really is filter width, but reusing a variable to save space.
|
||||
"\n"
|
||||
" for (j = -k; j <= k; j++)\n"
|
||||
" for (j = -k; j <= k; j++)\n"
|
||||
" {\n"
|
||||
" for (i = -k; i <= k; i++)\n"
|
||||
" {\n"
|
||||
" filterSelectInt = filterCoefIndex + coefIndices[(abs(j) * (densityFilter->m_FilterWidth + 1)) + abs(i)];\n"//Really is filterCoeffIndexPlusOffset, but reusing a variable to save space.
|
||||
"\n"
|
||||
" if (filterCoefs[filterSelectInt] != 0)\n"//This conditional actually improves speed, despite SIMT being bad at conditionals.
|
||||
" {\n"
|
||||
" for (i = -k; i <= k; i++)\n"
|
||||
" {\n"
|
||||
" filterSelectInt = filterCoefIndex + coefIndices[(abs(j) * (densityFilter->m_FilterWidth + 1)) + abs(i)];\n"//Really is filterCoeffIndexPlusOffset, but reusing a variable to save space.
|
||||
"\n"
|
||||
" if (filterCoefs[filterSelectInt] != 0)\n"//This conditional actually improves speed, despite SIMT being bad at conditionals.
|
||||
" {\n"
|
||||
" filterBox[(i + boxCol) + ((j + boxRow) * fullTempBoxWidth)].m_Real4 += (bucket * (filterCoefs[filterSelectInt] * cacheLog));\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" barrier(CLK_LOCAL_MEM_FENCE);\n"//If this is the only barrier and the block size is exactly 16, it works perfectly. Otherwise, no chunks occur, but a many streaks.
|
||||
" filterBox[(i + boxCol) + ((j + boxRow) * fullTempBoxWidth)].m_Real4 += (bucket * (filterCoefs[filterSelectInt] * cacheLog));\n"
|
||||
" }\n"
|
||||
" }\n"//bucket.w != 0.
|
||||
" }\n"//In bounds.
|
||||
"\n"
|
||||
" }\n"
|
||||
" barrier(CLK_LOCAL_MEM_FENCE);\n"//If this is the only barrier and the block size is exactly 16, it works perfectly. Otherwise, no chunks occur, but a many streaks.
|
||||
" }\n"
|
||||
"\n"
|
||||
" barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);\n"
|
||||
"\n"
|
||||
|
@ -29,6 +29,7 @@ namespace EmberCLns
|
||||
class EMBERCL_API DEOpenCLKernelCreator
|
||||
{
|
||||
public:
|
||||
DEOpenCLKernelCreator();
|
||||
DEOpenCLKernelCreator(bool doublePrecision, bool nVidia);
|
||||
|
||||
//Accessors.
|
||||
|
@ -184,6 +184,11 @@ static const char* RandFunctionString =
|
||||
" return MwcNext(s) * (real_t)(1.0 / 4294967296.0);\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"inline uint MwcNextCrand(uint2* s)\n"
|
||||
"{\n"
|
||||
" return MwcNextRange(s, 32767u);\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"inline real_t MwcNextFRange(uint2* s, real_t lower, real_t upper)\n"
|
||||
"{\n"
|
||||
" real_t f = (real_t)MwcNext(s) / (real_t)UINT_MAX;\n"
|
||||
|
@ -56,7 +56,7 @@ using namespace EmberNs;
|
||||
|
||||
//This special define is made to fix buggy OpenCL compilers on Mac.
|
||||
//Rendering is much slower there for unknown reasons. Michel traced it down
|
||||
//to the consec variable which keeps track of how many tries are needed to computer
|
||||
//to the consec variable which keeps track of how many tries are needed to compute
|
||||
//a point which is not a bad value. Strangely, keeping this as a local variable
|
||||
//is slower than keeping it as an element in a global array.
|
||||
//This is counterintuitive, and lends further weight to the idea that OpenCL on Mac
|
||||
|
@ -26,7 +26,6 @@ template <typename T, typename bucketT>
|
||||
RendererCL<T, bucketT>::RendererCL(const vector<pair<size_t, size_t>>& devices, bool shared, GLuint outputTexID)
|
||||
:
|
||||
m_IterOpenCLKernelCreator(),
|
||||
m_DEOpenCLKernelCreator(typeid(T) == typeid(double), false),
|
||||
m_FinalAccumOpenCLKernelCreator(typeid(T) == typeid(double))
|
||||
{
|
||||
m_PaletteFormat.image_channel_order = CL_RGBA;
|
||||
@ -114,7 +113,7 @@ bool RendererCL<T, bucketT>::Init(const vector<pair<size_t, size_t>>& devices, b
|
||||
if (b && (m_Devices.size() == devices.size()))
|
||||
{
|
||||
auto& firstWrapper = m_Devices[0]->m_Wrapper;
|
||||
m_DEOpenCLKernelCreator = DEOpenCLKernelCreator(m_DoublePrecision, m_Devices[0]->Nvidia());
|
||||
m_DEOpenCLKernelCreator = DEOpenCLKernelCreator(m_DoublePrecision, m_Devices[0]->Nvidia());//This will cause it to be created a second time, because it was already done once in the constructor.
|
||||
|
||||
//Build a simple program to ensure OpenCL is working right.
|
||||
if (b && !(b = firstWrapper.AddProgram(m_DEOpenCLKernelCreator.LogScaleAssignDEEntryPoint(), m_DEOpenCLKernelCreator.LogScaleAssignDEKernel(), m_DEOpenCLKernelCreator.LogScaleAssignDEEntryPoint(), m_DoublePrecision))) { ErrorStr(loc, "failed to init log scale program", m_Devices[0].get()); }
|
||||
|
@ -240,7 +240,7 @@ private:
|
||||
string m_FinalImageName = "Final";
|
||||
string m_PointsBufferName = "Points";
|
||||
#ifdef KNL_USE_GLOBAL_CONSEC
|
||||
string m_ConsecBufferName = "Consec";
|
||||
string m_ConsecBufferName = "Consec";
|
||||
#endif
|
||||
string m_VarStateBufferName = "VarState";
|
||||
|
||||
|
Reference in New Issue
Block a user