diff --git a/Source/EmberCL/DEOpenCLKernelCreator.cpp b/Source/EmberCL/DEOpenCLKernelCreator.cpp
index 6dc5d9d..e5ad8f7 100644
--- a/Source/EmberCL/DEOpenCLKernelCreator.cpp
+++ b/Source/EmberCL/DEOpenCLKernelCreator.cpp
@@ -56,8 +56,8 @@ DEOpenCLKernelCreator::DEOpenCLKernelCreator(bool doublePrecision, bool nVidia)
/// Kernel source and entry point properties, getters only.
///
-string DEOpenCLKernelCreator::LogScaleAssignDEKernel() { return m_LogScaleAssignDEKernel; }
-string DEOpenCLKernelCreator::LogScaleAssignDEEntryPoint() { return m_LogScaleAssignDEEntryPoint; }
+const string& DEOpenCLKernelCreator::LogScaleAssignDEKernel() const { return m_LogScaleAssignDEKernel; }
+const string& DEOpenCLKernelCreator::LogScaleAssignDEEntryPoint() const { return m_LogScaleAssignDEEntryPoint; }
///
/// Get the kernel source for the specified supersample and filterWidth.
@@ -65,7 +65,7 @@ string DEOpenCLKernelCreator::LogScaleAssignDEEntryPoint() { return m_LogScaleAs
/// The supersample being used
/// Filter width
/// The kernel source
-string DEOpenCLKernelCreator::GaussianDEKernel(size_t ss, uint filterWidth)
+const string& DEOpenCLKernelCreator::GaussianDEKernel(size_t ss, uint filterWidth) const
{
#ifndef ROW_ONLY_DE
if (filterWidth > MaxDEFilterSize())
@@ -101,7 +101,7 @@ string DEOpenCLKernelCreator::GaussianDEKernel(size_t ss, uint filterWidth)
/// The supersample being used
/// Filter width
/// The name of the density estimation filtering entry point kernel function
-string DEOpenCLKernelCreator::GaussianDEEntryPoint(size_t ss, uint filterWidth)
+const string& DEOpenCLKernelCreator::GaussianDEEntryPoint(size_t ss, uint filterWidth) const
{
#ifndef ROW_ONLY_DE
if (filterWidth > MaxDEFilterSize())
diff --git a/Source/EmberCL/DEOpenCLKernelCreator.h b/Source/EmberCL/DEOpenCLKernelCreator.h
index 68305e0..4793036 100644
--- a/Source/EmberCL/DEOpenCLKernelCreator.h
+++ b/Source/EmberCL/DEOpenCLKernelCreator.h
@@ -34,10 +34,10 @@ public:
DEOpenCLKernelCreator(bool doublePrecision, bool nVidia);
//Accessors.
- string LogScaleAssignDEKernel();
- string LogScaleAssignDEEntryPoint();
- string GaussianDEKernel(size_t ss, uint filterWidth);
- string GaussianDEEntryPoint(size_t ss, uint filterWidth);
+ const string& LogScaleAssignDEKernel() const;
+ const string& LogScaleAssignDEEntryPoint() const;
+ const string& GaussianDEKernel(size_t ss, uint filterWidth) const;
+ const string& GaussianDEEntryPoint(size_t ss, uint filterWidth) const;
//Miscellaneous static functions.
static uint MaxDEFilterSize();
diff --git a/Source/EmberCL/EmberCLFunctions.h b/Source/EmberCL/EmberCLFunctions.h
index 4230646..c9048c7 100644
--- a/Source/EmberCL/EmberCLFunctions.h
+++ b/Source/EmberCL/EmberCLFunctions.h
@@ -351,14 +351,11 @@ static const char* CarToRasFunctionString =
"}\n"
"\n";
-static string AtomicString(bool doublePrecision, bool dp64AtomicSupport)
+static string AtomicString()
{
ostringstream os;
- //If they want single precision, or if they want double precision and have dp atomic support.
- if (!doublePrecision || dp64AtomicSupport)
- {
- os <<
+ os <<
"void AtomicAdd(volatile __global real_bucket_t* source, const real_bucket_t operand)\n"
"{\n"
" union\n"
@@ -379,32 +376,6 @@ static string AtomicString(bool doublePrecision, bool dp64AtomicSupport)
" newVal.realVal = prevVal.realVal + operand;\n"
" } while (atomic_cmpxchg((volatile __global atomi*)source, prevVal.intVal, newVal.intVal) != prevVal.intVal);\n"
"}\n";
- }
- else//They want double precision and do not have dp atomic support.
- {
- os <<
- "void AtomicAdd(volatile __global double* source, const double operand)\n"
- "{\n"
- " union\n"
- " {\n"
- " uint intVal[2];\n"
- " double realVal;\n"
- " } newVal;\n"
- "\n"
- " union\n"
- " {\n"
- " uint intVal[2];\n"
- " double realVal;\n"
- " } prevVal;\n"
- "\n"
- " do\n"
- " {\n"
- " prevVal.realVal = *source;\n"
- " newVal.realVal = prevVal.realVal + operand;\n"
- " } while ((atomic_cmpxchg((volatile __global uint*)source, prevVal.intVal[0], newVal.intVal[0]) != prevVal.intVal[0]) ||\n"
- " (atomic_cmpxchg((volatile __global uint*)source + 1, prevVal.intVal[1], newVal.intVal[1]) != prevVal.intVal[1]));\n"
- "}\n";
- }
return os.str();
}
diff --git a/Source/EmberCL/EmberCLStructs.h b/Source/EmberCL/EmberCLStructs.h
index 59b7521..fa31933 100644
--- a/Source/EmberCL/EmberCLStructs.h
+++ b/Source/EmberCL/EmberCLStructs.h
@@ -39,7 +39,7 @@ static string ConstantDefinesString(bool doublePrecision)
<< "#endif\n"
<< "#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable\n"//Only supported on nVidia.
<< "typedef long intPrec;\n"
- << "typedef ulong atomi;\n"
+ << "typedef uint atomi;\n"//Same size as real_bucket_t, always 4 bytes.
<< "typedef double real_t;\n"
<< "typedef float real_bucket_t;\n"//Assume buckets are always float, even though iter calcs are in double.
<< "typedef double4 real4;\n"
diff --git a/Source/EmberCL/FinalAccumOpenCLKernelCreator.cpp b/Source/EmberCL/FinalAccumOpenCLKernelCreator.cpp
index e06ce79..7ebf7ee 100644
--- a/Source/EmberCL/FinalAccumOpenCLKernelCreator.cpp
+++ b/Source/EmberCL/FinalAccumOpenCLKernelCreator.cpp
@@ -37,24 +37,24 @@ FinalAccumOpenCLKernelCreator::FinalAccumOpenCLKernelCreator(bool doublePrecisio
/// Kernel source and entry point properties, getters only.
///
-string FinalAccumOpenCLKernelCreator::GammaCorrectionWithAlphaCalcKernel() { return m_GammaCorrectionWithAlphaCalcKernel; }
-string FinalAccumOpenCLKernelCreator::GammaCorrectionWithAlphaCalcEntryPoint() { return m_GammaCorrectionWithAlphaCalcEntryPoint; }
-string FinalAccumOpenCLKernelCreator::GammaCorrectionWithoutAlphaCalcKernel() { return m_GammaCorrectionWithoutAlphaCalcKernel; }
-string FinalAccumOpenCLKernelCreator::GammaCorrectionWithoutAlphaCalcEntryPoint() { return m_GammaCorrectionWithoutAlphaCalcEntryPoint; }
+const string& FinalAccumOpenCLKernelCreator::GammaCorrectionWithAlphaCalcKernel() const { return m_GammaCorrectionWithAlphaCalcKernel; }
+const string& FinalAccumOpenCLKernelCreator::GammaCorrectionWithAlphaCalcEntryPoint() const { return m_GammaCorrectionWithAlphaCalcEntryPoint; }
+const string& FinalAccumOpenCLKernelCreator::GammaCorrectionWithoutAlphaCalcKernel() const { return m_GammaCorrectionWithoutAlphaCalcKernel; }
+const string& FinalAccumOpenCLKernelCreator::GammaCorrectionWithoutAlphaCalcEntryPoint() const { return m_GammaCorrectionWithoutAlphaCalcEntryPoint; }
-string FinalAccumOpenCLKernelCreator::FinalAccumEarlyClipKernel() { return m_FinalAccumEarlyClipKernel; }
-string FinalAccumOpenCLKernelCreator::FinalAccumEarlyClipEntryPoint() { return m_FinalAccumEarlyClipEntryPoint; }
-string FinalAccumOpenCLKernelCreator::FinalAccumEarlyClipWithAlphaCalcWithAlphaAccumKernel() { return m_FinalAccumEarlyClipWithAlphaCalcWithAlphaAccumKernel; }
-string FinalAccumOpenCLKernelCreator::FinalAccumEarlyClipWithAlphaCalcWithAlphaAccumEntryPoint() { return m_FinalAccumEarlyClipWithAlphaCalcWithAlphaAccumEntryPoint; }
-string FinalAccumOpenCLKernelCreator::FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumKernel() { return m_FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumKernel; }
-string FinalAccumOpenCLKernelCreator::FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumEntryPoint() { return m_FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumEntryPoint; }
+const string& FinalAccumOpenCLKernelCreator::FinalAccumEarlyClipKernel() const { return m_FinalAccumEarlyClipKernel; }
+const string& FinalAccumOpenCLKernelCreator::FinalAccumEarlyClipEntryPoint() const { return m_FinalAccumEarlyClipEntryPoint; }
+const string& FinalAccumOpenCLKernelCreator::FinalAccumEarlyClipWithAlphaCalcWithAlphaAccumKernel() const { return m_FinalAccumEarlyClipWithAlphaCalcWithAlphaAccumKernel; }
+const string& FinalAccumOpenCLKernelCreator::FinalAccumEarlyClipWithAlphaCalcWithAlphaAccumEntryPoint() const { return m_FinalAccumEarlyClipWithAlphaCalcWithAlphaAccumEntryPoint; }
+const string& FinalAccumOpenCLKernelCreator::FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumKernel() const { return m_FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumKernel; }
+const string& FinalAccumOpenCLKernelCreator::FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumEntryPoint() const { return m_FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumEntryPoint; }
-string FinalAccumOpenCLKernelCreator::FinalAccumLateClipKernel() { return m_FinalAccumLateClipKernel; }
-string FinalAccumOpenCLKernelCreator::FinalAccumLateClipEntryPoint() { return m_FinalAccumLateClipEntryPoint; }
-string FinalAccumOpenCLKernelCreator::FinalAccumLateClipWithAlphaCalcWithAlphaAccumKernel() { return m_FinalAccumLateClipWithAlphaCalcWithAlphaAccumKernel; }
-string FinalAccumOpenCLKernelCreator::FinalAccumLateClipWithAlphaCalcWithAlphaAccumEntryPoint() { return m_FinalAccumLateClipWithAlphaCalcWithAlphaAccumEntryPoint; }
-string FinalAccumOpenCLKernelCreator::FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumKernel() { return m_FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumKernel; }
-string FinalAccumOpenCLKernelCreator::FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumEntryPoint() { return m_FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumEntryPoint; }
+const string& FinalAccumOpenCLKernelCreator::FinalAccumLateClipKernel() const { return m_FinalAccumLateClipKernel; }
+const string& FinalAccumOpenCLKernelCreator::FinalAccumLateClipEntryPoint() const { return m_FinalAccumLateClipEntryPoint; }
+const string& FinalAccumOpenCLKernelCreator::FinalAccumLateClipWithAlphaCalcWithAlphaAccumKernel() const { return m_FinalAccumLateClipWithAlphaCalcWithAlphaAccumKernel; }
+const string& FinalAccumOpenCLKernelCreator::FinalAccumLateClipWithAlphaCalcWithAlphaAccumEntryPoint() const { return m_FinalAccumLateClipWithAlphaCalcWithAlphaAccumEntryPoint; }
+const string& FinalAccumOpenCLKernelCreator::FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumKernel() const { return m_FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumKernel; }
+const string& FinalAccumOpenCLKernelCreator::FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumEntryPoint() const { return m_FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumEntryPoint; }
///
/// Get the gamma correction entry point.
@@ -62,7 +62,7 @@ string FinalAccumOpenCLKernelCreator::FinalAccumLateClipWithoutAlphaCalcWithAlph
/// The number of channels used, 3 or 4.
/// True if channels equals 4 and using transparency, else false.
/// The name of the gamma correction entry point kernel function
-string FinalAccumOpenCLKernelCreator::GammaCorrectionEntryPoint(size_t channels, bool transparency)
+const string& FinalAccumOpenCLKernelCreator::GammaCorrectionEntryPoint(size_t channels, bool transparency) const
{
bool alphaCalc = ((channels > 3) && transparency);
return alphaCalc ? m_GammaCorrectionWithAlphaCalcEntryPoint : m_GammaCorrectionWithoutAlphaCalcEntryPoint;
@@ -74,7 +74,7 @@ string FinalAccumOpenCLKernelCreator::GammaCorrectionEntryPoint(size_t channels,
/// The number of channels used, 3 or 4.
/// True if channels equals 4 and using transparency, else false.
/// The gamma correction kernel string
-string FinalAccumOpenCLKernelCreator::GammaCorrectionKernel(size_t channels, bool transparency)
+const string& FinalAccumOpenCLKernelCreator::GammaCorrectionKernel(size_t channels, bool transparency) const
{
bool alphaCalc = ((channels > 3) && transparency);
return alphaCalc ? m_GammaCorrectionWithAlphaCalcKernel : m_GammaCorrectionWithoutAlphaCalcKernel;
@@ -89,7 +89,7 @@ string FinalAccumOpenCLKernelCreator::GammaCorrectionKernel(size_t channels, boo
/// Storage for the alpha base value used in the kernel. 0 if transparency is true, else 255.
/// Storage for the alpha scale value used in the kernel. 255 if transparency is true, else 0.
/// The name of the final accumulation entry point kernel function
-string FinalAccumOpenCLKernelCreator::FinalAccumEntryPoint(bool earlyClip, size_t channels, bool transparency, double& alphaBase, double& alphaScale)
+const string& FinalAccumOpenCLKernelCreator::FinalAccumEntryPoint(bool earlyClip, size_t channels, bool transparency, double& alphaBase, double& alphaScale) const
{
bool alphaCalc = ((channels > 3) && transparency);
bool alphaAccum = channels > 3;
@@ -109,7 +109,7 @@ string FinalAccumOpenCLKernelCreator::FinalAccumEntryPoint(bool earlyClip, size_
else if (!alphaCalc && alphaAccum)//Rgba output and !Transparency.
return FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumEntryPoint();
else
- return "";//Cannot have alphaCalc and !alphaAccum, it makes no sense.
+ return m_Empty;//Cannot have alphaCalc and !alphaAccum, it makes no sense.
}
else
{
@@ -120,7 +120,7 @@ string FinalAccumOpenCLKernelCreator::FinalAccumEntryPoint(bool earlyClip, size_
else if (!alphaCalc && alphaAccum)//Rgba output and !Transparency.
return FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumEntryPoint();
else
- return "";//Cannot have alphaCalc and !alphaAccum, it makes no sense.
+ return m_Empty;//Cannot have alphaCalc and !alphaAccum, it makes no sense.
}
}
@@ -131,7 +131,7 @@ string FinalAccumOpenCLKernelCreator::FinalAccumEntryPoint(bool earlyClip, size_
/// The number of channels used, 3 or 4.
/// True if channels equals 4 and using transparency, else false.
/// The final accumulation kernel string
-string FinalAccumOpenCLKernelCreator::FinalAccumKernel(bool earlyClip, size_t channels, bool transparency)
+const string& FinalAccumOpenCLKernelCreator::FinalAccumKernel(bool earlyClip, size_t channels, bool transparency) const
{
bool alphaCalc = (channels > 3 && transparency);
bool alphaAccum = channels > 3;
@@ -145,7 +145,7 @@ string FinalAccumOpenCLKernelCreator::FinalAccumKernel(bool earlyClip, size_t ch
else if (!alphaCalc && alphaAccum)//Rgba output and !Transparency.
return FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumKernel();
else
- return "";//Cannot have alphaCalc and !alphaAccum, it makes no sense.
+ return m_Empty;//Cannot have alphaCalc and !alphaAccum, it makes no sense.
}
else
{
@@ -156,7 +156,7 @@ string FinalAccumOpenCLKernelCreator::FinalAccumKernel(bool earlyClip, size_t ch
else if (!alphaCalc && alphaAccum)//Rgba output and !Transparency.
return FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumKernel();
else
- return "";//Cannot have alphaCalc and !alphaAccum, it makes no sense.
+ return m_Empty;//Cannot have alphaCalc and !alphaAccum, it makes no sense.
}
}
diff --git a/Source/EmberCL/FinalAccumOpenCLKernelCreator.h b/Source/EmberCL/FinalAccumOpenCLKernelCreator.h
index 6d6ee26..1018768 100644
--- a/Source/EmberCL/FinalAccumOpenCLKernelCreator.h
+++ b/Source/EmberCL/FinalAccumOpenCLKernelCreator.h
@@ -25,29 +25,29 @@ class EMBERCL_API FinalAccumOpenCLKernelCreator
public:
FinalAccumOpenCLKernelCreator(bool doublePrecision);
- string GammaCorrectionWithAlphaCalcKernel();
- string GammaCorrectionWithAlphaCalcEntryPoint();
+ const string& GammaCorrectionWithAlphaCalcKernel() const;
+ const string& GammaCorrectionWithAlphaCalcEntryPoint() const;
- string GammaCorrectionWithoutAlphaCalcKernel();
- string GammaCorrectionWithoutAlphaCalcEntryPoint();
+ const string& GammaCorrectionWithoutAlphaCalcKernel() const;
+ const string& GammaCorrectionWithoutAlphaCalcEntryPoint() const;
- string FinalAccumEarlyClipKernel();
- string FinalAccumEarlyClipEntryPoint();
- string FinalAccumEarlyClipWithAlphaCalcWithAlphaAccumKernel();
- string FinalAccumEarlyClipWithAlphaCalcWithAlphaAccumEntryPoint();
- string FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumKernel();
- string FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumEntryPoint();
+ const string& FinalAccumEarlyClipKernel() const;
+ const string& FinalAccumEarlyClipEntryPoint() const;
+ const string& FinalAccumEarlyClipWithAlphaCalcWithAlphaAccumKernel() const;
+ const string& FinalAccumEarlyClipWithAlphaCalcWithAlphaAccumEntryPoint() const;
+ const string& FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumKernel() const;
+ const string& FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumEntryPoint() const;
- string FinalAccumLateClipKernel();
- string FinalAccumLateClipEntryPoint();
- string FinalAccumLateClipWithAlphaCalcWithAlphaAccumKernel();
- string FinalAccumLateClipWithAlphaCalcWithAlphaAccumEntryPoint();
- string FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumKernel();
- string FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumEntryPoint();
- string GammaCorrectionEntryPoint(size_t channels, bool transparency);
- string GammaCorrectionKernel(size_t channels, bool transparency);
- string FinalAccumEntryPoint(bool earlyClip, size_t channels, bool transparency, double& alphaBase, double& alphaScale);
- string FinalAccumKernel(bool earlyClip, size_t channels, bool transparency);
+ const string& FinalAccumLateClipKernel() const;
+ const string& FinalAccumLateClipEntryPoint() const;
+ const string& FinalAccumLateClipWithAlphaCalcWithAlphaAccumKernel() const;
+ const string& FinalAccumLateClipWithAlphaCalcWithAlphaAccumEntryPoint() const;
+ const string& FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumKernel() const;
+ const string& FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumEntryPoint() const;
+ const string& GammaCorrectionEntryPoint(size_t channels, bool transparency) const;
+ const string& GammaCorrectionKernel(size_t channels, bool transparency) const;
+ const string& FinalAccumEntryPoint(bool earlyClip, size_t channels, bool transparency, double& alphaBase, double& alphaScale) const;
+ const string& FinalAccumKernel(bool earlyClip, size_t channels, bool transparency) const;
private:
string CreateFinalAccumKernelString(bool earlyClip, size_t channels, bool transparency);
@@ -56,6 +56,7 @@ private:
string CreateFinalAccumKernelString(bool earlyClip, bool alphaCalc, bool alphaAccum);
string CreateGammaCorrectionFunctionString(bool globalBucket, bool alphaCalc, bool alphaAccum, bool finalOut);
string CreateCalcNewRgbFunctionString(bool globalBucket);
+
string m_GammaCorrectionWithAlphaCalcKernel;
string m_GammaCorrectionWithAlphaCalcEntryPoint;
@@ -76,6 +77,7 @@ private:
string m_FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumKernel;//False, true.
string m_FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumEntryPoint;
+ string m_Empty;
bool m_DoublePrecision;
};
}
diff --git a/Source/EmberCL/IterOpenCLKernelCreator.cpp b/Source/EmberCL/IterOpenCLKernelCreator.cpp
index d1a8e23..d634152 100644
--- a/Source/EmberCL/IterOpenCLKernelCreator.cpp
+++ b/Source/EmberCL/IterOpenCLKernelCreator.cpp
@@ -10,11 +10,9 @@ namespace EmberCLns
/// Constructor that sets up some basic entry point strings and creates
/// the zeroization kernel string since it requires no conditional inputs.
///
-/// True if running on an nVidia card, else false.
template
-IterOpenCLKernelCreator::IterOpenCLKernelCreator(bool nVidia)
+IterOpenCLKernelCreator::IterOpenCLKernelCreator()
{
- m_NVidia = nVidia;
m_IterEntryPoint = "IterateKernel";
m_ZeroizeEntryPoint = "ZeroizeKernel";
m_ZeroizeKernel = CreateZeroizeKernelString();
@@ -24,9 +22,9 @@ IterOpenCLKernelCreator::IterOpenCLKernelCreator(bool nVidia)
/// Accessors.
///
-template string IterOpenCLKernelCreator::ZeroizeKernel() { return m_ZeroizeKernel; }
-template string IterOpenCLKernelCreator::ZeroizeEntryPoint() { return m_ZeroizeEntryPoint; }
-template string IterOpenCLKernelCreator::IterEntryPoint() { return m_IterEntryPoint; }
+template const string& IterOpenCLKernelCreator::ZeroizeKernel() const { return m_ZeroizeKernel; }
+template const string& IterOpenCLKernelCreator::ZeroizeEntryPoint() const { return m_ZeroizeEntryPoint; }
+template const string& IterOpenCLKernelCreator::IterEntryPoint() const { return m_IterEntryPoint; }
///
/// Create the iteration kernel string using the Cuburn method.
@@ -221,8 +219,12 @@ string IterOpenCLKernelCreator::CreateIterKernelString(Ember& ember, strin
EmberCLStructString <<
UnionCLStructString <<
CarToRasCLStructString <<
- CarToRasFunctionString <<
- AtomicString(doublePrecision, m_NVidia) <<
+ CarToRasFunctionString;
+
+ if (lockAccum)
+ os << AtomicString();
+
+ os <<
xformFuncs.str() <<
"__kernel void " << m_IterEntryPoint << "(\n" <<
" uint iterCount,\n"
diff --git a/Source/EmberCL/IterOpenCLKernelCreator.h b/Source/EmberCL/IterOpenCLKernelCreator.h
index 4eb1e2a..9054ce1 100644
--- a/Source/EmberCL/IterOpenCLKernelCreator.h
+++ b/Source/EmberCL/IterOpenCLKernelCreator.h
@@ -23,10 +23,10 @@ template
class EMBERCL_API IterOpenCLKernelCreator
{
public:
- IterOpenCLKernelCreator(bool nVidia);
- string ZeroizeKernel();
- string ZeroizeEntryPoint();
- string IterEntryPoint();
+ IterOpenCLKernelCreator();
+ const string& ZeroizeKernel() const;
+ const string& ZeroizeEntryPoint() const;
+ const string& IterEntryPoint() const;
string CreateIterKernelString(Ember& ember, string& parVarDefines, bool lockAccum = false, bool doAccum = true);
static void ParVarIndexDefines(Ember& ember, pair>& params, bool doVals = true, bool doString = true);
static bool IsBuildRequired(Ember& ember1, Ember& ember2);
@@ -38,7 +38,6 @@ private:
string m_IterEntryPoint;
string m_ZeroizeKernel;
string m_ZeroizeEntryPoint;
- bool m_NVidia;
};
#ifdef OPEN_CL_TEST_AREA
diff --git a/Source/EmberCL/RendererCL.cpp b/Source/EmberCL/RendererCL.cpp
index 118051b..c7faf68 100644
--- a/Source/EmberCL/RendererCL.cpp
+++ b/Source/EmberCL/RendererCL.cpp
@@ -15,7 +15,7 @@ namespace EmberCLns
template
RendererCL::RendererCL(uint platform, uint device, bool shared, GLuint outputTexID)
:
- m_IterOpenCLKernelCreator(false),
+ m_IterOpenCLKernelCreator(),
m_DEOpenCLKernelCreator(typeid(T) == typeid(double), false),
m_FinalAccumOpenCLKernelCreator(typeid(T) == typeid(double))
{
@@ -105,7 +105,7 @@ bool RendererCL::Init(uint platform, uint device, bool shared, GLuin
{
m_NVidia = ToLower(m_Wrapper.DeviceAndPlatformNames()).find_first_of("nvidia") != string::npos && m_Wrapper.LocalMemSize() > (32 * 1024);
m_WarpSize = m_NVidia ? 32 : 64;
- m_IterOpenCLKernelCreator = IterOpenCLKernelCreator(m_NVidia);
+ m_IterOpenCLKernelCreator = IterOpenCLKernelCreator();
m_DEOpenCLKernelCreator = DEOpenCLKernelCreator(m_DoublePrecision, m_NVidia);
string zeroizeProgram = m_IterOpenCLKernelCreator.ZeroizeKernel();
@@ -286,7 +286,7 @@ bool RendererCL::WriteRandomPoints()
///
/// The string representation of the kernel for the last built iter program.
template
-string RendererCL::IterKernel() { return m_IterKernel; }
+const string& RendererCL::IterKernel() const { return m_IterKernel; }
///
@@ -294,14 +294,14 @@ string RendererCL::IterKernel() { return m_IterKernel; }
///
/// The string representation of the kernel for the last built density filtering program.
template
-string RendererCL::DEKernel() { return m_DEOpenCLKernelCreator.GaussianDEKernel(Supersample(), m_DensityFilterCL.m_FilterWidth); }
+const string& RendererCL::DEKernel() const { return m_DEOpenCLKernelCreator.GaussianDEKernel(Supersample(), m_DensityFilterCL.m_FilterWidth); }
///
/// Get the kernel string for the last built final accumulation program.
///
/// The string representation of the kernel for the last built final accumulation program.
template
-string RendererCL::FinalAccumKernel() { return m_FinalAccumOpenCLKernelCreator.FinalAccumKernel(EarlyClip(), Renderer::NumChannels(), Transparency()); }
+const string& RendererCL::FinalAccumKernel() const { return m_FinalAccumOpenCLKernelCreator.FinalAccumKernel(EarlyClip(), Renderer::NumChannels(), Transparency()); }
///
/// Virtual functions overridden from RendererCLBase.
@@ -567,8 +567,8 @@ bool RendererCL::Alloc()
m_XformsCL.resize(m_Ember.TotalXformCount());
bool b = true;
- size_t histLength = SuperSize() * sizeof(v4T);
- size_t accumLength = SuperSize() * sizeof(v4T);
+ size_t histLength = SuperSize() * sizeof(v4bT);
+ size_t accumLength = SuperSize() * sizeof(v4bT);
const char* loc = __FUNCTION__;
if (b && !(b = m_Wrapper.AddBuffer(m_EmberBufferName, sizeof(m_EmberCL)))) { this->m_ErrorReport.push_back(loc); }
@@ -694,18 +694,37 @@ EmberStats RendererCL::Iterate(size_t iterCount, size_t temporalSamp
EmberStats stats;//Do not record bad vals with with GPU. If the user needs to investigate bad vals, use the CPU.
const char* loc = __FUNCTION__;
- IterOpenCLKernelCreator::ParVarIndexDefines(m_Ember, m_Params, true, false);//Always do this to get the values (but no string), regardless of whether a rebuild is necessary.
-
- //Don't know the size of the parametric varations parameters buffer until the ember is examined.
- //So set it up right before the run.
- if (!m_Params.second.empty())
+ //Only need to do this once on the beginning of a new render. Last iter will always be 0 at the beginning of a full render or temporal sample.
+ if (m_LastIter == 0)
{
- if (!m_Wrapper.AddAndWriteBuffer(m_ParVarsBufferName, m_Params.second.data(), m_Params.second.size() * sizeof(m_Params.second[0])))
+ ConvertEmber(m_Ember, m_EmberCL, m_XformsCL);
+ ConvertCarToRas(*CoordMap());
+
+ if (b && !(b = m_Wrapper.WriteBuffer(m_EmberBufferName, reinterpret_cast(&m_EmberCL), sizeof(m_EmberCL)))) { this->m_ErrorReport.push_back(loc); }
+ if (b && !(b = m_Wrapper.WriteBuffer(m_XformsBufferName, reinterpret_cast(m_XformsCL.data()), sizeof(m_XformsCL[0]) * m_XformsCL.size()))) { this->m_ErrorReport.push_back(loc); }
+ if (b && !(b = m_Wrapper.AddAndWriteBuffer(m_DistBufferName, reinterpret_cast(const_cast(XformDistributions())), XformDistributionsSize()))) { this->m_ErrorReport.push_back(loc); }//Will be resized for xaos.
+ if (b && !(b = m_Wrapper.WriteBuffer(m_CarToRasBufferName, reinterpret_cast(&m_CarToRasCL), sizeof(m_CarToRasCL)))) { this->m_ErrorReport.push_back(loc); }
+
+ if (b && !(b = m_Wrapper.AddAndWriteImage("Palette", CL_MEM_READ_ONLY, m_PaletteFormat, m_DmapCL.m_Entries.size(), 1, 0, m_DmapCL.m_Entries.data()))) { this->m_ErrorReport.push_back(loc); }
+
+ if (b)
{
- m_Abort = true;
- this->m_ErrorReport.push_back(loc);
- return stats;
+ IterOpenCLKernelCreator::ParVarIndexDefines(m_Ember, m_Params, true, false);//Always do this to get the values (but no string), regardless of whether a rebuild is necessary.
+
+ //Don't know the size of the parametric varations parameters buffer until the ember is examined.
+ //So set it up right before the run.
+ if (!m_Params.second.empty())
+ {
+ if (!m_Wrapper.AddAndWriteBuffer(m_ParVarsBufferName, m_Params.second.data(), m_Params.second.size() * sizeof(m_Params.second[0])))
+ {
+ m_Abort = true;
+ this->m_ErrorReport.push_back(loc);
+ return stats;
+ }
+ }
}
+ else
+ return stats;
}
//Rebuilding is expensive, so only do it if it's required.
@@ -716,7 +735,7 @@ EmberStats RendererCL::Iterate(size_t iterCount, size_t temporalSamp
{
m_IterTimer.Tic();//Tic() here to avoid including build time in iter time measurement.
- if (m_Stats.m_Iters == 0)//Only reset the call count on the beginning of a new render. Do not reset on KEEP_ITERATING.
+ if (m_LastIter == 0)//Only reset the call count on the beginning of a new render. Do not reset on KEEP_ITERATING.
m_Calls = 0;
b = RunIter(iterCount, temporalSample, stats.m_Iters);
@@ -772,10 +791,8 @@ bool RendererCL::BuildIterProgramForEmber(bool doAccum)
///
/// Run the iteration kernel.
-/// Fusing on the CPU is done once per sub batch, usually 10,000 iters, however
-/// determining when to do it in OpenCL is much more difficult.
-/// Currently it's done once every 4 kernel calls which seems to be a good balance
-/// between quality of the final image and performance.
+/// Fusing on the CPU is done once per sub batch, usually 10,000 iters. Here,
+/// the same fusing frequency is kept, but is done per kernel thread.
///
/// The number of iterations to run
/// The temporal sample this is running for
@@ -803,16 +820,6 @@ bool RendererCL::RunIter(size_t iterCount, size_t temporalSample, si
if (kernelIndex != -1)
{
- ConvertEmber(m_Ember, m_EmberCL, m_XformsCL);
- m_CarToRasCL = ConvertCarToRas(*CoordMap());
-
- if (b && !(b = m_Wrapper.WriteBuffer (m_EmberBufferName, reinterpret_cast(&m_EmberCL), sizeof(m_EmberCL)))) { this->m_ErrorReport.push_back(loc); }
- if (b && !(b = m_Wrapper.WriteBuffer (m_XformsBufferName, reinterpret_cast(m_XformsCL.data()), sizeof(m_XformsCL[0]) * m_XformsCL.size()))) { this->m_ErrorReport.push_back(loc); }
- if (b && !(b = m_Wrapper.AddAndWriteBuffer(m_DistBufferName, reinterpret_cast(const_cast(XformDistributions())), XformDistributionsSize()))) { this->m_ErrorReport.push_back(loc); }//Will be resized for xaos.
- if (b && !(b = m_Wrapper.WriteBuffer (m_CarToRasBufferName, reinterpret_cast(&m_CarToRasCL), sizeof(m_CarToRasCL)))) { this->m_ErrorReport.push_back(loc); }
-
- if (b && !(b = m_Wrapper.AddAndWriteImage("Palette", CL_MEM_READ_ONLY, m_PaletteFormat, m_DmapCL.m_Entries.size(), 1, 0, m_DmapCL.m_Entries.data()))) { this->m_ErrorReport.push_back(loc); }
-
//If animating, treat each temporal sample as a newly started render for fusing purposes.
if (temporalSample > 0)
m_Calls = 0;
@@ -1255,13 +1262,13 @@ bool RendererCL::RunDensityFilterPrivate(uint kernelIndex, uint grid
template
int RendererCL::MakeAndGetDensityFilterProgram(size_t ss, uint filterWidth)
{
- string deEntryPoint = m_DEOpenCLKernelCreator.GaussianDEEntryPoint(ss, filterWidth);
+ auto& deEntryPoint = m_DEOpenCLKernelCreator.GaussianDEEntryPoint(ss, filterWidth);
int kernelIndex = m_Wrapper.FindKernelIndex(deEntryPoint);
const char* loc = __FUNCTION__;
if (kernelIndex == -1)//Has not been built yet.
{
- string kernel = m_DEOpenCLKernelCreator.GaussianDEKernel(ss, filterWidth);
+ auto& kernel = m_DEOpenCLKernelCreator.GaussianDEKernel(ss, filterWidth);
bool b = m_Wrapper.AddProgram(deEntryPoint, kernel, deEntryPoint, m_DoublePrecision);
if (b)
@@ -1288,13 +1295,13 @@ int RendererCL::MakeAndGetDensityFilterProgram(size_t ss, uint filte
template
int RendererCL::MakeAndGetFinalAccumProgram(double& alphaBase, double& alphaScale)
{
- string finalAccumEntryPoint = m_FinalAccumOpenCLKernelCreator.FinalAccumEntryPoint(EarlyClip(), Renderer::NumChannels(), Transparency(), alphaBase, alphaScale);
+ auto& finalAccumEntryPoint = m_FinalAccumOpenCLKernelCreator.FinalAccumEntryPoint(EarlyClip(), Renderer::NumChannels(), Transparency(), alphaBase, alphaScale);
int kernelIndex = m_Wrapper.FindKernelIndex(finalAccumEntryPoint);
const char* loc = __FUNCTION__;
if (kernelIndex == -1)//Has not been built yet.
{
- string kernel = m_FinalAccumOpenCLKernelCreator.FinalAccumKernel(EarlyClip(), Renderer::NumChannels(), Transparency());
+ auto& kernel = m_FinalAccumOpenCLKernelCreator.FinalAccumKernel(EarlyClip(), Renderer::NumChannels(), Transparency());
bool b = m_Wrapper.AddProgram(finalAccumEntryPoint, kernel, finalAccumEntryPoint, m_DoublePrecision);
if (b)
@@ -1313,13 +1320,13 @@ int RendererCL::MakeAndGetFinalAccumProgram(double& alphaBase, doubl
template
int RendererCL::MakeAndGetGammaCorrectionProgram()
{
- string gammaEntryPoint = m_FinalAccumOpenCLKernelCreator.GammaCorrectionEntryPoint(Renderer::NumChannels(), Transparency());
+ auto& gammaEntryPoint = m_FinalAccumOpenCLKernelCreator.GammaCorrectionEntryPoint(Renderer::NumChannels(), Transparency());
int kernelIndex = m_Wrapper.FindKernelIndex(gammaEntryPoint);
const char* loc = __FUNCTION__;
if (kernelIndex == -1)//Has not been built yet.
{
- string kernel = m_FinalAccumOpenCLKernelCreator.GammaCorrectionKernel(Renderer::NumChannels(), Transparency());
+ auto& kernel = m_FinalAccumOpenCLKernelCreator.GammaCorrectionKernel(Renderer::NumChannels(), Transparency());
bool b = m_Wrapper.AddProgram(gammaEntryPoint, kernel, gammaEntryPoint, m_DoublePrecision);
if (b)
@@ -1454,21 +1461,17 @@ void RendererCL::ConvertEmber(Ember& ember, EmberCL& emberCL,
/// The CarToRas object to convert
/// The CarToRasCL object
template
-CarToRasCL RendererCL::ConvertCarToRas(const CarToRas& carToRas)
+void RendererCL::ConvertCarToRas(const CarToRas& carToRas)
{
- CarToRasCL carToRasCL;
-
- carToRasCL.m_RasWidth = uint(carToRas.RasWidth());
- carToRasCL.m_PixPerImageUnitW = carToRas.PixPerImageUnitW();
- carToRasCL.m_RasLlX = carToRas.RasLlX();
- carToRasCL.m_PixPerImageUnitH = carToRas.PixPerImageUnitH();
- carToRasCL.m_RasLlY = carToRas.RasLlY();
- carToRasCL.m_CarLlX = carToRas.CarLlX();
- carToRasCL.m_CarLlY = carToRas.CarLlY();
- carToRasCL.m_CarUrX = carToRas.CarUrX();
- carToRasCL.m_CarUrY = carToRas.CarUrY();
-
- return carToRasCL;
+ m_CarToRasCL.m_RasWidth = uint(carToRas.RasWidth());
+ m_CarToRasCL.m_PixPerImageUnitW = carToRas.PixPerImageUnitW();
+ m_CarToRasCL.m_RasLlX = carToRas.RasLlX();
+ m_CarToRasCL.m_PixPerImageUnitH = carToRas.PixPerImageUnitH();
+ m_CarToRasCL.m_RasLlY = carToRas.RasLlY();
+ m_CarToRasCL.m_CarLlX = carToRas.CarLlX();
+ m_CarToRasCL.m_CarLlY = carToRas.CarLlY();
+ m_CarToRasCL.m_CarUrX = carToRas.CarUrX();
+ m_CarToRasCL.m_CarUrY = carToRas.CarUrY();
}
///
diff --git a/Source/EmberCL/RendererCL.h b/Source/EmberCL/RendererCL.h
index b1b3e6d..bf714ba 100644
--- a/Source/EmberCL/RendererCL.h
+++ b/Source/EmberCL/RendererCL.h
@@ -124,9 +124,9 @@ public:
#ifdef TEST_CL
bool WriteRandomPoints();
#endif
- string IterKernel();
- string DEKernel();
- string FinalAccumKernel();
+ const string& IterKernel() const;
+ const string& DEKernel() const;
+ const string& FinalAccumKernel() const;
//Virtual functions overridden from RendererCLBase.
virtual bool ReadFinal(byte* pixels);
@@ -179,7 +179,7 @@ private:
void ConvertDensityFilter();
void ConvertSpatialFilter();
void ConvertEmber(Ember& ember, EmberCL& emberCL, vector>& xformsCL);
- static CarToRasCL ConvertCarToRas(const CarToRas& carToRas);
+ void ConvertCarToRas(const CarToRas& carToRas);
bool m_Init;
bool m_NVidia;
diff --git a/Source/EmberTester/EmberTester.cpp b/Source/EmberTester/EmberTester.cpp
index 4aee375..21ba90b 100644
--- a/Source/EmberTester/EmberTester.cpp
+++ b/Source/EmberTester/EmberTester.cpp
@@ -71,7 +71,7 @@ Ember CreateBasicEmber(uint width, uint height, uint ss, T quality, T centerX
string GetEmberCLKernelString(Ember& ember, bool iter, bool log, bool de, uint ss, bool accum)
{
ostringstream os;
- IterOpenCLKernelCreator iterCreator(false);
+ IterOpenCLKernelCreator iterCreator;
DEOpenCLKernelCreator deCreator(false, false);
FinalAccumOpenCLKernelCreator accumCreator(false);
pair> pair;
diff --git a/Source/Fractorium/AboutDialog.ui b/Source/Fractorium/AboutDialog.ui
index 4a13f20..5384bf2 100644
--- a/Source/Fractorium/AboutDialog.ui
+++ b/Source/Fractorium/AboutDialog.ui
@@ -7,7 +7,7 @@
0
0
488
- 567
+ 595
@@ -58,7 +58,7 @@
- <html><head/><body><p align="center"><br/><span style=" font-size:12pt;">Fractorium 0.4.1.9 Beta</span></p><p align="center"><span style=" font-size:10pt;"><br/>A Qt-based fractal flame editor which uses a C++ re-write of the flam3 algorithm named Ember and a GPU capable version named EmberCL which implements a portion of the cuburn algorithm in OpenCL.</span></p><p align="center"><span style=" font-size:10pt;">Matt Feemster</span></p></body></html>
+ <html><head/><body><p align="center"><br/>Fractorium 0.4.1.9 Beta</p><p align="center"><span style=" font-size:10pt;"><br/>A Qt-based fractal flame editor which uses a C++ re-write of the flam3 algorithm named Ember and a GPU capable version named EmberCL which implements a portion of the cuburn algorithm in OpenCL.</span></p><p align="center"><span style=" font-size:10pt;">Lead: Matt Feemster</span></p><p align="center"><span style=" font-size:10pt;">Contributors: Simon Detheridge</span></p></body></html>
Qt::RichText