--User changes

-Remove Hue as a saved parameter, as well as animation parameters associated with it. It's now a GUI-only field that is never saved.
 -Make histogram, density filter buffer, and all associated fields always float, even when using double. In that case, only the iteration calculations are now double. Suggested by Thomas Ludwig.
 -Print all three kernels in EmberRender when the --dump_kernel option is specified.
 -Apply variations filter to randoms.

--Bug fixes
 -Fix bug where hue was not being preserved when switching controllers and embers. Very hard to repro bug, but mostly overcome by eliminating hue as a saved parameter.

--Code changes
 -De-templatized DEOpenCLKernelCreator and FinalAccumOpenCLKernelCreator. They now just take a bool as a parameter to specify double precision.
 -To accommodate the buffers being float, introduce a new #define types in EmberCL called real4_bucket, and real4reals_bucket.
 -Density and spatial filtering structs now use this type.
 -ConvertDensityFilter() and ConvertSpatialFilter() no longer return a value, they just assign to the member.
This commit is contained in:
mfeemster
2015-08-10 20:10:23 -07:00
parent 6b702334b9
commit eecd3c254f
38 changed files with 695 additions and 771 deletions

View File

@ -4,58 +4,22 @@
namespace EmberCLns
{
/// <summary>
/// Empty constructor that does nothing. The user must call the one which takes a bool
/// argument before using this class.
/// This constructor only exists so the class can be a member of a class.
/// </summary>
template <typename T>
DEOpenCLKernelCreator<T>::DEOpenCLKernelCreator()
{
}
/// <summary>
/// Constructor for float template type that sets all kernel entry points as well as composes
/// all kernel source strings.
/// No program compilation is done here, the user must explicitly do it.
/// The caller must specify whether they are using an nVidia or AMD card because it changes
/// the amount of local memory available.
/// </summary>
/// <param name="nVidia">True if running on an nVidia card, else false.</param>
template <>
DEOpenCLKernelCreator<float>::DEOpenCLKernelCreator(bool nVidia)
{
m_NVidia = nVidia;
m_LogScaleAssignDEEntryPoint = "LogScaleAssignDensityFilterKernel";
m_GaussianDEWithoutSsEntryPoint = "GaussianDEWithoutSsKernel";
m_GaussianDESsWithScfEntryPoint = "GaussianDESsWithScfKernel";
m_GaussianDESsWithoutScfEntryPoint = "GaussianDESsWithoutScfKernel";
m_GaussianDEWithoutSsNoCacheEntryPoint = "GaussianDEWithoutSsNoCacheKernel";
m_GaussianDESsWithScfNoCacheEntryPoint = "GaussianDESsWithScfNoCacheKernel";
m_GaussianDESsWithoutScfNoCacheEntryPoint = "GaussianDESsWithoutScfNoCacheKernel";
m_LogScaleAssignDEKernel = CreateLogScaleAssignDEKernelString();
m_GaussianDEWithoutSsKernel = CreateGaussianDEKernel(1);
m_GaussianDESsWithScfKernel = CreateGaussianDEKernel(2);
m_GaussianDESsWithoutScfKernel = CreateGaussianDEKernel(3);
m_GaussianDEWithoutSsNoCacheKernel = CreateGaussianDEKernelNoLocalCache(1);
m_GaussianDESsWithScfNoCacheKernel = CreateGaussianDEKernelNoLocalCache(2);
m_GaussianDESsWithoutScfNoCacheKernel = CreateGaussianDEKernelNoLocalCache(3);
}
/// <summary>
/// Constructor for double template type that sets all kernel entry points as well as composes
/// Constructor that sets all kernel entry points as well as composes
/// all kernel source strings.
/// Note that no versions of kernels that use the cache are compiled because
/// the cache is not big enough to hold double4.
/// No program compilation is done here, the user must explicitly do it.
/// Specifying true or false for the bool parameter has no effect since no local memory
/// is used when instantiated with type double.
/// The caller must specify whether they are using an nVidia or AMD card because it changes
/// the amount of local memory available.
/// </summary>
/// <param name="nVidia">True if running on an nVidia card, else false. Ignored.</param>
template <>
DEOpenCLKernelCreator<double>::DEOpenCLKernelCreator(bool nVidia)
/// <param name="doublePrecision">True if double precision, else false for float.</param>
/// <param name="nVidia">True if running on an nVidia card, else false.</param>
DEOpenCLKernelCreator::DEOpenCLKernelCreator(bool doublePrecision, bool nVidia)
{
#ifdef ROW_ONLY_DE
m_DoublePrecision = doublePrecision;
m_NVidia = nVidia;
#ifdef ROW_ONLY_DE
m_LogScaleAssignDEEntryPoint = "LogScaleAssignDensityFilterKernel";
m_GaussianDEWithoutSsEntryPoint = "GaussianDEWithoutSsKernel";
m_GaussianDESsWithScfEntryPoint = "GaussianDESsWithScfKernel";
@ -71,24 +35,29 @@ DEOpenCLKernelCreator<double>::DEOpenCLKernelCreator(bool nVidia)
m_GaussianDESsWithScfNoCacheKernel = CreateGaussianDEKernelNoLocalCache(2);
m_GaussianDESsWithoutScfNoCacheKernel = CreateGaussianDEKernelNoLocalCache(3);
#else
m_NVidia = nVidia;
m_LogScaleAssignDEEntryPoint = "LogScaleAssignDensityFilterKernel";
m_GaussianDEWithoutSsNoCacheEntryPoint = "GaussianDEWithoutSsNoCacheKernel";
m_GaussianDESsWithScfNoCacheEntryPoint = "GaussianDESsWithScfNoCacheKernel";
m_LogScaleAssignDEEntryPoint = "LogScaleAssignDensityFilterKernel";
m_GaussianDEWithoutSsEntryPoint = "GaussianDEWithoutSsKernel";
m_GaussianDESsWithScfEntryPoint = "GaussianDESsWithScfKernel";
m_GaussianDESsWithoutScfEntryPoint = "GaussianDESsWithoutScfKernel";
m_GaussianDEWithoutSsNoCacheEntryPoint = "GaussianDEWithoutSsNoCacheKernel";
m_GaussianDESsWithScfNoCacheEntryPoint = "GaussianDESsWithScfNoCacheKernel";
m_GaussianDESsWithoutScfNoCacheEntryPoint = "GaussianDESsWithoutScfNoCacheKernel";
m_LogScaleAssignDEKernel = CreateLogScaleAssignDEKernelString();
m_GaussianDEWithoutSsNoCacheKernel = CreateGaussianDEKernelNoLocalCache(1);
m_GaussianDESsWithScfNoCacheKernel = CreateGaussianDEKernelNoLocalCache(2);
m_GaussianDESsWithoutScfNoCacheKernel = CreateGaussianDEKernelNoLocalCache(3);
#endif
m_LogScaleAssignDEKernel = CreateLogScaleAssignDEKernelString();
m_GaussianDEWithoutSsKernel = CreateGaussianDEKernel(1);
m_GaussianDESsWithScfKernel = CreateGaussianDEKernel(2);
m_GaussianDESsWithoutScfKernel = CreateGaussianDEKernel(3);
m_GaussianDEWithoutSsNoCacheKernel = CreateGaussianDEKernelNoLocalCache(1);
m_GaussianDESsWithScfNoCacheKernel = CreateGaussianDEKernelNoLocalCache(2);
m_GaussianDESsWithoutScfNoCacheKernel = CreateGaussianDEKernelNoLocalCache(3);
#endif
}
/// <summary>
/// Kernel source and entry point properties, getters only.
/// </summary>
template <typename T> string DEOpenCLKernelCreator<T>::LogScaleAssignDEKernel() { return m_LogScaleAssignDEKernel; }
template <typename T> string DEOpenCLKernelCreator<T>::LogScaleAssignDEEntryPoint() { return m_LogScaleAssignDEEntryPoint; }
string DEOpenCLKernelCreator::LogScaleAssignDEKernel() { return m_LogScaleAssignDEKernel; }
string DEOpenCLKernelCreator::LogScaleAssignDEEntryPoint() { return m_LogScaleAssignDEEntryPoint; }
/// <summary>
/// Get the kernel source for the specified supersample and filterWidth.
@ -96,11 +65,10 @@ template <typename T> string DEOpenCLKernelCreator<T>::LogScaleAssignDEEntryPoin
/// <param name="ss">The supersample being used</param>
/// <param name="filterWidth">Filter width</param>
/// <returns>The kernel source</returns>
template <typename T>
string DEOpenCLKernelCreator<T>::GaussianDEKernel(size_t ss, uint filterWidth)
string DEOpenCLKernelCreator::GaussianDEKernel(size_t ss, uint filterWidth)
{
#ifndef ROW_ONLY_DE
if ((typeid(T) == typeid(double)) || (filterWidth > MaxDEFilterSize()))//Type double does not use cache.
if (filterWidth > MaxDEFilterSize())
{
if (ss > 1)
{
@ -133,11 +101,10 @@ string DEOpenCLKernelCreator<T>::GaussianDEKernel(size_t ss, uint filterWidth)
/// <param name="ss">The supersample being used</param>
/// <param name="filterWidth">Filter width</param>
/// <returns>The name of the density estimation filtering entry point kernel function</returns>
template <typename T>
string DEOpenCLKernelCreator<T>::GaussianDEEntryPoint(size_t ss, uint filterWidth)
string DEOpenCLKernelCreator::GaussianDEEntryPoint(size_t ss, uint filterWidth)
{
#ifndef ROW_ONLY_DE
if ((typeid(T) == typeid(double)) || (filterWidth > MaxDEFilterSize()))//Type double does not use cache.
if (filterWidth > MaxDEFilterSize())
{
if (ss > 1)
{
@ -169,8 +136,7 @@ string DEOpenCLKernelCreator<T>::GaussianDEEntryPoint(size_t ss, uint filterWidt
/// Filters larger than this value will run the version without local memory caching.
/// </summary>
/// <returns>The maximum filter size allowed for running the local memory version of density filtering</returns>
template <typename T>
uint DEOpenCLKernelCreator<T>::MaxDEFilterSize() { return 9; }//The true max would be (maxBoxSize - 1) / 2, but that's impractical because it can give us a tiny block size.
uint DEOpenCLKernelCreator::MaxDEFilterSize() { return 9; }//The true max would be (maxBoxSize - 1) / 2, but that's impractical because it can give us a tiny block size.
/// <summary>
/// Solve for the maximum filter radius.
@ -185,8 +151,7 @@ uint DEOpenCLKernelCreator<T>::MaxDEFilterSize() { return 9; }//The true max wou
/// <param name="desiredFilterSize">Size of the desired filter.</param>
/// <param name="ss">The supersample being used</param>
/// <returns>The maximum filter radius allowed</returns>
template <typename T>
T DEOpenCLKernelCreator<T>::SolveMaxDERad(uint maxBoxSize, T desiredFilterSize, T ss)
double DEOpenCLKernelCreator::SolveMaxDERad(uint maxBoxSize, double desiredFilterSize, double ss)
{
uint finalFilterSize = uint((ceil(desiredFilterSize) * ss) + (ss - 1.0));
@ -195,7 +160,7 @@ T DEOpenCLKernelCreator<T>::SolveMaxDERad(uint maxBoxSize, T desiredFilterSize,
return desiredFilterSize;
//The final size doesn't fit, so scale the original down until it fits.
return T(floor((MaxDEFilterSize() - (ss - 1.0)) / ss));
return floor((MaxDEFilterSize() - (ss - 1.0)) / ss);
}
/// <summary>
@ -204,10 +169,9 @@ T DEOpenCLKernelCreator<T>::SolveMaxDERad(uint maxBoxSize, T desiredFilterSize,
/// </summary>
/// <param name="localMem">The local memory available to a block</param>
/// <returns>The maximum filter box size allowed</returns>
template <typename T>
uint DEOpenCLKernelCreator<T>::SolveMaxBoxSize(uint localMem)
uint DEOpenCLKernelCreator::SolveMaxBoxSize(uint localMem)
{
return uint(floor(std::sqrt(floor(T(localMem) / 16.0))));//Divide by 16 because each element is float4.
return uint(floor(std::sqrt(floor(localMem / 16.0))));//Divide by 16 because each element is float4.
}
/// <summary>
@ -215,17 +179,16 @@ uint DEOpenCLKernelCreator<T>::SolveMaxBoxSize(uint localMem)
/// Use this when Passes == 1.
/// </summary>
/// <returns>The kernel string</returns>
template <typename T>
string DEOpenCLKernelCreator<T>::CreateLogScaleAssignDEKernelString()
string DEOpenCLKernelCreator::CreateLogScaleAssignDEKernelString()
{
ostringstream os;
os <<
ConstantDefinesString(typeid(T) == typeid(double)) <<
ConstantDefinesString(m_DoublePrecision) <<
DensityFilterCLStructString <<
"__kernel void " << m_LogScaleAssignDEEntryPoint << "(\n"
" const __global real4* histogram,\n"
" __global real4* accumulator,\n"
" const __global real4_bucket* histogram,\n"
" __global real4_bucket* accumulator,\n"
" __constant DensityFilterCL* logFilter\n"
"\t)\n"
"{\n"
@ -235,7 +198,7 @@ string DEOpenCLKernelCreator<T>::CreateLogScaleAssignDEKernelString()
"\n"
" if (histogram[index].w != 0)\n"
" {\n"
" real_t logScale = (logFilter->m_K1 * log(1.0 + histogram[index].w * logFilter->m_K2)) / histogram[index].w;\n"
" real_bucket_t logScale = (logFilter->m_K1 * log(1.0 + histogram[index].w * logFilter->m_K2)) / histogram[index].w;\n"
"\n"
" accumulator[index] = histogram[index] * logScale;\n"//Using a single real4 vector operation doubles the speed from doing each component individually.
" }\n"
@ -248,23 +211,22 @@ string DEOpenCLKernelCreator<T>::CreateLogScaleAssignDEKernelString()
}
#ifdef ROW_ONLY_DE
template <typename T>
string DEOpenCLKernelCreator<T>::CreateGaussianDEKernel(size_t ss)
string DEOpenCLKernelCreator::CreateGaussianDEKernel(size_t ss)
{
bool doSS = ss > 1;
bool doScf = !(ss & 1);
ostringstream os;
os <<
ConstantDefinesString(typeid(T) == typeid(double)) <<
ConstantDefinesString(m_DoublePrecision) <<
DensityFilterCLStructString <<
UnionCLStructString <<
"__kernel void " << GaussianDEEntryPoint(ss, MaxDEFilterSize()) << "(\n" <<
" const __global real4* histogram,\n"
" __global real4reals* accumulator,\n"
" const __global real4_bucket* histogram,\n"
" __global real4reals_bucket* accumulator,\n"
" __constant DensityFilterCL* densityFilter,\n"
" const __global real_t* filterCoefs,\n"
" const __global real_t* filterWidths,\n"
" const __global real_bucket_t* filterCoefs,\n"
" const __global real_bucket_t* filterWidths,\n"
" const __global uint* coefIndices,\n"
" const uint chunkSizeW,\n"
" const uint chunkSizeH,\n"
@ -282,7 +244,7 @@ string DEOpenCLKernelCreator<T>::CreateGaussianDEKernel(size_t ss)
if (doSS)
{
os <<
" uint ss = (uint)floor((real_t)densityFilter->m_Supersample / 2.0);\n"
" uint ss = (uint)floor((real_bucket_t)densityFilter->m_Supersample / 2.0);\n"
" int densityBoxLeftX;\n"
" int densityBoxRightX;\n"
" int densityBoxTopY;\n"
@ -291,7 +253,7 @@ string DEOpenCLKernelCreator<T>::CreateGaussianDEKernel(size_t ss)
if (doScf)
os <<
" real_t scfact = pow(densityFilter->m_Supersample / (densityFilter->m_Supersample + (real_t)1.0), (real_t)2.0);\n";
" real_bucket_t scfact = pow(densityFilter->m_Supersample / (densityFilter->m_Supersample + (real_bucket_t)1.0), (real_bucket_t)2.0);\n";
}
os <<
@ -320,7 +282,7 @@ string DEOpenCLKernelCreator<T>::CreateGaussianDEKernel(size_t ss)
"\n"
//Last, the indices in the global accumulator that the local bounds will be writing to.
" accumWriteStartCol = blockHistStartCol - min(densityFilter->m_FilterWidth, blockHistStartCol);\n"//The first column in the accumulator this block will write to.
" colsToWrite = ceil((real_t)(boxReadEndCol - boxReadStartCol) / (real_t)BLOCK_SIZE_X);\n"//Elements per thread to be written to the accumulator.
" colsToWrite = ceil((real_bucket_t)(boxReadEndCol - boxReadStartCol) / (real_bucket_t)BLOCK_SIZE_X);\n"//Elements per thread to be written to the accumulator.
" histCol = blockHistStartCol + THREAD_ID_X;\n"//The histogram column this individual thread will be reading from.
"\n"
" if (histCol >= rightBound)\n"
@ -331,15 +293,15 @@ string DEOpenCLKernelCreator<T>::CreateGaussianDEKernel(size_t ss)
//These are the local indices for the local data that are temporarily accumulated to before
//writing out to the global accumulator.
" uint boxCol = densityFilter->m_FilterWidth + THREAD_ID_X;\n"
" uint colsToZeroOffset, colsToZero = ceil((real_t)fullTempBoxWidth / (real_t)(BLOCK_SIZE_X));\n"//Usually is 2.
" uint colsToZeroOffset, colsToZero = ceil((real_bucket_t)fullTempBoxWidth / (real_bucket_t)(BLOCK_SIZE_X));\n"//Usually is 2.
" int i, j, k, jmin, jmax;\n"
" uint filterSelectInt, filterCoefIndex;\n"
" real_t cacheLog;\n"
" real_t filterSelect;\n"
" real4 bucket;\n"
" real_bucket_t cacheLog;\n"
" real_bucket_t filterSelect;\n"
" real4_bucket bucket;\n"
;
os << " __local real4reals filterBox[192];\n";//Must be >= fullTempBoxWidth.
os << " __local real4reals_bucket filterBox[192];\n";//Must be >= fullTempBoxWidth.
os <<
"\n"
@ -389,7 +351,7 @@ string DEOpenCLKernelCreator<T>::CreateGaussianDEKernel(size_t ss)
" else if (filterSelect <= DE_THRESH)\n"
" filterSelectInt = (int)ceil(filterSelect) - 1;\n"
" else if (filterSelect != 0)\n"
" filterSelectInt = (int)DE_THRESH + (int)floor(pow((real_t)(filterSelect - DE_THRESH), densityFilter->m_Curve));\n"
" filterSelectInt = (int)DE_THRESH + (int)floor(pow((real_bucket_t)(filterSelect - DE_THRESH), densityFilter->m_Curve));\n"
" else\n"
" filterSelectInt = 0;\n"
"\n"
@ -477,23 +439,22 @@ string DEOpenCLKernelCreator<T>::CreateGaussianDEKernel(size_t ss)
/// </summary>
/// <param name="ss">The supersample being used</param>
/// <returns>The kernel string</returns>
template <typename T>
string DEOpenCLKernelCreator<T>::CreateGaussianDEKernel(size_t ss)
string DEOpenCLKernelCreator::CreateGaussianDEKernel(size_t ss)
{
bool doSS = ss > 1;
bool doScf = !(ss & 1);
ostringstream os;
os <<
ConstantDefinesString(typeid(T) == typeid(double)) <<
ConstantDefinesString(m_DoublePrecision) <<
DensityFilterCLStructString <<
UnionCLStructString <<
"__kernel void " << GaussianDEEntryPoint(ss, MaxDEFilterSize()) << "(\n" <<
" const __global real4* histogram,\n"
" __global real4reals* accumulator,\n"
" const __global real4_bucket* histogram,\n"
" __global real4reals_bucket* accumulator,\n"
" __constant DensityFilterCL* densityFilter,\n"
" const __global real_t* filterCoefs,\n"
" const __global real_t* filterWidths,\n"
" const __global real_bucket_t* filterCoefs,\n"
" const __global real_bucket_t* filterWidths,\n"
" const __global uint* coefIndices,\n"
" const uint chunkSizeW,\n"
" const uint chunkSizeH,\n"
@ -509,7 +470,7 @@ string DEOpenCLKernelCreator<T>::CreateGaussianDEKernel(size_t ss)
if (doSS)
{
os <<
" uint ss = (uint)floor((real_t)densityFilter->m_Supersample / 2.0);\n"
" uint ss = (uint)floor((real_bucket_t)densityFilter->m_Supersample / 2.0);\n"
" int densityBoxLeftX;\n"
" int densityBoxRightX;\n"
" int densityBoxTopY;\n"
@ -518,7 +479,7 @@ string DEOpenCLKernelCreator<T>::CreateGaussianDEKernel(size_t ss)
if (doScf)
os <<
" real_t scfact = pow(densityFilter->m_Supersample / (densityFilter->m_Supersample + (real_t)1.0), (real_t)2.0);\n";
" real_bucket_t scfact = pow(densityFilter->m_Supersample / (densityFilter->m_Supersample + (real_bucket_t)1.0), (real_bucket_t)2.0);\n";
}
//Compute the size of the temporary box which is the block width + 2 * filter width x block height + 2 * filter width.
@ -561,7 +522,7 @@ string DEOpenCLKernelCreator<T>::CreateGaussianDEKernel(size_t ss)
//Last, the indices in the global accumulator that the local bounds will be writing to.
" accumWriteStartRow = blockHistStartRow - min(densityFilter->m_FilterWidth, blockHistStartRow);\n"//Will be fw - 0 except for boundary columns, it will be less.
" accumWriteStartCol = blockHistStartCol - min(densityFilter->m_FilterWidth, blockHistStartCol);\n"
" colsToWrite = ceil((real_t)(boxReadEndCol - boxReadStartCol) / (real_t)BLOCK_SIZE_X);\n"
" colsToWrite = ceil((real_bucket_t)(boxReadEndCol - boxReadStartCol) / (real_bucket_t)BLOCK_SIZE_X);\n"
"\n"
" uint threadHistRow = blockHistStartRow + THREAD_ID_Y;\n"//The histogram row this individual thread will be reading from.
" uint threadHistCol = blockHistStartCol + THREAD_ID_X;\n"//The histogram column this individual thread will be reading from.
@ -573,19 +534,19 @@ string DEOpenCLKernelCreator<T>::CreateGaussianDEKernel(size_t ss)
//writing out to the global accumulator.
" uint boxRow = densityFilter->m_FilterWidth + THREAD_ID_Y;\n"
" uint boxCol = densityFilter->m_FilterWidth + THREAD_ID_X;\n"
" uint colElementsToZero = ceil((real_t)fullTempBoxWidth / (real_t)(BLOCK_SIZE_X));\n"//Usually is 2.
" uint colElementsToZero = ceil((real_bucket_t)fullTempBoxWidth / (real_bucket_t)(BLOCK_SIZE_X));\n"//Usually is 2.
" int i, j, k;\n"
" uint filterSelectInt, filterCoefIndex;\n"
" real_t cacheLog;\n"
" real_t filterSelect;\n"
" real4 bucket;\n"
" real_bucket_t cacheLog;\n"
" real_bucket_t filterSelect;\n"
" real4_bucket bucket;\n"
;
//This will be treated as having dimensions of (BLOCK_SIZE_X + (fw * 2)) x (BLOCK_SIZE_Y + (fw * 2)).
if (m_NVidia)
os << " __local real4reals filterBox[3000];\n";
os << " __local real4reals_bucket filterBox[3000];\n";
else
os << " __local real4reals filterBox[1200];\n";
os << " __local real4reals_bucket filterBox[1200];\n";
os <<
//Zero the temp buffers first. This splits the zeroization evenly across all threads (columns) in the first block row.
@ -662,7 +623,7 @@ string DEOpenCLKernelCreator<T>::CreateGaussianDEKernel(size_t ss)
" else if (filterSelect <= DE_THRESH)\n"
" filterSelectInt = (int)ceil(filterSelect) - 1;\n"
" else\n"
" filterSelectInt = (int)DE_THRESH + (int)floor(pow((real_t)(filterSelect - DE_THRESH), densityFilter->m_Curve));\n"
" filterSelectInt = (int)DE_THRESH + (int)floor(pow((real_bucket_t)(filterSelect - DE_THRESH), densityFilter->m_Curve));\n"
"\n"
" if (filterSelectInt > densityFilter->m_MaxFilterIndex)\n"
" filterSelectInt = densityFilter->m_MaxFilterIndex;\n"
@ -736,24 +697,23 @@ string DEOpenCLKernelCreator<T>::CreateGaussianDEKernel(size_t ss)
/// </summary>
/// <param name="ss">The supersample being used</param>
/// <returns>The kernel string</returns>
template <typename T>
string DEOpenCLKernelCreator<T>::CreateGaussianDEKernelNoLocalCache(size_t ss)
string DEOpenCLKernelCreator::CreateGaussianDEKernelNoLocalCache(size_t ss)
{
bool doSS = ss > 1;
bool doScf = !(ss & 1);
ostringstream os;
os <<
ConstantDefinesString(typeid(T) == typeid(double)) <<
ConstantDefinesString(m_DoublePrecision) <<
DensityFilterCLStructString <<
UnionCLStructString <<
AddToAccumWithCheckFunctionString <<
"__kernel void " << GaussianDEEntryPoint(ss, MaxDEFilterSize() + 1) << "(\n" <<
" const __global real4* histogram,\n"
" __global real4reals* accumulator,\n"
" const __global real4_bucket* histogram,\n"
" __global real4reals_bucket* accumulator,\n"
" __constant DensityFilterCL* densityFilter,\n"
" const __global real_t* filterCoefs,\n"
" const __global real_t* filterWidths,\n"
" const __global real_bucket_t* filterCoefs,\n"
" const __global real_bucket_t* filterWidths,\n"
" const __global uint* coefIndices,\n"
" const uint chunkSizeW,\n"
" const uint chunkSizeH,\n"
@ -769,14 +729,14 @@ string DEOpenCLKernelCreator<T>::CreateGaussianDEKernelNoLocalCache(size_t ss)
if (doSS)
{
os <<
" uint ss = (uint)floor((real_t)densityFilter->m_Supersample / 2.0);\n"
" uint ss = (uint)floor((real_bucket_t)densityFilter->m_Supersample / 2.0);\n"
" int densityBoxLeftX;\n"
" int densityBoxRightX;\n"
" int densityBoxTopY;\n"
" int densityBoxBottomY;\n";
if (doScf)
os << " real_t scfact = pow((real_t)densityFilter->m_Supersample / ((real_t)densityFilter->m_Supersample + (real_t)1.0), (real_t)2.0);\n";
os << " real_bucket_t scfact = pow((real_bucket_t)densityFilter->m_Supersample / ((real_bucket_t)densityFilter->m_Supersample + (real_bucket_t)1.0), (real_bucket_t)2.0);\n";
}
os <<
@ -796,10 +756,9 @@ string DEOpenCLKernelCreator<T>::CreateGaussianDEKernelNoLocalCache(size_t ss)
"\n"
" int i, j;\n"
" uint filterSelectInt, filterCoefIndex;\n"
" real_t cacheLog;\n"
" real_t logScale;\n"
" real_t filterSelect;\n"
" real4 bucket;\n"
" real_bucket_t cacheLog;\n"
" real_bucket_t filterSelect;\n"
" real4_bucket bucket;\n"
"\n"
" if (threadHistRow < botBound && threadHistCol < rightBound)\n"
" {\n"
@ -843,7 +802,7 @@ string DEOpenCLKernelCreator<T>::CreateGaussianDEKernelNoLocalCache(size_t ss)
" else if (filterSelect <= DE_THRESH)\n"
" filterSelectInt = (int)ceil(filterSelect) - 1;\n"
" else\n"
" filterSelectInt = (int)DE_THRESH + (int)floor(pow((real_t)(filterSelect - DE_THRESH), densityFilter->m_Curve));\n"
" filterSelectInt = (int)DE_THRESH + (int)floor(pow((real_bucket_t)(filterSelect - DE_THRESH), densityFilter->m_Curve));\n"
"\n"
" if (filterSelectInt > densityFilter->m_MaxFilterIndex)\n"
" filterSelectInt = densityFilter->m_MaxFilterIndex;\n"
@ -877,10 +836,4 @@ string DEOpenCLKernelCreator<T>::CreateGaussianDEKernelNoLocalCache(size_t ss)
return os.str();
}
template EMBERCL_API class DEOpenCLKernelCreator<float>;
#ifdef DO_DOUBLE
template EMBERCL_API class DEOpenCLKernelCreator<double>;
#endif
}

View File

@ -27,14 +27,11 @@ namespace EmberCLns
/// ends up being not much faster than doing it on the CPU.
/// String members are kept for the program source and entry points
/// for each version of the program.
/// Template argument expected to be float or double.
/// </summary>
template <typename T>
class EMBERCL_API DEOpenCLKernelCreator
{
public:
DEOpenCLKernelCreator();
DEOpenCLKernelCreator(bool nVidia);
DEOpenCLKernelCreator(bool doublePrecision, bool nVidia);
//Accessors.
string LogScaleAssignDEKernel();
@ -44,7 +41,7 @@ public:
//Miscellaneous static functions.
static uint MaxDEFilterSize();
static T SolveMaxDERad(uint maxBoxSize, T desiredFilterSize, T ss);
static double SolveMaxDERad(uint maxBoxSize, double desiredFilterSize, double ss);
static uint SolveMaxBoxSize(uint localMem);
private:
@ -74,6 +71,7 @@ private:
string m_GaussianDESsWithoutScfNoCacheKernel;
string m_GaussianDESsWithoutScfNoCacheEntryPoint;
bool m_DoublePrecision;
bool m_NVidia;
};
}

View File

@ -15,9 +15,9 @@ namespace EmberCLns
static const char* RgbToHsvFunctionString =
//rgb 0 - 1,
//h 0 - 6, s 0 - 1, v 0 - 1
"static inline void RgbToHsv(real4* rgb, real4* hsv)\n"
"static inline void RgbToHsv(real4_bucket* rgb, real4_bucket* hsv)\n"
"{\n"
" real_t max, min, del, rc, gc, bc;\n"
" real_bucket_t max, min, del, rc, gc, bc;\n"
"\n"
//Compute maximum of r, g, b.
" if ((*rgb).x >= (*rgb).y)\n"
@ -85,10 +85,10 @@ static const char* RgbToHsvFunctionString =
static const char* HsvToRgbFunctionString =
//h 0 - 6, s 0 - 1, v 0 - 1
//rgb 0 - 1
"static inline void HsvToRgb(real4* hsv, real4* rgb)\n"
"static inline void HsvToRgb(real4_bucket* hsv, real4_bucket* rgb)\n"
"{\n"
" int j;\n"
" real_t f, p, q, t;\n"
" real_bucket_t f, p, q, t;\n"
"\n"
" while ((*hsv).x >= 6)\n"
" (*hsv).x = (*hsv).x - 6;\n"
@ -119,9 +119,9 @@ static const char* HsvToRgbFunctionString =
/// OpenCL equivalent of Palette::CalcAlpha().
/// </summary>
static const char* CalcAlphaFunctionString =
"static inline real_t CalcAlpha(real_t density, real_t gamma, real_t linrange)\n"//Not the slightest clue what this is doing.//DOC
"static inline real_t CalcAlpha(real_bucket_t density, real_bucket_t gamma, real_bucket_t linrange)\n"//Not the slightest clue what this is doing.//DOC
"{\n"
" real_t frac, alpha, funcval = pow(linrange, gamma);\n"
" real_bucket_t frac, alpha, funcval = pow(linrange, gamma);\n"
"\n"
" if (density > 0)\n"
" {\n"
@ -147,7 +147,7 @@ static const char* CalcAlphaFunctionString =
/// during final accumulation, which only takes floats.
/// </summary>
static const char* CurveAdjustFunctionString =
"static inline void CurveAdjust(__constant real4reals* csa, float* a, uint index)\n"
"static inline void CurveAdjust(__constant real4reals_bucket* csa, float* a, uint index)\n"
"{\n"
" uint tempIndex = (uint)Clamp(*a, 0.0, (float)COLORMAP_LENGTH_MINUS_1);\n"
" uint tempIndex2 = (uint)Clamp(csa[tempIndex].m_Real4.x, 0.0, (real_t)COLORMAP_LENGTH_MINUS_1);\n"
@ -359,18 +359,18 @@ static string AtomicString(bool doublePrecision, bool dp64AtomicSupport)
if (!doublePrecision || dp64AtomicSupport)
{
os <<
"void AtomicAdd(volatile __global real_t* source, const real_t operand)\n"
"void AtomicAdd(volatile __global real_bucket_t* source, const real_bucket_t operand)\n"
"{\n"
" union\n"
" {\n"
" atomi intVal;\n"
" real_t realVal;\n"
" real_bucket_t realVal;\n"
" } newVal;\n"
"\n"
" union\n"
" {\n"
" atomi intVal;\n"
" real_t realVal;\n"
" real_bucket_t realVal;\n"
" } prevVal;\n"
"\n"
" do\n"
@ -383,18 +383,18 @@ static string AtomicString(bool doublePrecision, bool dp64AtomicSupport)
else//They want double precision and do not have dp atomic support.
{
os <<
"void AtomicAdd(volatile __global real_t* source, const real_t operand)\n"
"void AtomicAdd(volatile __global double* source, const double operand)\n"
"{\n"
" union\n"
" {\n"
" uint intVal[2];\n"
" real_t realVal;\n"
" double realVal;\n"
" } newVal;\n"
"\n"
" union\n"
" {\n"
" uint intVal[2];\n"
" real_t realVal;\n"
" double realVal;\n"
" } prevVal;\n"
"\n"
" do\n"
@ -408,27 +408,4 @@ static string AtomicString(bool doublePrecision, bool dp64AtomicSupport)
return os.str();
}
#ifdef GRAVEYARD
/*"void AtomicLocalAdd(volatile __local real_t* source, const real_t operand)\n"
"{\n"
" union\n"
" {\n"
" atomi intVal;\n"
" real_t realVal;\n"
" } newVal;\n"
"\n"
" union\n"
" {\n"
" atomi intVal;\n"
" real_t realVal;\n"
" } prevVal;\n"
"\n"
" do\n"
" {\n"
" prevVal.realVal = *source;\n"
" newVal.realVal = prevVal.realVal + operand;\n"
" } while (atomic_cmpxchg((volatile __local atomi*)source, prevVal.intVal, newVal.intVal) != prevVal.intVal);\n"
"}\n"*/
#endif
}

View File

@ -41,7 +41,9 @@ static string ConstantDefinesString(bool doublePrecision)
<< "typedef long intPrec;\n"
<< "typedef ulong atomi;\n"
<< "typedef double real_t;\n"
<< "typedef float real_bucket_t;\n"//Assume buckets are always float, even though iter calcs are in double.
<< "typedef double4 real4;\n"
<< "typedef float4 real4_bucket;\n"//And here too.
<< "#define EPS (DBL_EPSILON)\n"
;
}
@ -50,7 +52,9 @@ static string ConstantDefinesString(bool doublePrecision)
os << "typedef int intPrec;\n"
"typedef uint atomi;\n"
"typedef float real_t;\n"
"typedef float real_bucket_t;\n"
"typedef float4 real4;\n"
"typedef float4 real4_bucket;\n"
"#define EPS (FLT_EPSILON)\n"
;
}
@ -284,9 +288,9 @@ struct ALIGN DensityFilterCL
static const char* DensityFilterCLStructString =
"typedef struct __attribute__ " ALIGN_CL " _DensityFilterCL\n"
"{\n"
" real_t m_Curve;\n"
" real_t m_K1;\n"
" real_t m_K2;\n"
" real_bucket_t m_Curve;\n"
" real_bucket_t m_K1;\n"
" real_bucket_t m_K2;\n"
" uint m_Supersample;\n"
" uint m_SuperRasW;\n"
" uint m_SuperRasH;\n"
@ -340,11 +344,11 @@ static const char* SpatialFilterCLStructString =
" uint m_DensityFilterOffset;\n"
" uint m_Transparency;\n"
" uint m_YAxisUp;\n"
" real_t m_Vibrancy;\n"
" real_t m_HighlightPower;\n"
" real_t m_Gamma;\n"
" real_t m_LinRange;\n"
" real_t m_Background[4];\n"//For some reason, using float4/double4 here does not align no matter what. So just use an array of 4.
" real_bucket_t m_Vibrancy;\n"
" real_bucket_t m_HighlightPower;\n"
" real_bucket_t m_Gamma;\n"
" real_bucket_t m_LinRange;\n"
" real_bucket_t m_Background[4];\n"//For some reason, using float4/double4 here does not align no matter what. So just use an array of 4.
"} SpatialFilterCL;\n"
"\n";
@ -383,5 +387,11 @@ static const char* UnionCLStructString =
" real4 m_Real4;\n"
" real_t m_Reals[4];\n"
"} real4reals;\n"
"\n"
"typedef union\n"//Used to match the bucket template type.
"{\n"
" real4_bucket m_Real4;\n"
" real_bucket_t m_Reals[4];\n"
"} real4reals_bucket;\n"
"\n";
}

View File

@ -7,9 +7,9 @@ namespace EmberCLns
/// Constructor that creates all kernel strings.
/// The caller will access these strings through the accessor functions.
/// </summary>
template <typename T>
FinalAccumOpenCLKernelCreator<T>::FinalAccumOpenCLKernelCreator()
FinalAccumOpenCLKernelCreator::FinalAccumOpenCLKernelCreator(bool doublePrecision)
{
m_DoublePrecision = doublePrecision;
m_GammaCorrectionWithAlphaCalcEntryPoint = "GammaCorrectionWithAlphaCalcKernel";
m_GammaCorrectionWithoutAlphaCalcEntryPoint = "GammaCorrectionWithoutAlphaCalcKernel";
@ -37,24 +37,24 @@ FinalAccumOpenCLKernelCreator<T>::FinalAccumOpenCLKernelCreator()
/// Kernel source and entry point properties, getters only.
/// </summary>
template <typename T> string FinalAccumOpenCLKernelCreator<T>::GammaCorrectionWithAlphaCalcKernel() { return m_GammaCorrectionWithAlphaCalcKernel; }
template <typename T> string FinalAccumOpenCLKernelCreator<T>::GammaCorrectionWithAlphaCalcEntryPoint() { return m_GammaCorrectionWithAlphaCalcEntryPoint; }
template <typename T> string FinalAccumOpenCLKernelCreator<T>::GammaCorrectionWithoutAlphaCalcKernel() { return m_GammaCorrectionWithoutAlphaCalcKernel; }
template <typename T> string FinalAccumOpenCLKernelCreator<T>::GammaCorrectionWithoutAlphaCalcEntryPoint() { return m_GammaCorrectionWithoutAlphaCalcEntryPoint; }
string FinalAccumOpenCLKernelCreator::GammaCorrectionWithAlphaCalcKernel() { return m_GammaCorrectionWithAlphaCalcKernel; }
string FinalAccumOpenCLKernelCreator::GammaCorrectionWithAlphaCalcEntryPoint() { return m_GammaCorrectionWithAlphaCalcEntryPoint; }
string FinalAccumOpenCLKernelCreator::GammaCorrectionWithoutAlphaCalcKernel() { return m_GammaCorrectionWithoutAlphaCalcKernel; }
string FinalAccumOpenCLKernelCreator::GammaCorrectionWithoutAlphaCalcEntryPoint() { return m_GammaCorrectionWithoutAlphaCalcEntryPoint; }
template <typename T> string FinalAccumOpenCLKernelCreator<T>::FinalAccumEarlyClipKernel() { return m_FinalAccumEarlyClipKernel; }
template <typename T> string FinalAccumOpenCLKernelCreator<T>::FinalAccumEarlyClipEntryPoint() { return m_FinalAccumEarlyClipEntryPoint; }
template <typename T> string FinalAccumOpenCLKernelCreator<T>::FinalAccumEarlyClipWithAlphaCalcWithAlphaAccumKernel() { return m_FinalAccumEarlyClipWithAlphaCalcWithAlphaAccumKernel; }
template <typename T> string FinalAccumOpenCLKernelCreator<T>::FinalAccumEarlyClipWithAlphaCalcWithAlphaAccumEntryPoint() { return m_FinalAccumEarlyClipWithAlphaCalcWithAlphaAccumEntryPoint; }
template <typename T> string FinalAccumOpenCLKernelCreator<T>::FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumKernel() { return m_FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumKernel; }
template <typename T> string FinalAccumOpenCLKernelCreator<T>::FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumEntryPoint() { return m_FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumEntryPoint; }
string FinalAccumOpenCLKernelCreator::FinalAccumEarlyClipKernel() { return m_FinalAccumEarlyClipKernel; }
string FinalAccumOpenCLKernelCreator::FinalAccumEarlyClipEntryPoint() { return m_FinalAccumEarlyClipEntryPoint; }
string FinalAccumOpenCLKernelCreator::FinalAccumEarlyClipWithAlphaCalcWithAlphaAccumKernel() { return m_FinalAccumEarlyClipWithAlphaCalcWithAlphaAccumKernel; }
string FinalAccumOpenCLKernelCreator::FinalAccumEarlyClipWithAlphaCalcWithAlphaAccumEntryPoint() { return m_FinalAccumEarlyClipWithAlphaCalcWithAlphaAccumEntryPoint; }
string FinalAccumOpenCLKernelCreator::FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumKernel() { return m_FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumKernel; }
string FinalAccumOpenCLKernelCreator::FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumEntryPoint() { return m_FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumEntryPoint; }
template <typename T> string FinalAccumOpenCLKernelCreator<T>::FinalAccumLateClipKernel() { return m_FinalAccumLateClipKernel; }
template <typename T> string FinalAccumOpenCLKernelCreator<T>::FinalAccumLateClipEntryPoint() { return m_FinalAccumLateClipEntryPoint; }
template <typename T> string FinalAccumOpenCLKernelCreator<T>::FinalAccumLateClipWithAlphaCalcWithAlphaAccumKernel() { return m_FinalAccumLateClipWithAlphaCalcWithAlphaAccumKernel; }
template <typename T> string FinalAccumOpenCLKernelCreator<T>::FinalAccumLateClipWithAlphaCalcWithAlphaAccumEntryPoint() { return m_FinalAccumLateClipWithAlphaCalcWithAlphaAccumEntryPoint; }
template <typename T> string FinalAccumOpenCLKernelCreator<T>::FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumKernel() { return m_FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumKernel; }
template <typename T> string FinalAccumOpenCLKernelCreator<T>::FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumEntryPoint() { return m_FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumEntryPoint; }
string FinalAccumOpenCLKernelCreator::FinalAccumLateClipKernel() { return m_FinalAccumLateClipKernel; }
string FinalAccumOpenCLKernelCreator::FinalAccumLateClipEntryPoint() { return m_FinalAccumLateClipEntryPoint; }
string FinalAccumOpenCLKernelCreator::FinalAccumLateClipWithAlphaCalcWithAlphaAccumKernel() { return m_FinalAccumLateClipWithAlphaCalcWithAlphaAccumKernel; }
string FinalAccumOpenCLKernelCreator::FinalAccumLateClipWithAlphaCalcWithAlphaAccumEntryPoint() { return m_FinalAccumLateClipWithAlphaCalcWithAlphaAccumEntryPoint; }
string FinalAccumOpenCLKernelCreator::FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumKernel() { return m_FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumKernel; }
string FinalAccumOpenCLKernelCreator::FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumEntryPoint() { return m_FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumEntryPoint; }
/// <summary>
/// Get the gamma correction entry point.
@ -62,8 +62,7 @@ template <typename T> string FinalAccumOpenCLKernelCreator<T>::FinalAccumLateCli
/// <param name="channels">The number of channels used, 3 or 4.</param>
/// <param name="transparency">True if channels equals 4 and using transparency, else false.</param>
/// <returns>The name of the gamma correction entry point kernel function</returns>
template <typename T>
string FinalAccumOpenCLKernelCreator<T>::GammaCorrectionEntryPoint(size_t channels, bool transparency)
string FinalAccumOpenCLKernelCreator::GammaCorrectionEntryPoint(size_t channels, bool transparency)
{
bool alphaCalc = ((channels > 3) && transparency);
return alphaCalc ? m_GammaCorrectionWithAlphaCalcEntryPoint : m_GammaCorrectionWithoutAlphaCalcEntryPoint;
@ -75,8 +74,7 @@ string FinalAccumOpenCLKernelCreator<T>::GammaCorrectionEntryPoint(size_t channe
/// <param name="channels">The number of channels used, 3 or 4.</param>
/// <param name="transparency">True if channels equals 4 and using transparency, else false.</param>
/// <returns>The gamma correction kernel string</returns>
template <typename T>
string FinalAccumOpenCLKernelCreator<T>::GammaCorrectionKernel(size_t channels, bool transparency)
string FinalAccumOpenCLKernelCreator::GammaCorrectionKernel(size_t channels, bool transparency)
{
bool alphaCalc = ((channels > 3) && transparency);
return alphaCalc ? m_GammaCorrectionWithAlphaCalcKernel : m_GammaCorrectionWithoutAlphaCalcKernel;
@ -91,16 +89,15 @@ string FinalAccumOpenCLKernelCreator<T>::GammaCorrectionKernel(size_t channels,
/// <param name="alphaBase">Storage for the alpha base value used in the kernel. 0 if transparency is true, else 255.</param>
/// <param name="alphaScale">Storage for the alpha scale value used in the kernel. 255 if transparency is true, else 0.</param>
/// <returns>The name of the final accumulation entry point kernel function</returns>
template <typename T>
string FinalAccumOpenCLKernelCreator<T>::FinalAccumEntryPoint(bool earlyClip, size_t channels, bool transparency, T& alphaBase, T& alphaScale)
string FinalAccumOpenCLKernelCreator::FinalAccumEntryPoint(bool earlyClip, size_t channels, bool transparency, double& alphaBase, double& alphaScale)
{
bool alphaCalc = ((channels > 3) && transparency);
bool alphaAccum = channels > 3;
if (alphaAccum)
{
alphaBase = transparency ? 0.0f : 255.0f;//See the table below.
alphaScale = transparency ? 255.0f : 0.0f;
alphaBase = transparency ? 0 : 255;//See the table below.
alphaScale = transparency ? 255 : 0;
}
if (earlyClip)
@ -134,8 +131,7 @@ string FinalAccumOpenCLKernelCreator<T>::FinalAccumEntryPoint(bool earlyClip, si
/// <param name="channels">The number of channels used, 3 or 4.</param>
/// <param name="transparency">True if channels equals 4 and using transparency, else false.</param>
/// <returns>The final accumulation kernel string</returns>
template <typename T>
string FinalAccumOpenCLKernelCreator<T>::FinalAccumKernel(bool earlyClip, size_t channels, bool transparency)
string FinalAccumOpenCLKernelCreator::FinalAccumKernel(bool earlyClip, size_t channels, bool transparency)
{
bool alphaCalc = (channels > 3 && transparency);
bool alphaAccum = channels > 3;
@ -171,8 +167,7 @@ string FinalAccumOpenCLKernelCreator<T>::FinalAccumKernel(bool earlyClip, size_t
/// <param name="channels">The number of channels used, 3 or 4.</param>
/// <param name="transparency">True if channels equals 4 and using transparency, else false.</param>
/// <returns>The final accumulation kernel string</returns>
template <typename T>
string FinalAccumOpenCLKernelCreator<T>::CreateFinalAccumKernelString(bool earlyClip, size_t channels, bool transparency)
string FinalAccumOpenCLKernelCreator::CreateFinalAccumKernelString(bool earlyClip, size_t channels, bool transparency)
{
return CreateFinalAccumKernelString(earlyClip, (channels > 3 && transparency), channels > 3);
}
@ -184,14 +179,13 @@ string FinalAccumOpenCLKernelCreator<T>::CreateFinalAccumKernelString(bool early
/// <param name="alphaCalc">True if channels equals 4 and transparency is desired, else false.</param>
/// <param name="alphaAccum">True if channels equals 4</param>
/// <returns>The final accumulation kernel string</returns>
template <typename T>
string FinalAccumOpenCLKernelCreator<T>::CreateFinalAccumKernelString(bool earlyClip, bool alphaCalc, bool alphaAccum)
string FinalAccumOpenCLKernelCreator::CreateFinalAccumKernelString(bool earlyClip, bool alphaCalc, bool alphaAccum)
{
ostringstream os;
string channels = alphaAccum ? "4" : "3";
os <<
ConstantDefinesString(typeid(T) == typeid(double)) <<
ConstantDefinesString(m_DoublePrecision) <<
ClampRealFunctionString <<
UnionCLStructString <<
RgbToHsvFunctionString <<
@ -228,14 +222,14 @@ string FinalAccumOpenCLKernelCreator<T>::CreateFinalAccumKernelString(bool early
}
os <<
" const __global real4reals* accumulator,\n"
" const __global real4reals_bucket* accumulator,\n"
" __write_only image2d_t pixels,\n"
" __constant SpatialFilterCL* spatialFilter,\n"
" __constant real_t* filterCoefs,\n"
" __constant real4reals* csa,\n"
" __constant real_bucket_t* filterCoefs,\n"
" __constant real4reals_bucket* csa,\n"
" const uint doCurves,\n"
" const real_t alphaBase,\n"
" const real_t alphaScale\n"
" const real_bucket_t alphaBase,\n"
" const real_bucket_t alphaScale\n"
"\t)\n"
"{\n"
"\n"
@ -250,8 +244,8 @@ string FinalAccumOpenCLKernelCreator<T>::CreateFinalAccumKernelString(bool early
" float4floats finalColor;\n"
" int ii, jj;\n"
" uint filterKRowIndex;\n"
" const __global real4reals* accumBucket;\n"
" real4reals newBucket;\n"
" const __global real4reals_bucket* accumBucket;\n"
" real4reals_bucket newBucket;\n"
" newBucket.m_Real4 = 0;\n"
"\n"
" for (jj = 0; jj < spatialFilter->m_FilterWidth; jj++)\n"
@ -260,7 +254,7 @@ string FinalAccumOpenCLKernelCreator<T>::CreateFinalAccumKernelString(bool early
"\n"
" for (ii = 0; ii < spatialFilter->m_FilterWidth; ii++)\n"
" {\n"
" real_t k = filterCoefs[ii + filterKRowIndex];\n"
" real_bucket_t k = filterCoefs[ii + filterKRowIndex];\n"
"\n"
" accumBucket = accumulator + (accumX + ii) + ((accumY + jj) * spatialFilter->m_SuperRasW);\n"
" newBucket.m_Real4 += (k * accumBucket->m_Real4);\n"
@ -287,10 +281,10 @@ string FinalAccumOpenCLKernelCreator<T>::CreateFinalAccumKernelString(bool early
else
{
//Late clip, so must gamma correct from the temp new bucket to temp float4.
if (typeid(T) == typeid(double))
if (m_DoublePrecision)
{
os <<
" real4reals realFinal;\n"
" real4reals_bucket realFinal;\n"
"\n"
" GammaCorrectionFloats(&newBucket, &(spatialFilter->m_Background[0]), spatialFilter->m_Gamma, spatialFilter->m_LinRange, spatialFilter->m_Vibrancy, spatialFilter->m_HighlightPower, alphaBase, alphaScale, &(realFinal.m_Reals[0]));\n"
" finalColor.m_Float4.x = (float)realFinal.m_Real4.x;\n"
@ -333,21 +327,20 @@ string FinalAccumOpenCLKernelCreator<T>::CreateFinalAccumKernelString(bool early
/// <param name="alphaAccum">True if channels equals 4</param>
/// <param name="finalOut">True if writing to global buffer (late clip), else false (early clip).</param>
/// <returns>The gamma correction function string</returns>
template <typename T>
string FinalAccumOpenCLKernelCreator<T>::CreateGammaCorrectionFunctionString(bool globalBucket, bool alphaCalc, bool alphaAccum, bool finalOut)
string FinalAccumOpenCLKernelCreator::CreateGammaCorrectionFunctionString(bool globalBucket, bool alphaCalc, bool alphaAccum, bool finalOut)
{
ostringstream os;
string dataType;
string unionMember;
dataType = "real_t";
dataType = "real_bucket_t";
//Use real_t for all cases, early clip and final accum.
os << "void GammaCorrectionFloats(" << (globalBucket ? "__global " : "") << "real4reals* bucket, __constant real_t* background, real_t g, real_t linRange, real_t vibrancy, real_t highlightPower, real_t alphaBase, real_t alphaScale, " << (finalOut ? "" : "__global") << " real_t* correctedChannels)\n";
os << "void GammaCorrectionFloats(" << (globalBucket ? "__global " : "") << "real4reals_bucket* bucket, __constant real_bucket_t* background, real_bucket_t g, real_bucket_t linRange, real_bucket_t vibrancy, real_bucket_t highlightPower, real_bucket_t alphaBase, real_bucket_t alphaScale, " << (finalOut ? "" : "__global") << " real_bucket_t* correctedChannels)\n";
os
<< "{\n"
<< " real_t alpha, ls, tmp, a;\n"
<< " real4reals newRgb;\n"
<< " real_bucket_t alpha, ls, tmp, a;\n"
<< " real4reals_bucket newRgb;\n"
<< "\n"
<< " if (bucket->m_Reals[3] <= 0)\n"
<< " {\n"
@ -359,7 +352,7 @@ string FinalAccumOpenCLKernelCreator<T>::CreateGammaCorrectionFunctionString(boo
<< " tmp = bucket->m_Reals[3];\n"
<< " alpha = CalcAlpha(tmp, g, linRange);\n"
<< " ls = vibrancy * 256.0 * alpha / tmp;\n"
<< " ClampRef(&alpha, 0.0, 1.0);\n"
<< " alpha = clamp(alpha, (real_bucket_t)0.0, (real_bucket_t)1.0);\n"
<< " }\n"
<< "\n"
<< " CalcNewRgb(bucket, ls, highlightPower, &newRgb);\n"
@ -385,7 +378,7 @@ string FinalAccumOpenCLKernelCreator<T>::CreateGammaCorrectionFunctionString(boo
os <<
"\n"
" correctedChannels[rgbi] = (" << dataType << ")clamp(a, (real_t)0.0, (real_t)255.0);\n"
" correctedChannels[rgbi] = (" << dataType << ")clamp(a, (real_bucket_t)0.0, (real_bucket_t)255.0);\n"
" }\n"
"\n";
@ -416,19 +409,18 @@ string FinalAccumOpenCLKernelCreator<T>::CreateGammaCorrectionFunctionString(boo
/// </summary>
/// <param name="globalBucket">True if writing the corrected value to a global buffer (early clip), else false (late clip).</param>
/// <returns>The CalcNewRgb function string</returns>
template <typename T>
string FinalAccumOpenCLKernelCreator<T>::CreateCalcNewRgbFunctionString(bool globalBucket)
string FinalAccumOpenCLKernelCreator::CreateCalcNewRgbFunctionString(bool globalBucket)
{
ostringstream os;
os <<
"static void CalcNewRgb(" << (globalBucket ? "__global " : "") << "real4reals* oldRgb, real_t ls, real_t highPow, real4reals* newRgb)\n"
"static void CalcNewRgb(" << (globalBucket ? "__global " : "") << "real4reals_bucket* oldRgb, real_bucket_t ls, real_bucket_t highPow, real4reals_bucket* newRgb)\n"
"{\n"
" int rgbi;\n"
" real_t newls, lsratio;\n"
" real4reals newHsv;\n"
" real_t maxa, maxc;\n"
" real_t adjhlp;\n"
" real_bucket_t newls, lsratio;\n"
" real4reals_bucket newHsv;\n"
" real_bucket_t maxa, maxc;\n"
" real_bucket_t adjhlp;\n"
"\n"
" if (ls == 0 || (oldRgb->m_Real4.x == 0 && oldRgb->m_Real4.y == 0 && oldRgb->m_Real4.z == 0))\n"//Can't do a vector compare to zero.
" {\n"
@ -485,14 +477,13 @@ string FinalAccumOpenCLKernelCreator<T>::CreateCalcNewRgbFunctionString(bool glo
/// </summary>
/// <param name="alphaCalc">True if channels equals 4 and transparency is desired, else false.</param>
/// <returns>The gamma correction kernel string used for early clipping</returns>
template <typename T>
string FinalAccumOpenCLKernelCreator<T>::CreateGammaCorrectionKernelString(bool alphaCalc)
string FinalAccumOpenCLKernelCreator::CreateGammaCorrectionKernelString(bool alphaCalc)
{
ostringstream os;
string dataType;
os <<
ConstantDefinesString(typeid(T) == typeid(double)) <<
ConstantDefinesString(m_DoublePrecision) <<
ClampRealFunctionString <<
UnionCLStructString <<
RgbToHsvFunctionString <<
@ -503,7 +494,7 @@ string FinalAccumOpenCLKernelCreator<T>::CreateGammaCorrectionKernelString(bool
CreateGammaCorrectionFunctionString(true, alphaCalc, true, false);//Will only be used with float in this case, early clip. Will always alpha accum.
os << "__kernel void " << (alphaCalc ? m_GammaCorrectionWithAlphaCalcEntryPoint : m_GammaCorrectionWithoutAlphaCalcEntryPoint) << "(\n" <<
" __global real4reals* accumulator,\n"
" __global real4reals_bucket* accumulator,\n"
" __constant SpatialFilterCL* spatialFilter\n"
")\n"
"{\n"
@ -513,7 +504,7 @@ string FinalAccumOpenCLKernelCreator<T>::CreateGammaCorrectionKernelString(bool
" return;\n"
"\n"
" uint superIndex = (GLOBAL_ID_Y * spatialFilter->m_SuperRasW) + GLOBAL_ID_X;\n"
" __global real4reals* bucket = accumulator + superIndex;\n"
" __global real4reals_bucket* bucket = accumulator + superIndex;\n"
//Pass in an alphaBase and alphaScale of 0, 1 which means to just directly assign the computed alpha value.
" GammaCorrectionFloats(bucket, &(spatialFilter->m_Background[0]), spatialFilter->m_Gamma, spatialFilter->m_LinRange, spatialFilter->m_Vibrancy, spatialFilter->m_HighlightPower, 0.0, 1.0, &(bucket->m_Reals[0]));\n"
"}\n"
@ -521,10 +512,4 @@ string FinalAccumOpenCLKernelCreator<T>::CreateGammaCorrectionKernelString(bool
return os.str();
}
template EMBERCL_API class FinalAccumOpenCLKernelCreator<float>;
#ifdef DO_DOUBLE
template EMBERCL_API class FinalAccumOpenCLKernelCreator<double>;
#endif
}

View File

@ -19,13 +19,11 @@ namespace EmberCLns
/// Early clip/late clip
/// Alpha channel, no alpha channel
/// Alpha with/without transparency
/// Template argument expected to be float or double.
/// </summary>
template <typename T>
class EMBERCL_API FinalAccumOpenCLKernelCreator
{
public:
FinalAccumOpenCLKernelCreator();
FinalAccumOpenCLKernelCreator(bool doublePrecision);
string GammaCorrectionWithAlphaCalcKernel();
string GammaCorrectionWithAlphaCalcEntryPoint();
@ -48,7 +46,7 @@ public:
string FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumEntryPoint();
string GammaCorrectionEntryPoint(size_t channels, bool transparency);
string GammaCorrectionKernel(size_t channels, bool transparency);
string FinalAccumEntryPoint(bool earlyClip, size_t channels, bool transparency, T& alphaBase, T& alphaScale);
string FinalAccumEntryPoint(bool earlyClip, size_t channels, bool transparency, double& alphaBase, double& alphaScale);
string FinalAccumKernel(bool earlyClip, size_t channels, bool transparency);
private:
@ -77,5 +75,7 @@ private:
string m_FinalAccumLateClipWithAlphaCalcWithAlphaAccumEntryPoint;
string m_FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumKernel;//False, true.
string m_FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumEntryPoint;
bool m_DoublePrecision;
};
}

View File

@ -6,20 +6,11 @@
namespace EmberCLns
{
/// <summary>
/// Empty constructor that does nothing. The user must call the one which takes a bool
/// argument before using this class.
/// This constructor only exists so the class can be a member of a class.
/// </summary>
template <typename T>
IterOpenCLKernelCreator<T>::IterOpenCLKernelCreator()
{
}
/// <summary>
/// Constructor that sets up some basic entry point strings and creates
/// the zeroization kernel string since it requires no conditional inputs.
/// </summary>
/// <param name="nVidia">True if running on an nVidia card, else false.</param>
template <typename T>
IterOpenCLKernelCreator<T>::IterOpenCLKernelCreator(bool nVidia)
{
@ -242,7 +233,7 @@ string IterOpenCLKernelCreator<T>::CreateIterKernelString(Ember<T>& ember, strin
" __constant real_t* parVars,\n"
" __global uchar* xformDistributions,\n"//Using uchar is quicker than uint. Can't be constant because the size can be too large to fit when using xaos.//FINALOPT
" __constant CarToRasCL* carToRas,\n"
" __global real4reals* histogram,\n"
" __global real4reals_bucket* histogram,\n"
" uint histSize,\n"
" __read_only image2d_t palette,\n"
" __global Point* points\n"
@ -506,41 +497,16 @@ string IterOpenCLKernelCreator<T>::CreateIterKernelString(Ember<T>& ember, strin
if (lockAccum)
{
if (typeid(T) == typeid(double))
{
os <<
" AtomicAdd(&(histogram[histIndex].m_Reals[0]), (real_t)palColor1.x * xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n"//Always apply opacity, even though it's usually 1.
" AtomicAdd(&(histogram[histIndex].m_Reals[1]), (real_t)palColor1.y * xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n"
" AtomicAdd(&(histogram[histIndex].m_Reals[2]), (real_t)palColor1.z * xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n"
" AtomicAdd(&(histogram[histIndex].m_Reals[3]), (real_t)palColor1.w * xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n";
}
else
{
os <<
" AtomicAdd(&(histogram[histIndex].m_Reals[0]), palColor1.x * xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n"//Always apply opacity, even though it's usually 1.
" AtomicAdd(&(histogram[histIndex].m_Reals[1]), palColor1.y * xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n"
" AtomicAdd(&(histogram[histIndex].m_Reals[2]), palColor1.z * xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n"
" AtomicAdd(&(histogram[histIndex].m_Reals[3]), palColor1.w * xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n";
}
os <<
" AtomicAdd(&(histogram[histIndex].m_Reals[0]), palColor1.x * (real_bucket_t)xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n"//Always apply opacity, even though it's usually 1.
" AtomicAdd(&(histogram[histIndex].m_Reals[1]), palColor1.y * (real_bucket_t)xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n"
" AtomicAdd(&(histogram[histIndex].m_Reals[2]), palColor1.z * (real_bucket_t)xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n"
" AtomicAdd(&(histogram[histIndex].m_Reals[3]), palColor1.w * (real_bucket_t)xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n";
}
else
{
if (typeid(T) == typeid(double))
{
os <<
" real4 realColor;\n"
"\n"
" realColor.x = (real_t)palColor1.x;\n"
" realColor.y = (real_t)palColor1.y;\n"
" realColor.z = (real_t)palColor1.z;\n"
" realColor.w = (real_t)palColor1.w;\n"
" histogram[histIndex].m_Real4 += (realColor * xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n";
}
else
{
os <<
" histogram[histIndex].m_Real4 += (palColor1 * xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n";
}
os <<
" histogram[histIndex].m_Real4 += (palColor1 * (real_bucket_t)xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n";//real_bucket_t should always be float.
}
os <<

View File

@ -23,7 +23,6 @@ template <typename T>
class EMBERCL_API IterOpenCLKernelCreator
{
public:
IterOpenCLKernelCreator();
IterOpenCLKernelCreator(bool nVidia);
string ZeroizeKernel();
string ZeroizeEntryPoint();
@ -41,22 +40,6 @@ private:
string m_ZeroizeEntryPoint;
bool m_NVidia;
};
//
//template EMBERCL_API class IterOpenCLKernelCreator<float>;
//
//#ifdef DO_DOUBLE
// template EMBERCL_API class IterOpenCLKernelCreator<double>;
//#endif
//
//template EMBERCL_API string IterOpenCLKernelCreator::CreateIterKernelString<float>(Ember<float>& ember, string& parVarDefines, bool lockAccum, bool doAccum);
//template EMBERCL_API string IterOpenCLKernelCreator::CreateIterKernelString<double>(Ember<double>& ember, string& parVarDefines, bool lockAccum, bool doAccum);
//
//template EMBERCL_API void IterOpenCLKernelCreator::ParVarIndexDefines<float>(Ember<float>& ember, pair<string, vector<float>>& params, bool doVals, bool doString);
//template EMBERCL_API void IterOpenCLKernelCreator::ParVarIndexDefines<double>(Ember<double>& ember, pair<string, vector<double>>& params, bool doVals, bool doString);
//
//template EMBERCL_API bool IterOpenCLKernelCreator::IsBuildRequired<float>(Ember<float>& ember1, Ember<float>& ember2);
//template EMBERCL_API bool IterOpenCLKernelCreator::IsBuildRequired<double>(Ember<double>& ember1, Ember<double>& ember2);
#ifdef OPEN_CL_TEST_AREA
typedef void (*KernelFuncPointer) (uint gridWidth, uint gridHeight, uint blockWidth, uint blockHeight,

View File

@ -6,13 +6,18 @@ namespace EmberCLns
/// <summary>
/// Constructor that inintializes various buffer names, block dimensions, image formats
/// and finally initializes OpenCL using the passed in parameters.
/// Kernel creators are set to be non-nvidia by default. Will be properly set in Init().
/// </summary>
/// <param name="platform">The index platform of the platform to use. Default: 0.</param>
/// <param name="device">The index device of the device to use. Default: 0.</param>
/// <param name="shared">True if shared with OpenGL, else false. Default: false.</param>
/// <param name="outputTexID">The texture ID of the shared OpenGL texture if shared. Default: 0.</param>
template <typename T>
RendererCL<T>::RendererCL(uint platform, uint device, bool shared, GLuint outputTexID)
template <typename T, typename bucketT>
RendererCL<T, bucketT>::RendererCL(uint platform, uint device, bool shared, GLuint outputTexID)
:
m_IterOpenCLKernelCreator(false),
m_DEOpenCLKernelCreator(typeid(T) == typeid(double), false),
m_FinalAccumOpenCLKernelCreator(typeid(T) == typeid(double))
{
m_Init = false;
m_NVidia = false;
@ -61,8 +66,8 @@ RendererCL<T>::RendererCL(uint platform, uint device, bool shared, GLuint output
/// <summary>
/// Virtual destructor.
/// </summary>
template <typename T>
RendererCL<T>::~RendererCL()
template <typename T, typename bucketT>
RendererCL<T, bucketT>::~RendererCL()
{
}
@ -82,8 +87,8 @@ RendererCL<T>::~RendererCL()
/// <param name="shared">True if shared with OpenGL, else false.</param>
/// <param name="outputTexID">The texture ID of the shared OpenGL texture if shared</param>
/// <returns>True if success, else false.</returns>
template <typename T>
bool RendererCL<T>::Init(uint platform, uint device, bool shared, GLuint outputTexID)
template <typename T, typename bucketT>
bool RendererCL<T, bucketT>::Init(uint platform, uint device, bool shared, GLuint outputTexID)
{
//Timing t;
bool b = true;
@ -101,12 +106,12 @@ bool RendererCL<T>::Init(uint platform, uint device, bool shared, GLuint outputT
m_NVidia = ToLower(m_Wrapper.DeviceAndPlatformNames()).find_first_of("nvidia") != string::npos && m_Wrapper.LocalMemSize() > (32 * 1024);
m_WarpSize = m_NVidia ? 32 : 64;
m_IterOpenCLKernelCreator = IterOpenCLKernelCreator<T>(m_NVidia);
m_DEOpenCLKernelCreator = DEOpenCLKernelCreator<T>(m_NVidia);
m_DEOpenCLKernelCreator = DEOpenCLKernelCreator(m_DoublePrecision, m_NVidia);
string zeroizeProgram = m_IterOpenCLKernelCreator.ZeroizeKernel();
string logAssignProgram = m_DEOpenCLKernelCreator.LogScaleAssignDEKernel();//Build a couple of simple programs to ensure OpenCL is working right.
if (b && !(b = m_Wrapper.AddProgram(m_IterOpenCLKernelCreator.ZeroizeEntryPoint(), zeroizeProgram, m_IterOpenCLKernelCreator.ZeroizeEntryPoint(), m_DoublePrecision))) { m_ErrorReport.push_back(loc); }
if (b && !(b = m_Wrapper.AddProgram(m_IterOpenCLKernelCreator.ZeroizeEntryPoint(), zeroizeProgram, m_IterOpenCLKernelCreator.ZeroizeEntryPoint(), m_DoublePrecision))) { m_ErrorReport.push_back(loc); }
if (b && !(b = m_Wrapper.AddProgram(m_DEOpenCLKernelCreator.LogScaleAssignDEEntryPoint(), logAssignProgram, m_DEOpenCLKernelCreator.LogScaleAssignDEEntryPoint(), m_DoublePrecision))) { m_ErrorReport.push_back(loc); }
if (b && !(b = m_Wrapper.AddAndWriteImage("Palette", CL_MEM_READ_ONLY, m_PaletteFormat, 256, 1, 0, nullptr))) { m_ErrorReport.push_back(loc); }
if (b && !(b = m_Wrapper.AddAndWriteBuffer(m_SeedsBufferName, reinterpret_cast<void*>(m_Seeds.data()), SizeOf(m_Seeds)))) { m_ErrorReport.push_back(loc); }
@ -130,8 +135,8 @@ bool RendererCL<T>::Init(uint platform, uint device, bool shared, GLuint outputT
/// </summary>
/// <param name="outputTexID">The texture ID of the shared OpenGL texture if shared</param>
/// <returns>True if success, else false.</returns>
template <typename T>
bool RendererCL<T>::SetOutputTexture(GLuint outputTexID)
template <typename T, typename bucketT>
bool RendererCL<T, bucketT>::SetOutputTexture(GLuint outputTexID)
{
bool success = true;
const char* loc = __FUNCTION__;
@ -157,38 +162,38 @@ bool RendererCL<T>::SetOutputTexture(GLuint outputTexID)
/// </summary>
//Iters per kernel/block/grid.
template <typename T> uint RendererCL<T>::IterCountPerKernel() const { return m_IterCountPerKernel; }
template <typename T> uint RendererCL<T>::IterCountPerBlock() const { return IterCountPerKernel() * IterBlockKernelCount(); }
template <typename T> uint RendererCL<T>::IterCountPerGrid() const { return IterCountPerKernel() * IterGridKernelCount(); }
template <typename T, typename bucketT> uint RendererCL<T, bucketT>::IterCountPerKernel() const { return m_IterCountPerKernel; }
template <typename T, typename bucketT> uint RendererCL<T, bucketT>::IterCountPerBlock() const { return IterCountPerKernel() * IterBlockKernelCount(); }
template <typename T, typename bucketT> uint RendererCL<T, bucketT>::IterCountPerGrid() const { return IterCountPerKernel() * IterGridKernelCount(); }
//Kernels per block.
template <typename T> uint RendererCL<T>::IterBlockKernelWidth() const { return m_IterBlockWidth; }
template <typename T> uint RendererCL<T>::IterBlockKernelHeight() const { return m_IterBlockHeight; }
template <typename T> uint RendererCL<T>::IterBlockKernelCount() const { return IterBlockKernelWidth() * IterBlockKernelHeight(); }
template <typename T, typename bucketT> uint RendererCL<T, bucketT>::IterBlockKernelWidth() const { return m_IterBlockWidth; }
template <typename T, typename bucketT> uint RendererCL<T, bucketT>::IterBlockKernelHeight() const { return m_IterBlockHeight; }
template <typename T, typename bucketT> uint RendererCL<T, bucketT>::IterBlockKernelCount() const { return IterBlockKernelWidth() * IterBlockKernelHeight(); }
//Kernels per grid.
template <typename T> uint RendererCL<T>::IterGridKernelWidth() const { return IterGridBlockWidth() * IterBlockKernelWidth(); }
template <typename T> uint RendererCL<T>::IterGridKernelHeight() const { return IterGridBlockHeight() * IterBlockKernelHeight(); }
template <typename T> uint RendererCL<T>::IterGridKernelCount() const { return IterGridKernelWidth() * IterGridKernelHeight(); }
template <typename T, typename bucketT> uint RendererCL<T, bucketT>::IterGridKernelWidth() const { return IterGridBlockWidth() * IterBlockKernelWidth(); }
template <typename T, typename bucketT> uint RendererCL<T, bucketT>::IterGridKernelHeight() const { return IterGridBlockHeight() * IterBlockKernelHeight(); }
template <typename T, typename bucketT> uint RendererCL<T, bucketT>::IterGridKernelCount() const { return IterGridKernelWidth() * IterGridKernelHeight(); }
//Blocks per grid.
template <typename T> uint RendererCL<T>::IterGridBlockWidth() const { return m_IterBlocksWide; }
template <typename T> uint RendererCL<T>::IterGridBlockHeight() const { return m_IterBlocksHigh; }
template <typename T> uint RendererCL<T>::IterGridBlockCount() const { return IterGridBlockWidth() * IterGridBlockHeight(); }
template <typename T, typename bucketT> uint RendererCL<T, bucketT>::IterGridBlockWidth() const { return m_IterBlocksWide; }
template <typename T, typename bucketT> uint RendererCL<T, bucketT>::IterGridBlockHeight() const { return m_IterBlocksHigh; }
template <typename T, typename bucketT> uint RendererCL<T, bucketT>::IterGridBlockCount() const { return IterGridBlockWidth() * IterGridBlockHeight(); }
template <typename T> uint RendererCL<T>::PlatformIndex() { return m_Wrapper.PlatformIndex(); }
template <typename T> uint RendererCL<T>::DeviceIndex() { return m_Wrapper.DeviceIndex(); }
template <typename T, typename bucketT> uint RendererCL<T, bucketT>::PlatformIndex() { return m_Wrapper.PlatformIndex(); }
template <typename T, typename bucketT> uint RendererCL<T, bucketT>::DeviceIndex() { return m_Wrapper.DeviceIndex(); }
/// <summary>
/// Read the histogram into the host side CPU buffer.
/// Used for debugging.
/// </summary>
/// <returns>True if success, else false.</returns>
template <typename T>
bool RendererCL<T>::ReadHist()
template <typename T, typename bucketT>
bool RendererCL<T, bucketT>::ReadHist()
{
if (Renderer<T, T>::Alloc())//Allocate the memory to read into.
return m_Wrapper.ReadBuffer(m_HistBufferName, reinterpret_cast<void*>(HistBuckets()), SuperSize() * sizeof(v4T));
if (Renderer<T, bucketT>::Alloc())//Allocate the memory to read into.
return m_Wrapper.ReadBuffer(m_HistBufferName, reinterpret_cast<void*>(HistBuckets()), SuperSize() * sizeof(v4bT));
return false;
}
@ -198,11 +203,11 @@ bool RendererCL<T>::ReadHist()
/// Used for debugging.
/// </summary>
/// <returns>True if success, else false.</returns>
template <typename T>
bool RendererCL<T>::ReadAccum()
template <typename T, typename bucketT>
bool RendererCL<T, bucketT>::ReadAccum()
{
if (Renderer<T, T>::Alloc())//Allocate the memory to read into.
return m_Wrapper.ReadBuffer(m_AccumBufferName, reinterpret_cast<void*>(AccumulatorBuckets()), SuperSize() * sizeof(v4T));
if (Renderer<T, bucketT>::Alloc())//Allocate the memory to read into.
return m_Wrapper.ReadBuffer(m_AccumBufferName, reinterpret_cast<void*>(AccumulatorBuckets()), SuperSize() * sizeof(v4bT));
return false;
}
@ -213,8 +218,8 @@ bool RendererCL<T>::ReadAccum()
/// </summary>
/// <param name="vec">The host side buffer to read into</param>
/// <returns>True if success, else false.</returns>
template <typename T>
bool RendererCL<T>::ReadPoints(vector<PointCL<T>>& vec)
template <typename T, typename bucketT>
bool RendererCL<T, bucketT>::ReadPoints(vector<PointCL<T>>& vec)
{
vec.resize(IterGridKernelCount());//Allocate the memory to read into.
@ -228,20 +233,20 @@ bool RendererCL<T>::ReadPoints(vector<PointCL<T>>& vec)
/// Clear the histogram buffer with all zeroes.
/// </summary>
/// <returns>True if success, else false.</returns>
template <typename T>
bool RendererCL<T>::ClearHist()
template <typename T, typename bucketT>
bool RendererCL<T, bucketT>::ClearHist()
{
return ClearBuffer(m_HistBufferName, uint(SuperRasW()), uint(SuperRasH()), sizeof(v4T));
return ClearBuffer(m_HistBufferName, uint(SuperRasW()), uint(SuperRasH()), sizeof(v4bT));
}
/// <summary>
/// Clear the desnity filtering buffer with all zeroes.
/// </summary>
/// <returns>True if success, else false.</returns>
template <typename T>
bool RendererCL<T>::ClearAccum()
template <typename T, typename bucketT>
bool RendererCL<T, bucketT>::ClearAccum()
{
return ClearBuffer(m_AccumBufferName, uint(SuperRasW()), uint(SuperRasH()), sizeof(v4T));
return ClearBuffer(m_AccumBufferName, uint(SuperRasW()), uint(SuperRasH()), sizeof(v4bT));
}
/// <summary>
@ -250,15 +255,15 @@ bool RendererCL<T>::ClearAccum()
/// </summary>
/// <param name="vec">The host side buffer whose values to write</param>
/// <returns>True if success, else false.</returns>
template <typename T>
bool RendererCL<T>::WritePoints(vector<PointCL<T>>& vec)
template <typename T, typename bucketT>
bool RendererCL<T, bucketT>::WritePoints(vector<PointCL<T>>& vec)
{
return m_Wrapper.WriteBuffer(m_PointsBufferName, reinterpret_cast<void*>(vec.data()), SizeOf(vec));
}
#ifdef TEST_CL
template <typename T>
bool RendererCL<T>::WriteRandomPoints()
template <typename T, typename bucketT>
bool RendererCL<T, bucketT>::WriteRandomPoints()
{
size_t size = IterGridKernelCount();
vector<PointCL<T>> vec(size);
@ -280,23 +285,23 @@ bool RendererCL<T>::WriteRandomPoints()
/// Get the kernel string for the last built iter program.
/// </summary>
/// <returns>The string representation of the kernel for the last built iter program.</returns>
template <typename T>
string RendererCL<T>::IterKernel() { return m_IterKernel; }
template <typename T, typename bucketT>
string RendererCL<T, bucketT>::IterKernel() { return m_IterKernel; }
/// <summary>
/// Get the kernel string for the last built density filtering program.
/// </summary>
/// <returns>The string representation of the kernel for the last built density filtering program.</returns>
template <typename T>
string RendererCL<T>::DEKernel() { return m_DEOpenCLKernelCreator.GaussianDEKernel(Supersample(), m_DensityFilterCL.m_FilterWidth); }
template <typename T, typename bucketT>
string RendererCL<T, bucketT>::DEKernel() { return m_DEOpenCLKernelCreator.GaussianDEKernel(Supersample(), m_DensityFilterCL.m_FilterWidth); }
/// <summary>
/// Get the kernel string for the last built final accumulation program.
/// </summary>
/// <returns>The string representation of the kernel for the last built final accumulation program.</returns>
template <typename T>
string RendererCL<T>::FinalAccumKernel() { return m_FinalAccumOpenCLKernelCreator.FinalAccumKernel(EarlyClip(), Renderer<T, T>::NumChannels(), Transparency()); }
template <typename T, typename bucketT>
string RendererCL<T, bucketT>::FinalAccumKernel() { return m_FinalAccumOpenCLKernelCreator.FinalAccumKernel(EarlyClip(), Renderer<T, bucketT>::NumChannels(), Transparency()); }
/// <summary>
/// Virtual functions overridden from RendererCLBase.
@ -308,8 +313,8 @@ string RendererCL<T>::FinalAccumKernel() { return m_FinalAccumOpenCLKernelCreato
/// </summary>
/// <param name="pixels">The host side buffer to read into</param>
/// <returns>True if success, else false.</returns>
template <typename T>
bool RendererCL<T>::ReadFinal(byte* pixels)
template <typename T, typename bucketT>
bool RendererCL<T, bucketT>::ReadFinal(byte* pixels)
{
if (pixels)
return m_Wrapper.ReadImage(m_FinalImageName, FinalRasW(), FinalRasH(), 0, m_Wrapper.Shared(), pixels);
@ -322,8 +327,8 @@ bool RendererCL<T>::ReadFinal(byte* pixels)
/// Slow, but never used because the final output image is always completely overwritten.
/// </summary>
/// <returns>True if success, else false.</returns>
template <typename T>
bool RendererCL<T>::ClearFinal()
template <typename T, typename bucketT>
bool RendererCL<T, bucketT>::ClearFinal()
{
vector<byte> v;
uint index = m_Wrapper.FindImageIndex(m_FinalImageName, m_Wrapper.Shared());
@ -349,8 +354,8 @@ bool RendererCL<T>::ClearFinal()
/// The amount of video RAM available on the GPU to render with.
/// </summary>
/// <returns>An unsigned 64-bit integer specifying how much video memory is available</returns>
template <typename T>
size_t RendererCL<T>::MemoryAvailable()
template <typename T, typename bucketT>
size_t RendererCL<T, bucketT>::MemoryAvailable()
{
return Ok() ? m_Wrapper.GlobalMemSize() : 0ULL;
}
@ -359,8 +364,8 @@ size_t RendererCL<T>::MemoryAvailable()
/// Return whether OpenCL has been properly initialized.
/// </summary>
/// <returns>True if OpenCL has been properly initialized, else false.</returns>
template <typename T>
bool RendererCL<T>::Ok() const
template <typename T, typename bucketT>
bool RendererCL<T, bucketT>::Ok() const
{
return m_Init;
}
@ -370,8 +375,8 @@ bool RendererCL<T>::Ok() const
/// since the output is actually an image rather than just a buffer.
/// </summary>
/// <param name="numChannels">The number of channels, ignored.</param>
template <typename T>
void RendererCL<T>::NumChannels(size_t numChannels)
template <typename T, typename bucketT>
void RendererCL<T, bucketT>::NumChannels(size_t numChannels)
{
m_NumChannels = 4;
}
@ -379,8 +384,8 @@ void RendererCL<T>::NumChannels(size_t numChannels)
/// <summary>
/// Dump the error report for this class as well as the OpenCLWrapper member.
/// </summary>
template <typename T>
void RendererCL<T>::DumpErrorReport()
template <typename T, typename bucketT>
void RendererCL<T, bucketT>::DumpErrorReport()
{
EmberReport::DumpErrorReport();
m_Wrapper.DumpErrorReport();
@ -389,8 +394,8 @@ void RendererCL<T>::DumpErrorReport()
/// <summary>
/// Clear the error report for this class as well as the OpenCLWrapper member.
/// </summary>
template <typename T>
void RendererCL<T>::ClearErrorReport()
template <typename T, typename bucketT>
void RendererCL<T, bucketT>::ClearErrorReport()
{
EmberReport::ClearErrorReport();
m_Wrapper.ClearErrorReport();
@ -402,8 +407,8 @@ void RendererCL<T>::ClearErrorReport()
/// change this.
/// </summary>
/// <returns>The number of iterations ran in a single kernel call</returns>
template <typename T>
size_t RendererCL<T>::SubBatchSize() const
template <typename T, typename bucketT>
size_t RendererCL<T, bucketT>::SubBatchSize() const
{
return IterCountPerGrid();
}
@ -413,8 +418,8 @@ size_t RendererCL<T>::SubBatchSize() const
/// the kernel internally runs many threads.
/// </summary>
/// <returns>1</returns>
template <typename T>
size_t RendererCL<T>::ThreadCount() const
template <typename T, typename bucketT>
size_t RendererCL<T, bucketT>::ThreadCount() const
{
return 1;
}
@ -425,22 +430,21 @@ size_t RendererCL<T>::ThreadCount() const
/// </summary>
/// <param name="newAlloc">True if a new filter instance was created, else false.</param>
/// <returns>True if success, else false.</returns>
template <typename T>
bool RendererCL<T>::CreateDEFilter(bool& newAlloc)
template <typename T, typename bucketT>
bool RendererCL<T, bucketT>::CreateDEFilter(bool& newAlloc)
{
bool b = true;
if (Renderer<T, T>::CreateDEFilter(newAlloc))
if (Renderer<T, bucketT>::CreateDEFilter(newAlloc))
{
//Copy coefs and widths here. Convert and copy the other filter params right before calling the filtering kernel.
if (newAlloc)
{
const char* loc = __FUNCTION__;
DensityFilter<T>* filter = dynamic_cast<DensityFilter<T>*>(GetDensityFilter());
if (b && !(b = m_Wrapper.AddAndWriteBuffer(m_DECoefsBufferName, reinterpret_cast<void*>(const_cast<T*>(filter->Coefs())), filter->CoefsSizeBytes()))) { m_ErrorReport.push_back(loc); }
if (b && !(b = m_Wrapper.AddAndWriteBuffer(m_DEWidthsBufferName, reinterpret_cast<void*>(const_cast<T*>(filter->Widths())), filter->WidthsSizeBytes()))) { m_ErrorReport.push_back(loc); }
if (b && !(b = m_Wrapper.AddAndWriteBuffer(m_DECoefIndicesBufferName, reinterpret_cast<void*>(const_cast<uint*>(filter->CoefIndices())), filter->CoefsIndicesSizeBytes()))) { m_ErrorReport.push_back(loc); }
if (b && !(b = m_Wrapper.AddAndWriteBuffer(m_DECoefsBufferName, reinterpret_cast<void*>(const_cast<bucketT*>(m_DensityFilter->Coefs())), m_DensityFilter->CoefsSizeBytes()))) { m_ErrorReport.push_back(loc); }
if (b && !(b = m_Wrapper.AddAndWriteBuffer(m_DEWidthsBufferName, reinterpret_cast<void*>(const_cast<bucketT*>(m_DensityFilter->Widths())), m_DensityFilter->WidthsSizeBytes()))) { m_ErrorReport.push_back(loc); }
if (b && !(b = m_Wrapper.AddAndWriteBuffer(m_DECoefIndicesBufferName, reinterpret_cast<void*>(const_cast<uint*>(m_DensityFilter->CoefIndices())), m_DensityFilter->CoefsIndicesSizeBytes()))) { m_ErrorReport.push_back(loc); }
}
}
else
@ -455,15 +459,15 @@ bool RendererCL<T>::CreateDEFilter(bool& newAlloc)
/// </summary>
/// <param name="newAlloc">True if a new filter instance was created, else false.</param>
/// <returns>True if success, else false.</returns>
template <typename T>
bool RendererCL<T>::CreateSpatialFilter(bool& newAlloc)
template <typename T, typename bucketT>
bool RendererCL<T, bucketT>::CreateSpatialFilter(bool& newAlloc)
{
bool b = true;
if (Renderer<T, T>::CreateSpatialFilter(newAlloc))
if (Renderer<T, bucketT>::CreateSpatialFilter(newAlloc))
{
if (newAlloc)
if (b && !(b = m_Wrapper.AddAndWriteBuffer(m_SpatialFilterCoefsBufferName, reinterpret_cast<void*>(GetSpatialFilter()->Filter()), GetSpatialFilter()->BufferSizeBytes()))) { m_ErrorReport.push_back(__FUNCTION__); }
if (b && !(b = m_Wrapper.AddAndWriteBuffer(m_SpatialFilterCoefsBufferName, reinterpret_cast<void*>(m_SpatialFilter->Filter()), m_SpatialFilter->BufferSizeBytes()))) { m_ErrorReport.push_back(__FUNCTION__); }
}
else
@ -476,8 +480,8 @@ bool RendererCL<T>::CreateSpatialFilter(bool& newAlloc)
/// Get the renderer type enum.
/// </summary>
/// <returns>OPENCL_RENDERER</returns>
template <typename T>
eRendererType RendererCL<T>::RendererType() const
template <typename T, typename bucketT>
eRendererType RendererCL<T, bucketT>::RendererType() const
{
return OPENCL_RENDERER;
}
@ -487,8 +491,8 @@ eRendererType RendererCL<T>::RendererType() const
/// OpenCLWrapper member as a single string.
/// </summary>
/// <returns>The concatenated error report string</returns>
template <typename T>
string RendererCL<T>::ErrorReportString()
template <typename T, typename bucketT>
string RendererCL<T, bucketT>::ErrorReportString()
{
return EmberReport::ErrorReportString() + m_Wrapper.ErrorReportString();
}
@ -498,8 +502,8 @@ string RendererCL<T>::ErrorReportString()
/// OpenCLWrapper member as a vector of strings.
/// </summary>
/// <returns>The concatenated error report vector of strings</returns>
template <typename T>
vector<string> RendererCL<T>::ErrorReport()
template <typename T, typename bucketT>
vector<string> RendererCL<T, bucketT>::ErrorReport()
{
auto ours = EmberReport::ErrorReport();
auto wrappers = m_Wrapper.ErrorReport();
@ -514,10 +518,10 @@ vector<string> RendererCL<T>::ErrorReport()
/// </summary>
/// <param name="randVec">The vector of random contexts to assign</param>
/// <returns>True if the size of the vector matched the number of threads used for rendering and writing seeds to OpenCL succeeded, else false.</returns>
template <typename T>
bool RendererCL<T>::RandVec(vector<QTIsaac<ISAAC_SIZE, ISAAC_INT>>& randVec)
template <typename T, typename bucketT>
bool RendererCL<T, bucketT>::RandVec(vector<QTIsaac<ISAAC_SIZE, ISAAC_INT>>& randVec)
{
bool b = Renderer<T, T>::RandVec(randVec);
bool b = Renderer<T, bucketT>::RandVec(randVec);
const char* loc = __FUNCTION__;
if (m_Wrapper.Ok())
@ -540,8 +544,8 @@ bool RendererCL<T>::RandVec(vector<QTIsaac<ISAAC_SIZE, ISAAC_INT>>& randVec)
/// only supports floats for texture images.
/// </summary>
/// <param name="colorScalar">The color scalar to multiply the ember's palette by</param>
template <typename T>
void RendererCL<T>::MakeDmap(T colorScalar)
template <typename T, typename bucketT>
void RendererCL<T, bucketT>::MakeDmap(T colorScalar)
{
//m_Ember.m_Palette.MakeDmap<float>(m_DmapCL, colorScalar);
m_Ember.m_Palette.MakeDmap(m_DmapCL, colorScalar);
@ -553,8 +557,8 @@ void RendererCL<T>::MakeDmap(T colorScalar)
/// 2D image.
/// </summary>
/// <returns>True if success, else false.</returns>
template <typename T>
bool RendererCL<T>::Alloc()
template <typename T, typename bucketT>
bool RendererCL<T, bucketT>::Alloc()
{
if (!m_Wrapper.Ok())
return false;
@ -567,17 +571,17 @@ bool RendererCL<T>::Alloc()
size_t accumLength = SuperSize() * sizeof(v4T);
const char* loc = __FUNCTION__;
if (b && !(b = m_Wrapper.AddBuffer(m_EmberBufferName, sizeof(m_EmberCL)))) { m_ErrorReport.push_back(loc); }
if (b && !(b = m_Wrapper.AddBuffer(m_XformsBufferName, SizeOf(m_XformsCL)))) { m_ErrorReport.push_back(loc); }
if (b && !(b = m_Wrapper.AddBuffer(m_ParVarsBufferName, 128 * sizeof(T)))) { m_ErrorReport.push_back(loc); }
if (b && !(b = m_Wrapper.AddBuffer(m_DistBufferName, CHOOSE_XFORM_GRAIN))) { m_ErrorReport.push_back(loc); }//Will be resized for xaos.
if (b && !(b = m_Wrapper.AddBuffer(m_CarToRasBufferName, sizeof(m_CarToRasCL)))) { m_ErrorReport.push_back(loc); }
if (b && !(b = m_Wrapper.AddBuffer(m_DEFilterParamsBufferName, sizeof(m_DensityFilterCL)))) { m_ErrorReport.push_back(loc); }
if (b && !(b = m_Wrapper.AddBuffer(m_SpatialFilterParamsBufferName, sizeof(m_SpatialFilterCL)))) { m_ErrorReport.push_back(loc); }
if (b && !(b = m_Wrapper.AddBuffer(m_CurvesCsaName, SizeOf(m_Csa.m_Entries)))) { m_ErrorReport.push_back(loc); }
if (b && !(b = m_Wrapper.AddBuffer(m_HistBufferName, histLength))) { m_ErrorReport.push_back(loc); }//Histogram. Will memset to zero later.
if (b && !(b = m_Wrapper.AddBuffer(m_AccumBufferName, accumLength))) { m_ErrorReport.push_back(loc); }//Accum buffer.
if (b && !(b = m_Wrapper.AddBuffer(m_PointsBufferName, IterGridKernelCount() * sizeof(PointCL<T>)))) { m_ErrorReport.push_back(loc); }//Points between iter calls.
if (b && !(b = m_Wrapper.AddBuffer(m_EmberBufferName, sizeof(m_EmberCL)))) { m_ErrorReport.push_back(loc); }
if (b && !(b = m_Wrapper.AddBuffer(m_XformsBufferName, SizeOf(m_XformsCL)))) { m_ErrorReport.push_back(loc); }
if (b && !(b = m_Wrapper.AddBuffer(m_ParVarsBufferName, 128 * sizeof(T)))) { m_ErrorReport.push_back(loc); }
if (b && !(b = m_Wrapper.AddBuffer(m_DistBufferName, CHOOSE_XFORM_GRAIN))) { m_ErrorReport.push_back(loc); }//Will be resized for xaos.
if (b && !(b = m_Wrapper.AddBuffer(m_CarToRasBufferName, sizeof(m_CarToRasCL)))) { m_ErrorReport.push_back(loc); }
if (b && !(b = m_Wrapper.AddBuffer(m_DEFilterParamsBufferName, sizeof(m_DensityFilterCL)))) { m_ErrorReport.push_back(loc); }
if (b && !(b = m_Wrapper.AddBuffer(m_SpatialFilterParamsBufferName, sizeof(m_SpatialFilterCL)))) { m_ErrorReport.push_back(loc); }
if (b && !(b = m_Wrapper.AddBuffer(m_CurvesCsaName, SizeOf(m_Csa.m_Entries)))) { m_ErrorReport.push_back(loc); }
if (b && !(b = m_Wrapper.AddBuffer(m_HistBufferName, histLength))) { m_ErrorReport.push_back(loc); }//Histogram. Will memset to zero later.
if (b && !(b = m_Wrapper.AddBuffer(m_AccumBufferName, accumLength))) { m_ErrorReport.push_back(loc); }//Accum buffer.
if (b && !(b = m_Wrapper.AddBuffer(m_PointsBufferName, IterGridKernelCount() * sizeof(PointCL<T>)))) { m_ErrorReport.push_back(loc); }//Points between iter calls.
LeaveResize();
@ -592,8 +596,8 @@ bool RendererCL<T>::Alloc()
/// <param name="resetHist">Clear histogram if true, else don't.</param>
/// <param name="resetAccum">Clear density filtering buffer if true, else don't.</param>
/// <returns>True if success, else false.</returns>
template <typename T>
bool RendererCL<T>::ResetBuckets(bool resetHist, bool resetAccum)
template <typename T, typename bucketT>
bool RendererCL<T, bucketT>::ResetBuckets(bool resetHist, bool resetAccum)
{
bool b = true;
@ -610,8 +614,8 @@ bool RendererCL<T>::ResetBuckets(bool resetHist, bool resetAccum)
/// Perform log scale density filtering.
/// </summary>
/// <returns>True if success and not aborted, else false.</returns>
template <typename T>
eRenderStatus RendererCL<T>::LogScaleDensityFilter()
template <typename T, typename bucketT>
eRenderStatus RendererCL<T, bucketT>::LogScaleDensityFilter()
{
return RunLogScaleFilter();
}
@ -620,8 +624,8 @@ eRenderStatus RendererCL<T>::LogScaleDensityFilter()
/// Run gaussian density estimation filtering.
/// </summary>
/// <returns>True if success and not aborted, else false.</returns>
template <typename T>
eRenderStatus RendererCL<T>::GaussianDensityFilter()
template <typename T, typename bucketT>
eRenderStatus RendererCL<T, bucketT>::GaussianDensityFilter()
{
//This commented section is for debugging density filtering by making it run on the CPU
//then copying the results back to the GPU.
@ -630,8 +634,8 @@ eRenderStatus RendererCL<T>::GaussianDensityFilter()
// uint accumLength = SuperSize() * sizeof(glm::detail::tvec4<T>);
// const char* loc = __FUNCTION__;
//
// Renderer<T, T>::ResetBuckets(false, true);
// Renderer<T, T>::GaussianDensityFilter();
// Renderer<T, bucketT>::ResetBuckets(false, true);
// Renderer<T, bucketT>::GaussianDensityFilter();
//
// if (!m_Wrapper.WriteBuffer(m_AccumBufferName, AccumulatorBuckets(), accumLength)) { m_ErrorReport.push_back(loc); return RENDER_ERROR; }
// return RENDER_OK;
@ -656,8 +660,8 @@ eRenderStatus RendererCL<T>::GaussianDensityFilter()
/// <param name="pixels">The pixels to copy the final image to if not nullptr</param>
/// <param name="finalOffset">Offset in the buffer to store the pixels to</param>
/// <returns>True if success and not aborted, else false.</returns>
template <typename T>
eRenderStatus RendererCL<T>::AccumulatorToFinalImage(byte* pixels, size_t finalOffset)
template <typename T, typename bucketT>
eRenderStatus RendererCL<T, bucketT>::AccumulatorToFinalImage(byte* pixels, size_t finalOffset)
{
eRenderStatus status = RunFinalAccum();
@ -683,8 +687,8 @@ eRenderStatus RendererCL<T>::AccumulatorToFinalImage(byte* pixels, size_t finalO
/// <param name="iterCount">The number of iterations to run</param>
/// <param name="temporalSample">The temporal sample within the current pass this is running for</param>
/// <returns>Rendering statistics</returns>
template <typename T>
EmberStats RendererCL<T>::Iterate(size_t iterCount, size_t temporalSample)
template <typename T, typename bucketT>
EmberStats RendererCL<T, bucketT>::Iterate(size_t iterCount, size_t temporalSample)
{
bool b = true;
EmberStats stats;//Do not record bad vals with with GPU. If the user needs to investigate bad vals, use the CPU.
@ -740,8 +744,8 @@ EmberStats RendererCL<T>::Iterate(size_t iterCount, size_t temporalSample)
/// </summary>
/// <param name="doAccum">Whether to build in accumulation, only for debugging. Default: true.</param>
/// <returns>True if success, else false.</returns>
template <typename T>
bool RendererCL<T>::BuildIterProgramForEmber(bool doAccum)
template <typename T, typename bucketT>
bool RendererCL<T, bucketT>::BuildIterProgramForEmber(bool doAccum)
{
//Timing t;
const char* loc = __FUNCTION__;
@ -777,8 +781,8 @@ bool RendererCL<T>::BuildIterProgramForEmber(bool doAccum)
/// <param name="temporalSample">The temporal sample this is running for</param>
/// <param name="itersRan">The storage for the number of iterations ran</param>
/// <returns>True if success, else false.</returns>
template <typename T>
bool RendererCL<T>::RunIter(size_t iterCount, size_t temporalSample, size_t& itersRan)
template <typename T, typename bucketT>
bool RendererCL<T, bucketT>::RunIter(size_t iterCount, size_t temporalSample, size_t& itersRan)
{
Timing t;//, t2(4);
bool b = true;
@ -787,7 +791,7 @@ bool RendererCL<T>::RunIter(size_t iterCount, size_t temporalSample, size_t& ite
uint iterCountPerBlock = IterCountPerBlock();
uint supersize = uint(SuperSize());
int kernelIndex = m_Wrapper.FindKernelIndex(m_IterOpenCLKernelCreator.IterEntryPoint());
size_t fuseFreq = Renderer<T, T>::SubBatchSize() / m_IterCountPerKernel;//Use the base sbs to determine when to fuse.
size_t fuseFreq = Renderer<T, bucketT>::SubBatchSize() / m_IterCountPerKernel;//Use the base sbs to determine when to fuse.
size_t itersRemaining;
double percent, etaMs;
const char* loc = __FUNCTION__;
@ -802,10 +806,10 @@ bool RendererCL<T>::RunIter(size_t iterCount, size_t temporalSample, size_t& ite
ConvertEmber(m_Ember, m_EmberCL, m_XformsCL);
m_CarToRasCL = ConvertCarToRas(*CoordMap());
if (b && !(b = m_Wrapper.WriteBuffer (m_EmberBufferName, reinterpret_cast<void*>(&m_EmberCL), sizeof(m_EmberCL)))) { m_ErrorReport.push_back(loc); }
if (b && !(b = m_Wrapper.WriteBuffer (m_XformsBufferName, reinterpret_cast<void*>(m_XformsCL.data()), sizeof(m_XformsCL[0]) * m_XformsCL.size()))) { m_ErrorReport.push_back(loc); }
if (b && !(b = m_Wrapper.AddAndWriteBuffer(m_DistBufferName, reinterpret_cast<void*>(const_cast<byte*>(XformDistributions())), XformDistributionsSize()))) { m_ErrorReport.push_back(loc); }//Will be resized for xaos.
if (b && !(b = m_Wrapper.WriteBuffer (m_CarToRasBufferName, reinterpret_cast<void*>(&m_CarToRasCL), sizeof(m_CarToRasCL)))) { m_ErrorReport.push_back(loc); }
if (b && !(b = m_Wrapper.WriteBuffer (m_EmberBufferName, reinterpret_cast<void*>(&m_EmberCL), sizeof(m_EmberCL)))) { m_ErrorReport.push_back(loc); }
if (b && !(b = m_Wrapper.WriteBuffer (m_XformsBufferName, reinterpret_cast<void*>(m_XformsCL.data()), sizeof(m_XformsCL[0]) * m_XformsCL.size()))) { m_ErrorReport.push_back(loc); }
if (b && !(b = m_Wrapper.AddAndWriteBuffer(m_DistBufferName, reinterpret_cast<void*>(const_cast<byte*>(XformDistributions())), XformDistributionsSize()))) { m_ErrorReport.push_back(loc); }//Will be resized for xaos.
if (b && !(b = m_Wrapper.WriteBuffer (m_CarToRasBufferName, reinterpret_cast<void*>(&m_CarToRasCL), sizeof(m_CarToRasCL)))) { m_ErrorReport.push_back(loc); }
if (b && !(b = m_Wrapper.AddAndWriteImage("Palette", CL_MEM_READ_ONLY, m_PaletteFormat, m_DmapCL.m_Entries.size(), 1, 0, m_DmapCL.m_Entries.data()))) { m_ErrorReport.push_back(loc); }
@ -825,7 +829,7 @@ bool RendererCL<T>::RunIter(size_t iterCount, size_t temporalSample, size_t& ite
//fuse = ((m_Calls % 4) == 0 ? 100u : 0u);
#endif
itersRemaining = iterCount - itersRan;
uint gridW = uint(std::min(ceil(double(itersRemaining) / double(iterCountPerBlock)), double(IterGridBlockWidth())));
uint gridW = uint(std::min(ceil(double(itersRemaining) / double(iterCountPerBlock)), double(IterGridBlockWidth())));
uint gridH = uint(std::min(ceil(double(itersRemaining) / double(gridW * iterCountPerBlock)), double(IterGridBlockHeight())));
uint iterCountThisLaunch = iterCountPerBlock * gridW * gridH;
@ -910,8 +914,8 @@ bool RendererCL<T>::RunIter(size_t iterCount, size_t temporalSample, size_t& ite
/// Run the log scale filter.
/// </summary>
/// <returns>True if success, else false.</returns>
template <typename T>
eRenderStatus RendererCL<T>::RunLogScaleFilter()
template <typename T, typename bucketT>
eRenderStatus RendererCL<T, bucketT>::RunLogScaleFilter()
{
//Timing t(4);
bool b = true;
@ -920,7 +924,7 @@ eRenderStatus RendererCL<T>::RunLogScaleFilter()
if (kernelIndex != -1)
{
m_DensityFilterCL = ConvertDensityFilter();
ConvertDensityFilter();
uint argIndex = 0;
uint blockW = m_WarpSize;
uint blockH = 4;//A height of 4 seems to run the fastest.
@ -953,15 +957,15 @@ eRenderStatus RendererCL<T>::RunLogScaleFilter()
/// <summary>
/// Run the Gaussian density filter.
/// Method 7: Each block processes a 32x32 block and exits. No column or row advancements happen.
/// Method 7: Each block processes a 16x16(AMD) or 32x32(Nvidia) block and exits. No column or row advancements happen.
/// </summary>
/// <returns>True if success and not aborted, else false.</returns>
template <typename T>
eRenderStatus RendererCL<T>::RunDensityFilter()
template <typename T, typename bucketT>
eRenderStatus RendererCL<T, bucketT>::RunDensityFilter()
{
bool b = true;
Timing t(4);// , t2(4);
m_DensityFilterCL = ConvertDensityFilter();
ConvertDensityFilter();
int kernelIndex = MakeAndGetDensityFilterProgram(Supersample(), m_DensityFilterCL.m_FilterWidth);
const char* loc = __FUNCTION__;
@ -1074,13 +1078,13 @@ eRenderStatus RendererCL<T>::RunDensityFilter()
/// Run final accumulation to the 2D output image.
/// </summary>
/// <returns>True if success and not aborted, else false.</returns>
template <typename T>
eRenderStatus RendererCL<T>::RunFinalAccum()
template <typename T, typename bucketT>
eRenderStatus RendererCL<T, bucketT>::RunFinalAccum()
{
//Timing t(4);
bool b = true;
T alphaBase;
T alphaScale;
double alphaBase;
double alphaScale;
int accumKernelIndex = MakeAndGetFinalAccumProgram(alphaBase, alphaScale);
uint argIndex;
uint gridW;
@ -1093,10 +1097,10 @@ eRenderStatus RendererCL<T>::RunFinalAccum()
if (!m_Abort && accumKernelIndex != -1)
{
//This is needed with or without early clip.
m_SpatialFilterCL = ConvertSpatialFilter();
ConvertSpatialFilter();
if (b && !(b = m_Wrapper.AddAndWriteBuffer(m_SpatialFilterParamsBufferName, reinterpret_cast<void*>(&m_SpatialFilterCL), sizeof(m_SpatialFilterCL)))) { m_ErrorReport.push_back(loc); }
if (b && !(b = m_Wrapper.AddAndWriteBuffer(m_CurvesCsaName, m_Csa.m_Entries.data(), SizeOf(m_Csa.m_Entries)))) { m_ErrorReport.push_back(loc); }
if (b && !(b = m_Wrapper.AddAndWriteBuffer(m_CurvesCsaName, m_Csa.m_Entries.data(), SizeOf(m_Csa.m_Entries)))) { m_ErrorReport.push_back(loc); }
//Since early clip requires gamma correcting the entire accumulator first,
//it can't be done inside of the normal final accumulation kernel, so
@ -1140,8 +1144,8 @@ eRenderStatus RendererCL<T>::RunFinalAccum()
if (b && !(b = m_Wrapper.SetBufferArg(accumKernelIndex, argIndex++, m_CurvesCsaName))) { m_ErrorReport.push_back(loc); }//Curve points.
if (b && !(b = m_Wrapper.SetArg (accumKernelIndex, argIndex++, curvesSet))) { m_ErrorReport.push_back(loc); }//Do curves.
if (b && !(b = m_Wrapper.SetArg (accumKernelIndex, argIndex++, alphaBase))) { m_ErrorReport.push_back(loc); }//Alpha base.
if (b && !(b = m_Wrapper.SetArg (accumKernelIndex, argIndex++, alphaScale))) { m_ErrorReport.push_back(loc); }//Alpha scale.
if (b && !(b = m_Wrapper.SetArg (accumKernelIndex, argIndex++, bucketT(alphaBase)))) { m_ErrorReport.push_back(loc); }//Alpha base.
if (b && !(b = m_Wrapper.SetArg (accumKernelIndex, argIndex++, bucketT(alphaScale)))) { m_ErrorReport.push_back(loc); }//Alpha scale.
if (b && m_Wrapper.Shared())
if (b && !(b = m_Wrapper.EnqueueAcquireGLObjects(m_FinalImageName))) { m_ErrorReport.push_back(loc); }
@ -1170,8 +1174,8 @@ eRenderStatus RendererCL<T>::RunFinalAccum()
/// <param name="height">Height in elements</param>
/// <param name="elementSize">Size of each element</param>
/// <returns>True if success, else false.</returns>
template <typename T>
bool RendererCL<T>::ClearBuffer(const string& bufferName, uint width, uint height, uint elementSize)
template <typename T, typename bucketT>
bool RendererCL<T, bucketT>::ClearBuffer(const string& bufferName, uint width, uint height, uint elementSize)
{
bool b = true;
int kernelIndex = m_Wrapper.FindKernelIndex(m_IterOpenCLKernelCreator.ZeroizeEntryPoint());
@ -1215,8 +1219,8 @@ bool RendererCL<T>::ClearBuffer(const string& bufferName, uint width, uint heigh
/// <param name="rowParity">Row parity</param>
/// <param name="colParity">Column parity</param>
/// <returns>True if success, else false.</returns>
template <typename T>
bool RendererCL<T>::RunDensityFilterPrivate(uint kernelIndex, uint gridW, uint gridH, uint blockW, uint blockH, uint chunkSizeW, uint chunkSizeH, uint chunkW, uint chunkH)
template <typename T, typename bucketT>
bool RendererCL<T, bucketT>::RunDensityFilterPrivate(uint kernelIndex, uint gridW, uint gridH, uint blockW, uint blockH, uint chunkSizeW, uint chunkSizeH, uint chunkW, uint chunkH)
{
//Timing t(4);
bool b = true;
@ -1248,8 +1252,8 @@ bool RendererCL<T>::RunDensityFilterPrivate(uint kernelIndex, uint gridW, uint g
/// <param name="ss">The supersample being used for the current ember</param>
/// <param name="filterWidth">Width of the gaussian filter</param>
/// <returns>The kernel index if successful, else -1.</returns>
template <typename T>
int RendererCL<T>::MakeAndGetDensityFilterProgram(size_t ss, uint filterWidth)
template <typename T, typename bucketT>
int RendererCL<T, bucketT>::MakeAndGetDensityFilterProgram(size_t ss, uint filterWidth)
{
string deEntryPoint = m_DEOpenCLKernelCreator.GaussianDEEntryPoint(ss, filterWidth);
int kernelIndex = m_Wrapper.FindKernelIndex(deEntryPoint);
@ -1281,16 +1285,16 @@ int RendererCL<T>::MakeAndGetDensityFilterProgram(size_t ss, uint filterWidth)
/// <param name="alphaBase">Storage for the alpha base value used in the kernel. 0 if transparency is true, else 255.</param>
/// <param name="alphaScale">Storage for the alpha scale value used in the kernel. 255 if transparency is true, else 0.</param>
/// <returns>The kernel index if successful, else -1.</returns>
template <typename T>
int RendererCL<T>::MakeAndGetFinalAccumProgram(T& alphaBase, T& alphaScale)
template <typename T, typename bucketT>
int RendererCL<T, bucketT>::MakeAndGetFinalAccumProgram(double& alphaBase, double& alphaScale)
{
string finalAccumEntryPoint = m_FinalAccumOpenCLKernelCreator.FinalAccumEntryPoint(EarlyClip(), Renderer<T, T>::NumChannels(), Transparency(), alphaBase, alphaScale);
string finalAccumEntryPoint = m_FinalAccumOpenCLKernelCreator.FinalAccumEntryPoint(EarlyClip(), Renderer<T, bucketT>::NumChannels(), Transparency(), alphaBase, alphaScale);
int kernelIndex = m_Wrapper.FindKernelIndex(finalAccumEntryPoint);
const char* loc = __FUNCTION__;
if (kernelIndex == -1)//Has not been built yet.
{
string kernel = m_FinalAccumOpenCLKernelCreator.FinalAccumKernel(EarlyClip(), Renderer<T, T>::NumChannels(), Transparency());
string kernel = m_FinalAccumOpenCLKernelCreator.FinalAccumKernel(EarlyClip(), Renderer<T, bucketT>::NumChannels(), Transparency());
bool b = m_Wrapper.AddProgram(finalAccumEntryPoint, kernel, finalAccumEntryPoint, m_DoublePrecision);
if (b)
@ -1306,16 +1310,16 @@ int RendererCL<T>::MakeAndGetFinalAccumProgram(T& alphaBase, T& alphaScale)
/// Make the gamma correction program for early clipping and return its index.
/// </summary>
/// <returns>The kernel index if successful, else -1.</returns>
template <typename T>
int RendererCL<T>::MakeAndGetGammaCorrectionProgram()
template <typename T, typename bucketT>
int RendererCL<T, bucketT>::MakeAndGetGammaCorrectionProgram()
{
string gammaEntryPoint = m_FinalAccumOpenCLKernelCreator.GammaCorrectionEntryPoint(Renderer<T, T>::NumChannels(), Transparency());
string gammaEntryPoint = m_FinalAccumOpenCLKernelCreator.GammaCorrectionEntryPoint(Renderer<T, bucketT>::NumChannels(), Transparency());
int kernelIndex = m_Wrapper.FindKernelIndex(gammaEntryPoint);
const char* loc = __FUNCTION__;
if (kernelIndex == -1)//Has not been built yet.
{
string kernel = m_FinalAccumOpenCLKernelCreator.GammaCorrectionKernel(Renderer<T, T>::NumChannels(), Transparency());
string kernel = m_FinalAccumOpenCLKernelCreator.GammaCorrectionKernel(Renderer<T, bucketT>::NumChannels(), Transparency());
bool b = m_Wrapper.AddProgram(gammaEntryPoint, kernel, gammaEntryPoint, m_DoublePrecision);
if (b)
@ -1336,28 +1340,22 @@ int RendererCL<T>::MakeAndGetGammaCorrectionProgram()
/// for passing to OpenCL.
/// </summary>
/// <returns>The DensityFilterCL object</returns>
template <typename T>
DensityFilterCL<T> RendererCL<T>::ConvertDensityFilter()
template <typename T, typename bucketT>
void RendererCL<T, bucketT>::ConvertDensityFilter()
{
DensityFilterCL<T> filterCL;
DensityFilter<T>* densityFilter = dynamic_cast<DensityFilter<T>*>(GetDensityFilter());
filterCL.m_Supersample = uint(Supersample());
filterCL.m_SuperRasW = uint(SuperRasW());
filterCL.m_SuperRasH = uint(SuperRasH());
filterCL.m_K1 = K1();
filterCL.m_K2 = K2();
if (densityFilter)
if (m_DensityFilter.get())
{
filterCL.m_Curve = densityFilter->Curve();
filterCL.m_KernelSize = uint(densityFilter->KernelSize());
filterCL.m_MaxFilterIndex = uint(densityFilter->MaxFilterIndex());
filterCL.m_MaxFilteredCounts = uint(densityFilter->MaxFilteredCounts());
filterCL.m_FilterWidth = uint(densityFilter->FilterWidth());
m_DensityFilterCL.m_Supersample = uint(Supersample());
m_DensityFilterCL.m_SuperRasW = uint(SuperRasW());
m_DensityFilterCL.m_SuperRasH = uint(SuperRasH());
m_DensityFilterCL.m_K1 = K1();
m_DensityFilterCL.m_K2 = K2();
m_DensityFilterCL.m_Curve = m_DensityFilter->Curve();
m_DensityFilterCL.m_KernelSize = uint(m_DensityFilter->KernelSize());
m_DensityFilterCL.m_MaxFilterIndex = uint(m_DensityFilter->MaxFilterIndex());
m_DensityFilterCL.m_MaxFilteredCounts = uint(m_DensityFilter->MaxFilteredCounts());
m_DensityFilterCL.m_FilterWidth = uint(m_DensityFilter->FilterWidth());
}
return filterCL;
}
/// <summary>
@ -1365,33 +1363,33 @@ DensityFilterCL<T> RendererCL<T>::ConvertDensityFilter()
/// for passing to OpenCL.
/// </summary>
/// <returns>The SpatialFilterCL object</returns>
template <typename T>
SpatialFilterCL<T> RendererCL<T>::ConvertSpatialFilter()
template <typename T, typename bucketT>
void RendererCL<T, bucketT>::ConvertSpatialFilter()
{
T g, linRange, vibrancy;
Color<T> background;
SpatialFilterCL<T> filterCL;
bucketT g, linRange, vibrancy;
Color<bucketT> background;
this->PrepFinalAccumVals(background, g, linRange, vibrancy);
if (m_SpatialFilter.get())
{
this->PrepFinalAccumVals(background, g, linRange, vibrancy);
filterCL.m_SuperRasW = uint(SuperRasW());
filterCL.m_SuperRasH = uint(SuperRasH());
filterCL.m_FinalRasW = uint(FinalRasW());
filterCL.m_FinalRasH = uint(FinalRasH());
filterCL.m_Supersample = uint(Supersample());
filterCL.m_FilterWidth = uint(GetSpatialFilter()->FinalFilterWidth());
filterCL.m_NumChannels = uint(Renderer<T, T>::NumChannels());
filterCL.m_BytesPerChannel = uint(BytesPerChannel());
filterCL.m_DensityFilterOffset = uint(DensityFilterOffset());
filterCL.m_Transparency = Transparency();
filterCL.m_YAxisUp = uint(m_YAxisUp);
filterCL.m_Vibrancy = vibrancy;
filterCL.m_HighlightPower = HighlightPower();
filterCL.m_Gamma = g;
filterCL.m_LinRange = linRange;
filterCL.m_Background = background;
return filterCL;
m_SpatialFilterCL.m_SuperRasW = uint(SuperRasW());
m_SpatialFilterCL.m_SuperRasH = uint(SuperRasH());
m_SpatialFilterCL.m_FinalRasW = uint(FinalRasW());
m_SpatialFilterCL.m_FinalRasH = uint(FinalRasH());
m_SpatialFilterCL.m_Supersample = uint(Supersample());
m_SpatialFilterCL.m_FilterWidth = uint(m_SpatialFilter->FinalFilterWidth());
m_SpatialFilterCL.m_NumChannels = uint(Renderer<T, bucketT>::NumChannels());
m_SpatialFilterCL.m_BytesPerChannel = uint(BytesPerChannel());
m_SpatialFilterCL.m_DensityFilterOffset = uint(DensityFilterOffset());
m_SpatialFilterCL.m_Transparency = Transparency();
m_SpatialFilterCL.m_YAxisUp = uint(m_YAxisUp);
m_SpatialFilterCL.m_Vibrancy = vibrancy;
m_SpatialFilterCL.m_HighlightPower = HighlightPower();
m_SpatialFilterCL.m_Gamma = g;
m_SpatialFilterCL.m_LinRange = linRange;
m_SpatialFilterCL.m_Background = background;
}
}
/// <summary>
@ -1401,8 +1399,8 @@ SpatialFilterCL<T> RendererCL<T>::ConvertSpatialFilter()
/// <param name="ember">The Ember object to convert</param>
/// <param name="emberCL">The converted EmberCL</param>
/// <param name="xformsCL">The converted vector of XformCL</param>
template <typename T>
void RendererCL<T>::ConvertEmber(Ember<T>& ember, EmberCL<T>& emberCL, vector<XformCL<T>>& xformsCL)
template <typename T, typename bucketT>
void RendererCL<T, bucketT>::ConvertEmber(Ember<T>& ember, EmberCL<T>& emberCL, vector<XformCL<T>>& xformsCL)
{
memset(&emberCL, 0, sizeof(EmberCL<T>));//Might not really be needed.
@ -1455,8 +1453,8 @@ void RendererCL<T>::ConvertEmber(Ember<T>& ember, EmberCL<T>& emberCL, vector<Xf
/// </summary>
/// <param name="carToRas">The CarToRas object to convert</param>
/// <returns>The CarToRasCL object</returns>
template <typename T>
CarToRasCL<T> RendererCL<T>::ConvertCarToRas(const CarToRas<T>& carToRas)
template <typename T, typename bucketT>
CarToRasCL<T> RendererCL<T, bucketT>::ConvertCarToRas(const CarToRas<T>& carToRas)
{
CarToRasCL<T> carToRasCL;
@ -1479,8 +1477,8 @@ CarToRasCL<T> RendererCL<T>::ConvertCarToRas(const CarToRas<T>& carToRas)
/// Note, WriteBuffer() must be called after this to actually copy the
/// data from the host to the device.
/// </summary>
template <typename T>
void RendererCL<T>::FillSeeds()
template <typename T, typename bucketT>
void RendererCL<T, bucketT>::FillSeeds()
{
double start, delta = std::floor((double)std::numeric_limits<uint>::max() / (IterGridKernelCount() * 2));
m_Seeds.resize(IterGridKernelCount());
@ -1495,9 +1493,9 @@ void RendererCL<T>::FillSeeds()
}
}
template EMBERCL_API class RendererCL<float>;
template EMBERCL_API class RendererCL<float, float>;
#ifdef DO_DOUBLE
template EMBERCL_API class RendererCL<double>;
template EMBERCL_API class RendererCL<double, float>;
#endif
}

View File

@ -33,55 +33,55 @@ public:
/// It does not support different types for T and bucketT, so it only has one template argument
/// and uses both for the base.
/// </summary>
template <typename T>
class EMBERCL_API RendererCL : public Renderer<T, T>, public RendererCLBase
template <typename T, typename bucketT>
class EMBERCL_API RendererCL : public Renderer<T, bucketT>, public RendererCLBase
{
using EmberNs::Renderer<T, T>::RendererBase::Abort;
using EmberNs::Renderer<T, T>::RendererBase::EarlyClip;
using EmberNs::Renderer<T, T>::RendererBase::Transparency;
using EmberNs::Renderer<T, T>::RendererBase::EnterResize;
using EmberNs::Renderer<T, T>::RendererBase::LeaveResize;
using EmberNs::Renderer<T, T>::RendererBase::FinalRasW;
using EmberNs::Renderer<T, T>::RendererBase::FinalRasH;
using EmberNs::Renderer<T, T>::RendererBase::SuperRasW;
using EmberNs::Renderer<T, T>::RendererBase::SuperRasH;
using EmberNs::Renderer<T, T>::RendererBase::SuperSize;
using EmberNs::Renderer<T, T>::RendererBase::BytesPerChannel;
using EmberNs::Renderer<T, T>::RendererBase::TemporalSamples;
using EmberNs::Renderer<T, T>::RendererBase::ItersPerTemporalSample;
using EmberNs::Renderer<T, T>::RendererBase::FuseCount;
using EmberNs::Renderer<T, T>::RendererBase::DensityFilterOffset;
using EmberNs::Renderer<T, T>::RendererBase::m_ProgressParameter;
using EmberNs::Renderer<T, T>::RendererBase::m_YAxisUp;
using EmberNs::Renderer<T, T>::RendererBase::m_LockAccum;
using EmberNs::Renderer<T, T>::RendererBase::m_Abort;
using EmberNs::Renderer<T, T>::RendererBase::m_NumChannels;
using EmberNs::Renderer<T, T>::RendererBase::m_LastIter;
using EmberNs::Renderer<T, T>::RendererBase::m_LastIterPercent;
using EmberNs::Renderer<T, T>::RendererBase::m_Stats;
using EmberNs::Renderer<T, T>::RendererBase::m_Callback;
using EmberNs::Renderer<T, T>::RendererBase::m_Rand;
using EmberNs::Renderer<T, T>::RendererBase::m_RenderTimer;
using EmberNs::Renderer<T, T>::RendererBase::m_IterTimer;
using EmberNs::Renderer<T, T>::RendererBase::m_ProgressTimer;
using EmberNs::Renderer<T, T>::RendererBase::EmberReport::m_ErrorReport;
using EmberNs::Renderer<T, T>::m_RotMat;
using EmberNs::Renderer<T, T>::m_Ember;
using EmberNs::Renderer<T, T>::m_Csa;
using EmberNs::Renderer<T, T>::m_CurvesSet;
using EmberNs::Renderer<T, T>::CenterX;
using EmberNs::Renderer<T, T>::CenterY;
using EmberNs::Renderer<T, T>::K1;
using EmberNs::Renderer<T, T>::K2;
using EmberNs::Renderer<T, T>::Supersample;
using EmberNs::Renderer<T, T>::HighlightPower;
using EmberNs::Renderer<T, T>::HistBuckets;
using EmberNs::Renderer<T, T>::AccumulatorBuckets;
using EmberNs::Renderer<T, T>::GetDensityFilter;
using EmberNs::Renderer<T, T>::GetSpatialFilter;
using EmberNs::Renderer<T, T>::CoordMap;
using EmberNs::Renderer<T, T>::XformDistributions;
using EmberNs::Renderer<T, T>::XformDistributionsSize;
using EmberNs::Renderer<T, bucketT>::RendererBase::Abort;
using EmberNs::Renderer<T, bucketT>::RendererBase::EarlyClip;
using EmberNs::Renderer<T, bucketT>::RendererBase::Transparency;
using EmberNs::Renderer<T, bucketT>::RendererBase::EnterResize;
using EmberNs::Renderer<T, bucketT>::RendererBase::LeaveResize;
using EmberNs::Renderer<T, bucketT>::RendererBase::FinalRasW;
using EmberNs::Renderer<T, bucketT>::RendererBase::FinalRasH;
using EmberNs::Renderer<T, bucketT>::RendererBase::SuperRasW;
using EmberNs::Renderer<T, bucketT>::RendererBase::SuperRasH;
using EmberNs::Renderer<T, bucketT>::RendererBase::SuperSize;
using EmberNs::Renderer<T, bucketT>::RendererBase::BytesPerChannel;
using EmberNs::Renderer<T, bucketT>::RendererBase::TemporalSamples;
using EmberNs::Renderer<T, bucketT>::RendererBase::ItersPerTemporalSample;
using EmberNs::Renderer<T, bucketT>::RendererBase::FuseCount;
using EmberNs::Renderer<T, bucketT>::RendererBase::DensityFilterOffset;
using EmberNs::Renderer<T, bucketT>::RendererBase::m_ProgressParameter;
using EmberNs::Renderer<T, bucketT>::RendererBase::m_YAxisUp;
using EmberNs::Renderer<T, bucketT>::RendererBase::m_LockAccum;
using EmberNs::Renderer<T, bucketT>::RendererBase::m_Abort;
using EmberNs::Renderer<T, bucketT>::RendererBase::m_NumChannels;
using EmberNs::Renderer<T, bucketT>::RendererBase::m_LastIter;
using EmberNs::Renderer<T, bucketT>::RendererBase::m_LastIterPercent;
using EmberNs::Renderer<T, bucketT>::RendererBase::m_Stats;
using EmberNs::Renderer<T, bucketT>::RendererBase::m_Callback;
using EmberNs::Renderer<T, bucketT>::RendererBase::m_Rand;
using EmberNs::Renderer<T, bucketT>::RendererBase::m_RenderTimer;
using EmberNs::Renderer<T, bucketT>::RendererBase::m_IterTimer;
using EmberNs::Renderer<T, bucketT>::RendererBase::m_ProgressTimer;
using EmberNs::Renderer<T, bucketT>::RendererBase::EmberReport::m_ErrorReport;
using EmberNs::Renderer<T, bucketT>::m_RotMat;
using EmberNs::Renderer<T, bucketT>::m_Ember;
using EmberNs::Renderer<T, bucketT>::m_Csa;
using EmberNs::Renderer<T, bucketT>::m_CurvesSet;
using EmberNs::Renderer<T, bucketT>::CenterX;
using EmberNs::Renderer<T, bucketT>::CenterY;
using EmberNs::Renderer<T, bucketT>::K1;
using EmberNs::Renderer<T, bucketT>::K2;
using EmberNs::Renderer<T, bucketT>::Supersample;
using EmberNs::Renderer<T, bucketT>::HighlightPower;
using EmberNs::Renderer<T, bucketT>::HistBuckets;
using EmberNs::Renderer<T, bucketT>::AccumulatorBuckets;
using EmberNs::Renderer<T, bucketT>::GetDensityFilter;
using EmberNs::Renderer<T, bucketT>::GetSpatialFilter;
using EmberNs::Renderer<T, bucketT>::CoordMap;
using EmberNs::Renderer<T, bucketT>::XformDistributions;
using EmberNs::Renderer<T, bucketT>::XformDistributionsSize;
public:
RendererCL(uint platform = 0, uint device = 0, bool shared = false, GLuint outputTexID = 0);
@ -169,13 +169,13 @@ private:
bool ClearBuffer(const string& bufferName, uint width, uint height, uint elementSize);
bool RunDensityFilterPrivate(uint kernelIndex, uint gridW, uint gridH, uint blockW, uint blockH, uint chunkSizeW, uint chunkSizeH, uint chunkW, uint chunkH);
int MakeAndGetDensityFilterProgram(size_t ss, uint filterWidth);
int MakeAndGetFinalAccumProgram(T& alphaBase, T& alphaScale);
int MakeAndGetFinalAccumProgram(double& alphaBase, double& alphaScale);
int MakeAndGetGammaCorrectionProgram();
void FillSeeds();
//Private functions passing data to OpenCL programs.
DensityFilterCL<T> ConvertDensityFilter();
SpatialFilterCL<T> ConvertSpatialFilter();
void ConvertDensityFilter();
void ConvertSpatialFilter();
void ConvertEmber(Ember<T>& ember, EmberCL<T>& emberCL, vector<XformCL<T>>& xformsCL);
static CarToRasCL<T> ConvertCarToRas(const CarToRas<T>& carToRas);
@ -221,13 +221,13 @@ private:
EmberCL<T> m_EmberCL;
vector<XformCL<T>> m_XformsCL;
vector<glm::highp_uvec2> m_Seeds;
Palette<float> m_DmapCL;//Used instead of the base class' m_Dmap because OpenCL only supports float textures.
Palette<float> m_DmapCL;//Used instead of the base class' m_Dmap because OpenCL only supports float textures. Likely not needed if we switch to float only hist.
CarToRasCL<T> m_CarToRasCL;
DensityFilterCL<T> m_DensityFilterCL;
SpatialFilterCL<T> m_SpatialFilterCL;
DensityFilterCL<bucketT> m_DensityFilterCL;
SpatialFilterCL<bucketT> m_SpatialFilterCL;
IterOpenCLKernelCreator<T> m_IterOpenCLKernelCreator;
DEOpenCLKernelCreator<T> m_DEOpenCLKernelCreator;
FinalAccumOpenCLKernelCreator<T> m_FinalAccumOpenCLKernelCreator;
DEOpenCLKernelCreator m_DEOpenCLKernelCreator;
FinalAccumOpenCLKernelCreator m_FinalAccumOpenCLKernelCreator;
pair<string, vector<T>> m_Params;
Ember<T> m_LastBuiltEmber;
};