mirror of
https://bitbucket.org/mfeemster/fractorium.git
synced 2025-07-01 05:46:06 -04:00
Initial source commit
Initial source commit
This commit is contained in:
698
Source/EmberCL/DEOpenCLKernelCreator.cpp
Normal file
698
Source/EmberCL/DEOpenCLKernelCreator.cpp
Normal file
@ -0,0 +1,698 @@
|
||||
#include "EmberCLPch.h"
|
||||
#include "DEOpenCLKernelCreator.h"
|
||||
|
||||
namespace EmberCLns
|
||||
{
|
||||
/// <summary>
|
||||
/// Empty constructor that does nothing. The user must call the one which takes a bool
|
||||
/// argument before using this class.
|
||||
/// This constructor only exists so the class can be a member of a class.
|
||||
/// </summary>
|
||||
template <typename T>
|
||||
DEOpenCLKernelCreator<T>::DEOpenCLKernelCreator()
|
||||
{
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Constructor for float template type that sets all kernel entry points as well as composes
|
||||
/// all kernel source strings.
|
||||
/// No program compilation is done here, the user must explicitly do it.
|
||||
/// The caller must specify whether they are using an nVidia or AMD card because it changes
|
||||
/// the amount of local memory available.
|
||||
/// </summary>
|
||||
/// <param name="nVidia">True if running on an nVidia card, else false.</param>
|
||||
template <>
|
||||
DEOpenCLKernelCreator<float>::DEOpenCLKernelCreator(bool nVidia)
|
||||
{
|
||||
m_NVidia = nVidia;
|
||||
m_LogScaleSumDEEntryPoint = "LogScaleSumDensityFilterKernel";
|
||||
m_LogScaleAssignDEEntryPoint = "LogScaleAssignDensityFilterKernel";
|
||||
m_GaussianDEWithoutSsEntryPoint = "GaussianDEWithoutSsKernel";
|
||||
m_GaussianDESsWithScfEntryPoint = "GaussianDESsWithScfKernel";
|
||||
m_GaussianDESsWithoutScfEntryPoint = "GaussianDESsWithoutScfKernel";
|
||||
m_GaussianDEWithoutSsNoCacheEntryPoint = "GaussianDEWithoutSsNoCacheKernel";
|
||||
m_GaussianDESsWithScfNoCacheEntryPoint = "GaussianDESsWithScfNoCacheKernel";
|
||||
m_GaussianDESsWithoutScfNoCacheEntryPoint = "GaussianDESsWithoutScfNoCacheKernel";
|
||||
m_LogScaleSumDEKernel = CreateLogScaleSumDEKernelString();
|
||||
m_LogScaleAssignDEKernel = CreateLogScaleAssignDEKernelString();
|
||||
m_GaussianDEWithoutSsKernel = CreateGaussianDEKernel(1);
|
||||
m_GaussianDESsWithScfKernel = CreateGaussianDEKernel(2);
|
||||
m_GaussianDESsWithoutScfKernel = CreateGaussianDEKernel(3);
|
||||
m_GaussianDEWithoutSsNoCacheKernel = CreateGaussianDEKernelNoLocalCache(1);
|
||||
m_GaussianDESsWithScfNoCacheKernel = CreateGaussianDEKernelNoLocalCache(2);
|
||||
m_GaussianDESsWithoutScfNoCacheKernel = CreateGaussianDEKernelNoLocalCache(3);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Constructor for double template type that sets all kernel entry points as well as composes
|
||||
/// all kernel source strings.
|
||||
/// Note that no versions of kernels that use the cache are compiled because
|
||||
/// the cache is not big enough to hold double4.
|
||||
/// No program compilation is done here, the user must explicitly do it.
|
||||
/// Specifying true or false for the bool parameter has no effect since no local memory
|
||||
/// is used when instantiated with type double.
|
||||
/// </summary>
|
||||
/// <param name="nVidia">True if running on an nVidia card, else false. Ignored.</param>
|
||||
template <>
|
||||
DEOpenCLKernelCreator<double>::DEOpenCLKernelCreator(bool nVidia)
|
||||
{
|
||||
m_NVidia = nVidia;
|
||||
m_LogScaleSumDEEntryPoint = "LogScaleSumDensityFilterKernel";
|
||||
m_LogScaleAssignDEEntryPoint = "LogScaleAssignDensityFilterKernel";
|
||||
m_GaussianDEWithoutSsNoCacheEntryPoint = "GaussianDEWithoutSsNoCacheKernel";
|
||||
m_GaussianDESsWithScfNoCacheEntryPoint = "GaussianDESsWithScfNoCacheKernel";
|
||||
m_GaussianDESsWithoutScfNoCacheEntryPoint = "GaussianDESsWithoutScfNoCacheKernel";
|
||||
m_LogScaleSumDEKernel = CreateLogScaleSumDEKernelString();
|
||||
m_LogScaleAssignDEKernel = CreateLogScaleAssignDEKernelString();
|
||||
m_GaussianDEWithoutSsNoCacheKernel = CreateGaussianDEKernelNoLocalCache(1);
|
||||
m_GaussianDESsWithScfNoCacheKernel = CreateGaussianDEKernelNoLocalCache(2);
|
||||
m_GaussianDESsWithoutScfNoCacheKernel = CreateGaussianDEKernelNoLocalCache(3);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Kernel source and entry point properties, getters only.
|
||||
/// </summary>
|
||||
|
||||
template <typename T> string DEOpenCLKernelCreator<T>::LogScaleSumDEKernel() { return m_LogScaleSumDEKernel; }
|
||||
template <typename T> string DEOpenCLKernelCreator<T>::LogScaleSumDEEntryPoint() { return m_LogScaleSumDEEntryPoint; }
|
||||
template <typename T> string DEOpenCLKernelCreator<T>::LogScaleAssignDEKernel() { return m_LogScaleAssignDEKernel; }
|
||||
template <typename T> string DEOpenCLKernelCreator<T>::LogScaleAssignDEEntryPoint() { return m_LogScaleAssignDEEntryPoint; }
|
||||
|
||||
/// <summary>
|
||||
/// Get the kernel source for the specified supersample and filterWidth.
|
||||
/// </summary>
|
||||
/// <param name="ss">The supersample being used</param>
|
||||
/// <param name="filterWidth">Filter width</param>
|
||||
/// <returns>The kernel source</returns>
|
||||
template <typename T>
|
||||
string DEOpenCLKernelCreator<T>::GaussianDEKernel(unsigned int ss, unsigned int filterWidth)
|
||||
{
|
||||
if ((typeid(T) == typeid(double)) || (filterWidth > MaxDEFilterSize()))//Type double does not use cache.
|
||||
{
|
||||
if (ss > 1)
|
||||
{
|
||||
if (!(ss & 1))
|
||||
return m_GaussianDESsWithScfNoCacheKernel;
|
||||
else
|
||||
return m_GaussianDESsWithoutScfNoCacheKernel;
|
||||
}
|
||||
else
|
||||
return m_GaussianDEWithoutSsNoCacheKernel;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (ss > 1)
|
||||
{
|
||||
if (!(ss & 1))
|
||||
return m_GaussianDESsWithScfKernel;
|
||||
else
|
||||
return m_GaussianDESsWithoutScfKernel;
|
||||
}
|
||||
else
|
||||
return m_GaussianDEWithoutSsKernel;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Get the kernel entry point for the specified supersample and filterWidth.
|
||||
/// </summary>
|
||||
/// <param name="ss">The supersample being used</param>
|
||||
/// <param name="filterWidth">Filter width</param>
|
||||
/// <returns>The name of the density estimation filtering entry point kernel function</returns>
|
||||
template <typename T>
|
||||
string DEOpenCLKernelCreator<T>::GaussianDEEntryPoint(unsigned int ss, unsigned int filterWidth)
|
||||
{
|
||||
if ((typeid(T) == typeid(double)) || (filterWidth > MaxDEFilterSize()))//Type double does not use cache.
|
||||
{
|
||||
if (ss > 1)
|
||||
{
|
||||
if (!(ss & 1))
|
||||
return m_GaussianDESsWithScfNoCacheEntryPoint;
|
||||
else
|
||||
return m_GaussianDESsWithoutScfNoCacheEntryPoint;
|
||||
}
|
||||
else
|
||||
return m_GaussianDEWithoutSsNoCacheEntryPoint;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (ss > 1)
|
||||
{
|
||||
if (!(ss & 1))
|
||||
return m_GaussianDESsWithScfEntryPoint;
|
||||
else
|
||||
return m_GaussianDESsWithoutScfEntryPoint;
|
||||
}
|
||||
else
|
||||
return m_GaussianDEWithoutSsEntryPoint;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Get the maximum filter size allowed for running the local memory version of density filtering
|
||||
/// Filters larger than this value will run the version without local memory caching.
|
||||
/// </summary>
|
||||
/// <returns>The maximum filter size allowed for running the local memory version of density filtering</returns>
|
||||
template <typename T>
|
||||
unsigned int DEOpenCLKernelCreator<T>::MaxDEFilterSize() { return 9; }//The true max would be (maxBoxSize - 1) / 2, but that's impractical because it can give us a tiny block size.
|
||||
|
||||
/// <summary>
|
||||
/// Solve for the maximum filter radius.
|
||||
/// The final filter width is calculated by: (unsigned int)(ceil(m_MaxRad) * (T)m_Supersample) + (m_Supersample - 1);
|
||||
/// Must solve for what max rad should be in order to give a maximum final width of (maxBoxSize - 1) / 2, assuming
|
||||
/// a minimum block size of 1 which processes 1 pixel.
|
||||
/// Example: If a box size of 20 was allowed, a filter
|
||||
/// size of up to 9: (20 - 1) / 2 == (19 / 2) == 9 could be supported.
|
||||
/// This function is deprecated, the appropriate kernels take care of this problem now.
|
||||
/// </summary>
|
||||
/// <param name="maxBoxSize">Maximum size of the box.</param>
|
||||
/// <param name="desiredFilterSize">Size of the desired filter.</param>
|
||||
/// <param name="ss">The supersample being used</param>
|
||||
/// <returns>The maximum filter radius allowed</returns>
|
||||
template <typename T>
|
||||
T DEOpenCLKernelCreator<T>::SolveMaxDERad(unsigned int maxBoxSize, T desiredFilterSize, T ss)
|
||||
{
|
||||
unsigned int finalFilterSize = (unsigned int)((ceil(desiredFilterSize) * ss) + (ss - 1.0));
|
||||
|
||||
//Return the desired size if the final size of it will fit.
|
||||
if (finalFilterSize <= MaxDEFilterSize())
|
||||
return desiredFilterSize;
|
||||
|
||||
//The final size doesn't fit, so scale the original down until it fits.
|
||||
return (T)floor((MaxDEFilterSize() - (ss - 1.0)) / ss);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Determine the maximum filter box size based on the amount of local memory available
|
||||
/// to each block.
|
||||
/// </summary>
|
||||
/// <param name="localMem">The local memory available to a block</param>
|
||||
/// <returns>The maximum filter box size allowed</returns>
|
||||
template <typename T>
|
||||
unsigned int DEOpenCLKernelCreator<T>::SolveMaxBoxSize(unsigned int localMem)
|
||||
{
|
||||
return (unsigned int)floor(sqrt(floor((T)localMem / 16.0)));//Divide by 16 because each element is float4.
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Create the log scale kernel string, using summation.
|
||||
/// This means each cell will be added to, rather than just assigned.
|
||||
/// Since adding is slower than assigning, this should only be used when Passes > 1,
|
||||
/// otherwise use the kernel created from CreateLogScaleAssignDEKernelString().
|
||||
/// </summary>
|
||||
/// <returns>The kernel string</returns>
|
||||
template <typename T>
|
||||
string DEOpenCLKernelCreator<T>::CreateLogScaleSumDEKernelString()
|
||||
{
|
||||
ostringstream os;
|
||||
|
||||
os <<
|
||||
ConstantDefinesString(typeid(T) == typeid(double)) <<
|
||||
DensityFilterCLStructString <<
|
||||
"__kernel void " << m_LogScaleSumDEEntryPoint << "(\n"
|
||||
" const __global real4* histogram,\n"
|
||||
" __global real4* accumulator,\n"
|
||||
" __constant DensityFilterCL* logFilter\n"
|
||||
"\t)\n"
|
||||
"{\n"
|
||||
" if ((GLOBAL_ID_X < logFilter->m_SuperRasW) && (GLOBAL_ID_Y < logFilter->m_SuperRasH))\n"
|
||||
" {\n"
|
||||
" uint index = (GLOBAL_ID_Y * logFilter->m_SuperRasW) + GLOBAL_ID_X;\n"
|
||||
"\n"
|
||||
" if (histogram[index].w != 0)\n"
|
||||
" {\n"
|
||||
" real_t logScale = (logFilter->m_K1 * log(1.0 + histogram[index].w * logFilter->m_K2)) / histogram[index].w;\n"
|
||||
"\n"
|
||||
" accumulator[index] += histogram[index] * logScale;\n"//Using a single real4 vector operation doubles the speed from doing each component individually.
|
||||
" }\n"
|
||||
"\n"
|
||||
" barrier(CLK_GLOBAL_MEM_FENCE);\n"//Just to be safe. Makes no speed difference to do all of the time or only when there's a hit.
|
||||
" }\n"
|
||||
"}\n";
|
||||
|
||||
return os.str();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Create the log scale kernel string, using assignment.
|
||||
/// Use this when Passes == 1.
|
||||
/// </summary>
|
||||
/// <returns>The kernel string</returns>
|
||||
template <typename T>
|
||||
string DEOpenCLKernelCreator<T>::CreateLogScaleAssignDEKernelString()
|
||||
{
|
||||
ostringstream os;
|
||||
|
||||
os <<
|
||||
ConstantDefinesString(typeid(T) == typeid(double)) <<
|
||||
DensityFilterCLStructString <<
|
||||
"__kernel void " << m_LogScaleAssignDEEntryPoint << "(\n"
|
||||
" const __global real4* histogram,\n"
|
||||
" __global real4* accumulator,\n"
|
||||
" __constant DensityFilterCL* logFilter\n"
|
||||
"\t)\n"
|
||||
"{\n"
|
||||
" if ((GLOBAL_ID_X < logFilter->m_SuperRasW) && (GLOBAL_ID_Y < logFilter->m_SuperRasH))\n"
|
||||
" {\n"
|
||||
" uint index = (GLOBAL_ID_Y * logFilter->m_SuperRasW) + GLOBAL_ID_X;\n"
|
||||
"\n"
|
||||
" if (histogram[index].w != 0)\n"
|
||||
" {\n"
|
||||
" real_t logScale = (logFilter->m_K1 * log(1.0 + histogram[index].w * logFilter->m_K2)) / histogram[index].w;\n"
|
||||
"\n"
|
||||
" accumulator[index] = histogram[index] * logScale;\n"//Using a single real4 vector operation doubles the speed from doing each component individually.
|
||||
" }\n"
|
||||
"\n"
|
||||
" barrier(CLK_GLOBAL_MEM_FENCE);\n"//Just to be safe. Makes no speed difference to do all of the time or only when there's a hit.
|
||||
" }\n"
|
||||
"}\n";
|
||||
|
||||
return os.str();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Create the gaussian density filtering kernel string.
|
||||
/// 6 different methods of processing were tried before settling on this final and fastest 7th one.
|
||||
/// Each block processes a box and exits. No column or row advancements happen.
|
||||
/// The block accumulates to a temporary box and writes the contents to the global density filter buffer when done.
|
||||
/// Note this applies the filter from top to bottom row and not from the center outward like the CPU version does.
|
||||
/// This allows the image to be filtered without suffering from pixel loss due to race conditions.
|
||||
/// It is run in multiple passes that are spaced far enough apart on the image so as to not overlap.
|
||||
/// This allows writing to the global buffer without ever overlapping or using atomics.
|
||||
/// The supersample parameter will produce three different kernels.
|
||||
/// SS = 1, SS > 1 && SS even, SS > 1 && SS odd.
|
||||
/// The width of the kernl this runs in must be evenly divisible by 16 or else artifacts will occur.
|
||||
/// Note that because this function uses so many variables and is so complex, OpenCL can easily run
|
||||
/// out of resources in some cases. Certain variables had to be reused to condense the kernel footprint
|
||||
/// down enough to be able to run a block size of 32x32.
|
||||
/// For double precision, or for SS > 1, a size of 32x30 is used.
|
||||
/// Box width = (BLOCK_SIZE_X + (fw * 2)).
|
||||
/// Box height = (BLOCK_SIZE_Y + (fw * 2)).
|
||||
/// </summary>
|
||||
/// <param name="ss">The supersample being used</param>
|
||||
/// <returns>The kernel string</returns>
|
||||
template <typename T>
|
||||
string DEOpenCLKernelCreator<T>::CreateGaussianDEKernel(unsigned int ss)
|
||||
{
|
||||
bool doSS = ss > 1;
|
||||
bool doScf = !(ss & 1);
|
||||
ostringstream os;
|
||||
|
||||
os <<
|
||||
ConstantDefinesString(typeid(T) == typeid(double)) <<
|
||||
DensityFilterCLStructString <<
|
||||
UnionCLStructString <<
|
||||
"__kernel void " << GaussianDEEntryPoint(ss, MaxDEFilterSize()) << "(\n" <<
|
||||
" const __global real4* histogram,\n"
|
||||
" __global real4reals* accumulator,\n"
|
||||
" __constant DensityFilterCL* densityFilter,\n"
|
||||
" const __global real_t* filterCoefs,\n"
|
||||
" const __global real_t* filterWidths,\n"
|
||||
" const __global uint* coefIndices,\n"
|
||||
" const uint chunkSizeW,\n"
|
||||
" const uint chunkSizeH,\n"
|
||||
" const uint rowParity,\n"
|
||||
" const uint colParity\n"
|
||||
"\t)\n"
|
||||
"{\n"
|
||||
//Parity determines if this function should execute.
|
||||
" if ((GLOBAL_ID_X >= densityFilter->m_SuperRasW) ||\n"
|
||||
" (GLOBAL_ID_Y >= densityFilter->m_SuperRasH) ||\n"
|
||||
" ((BLOCK_ID_X % chunkSizeW) != colParity) ||\n"
|
||||
" ((BLOCK_ID_Y % chunkSizeH) != rowParity)) \n"
|
||||
" return;\n"
|
||||
"\n";
|
||||
|
||||
if (doSS)
|
||||
{
|
||||
os <<
|
||||
" uint ss = (uint)floor((real_t)densityFilter->m_Supersample / 2.0);\n"
|
||||
" int densityBoxLeftX;\n"
|
||||
" int densityBoxRightX;\n"
|
||||
" int densityBoxTopY;\n"
|
||||
" int densityBoxBottomY;\n"
|
||||
"\n";
|
||||
|
||||
if (doScf)
|
||||
os <<
|
||||
" real_t scfact = pow(densityFilter->m_Supersample / (densityFilter->m_Supersample + 1.0), 2.0);\n";
|
||||
}
|
||||
|
||||
//Compute the size of the temporary box which is the block width + 2 * filter width x block height + 2 * filter width.
|
||||
//Ideally the block width and height are both 32. However, the height might be smaller if there isn't enough memory.
|
||||
os <<
|
||||
" uint fullTempBoxWidth, fullTempBoxHeight;\n"
|
||||
" uint leftBound, rightBound, topBound, botBound;\n"
|
||||
" uint blockHistStartRow, blockHistEndRow, boxReadStartRow, boxReadEndRow;\n"
|
||||
" uint blockHistStartCol, boxReadStartCol, boxReadEndCol;\n"
|
||||
" uint accumWriteStartRow, accumWriteStartCol, colsToWrite;\n"
|
||||
|
||||
//If any of the variables above end up being made __local, init them here.
|
||||
//At the moment, it's slower even though it's more memory efficient.
|
||||
//" if (THREAD_ID_X == 0 && THREAD_ID_Y == 0)\n"
|
||||
//" {\n"
|
||||
//Init local vars here.
|
||||
//" }\n"
|
||||
//"\n"
|
||||
//" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
||||
"\n"
|
||||
" fullTempBoxWidth = BLOCK_SIZE_X + (densityFilter->m_FilterWidth * 2);\n"
|
||||
" fullTempBoxHeight = BLOCK_SIZE_Y + (densityFilter->m_FilterWidth * 2);\n"
|
||||
//Compute the bounds of the area to be sampled, which is just the ends minus the super sample minus 1.
|
||||
" leftBound = densityFilter->m_Supersample - 1;\n"
|
||||
" rightBound = densityFilter->m_SuperRasW - (densityFilter->m_Supersample - 1);\n"
|
||||
" topBound = densityFilter->m_Supersample - 1;\n"
|
||||
" botBound = densityFilter->m_SuperRasH - (densityFilter->m_Supersample - 1);\n"
|
||||
"\n"
|
||||
//Start and end values are the indices in the histogram read from
|
||||
//and written to in the accumulator. They are not the indices for the local block of data.
|
||||
//Before computing local offsets, compute the global offsets first to determine if any rows or cols fall outside of the bounds.
|
||||
" blockHistStartRow = min(botBound, topBound + (BLOCK_ID_Y * BLOCK_SIZE_Y));\n"//The first histogram row this block will process.
|
||||
" blockHistEndRow = min(botBound, blockHistStartRow + BLOCK_SIZE_Y);\n"//The last histogram row this block will process, clamped to the last row.
|
||||
" boxReadStartRow = densityFilter->m_FilterWidth - min(densityFilter->m_FilterWidth, blockHistStartRow);\n"//The first row in the local box to read from when writing back to the final accumulator for this block.
|
||||
" boxReadEndRow = densityFilter->m_FilterWidth + min(densityFilter->m_FilterWidth + BLOCK_SIZE_Y, densityFilter->m_SuperRasH - blockHistStartRow);\n"//The last row in the local box to read from when writing back to the final accumulator for this block.
|
||||
" blockHistStartCol = min(rightBound, leftBound + (BLOCK_ID_X * BLOCK_SIZE_X));\n"//The first histogram column this block will process.
|
||||
" boxReadStartCol = densityFilter->m_FilterWidth - min(densityFilter->m_FilterWidth, blockHistStartCol);\n"//The first box row this block will read from when copying to the accumulator.
|
||||
" boxReadEndCol = densityFilter->m_FilterWidth + min(densityFilter->m_FilterWidth + BLOCK_SIZE_X, densityFilter->m_SuperRasW - blockHistStartCol);\n"//The last box row this block will read from when copying to the accumulator.
|
||||
"\n"
|
||||
//Last, the indices in the global accumulator that the local bounds will be writing to.
|
||||
" accumWriteStartRow = blockHistStartRow - min(densityFilter->m_FilterWidth, blockHistStartRow);\n"//Will be fw - 0 except for boundary columns, it will be less.
|
||||
" accumWriteStartCol = blockHistStartCol - min(densityFilter->m_FilterWidth, blockHistStartCol);\n"
|
||||
" colsToWrite = ceil((real_t)(boxReadEndCol - boxReadStartCol) / (real_t)BLOCK_SIZE_X);\n"
|
||||
"\n"
|
||||
" uint threadHistRow = blockHistStartRow + THREAD_ID_Y;\n"//The histogram row this individual thread will be reading from.
|
||||
" uint threadHistCol = blockHistStartCol + THREAD_ID_X;\n"//The histogram column this individual thread will be reading from.
|
||||
"\n"
|
||||
|
||||
//Compute the center position in this local box to serve as the center position
|
||||
//from which filter application offsets are computed.
|
||||
//These are the local indices for the local data that are temporarily accumulated to before
|
||||
//writing out to the global accumulator.
|
||||
" uint boxRow = densityFilter->m_FilterWidth + THREAD_ID_Y;\n"
|
||||
" uint boxCol = densityFilter->m_FilterWidth + THREAD_ID_X;\n"
|
||||
" uint colElementsToZero = ceil((real_t)fullTempBoxWidth / (real_t)(BLOCK_SIZE_X));\n"//Usually is 2.
|
||||
" int i, j, k;\n"
|
||||
" uint filterSelectInt, filterCoefIndex;\n"
|
||||
" real_t cacheLog;\n"
|
||||
" real_t filterSelect;\n"
|
||||
" real4 bucket;\n"
|
||||
;
|
||||
|
||||
//This will be treated as having dimensions of (BLOCK_SIZE_X + (fw * 2)) x (BLOCK_SIZE_Y + (fw * 2)).
|
||||
if (m_NVidia)
|
||||
os << " __local real4reals filterBox[3000];\n";
|
||||
else
|
||||
os << " __local real4reals filterBox[1200];\n";
|
||||
|
||||
os <<
|
||||
//Zero the temp buffers first. This splits the zeroization evenly across all threads (columns) in the first block row.
|
||||
//This is a middle ground solution. Previous methods tried:
|
||||
//Thread (0, 0) does all init. This works, but is the slowest.
|
||||
//Init is divided among all threads. This is the fastest but exposes a severe flaw in OpenCL,
|
||||
//in that it will not get executed by all threads before proceeding, despite the barrier statement
|
||||
//below. As a result, strange artifacts will get left around because filtering gets executed on a temp
|
||||
//box that has not been properly zeroized.
|
||||
//The only way to do it and still achieve reasonable speed is to have the first row do it. This is
|
||||
//most likely because the first row gets executed first, ensuring zeroization is done when the rest
|
||||
//of the threads execute.
|
||||
"\n"//Dummy test zeroization for debugging.
|
||||
//" if (THREAD_ID_Y == 0 && THREAD_ID_X == 0)\n"//First thread of the block takes the responsibility of zeroizing.
|
||||
//" {\n"
|
||||
//" for (k = 0; k < 2 * 1024; k++)\n"
|
||||
//" {\n"
|
||||
//" filterBox[k].m_Real4 = 0;\n"
|
||||
//" }\n"
|
||||
//" }\n"
|
||||
" if (THREAD_ID_Y == 0)\n"//First row of the block takes the responsibility of zeroizing.
|
||||
" {\n"
|
||||
" for (i = 0; i < fullTempBoxHeight; i++)\n"//Each column in the row iterates through all rows.
|
||||
" {\n"
|
||||
" for (j = 0; j < colElementsToZero && ((colElementsToZero * THREAD_ID_X) + j) < fullTempBoxWidth; j++)\n"//And zeroizes a few columns from that row.
|
||||
" {\n"
|
||||
" filterBox[(i * fullTempBoxWidth) + ((colElementsToZero * THREAD_ID_X) + j)].m_Real4 = 0;\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
"\n"
|
||||
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
||||
"\n"
|
||||
" if (threadHistRow < botBound && threadHistCol < rightBound)\n"
|
||||
" {\n"
|
||||
" bucket = histogram[(threadHistRow * densityFilter->m_SuperRasW) + threadHistCol];\n"
|
||||
"\n"
|
||||
" if (bucket.w != 0)\n"
|
||||
" {\n"
|
||||
" cacheLog = (densityFilter->m_K1 * log(1.0 + bucket.w * densityFilter->m_K2)) / bucket.w;\n";
|
||||
|
||||
if (doSS)
|
||||
{
|
||||
os <<
|
||||
" filterSelect = 0;\n"
|
||||
" densityBoxLeftX = threadHistCol - min(threadHistCol, ss);\n"
|
||||
" densityBoxRightX = threadHistCol + min(ss, (densityFilter->m_SuperRasW - threadHistCol) - 1);\n"
|
||||
" densityBoxTopY = threadHistRow - min(threadHistRow, ss);\n"
|
||||
" densityBoxBottomY = threadHistRow + min(ss, (densityFilter->m_SuperRasH - threadHistRow) - 1);\n"
|
||||
"\n"
|
||||
" for (j = densityBoxTopY; j <= densityBoxBottomY; j++)\n"
|
||||
" {\n"
|
||||
" for (i = densityBoxLeftX; i <= densityBoxRightX; i++)\n"
|
||||
" {\n"
|
||||
" filterSelect += histogram[i + (j * densityFilter->m_SuperRasW)].w;\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
"\n";
|
||||
|
||||
if (doScf)
|
||||
os << " filterSelect *= scfact;\n";
|
||||
}
|
||||
else
|
||||
{
|
||||
os
|
||||
<< " filterSelect = bucket.w;\n";
|
||||
}
|
||||
|
||||
os <<
|
||||
"\n"
|
||||
" if (filterSelect > densityFilter->m_MaxFilteredCounts)\n"
|
||||
" filterSelectInt = densityFilter->m_MaxFilterIndex;\n"
|
||||
" else if (filterSelect <= DE_THRESH)\n"
|
||||
" filterSelectInt = (int)ceil(filterSelect) - 1;\n"
|
||||
" else\n"
|
||||
" filterSelectInt = (int)DE_THRESH + (int)floor(pow((real_t)(filterSelect - DE_THRESH), densityFilter->m_Curve));\n"
|
||||
"\n"
|
||||
" if (filterSelectInt > densityFilter->m_MaxFilterIndex)\n"
|
||||
" filterSelectInt = densityFilter->m_MaxFilterIndex;\n"
|
||||
"\n"
|
||||
" filterCoefIndex = filterSelectInt * densityFilter->m_KernelSize;\n"
|
||||
"\n"
|
||||
//With this new method, only accumulate to the temp local buffer first. Write to the final accumulator last.
|
||||
//For each loop through, note that there is a local memory barrier call inside of each call to AddToAccumNoCheck().
|
||||
//If this isn't done, pixel errors occurr and even an out of resources error occurrs because too many writes are done to the same place in memory at once.
|
||||
" k = (int)densityFilter->m_FilterWidth;\n"//Need a signed int to use below, really is filter width, but reusing a variable to save space.
|
||||
"\n"
|
||||
" for (j = -k; j <= k; j++)\n"
|
||||
" {\n"
|
||||
" for (i = -k; i <= k; i++)\n"
|
||||
" {\n"
|
||||
" filterSelectInt = filterCoefIndex + coefIndices[(abs(j) * (densityFilter->m_FilterWidth + 1)) + abs(i)];\n"//Really is filterCoeffIndexPlusOffset, but reusing a variable to save space.
|
||||
"\n"
|
||||
" if (filterCoefs[filterSelectInt] != 0)\n"
|
||||
" {\n"
|
||||
" filterBox[(i + boxCol) + ((j + boxRow) * fullTempBoxWidth)].m_Real4 += (bucket * (filterCoefs[filterSelectInt] * cacheLog));\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" barrier(CLK_LOCAL_MEM_FENCE);\n"//If this is the only barrier and the block size is exactly 16, it works perfectly. Otherwise, no chunks occur, but a many streaks.
|
||||
" }\n"
|
||||
" }\n"//bucket.w != 0.
|
||||
" }\n"//In bounds.
|
||||
"\n"
|
||||
"\n"
|
||||
" barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);\n"
|
||||
"\n"
|
||||
" if (THREAD_ID_Y == 0)\n"
|
||||
" {\n"
|
||||
//At this point, all threads in this block have applied the filter to their surrounding pixel and stored the results in the temp local box.
|
||||
//Add the cells of it that are in bounds to the global accumulator.
|
||||
//Compute offsets in local box to read from, and offsets into global accumulator to write to.
|
||||
//Use a method here that is similar to the zeroization above: Each thread (column) in the first row iterates through all of the
|
||||
//rows and adds a few columns to the accumulator.
|
||||
" for (i = boxReadStartRow, j = accumWriteStartRow; i < boxReadEndRow; i++, j++)\n"
|
||||
" {\n"
|
||||
" for (k = 0; k < colsToWrite; k++)\n"//Write a few columns.
|
||||
" {\n"
|
||||
" boxCol = (colsToWrite * THREAD_ID_X) + k;\n"//Really is colOffset, but reusing a variable to save space.
|
||||
"\n"
|
||||
" if (boxReadStartCol + boxCol < boxReadEndCol)\n"
|
||||
" accumulator[(j * densityFilter->m_SuperRasW) + (accumWriteStartCol + boxCol)].m_Real4 += filterBox[(i * fullTempBoxWidth) + (boxReadStartCol + boxCol)].m_Real4;\n"
|
||||
" }\n"
|
||||
" barrier(CLK_GLOBAL_MEM_FENCE);\n"//This must be here or else chunks will go missing.
|
||||
" }\n"
|
||||
" }\n"
|
||||
"}\n";
|
||||
|
||||
return os.str();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Create the gaussian density filtering kernel string, but use no local cache and perform
|
||||
/// all writes directly to the global density filtering buffer.
|
||||
/// Note this applies the filter from top to bottom row and not from the center outward like the CPU version does.
|
||||
/// This allows the image to be filtered without suffering from pixel loss due to race conditions.
|
||||
/// This is used for when the filter box is greater than can fit in the local cache.
|
||||
/// While the cached version is incredibly fast, this version offers no real gain over doing it
|
||||
/// on the CPU because the frequent global memory access brings performance to a crawl.
|
||||
/// The supersample parameter will produce three different kernels.
|
||||
/// SS = 1, SS > 1 && SS even, SS > 1 && SS odd.
|
||||
/// The width of the kernl this runs in must be evenly divisible by 16 or else artifacts will occur.
|
||||
/// Note that because this function uses so many variables and is so complex, OpenCL can easily run
|
||||
/// out of resources in some cases. Certain variables had to be reused to condense the kernel footprint
|
||||
/// down enough to be able to run a block size of 32x32.
|
||||
/// For double precision, or for SS > 1, a size of 32x30 is used.
|
||||
/// </summary>
|
||||
/// <param name="ss">The supersample being used</param>
|
||||
/// <returns>The kernel string</returns>
|
||||
template <typename T>
|
||||
string DEOpenCLKernelCreator<T>::CreateGaussianDEKernelNoLocalCache(unsigned int ss)
|
||||
{
|
||||
bool doSS = ss > 1;
|
||||
bool doScf = !(ss & 1);
|
||||
ostringstream os;
|
||||
|
||||
os <<
|
||||
ConstantDefinesString(typeid(T) == typeid(double)) <<
|
||||
DensityFilterCLStructString <<
|
||||
UnionCLStructString <<
|
||||
AddToAccumWithCheckFunctionString <<
|
||||
"__kernel void " << GaussianDEEntryPoint(ss, MaxDEFilterSize() + 1) << "(\n" <<
|
||||
" const __global real4* histogram,\n"
|
||||
" __global real4reals* accumulator,\n"
|
||||
" __constant DensityFilterCL* densityFilter,\n"
|
||||
" const __global real_t* filterCoefs,\n"
|
||||
" const __global real_t* filterWidths,\n"
|
||||
" const __global uint* coefIndices,\n"
|
||||
" const uint chunkSizeW,\n"
|
||||
" const uint chunkSizeH,\n"
|
||||
" const uint rowParity,\n"
|
||||
" const uint colParity\n"
|
||||
"\t)\n"
|
||||
"{\n"
|
||||
//Parity determines if this function should execute.
|
||||
" if ((GLOBAL_ID_X >= densityFilter->m_SuperRasW) ||\n"
|
||||
" (GLOBAL_ID_Y >= densityFilter->m_SuperRasH) ||\n"
|
||||
" ((BLOCK_ID_X % chunkSizeW) != colParity) ||\n"
|
||||
" ((BLOCK_ID_Y % chunkSizeH) != rowParity)) \n"
|
||||
" return;\n"
|
||||
"\n";
|
||||
|
||||
if (doSS)
|
||||
{
|
||||
os <<
|
||||
" uint ss = (uint)floor((real_t)densityFilter->m_Supersample / 2.0);\n"
|
||||
" int densityBoxLeftX;\n"
|
||||
" int densityBoxRightX;\n"
|
||||
" int densityBoxTopY;\n"
|
||||
" int densityBoxBottomY;\n";
|
||||
|
||||
if (doScf)
|
||||
os << " real_t scfact = pow((real_t)densityFilter->m_Supersample / ((real_t)densityFilter->m_Supersample + 1.0), 2.0);\n";
|
||||
}
|
||||
|
||||
os <<
|
||||
//Compute the bounds of the area to be sampled, which is just the ends minus the super sample minus 1.
|
||||
" uint leftBound = densityFilter->m_Supersample - 1;\n"
|
||||
" uint rightBound = densityFilter->m_SuperRasW - (densityFilter->m_Supersample - 1);\n"
|
||||
" uint topBound = densityFilter->m_Supersample - 1;\n"
|
||||
" uint botBound = densityFilter->m_SuperRasH - (densityFilter->m_Supersample - 1);\n"
|
||||
"\n"
|
||||
//Start and end values are the indices in the histogram read from and written to in the accumulator.
|
||||
//Before computing local offsets, compute the global offsets first to determine if any rows or cols fall outside of the bounds.
|
||||
" uint blockHistStartRow = min(botBound, topBound + (BLOCK_ID_Y * BLOCK_SIZE_Y));\n"//The first histogram row this block will process.
|
||||
" uint threadHistRow = blockHistStartRow + THREAD_ID_Y;\n"//The histogram row this individual thread will be reading from.
|
||||
"\n"
|
||||
" uint blockHistStartCol = min(rightBound, leftBound + (BLOCK_ID_X * BLOCK_SIZE_X));\n"//The first histogram column this block will process.
|
||||
" uint threadHistCol = blockHistStartCol + THREAD_ID_X;\n"//The histogram column this individual thread will be reading from.
|
||||
"\n"
|
||||
" int i, j;\n"
|
||||
" uint filterSelectInt, filterCoefIndex;\n"
|
||||
" real_t cacheLog;\n"
|
||||
" real_t logScale;\n"
|
||||
" real_t filterSelect;\n"
|
||||
" real4 bucket;\n"
|
||||
"\n"
|
||||
" if (threadHistRow < botBound && threadHistCol < rightBound)\n"
|
||||
" {\n"
|
||||
" bucket = histogram[(threadHistRow * densityFilter->m_SuperRasW) + threadHistCol];\n"
|
||||
"\n"
|
||||
" if (bucket.w != 0)\n"
|
||||
" {\n"
|
||||
" cacheLog = (densityFilter->m_K1 * log(1.0 + bucket.w * densityFilter->m_K2)) / bucket.w;\n";
|
||||
|
||||
if (doSS)
|
||||
{
|
||||
os <<
|
||||
" filterSelect = 0;\n"
|
||||
" densityBoxLeftX = threadHistCol - min(threadHistCol, ss);\n"
|
||||
" densityBoxRightX = threadHistCol + min(ss, (densityFilter->m_SuperRasW - threadHistCol) - 1);\n"
|
||||
" densityBoxTopY = threadHistRow - min(threadHistRow, ss);\n"
|
||||
" densityBoxBottomY = threadHistRow + min(ss, (densityFilter->m_SuperRasH - threadHistRow) - 1);\n"
|
||||
"\n"
|
||||
" for (j = densityBoxTopY; j <= densityBoxBottomY; j++)\n"
|
||||
" {\n"
|
||||
" for (i = densityBoxLeftX; i <= densityBoxRightX; i++)\n"
|
||||
" {\n"
|
||||
" filterSelect += histogram[i + (j * densityFilter->m_SuperRasW)].w;\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
"\n";
|
||||
|
||||
if (doScf)
|
||||
os << " filterSelect *= scfact;\n";
|
||||
}
|
||||
else
|
||||
{
|
||||
os
|
||||
<< " filterSelect = bucket.w;\n";
|
||||
}
|
||||
|
||||
os <<
|
||||
"\n"
|
||||
" if (filterSelect > densityFilter->m_MaxFilteredCounts)\n"
|
||||
" filterSelectInt = densityFilter->m_MaxFilterIndex;\n"
|
||||
" else if (filterSelect <= DE_THRESH)\n"
|
||||
" filterSelectInt = (int)ceil(filterSelect) - 1;\n"
|
||||
" else\n"
|
||||
" filterSelectInt = (int)DE_THRESH + (int)floor(pow((real_t)(filterSelect - DE_THRESH), densityFilter->m_Curve));\n"
|
||||
"\n"
|
||||
" if (filterSelectInt > densityFilter->m_MaxFilterIndex)\n"
|
||||
" filterSelectInt = densityFilter->m_MaxFilterIndex;\n"
|
||||
"\n"
|
||||
" filterCoefIndex = filterSelectInt * densityFilter->m_KernelSize;\n"
|
||||
"\n"
|
||||
" int fw = (int)densityFilter->m_FilterWidth;\n"//Need a signed int to use below.
|
||||
"\n"
|
||||
" for (j = -fw; j <= fw; j++)\n"
|
||||
" {\n"
|
||||
" for (i = -fw; i <= fw; i++)\n"
|
||||
" {\n"
|
||||
" if (AccumCheck(densityFilter->m_SuperRasW, densityFilter->m_SuperRasH, threadHistCol, i, threadHistRow, j))\n"
|
||||
" {\n"
|
||||
" filterSelectInt = filterCoefIndex + coefIndices[(abs(j) * (densityFilter->m_FilterWidth + 1)) + abs(i)];\n"//Really is filterCoeffIndexPlusOffset, but reusing a variable to save space.
|
||||
"\n"
|
||||
" if (filterCoefs[filterSelectInt] != 0)\n"
|
||||
" {\n"
|
||||
" accumulator[(i + threadHistCol) + ((j + threadHistRow) * densityFilter->m_SuperRasW)].m_Real4 += (bucket * (filterCoefs[filterSelectInt] * cacheLog));\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
"\n"
|
||||
" barrier(CLK_GLOBAL_MEM_FENCE);\n"//Required to avoid streaks.
|
||||
" }\n"
|
||||
" }\n"
|
||||
" }\n"//bucket.w != 0.
|
||||
" }\n"//In bounds.
|
||||
"\n"
|
||||
//" barrier(CLK_GLOBAL_MEM_FENCE);\n"//Just to be safe.
|
||||
"}\n";
|
||||
|
||||
return os.str();
|
||||
}
|
||||
}
|
89
Source/EmberCL/DEOpenCLKernelCreator.h
Normal file
89
Source/EmberCL/DEOpenCLKernelCreator.h
Normal file
@ -0,0 +1,89 @@
|
||||
#pragma once
|
||||
|
||||
#include "EmberCLPch.h"
|
||||
#include "EmberCLStructs.h"
|
||||
#include "EmberCLFunctions.h"
|
||||
|
||||
/// <summary>
|
||||
/// DEOpenCLKernelCreator class.
|
||||
/// </summary>
|
||||
|
||||
namespace EmberCLns
|
||||
{
|
||||
/// <summary>
|
||||
/// Kernel creator for density filtering.
|
||||
/// This implements both basic log scale filtering
|
||||
/// as well as the full flam3 density estimation filtering
|
||||
/// in OpenCL.
|
||||
/// Several conditionals are present in the CPU version. They
|
||||
/// are stripped out of the kernels and instead a separate kernel
|
||||
/// is created for every possible case.
|
||||
/// If the filter width is 9 or less, then the entire process can be
|
||||
/// done in shared memory which is very fast.
|
||||
/// However, if the filter width is greater than 9, shared memory is not
|
||||
/// used and all filtering is done directly with main global VRAM. This
|
||||
/// ends up being not much faster than doing it on the CPU.
|
||||
/// String members are kept for the program source and entry points
|
||||
/// for each version of the program.
|
||||
/// Template argument expected to be float or double.
|
||||
/// </summary>
|
||||
template <typename T>
|
||||
class EMBERCL_API DEOpenCLKernelCreator
|
||||
{
|
||||
public:
|
||||
DEOpenCLKernelCreator();
|
||||
DEOpenCLKernelCreator(bool nVidia);
|
||||
|
||||
//Accessors.
|
||||
string LogScaleSumDEKernel();
|
||||
string LogScaleSumDEEntryPoint();
|
||||
string LogScaleAssignDEKernel();
|
||||
string LogScaleAssignDEEntryPoint();
|
||||
string GaussianDEKernel(unsigned int ss, unsigned int filterWidth);
|
||||
string GaussianDEEntryPoint(unsigned int ss, unsigned int filterWidth);
|
||||
|
||||
//Miscellaneous static functions.
|
||||
static unsigned int MaxDEFilterSize();
|
||||
static T SolveMaxDERad(unsigned int maxBoxSize, T desiredFilterSize, T ss);
|
||||
static unsigned int SolveMaxBoxSize(unsigned int localMem);
|
||||
|
||||
private:
|
||||
//Kernel creators.
|
||||
string CreateLogScaleSumDEKernelString();
|
||||
string CreateLogScaleAssignDEKernelString();
|
||||
string CreateGaussianDEKernel(unsigned int ss);
|
||||
string CreateGaussianDEKernelNoLocalCache(unsigned int ss);
|
||||
|
||||
string m_LogScaleSumDEKernel;
|
||||
string m_LogScaleSumDEEntryPoint;
|
||||
|
||||
string m_LogScaleAssignDEKernel;
|
||||
string m_LogScaleAssignDEEntryPoint;
|
||||
|
||||
string m_GaussianDEWithoutSsKernel;
|
||||
string m_GaussianDEWithoutSsEntryPoint;
|
||||
|
||||
string m_GaussianDESsWithScfKernel;
|
||||
string m_GaussianDESsWithScfEntryPoint;
|
||||
|
||||
string m_GaussianDESsWithoutScfKernel;
|
||||
string m_GaussianDESsWithoutScfEntryPoint;
|
||||
|
||||
string m_GaussianDEWithoutSsNoCacheKernel;
|
||||
string m_GaussianDEWithoutSsNoCacheEntryPoint;
|
||||
|
||||
string m_GaussianDESsWithScfNoCacheKernel;
|
||||
string m_GaussianDESsWithScfNoCacheEntryPoint;
|
||||
|
||||
string m_GaussianDESsWithoutScfNoCacheKernel;
|
||||
string m_GaussianDESsWithoutScfNoCacheEntryPoint;
|
||||
|
||||
bool m_NVidia;
|
||||
};
|
||||
|
||||
template EMBERCL_API class DEOpenCLKernelCreator<float>;
|
||||
|
||||
#ifdef DO_DOUBLE
|
||||
template EMBERCL_API class DEOpenCLKernelCreator<double>;
|
||||
#endif
|
||||
}
|
20
Source/EmberCL/DllMain.cpp
Normal file
20
Source/EmberCL/DllMain.cpp
Normal file
@ -0,0 +1,20 @@
|
||||
#include "EmberCLPch.h"
|
||||
|
||||
/// <summary>
|
||||
/// Generated by Visual Studio to make the DLL run properly.
|
||||
/// </summary>
|
||||
BOOL APIENTRY DllMain( HMODULE hModule,
|
||||
DWORD ul_reason_for_call,
|
||||
LPVOID lpReserved
|
||||
)
|
||||
{
|
||||
switch (ul_reason_for_call)
|
||||
{
|
||||
case DLL_PROCESS_ATTACH:
|
||||
case DLL_THREAD_ATTACH:
|
||||
case DLL_THREAD_DETACH:
|
||||
case DLL_PROCESS_DETACH:
|
||||
break;
|
||||
}
|
||||
return TRUE;
|
||||
}
|
413
Source/EmberCL/EmberCLFunctions.h
Normal file
413
Source/EmberCL/EmberCLFunctions.h
Normal file
@ -0,0 +1,413 @@
|
||||
#pragma once
|
||||
|
||||
#include "EmberCLPch.h"
|
||||
#include "EmberCLStructs.h"
|
||||
|
||||
/// <summary>
|
||||
/// OpenCL global function strings.
|
||||
/// </summary>
|
||||
|
||||
namespace EmberCLns
|
||||
{
|
||||
/// <summary>
|
||||
/// OpenCL equivalent of Palette::RgbToHsv().
|
||||
/// </summary>
|
||||
static const char* RgbToHsvFunctionString =
|
||||
//rgb 0 - 1,
|
||||
//h 0 - 6, s 0 - 1, v 0 - 1
|
||||
"static inline void RgbToHsv(real4* rgb, real4* hsv)\n"
|
||||
"{\n"
|
||||
" real_t max, min, del, rc, gc, bc;\n"
|
||||
"\n"
|
||||
//Compute maximum of r, g, b.
|
||||
" if ((*rgb).x >= (*rgb).y)\n"
|
||||
" {\n"
|
||||
" if ((*rgb).x >= (*rgb).z)\n"
|
||||
" max = (*rgb).x;\n"
|
||||
" else\n"
|
||||
" max = (*rgb).z;\n"
|
||||
" }\n"
|
||||
" else\n"
|
||||
" {\n"
|
||||
" if ((*rgb).y >= (*rgb).z)\n"
|
||||
" max = (*rgb).y;\n"
|
||||
" else\n"
|
||||
" max = (*rgb).z;\n"
|
||||
" }\n"
|
||||
"\n"
|
||||
//Compute minimum of r, g, b.
|
||||
" if ((*rgb).x <= (*rgb).y)\n"
|
||||
" {\n"
|
||||
" if ((*rgb).x <= (*rgb).z)\n"
|
||||
" min = (*rgb).x;\n"
|
||||
" else\n"
|
||||
" min = (*rgb).z;\n"
|
||||
" }\n"
|
||||
" else\n"
|
||||
" {\n"
|
||||
" if ((*rgb).y <= (*rgb).z)\n"
|
||||
" min = (*rgb).y;\n"
|
||||
" else\n"
|
||||
" min = (*rgb).z;\n"
|
||||
" }\n"
|
||||
"\n"
|
||||
" del = max - min;\n"
|
||||
" (*hsv).z = max;\n"
|
||||
"\n"
|
||||
" if (max != 0)\n"
|
||||
" (*hsv).y = del / max;\n"
|
||||
" else\n"
|
||||
" (*hsv).y = 0;\n"
|
||||
"\n"
|
||||
" (*hsv).x = 0;\n"
|
||||
" if ((*hsv).y != 0)\n"
|
||||
" {\n"
|
||||
" rc = (max - (*rgb).x) / del;\n"
|
||||
" gc = (max - (*rgb).y) / del;\n"
|
||||
" bc = (max - (*rgb).z) / del;\n"
|
||||
"\n"
|
||||
" if ((*rgb).x == max)\n"
|
||||
" (*hsv).x = bc - gc;\n"
|
||||
" else if ((*rgb).y == max)\n"
|
||||
" (*hsv).x = 2 + rc - bc;\n"
|
||||
" else if ((*rgb).z == max)\n"
|
||||
" (*hsv).x = 4 + gc - rc;\n"
|
||||
"\n"
|
||||
" if ((*hsv).x < 0)\n"
|
||||
" (*hsv).x += 6;\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
"\n";
|
||||
|
||||
/// <summary>
|
||||
/// OpenCL equivalent of Palette::HsvToRgb().
|
||||
/// </summary>
|
||||
static const char* HsvToRgbFunctionString =
|
||||
//h 0 - 6, s 0 - 1, v 0 - 1
|
||||
//rgb 0 - 1
|
||||
"static inline void HsvToRgb(real4* hsv, real4* rgb)\n"
|
||||
"{\n"
|
||||
" int j;\n"
|
||||
" real_t f, p, q, t;\n"
|
||||
"\n"
|
||||
" while ((*hsv).x >= 6)\n"
|
||||
" (*hsv).x = (*hsv).x - 6;\n"
|
||||
"\n"
|
||||
" while ((*hsv).x < 0)\n"
|
||||
" (*hsv).x = (*hsv).x + 6;\n"
|
||||
"\n"
|
||||
" j = (int)floor((*hsv).x);\n"
|
||||
" f = (*hsv).x - j;\n"
|
||||
" p = (*hsv).z * (1 - (*hsv).y);\n"
|
||||
" q = (*hsv).z * (1 - ((*hsv).y * f));\n"
|
||||
" t = (*hsv).z * (1 - ((*hsv).y * (1 - f)));\n"
|
||||
"\n"
|
||||
" switch (j)\n"
|
||||
" {\n"
|
||||
" case 0: (*rgb).x = (*hsv).z; (*rgb).y = t; (*rgb).z = p; break;\n"
|
||||
" case 1: (*rgb).x = q; (*rgb).y = (*hsv).z; (*rgb).z = p; break;\n"
|
||||
" case 2: (*rgb).x = p; (*rgb).y = (*hsv).z; (*rgb).z = t; break;\n"
|
||||
" case 3: (*rgb).x = p; (*rgb).y = q; (*rgb).z = (*hsv).z; break;\n"
|
||||
" case 4: (*rgb).x = t; (*rgb).y = p; (*rgb).z = (*hsv).z; break;\n"
|
||||
" case 5: (*rgb).x = (*hsv).z; (*rgb).y = p; (*rgb).z = q; break;\n"
|
||||
" default: (*rgb).x = (*hsv).z; (*rgb).y = t; (*rgb).z = p; break;\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
"\n";
|
||||
|
||||
/// <summary>
|
||||
/// OpenCL equivalent of Palette::CalcAlpha().
|
||||
/// </summary>
|
||||
static const char* CalcAlphaFunctionString =
|
||||
"static inline real_t CalcAlpha(real_t density, real_t gamma, real_t linrange)\n"//Not the slightest clue what this is doing.//DOC
|
||||
"{\n"
|
||||
" real_t frac, alpha, funcval = pow(linrange, gamma);\n"
|
||||
"\n"
|
||||
" if (density > 0)\n"
|
||||
" {\n"
|
||||
" if (density < linrange)\n"
|
||||
" {\n"
|
||||
" frac = density / linrange;\n"
|
||||
" alpha = (1.0 - frac) * density * (funcval / linrange) + frac * pow(density, gamma);\n"
|
||||
" }\n"
|
||||
" else\n"
|
||||
" alpha = pow(density, gamma);\n"
|
||||
" }\n"
|
||||
" else\n"
|
||||
" alpha = 0;\n"
|
||||
"\n"
|
||||
" return alpha;\n"
|
||||
"}\n"
|
||||
"\n";
|
||||
|
||||
/// <summary>
|
||||
/// Use MWC 64 from David Thomas at the Imperial College of London for
|
||||
/// random numbers in OpenCL, instead of ISAAC which was used
|
||||
/// for CPU rendering.
|
||||
/// </summary>
|
||||
static const char* RandFunctionString =
|
||||
"enum { MWC64X_A = 4294883355u };\n\n"
|
||||
"inline uint MwcNext(uint2* s)\n"
|
||||
"{\n"
|
||||
" uint res = (*s).x ^ (*s).y; \n"//Calculate the result.
|
||||
" uint hi = mul_hi((*s).x, MWC64X_A); \n"//Step the RNG.
|
||||
" (*s).x = (*s).x * MWC64X_A + (*s).y;\n"//Pack the state back up.
|
||||
" (*s).y = hi + ((*s).x < (*s).y); \n"
|
||||
" return res; \n"//Return the next result.
|
||||
"}\n"
|
||||
"\n"
|
||||
"inline uint MwcNextRange(uint2* s, uint val)\n"
|
||||
"{\n"
|
||||
" return (val == 0) ? MwcNext(s) : (MwcNext(s) % val);\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"inline real_t MwcNext01(uint2* s)\n"
|
||||
"{\n"
|
||||
" return MwcNext(s) * (1.0 / 4294967296.0);\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"inline real_t MwcNextNeg1Pos1(uint2* s)\n"
|
||||
"{\n"
|
||||
" real_t f = (real_t)MwcNext(s) / UINT_MAX;\n"
|
||||
" return -1.0 + (f * (1.0 - (-1.0)));\n"
|
||||
"}\n"
|
||||
"\n";
|
||||
|
||||
/// <summary>
|
||||
/// OpenCL equivalent of the global ClampRef().
|
||||
/// </summary>
|
||||
static const char* ClampRealFunctionString =
|
||||
"inline real_t Clamp(real_t val, real_t min, real_t max)\n"
|
||||
"{\n"
|
||||
" if (val < min)\n"
|
||||
" return min;\n"
|
||||
" else if (val > max)\n"
|
||||
" return max;\n"
|
||||
" else\n"
|
||||
" return val;\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"inline void ClampRef(real_t* val, real_t min, real_t max)\n"
|
||||
"{\n"
|
||||
" if (*val < min)\n"
|
||||
" *val = min;\n"
|
||||
" else if (*val > max)\n"
|
||||
" *val = max;\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"inline real_t ClampGte(real_t val, real_t gte)\n"
|
||||
"{\n"
|
||||
" return (val < gte) ? gte : val;\n"
|
||||
"}\n"
|
||||
"\n";
|
||||
|
||||
/// <summary>
|
||||
/// OpenCL equivalent of the global LRint().
|
||||
/// </summary>
|
||||
static const char* InlineMathFunctionsString =
|
||||
"inline real_t LRint(real_t x)\n"
|
||||
"{\n"
|
||||
" intPrec temp = (x >= 0.0 ? (intPrec)(x + 0.5) : (intPrec)(x - 0.5));\n"
|
||||
" return (real_t)temp;\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"inline real_t Round(real_t r)\n"
|
||||
"{\n"
|
||||
" return (r > 0.0) ? floor(r + 0.5) : ceil(r - 0.5);\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"inline real_t Sign(real_t v)\n"
|
||||
"{\n"
|
||||
" return (v < 0.0) ? -1 : (v > 0.0) ? 1 : 0.0;\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"inline real_t SignNz(real_t v)\n"
|
||||
"{\n"
|
||||
" return (v < 0.0) ? -1.0 : 1.0;\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"inline real_t Sqr(real_t v)\n"
|
||||
"{\n"
|
||||
" return v * v;\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"inline real_t SafeSqrt(real_t x)\n"
|
||||
"{\n"
|
||||
" if (x <= 0.0)\n"
|
||||
" return 0.0;\n"
|
||||
"\n"
|
||||
" return sqrt(x);\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"inline real_t Cube(real_t v)\n"
|
||||
"{\n"
|
||||
" return v * v * v;\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"inline real_t Hypot(real_t x, real_t y)\n"
|
||||
"{\n"
|
||||
" return sqrt(SQR(x) + SQR(y));\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"inline real_t Spread(real_t x, real_t y)\n"
|
||||
"{\n"
|
||||
" return Hypot(x, y) * ((x) > 0.0 ? 1.0 : -1.0);\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"inline real_t Powq4(real_t x, real_t y)\n"
|
||||
"{\n"
|
||||
" return pow(fabs(x), y) * SignNz(x);\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"inline real_t Powq4c(real_t x, real_t y)\n"
|
||||
"{\n"
|
||||
" return y == 1.0 ? x : Powq4(x, y);\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"inline real_t Zeps(real_t x)\n"
|
||||
"{\n"
|
||||
" return x == 0.0 ? EPS6 : x;\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"inline real_t Lerp(real_t a, real_t b, real_t p)\n"
|
||||
"{\n"
|
||||
" return a + (b - a) * p;\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"inline real_t Fabsmod(real_t v)\n"
|
||||
"{\n"
|
||||
" real_t dummy;\n"
|
||||
"\n"
|
||||
" return modf(v, &dummy);\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"inline real_t Fosc(real_t p, real_t amp, real_t ph)\n"
|
||||
"{\n"
|
||||
" return 0.5 - cos(p * amp + ph) * 0.5;\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"inline real_t Foscn(real_t p, real_t ph)\n"
|
||||
"{\n"
|
||||
" return 0.5 - cos(p + ph) * 0.5;\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"inline real_t LogScale(real_t x)\n"
|
||||
"{\n"
|
||||
" return x == 0.0 ? 0.0 : log((fabs(x) + 1) * M_E) * SignNz(x) / M_E;\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"inline real_t LogMap(real_t x)\n"
|
||||
"{\n"
|
||||
" return x == 0.0 ? 0.0 : (M_E + log(x * M_E)) * 0.25 * SignNz(x);\n"
|
||||
"}\n"
|
||||
"\n";
|
||||
|
||||
/// <summary>
|
||||
/// OpenCL equivalent Renderer::AddToAccum().
|
||||
/// </summary>
|
||||
static const char* AddToAccumWithCheckFunctionString =
|
||||
"inline bool AccumCheck(int superRasW, int superRasH, int i, int ii, int j, int jj)\n"
|
||||
"{\n"
|
||||
" return (j + jj >= 0 && j + jj < superRasH && i + ii >= 0 && i + ii < superRasW);\n"
|
||||
"}\n"
|
||||
"\n";
|
||||
|
||||
/// <summary>
|
||||
/// OpenCL equivalent various CarToRas member functions.
|
||||
/// </summary>
|
||||
static const char* CarToRasFunctionString =
|
||||
"inline void CarToRasConvertPointToSingle(__constant CarToRasCL* carToRas, Point* point, unsigned int* singleBufferIndex)\n"
|
||||
"{\n"
|
||||
" *singleBufferIndex = (unsigned int)(carToRas->m_PixPerImageUnitW * point->m_X - carToRas->m_RasLlX) + (carToRas->m_RasWidth * (unsigned int)(carToRas->m_PixPerImageUnitH * point->m_Y - carToRas->m_RasLlY));\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"inline bool CarToRasInBounds(__constant CarToRasCL* carToRas, Point* point)\n"
|
||||
"{\n"
|
||||
" return point->m_X >= carToRas->m_CarLlX &&\n"
|
||||
" point->m_X < carToRas->m_CarUrX &&\n"
|
||||
" point->m_Y < carToRas->m_CarUrY &&\n"
|
||||
" point->m_Y >= carToRas->m_CarLlY;\n"
|
||||
"}\n"
|
||||
"\n";
|
||||
|
||||
static string AtomicString(bool doublePrecision, bool dp64AtomicSupport)
|
||||
{
|
||||
ostringstream os;
|
||||
|
||||
//If they want single precision, or if they want double precision and have dp atomic support.
|
||||
if (!doublePrecision || dp64AtomicSupport)
|
||||
{
|
||||
os <<
|
||||
"void AtomicAdd(volatile __global real_t* source, const real_t operand)\n"
|
||||
"{\n"
|
||||
" union\n"
|
||||
" {\n"
|
||||
" atomi intVal;\n"
|
||||
" real_t realVal;\n"
|
||||
" } newVal;\n"
|
||||
"\n"
|
||||
" union\n"
|
||||
" {\n"
|
||||
" atomi intVal;\n"
|
||||
" real_t realVal;\n"
|
||||
" } prevVal;\n"
|
||||
"\n"
|
||||
" do\n"
|
||||
" {\n"
|
||||
" prevVal.realVal = *source;\n"
|
||||
" newVal.realVal = prevVal.realVal + operand;\n"
|
||||
" } while (atomic_cmpxchg((volatile __global atomi*)source, prevVal.intVal, newVal.intVal) != prevVal.intVal);\n"
|
||||
"}\n";
|
||||
}
|
||||
else//They want double precision and do not have dp atomic support.
|
||||
{
|
||||
os <<
|
||||
"void AtomicAdd(volatile __global real_t* source, const real_t operand)\n"
|
||||
"{\n"
|
||||
" union\n"
|
||||
" {\n"
|
||||
" uint intVal[2];\n"
|
||||
" real_t realVal;\n"
|
||||
" } newVal;\n"
|
||||
"\n"
|
||||
" union\n"
|
||||
" {\n"
|
||||
" uint intVal[2];\n"
|
||||
" real_t realVal;\n"
|
||||
" } prevVal;\n"
|
||||
"\n"
|
||||
" do\n"
|
||||
" {\n"
|
||||
" prevVal.realVal = *source;\n"
|
||||
" newVal.realVal = prevVal.realVal + operand;\n"
|
||||
" } while ((atomic_cmpxchg((volatile __global uint*)source, prevVal.intVal[0], newVal.intVal[0]) != prevVal.intVal[0]) ||\n"
|
||||
" (atomic_cmpxchg((volatile __global uint*)source + 1, prevVal.intVal[1], newVal.intVal[1]) != prevVal.intVal[1]));\n"
|
||||
"}\n";
|
||||
}
|
||||
|
||||
return os.str();
|
||||
}
|
||||
|
||||
#ifdef GRAVEYARD
|
||||
/*"void AtomicLocalAdd(volatile __local real_t* source, const real_t operand)\n"
|
||||
"{\n"
|
||||
" union\n"
|
||||
" {\n"
|
||||
" atomi intVal;\n"
|
||||
" real_t realVal;\n"
|
||||
" } newVal;\n"
|
||||
"\n"
|
||||
" union\n"
|
||||
" {\n"
|
||||
" atomi intVal;\n"
|
||||
" real_t realVal;\n"
|
||||
" } prevVal;\n"
|
||||
"\n"
|
||||
" do\n"
|
||||
" {\n"
|
||||
" prevVal.realVal = *source;\n"
|
||||
" newVal.realVal = prevVal.realVal + operand;\n"
|
||||
" } while (atomic_cmpxchg((volatile __local atomi*)source, prevVal.intVal, newVal.intVal) != prevVal.intVal);\n"
|
||||
"}\n"*/
|
||||
#endif
|
||||
}
|
39
Source/EmberCL/EmberCLPch.h
Normal file
39
Source/EmberCL/EmberCLPch.h
Normal file
@ -0,0 +1,39 @@
|
||||
#pragma once
|
||||
|
||||
/// <summary>
|
||||
/// Precompiled header file. Place all system includes here with appropriate #defines for different operating systems and compilers.
|
||||
/// </summary>
|
||||
|
||||
#define NOMINMAX
|
||||
#define WIN32_LEAN_AND_MEAN//Exclude rarely-used stuff from Windows headers.
|
||||
#define _USE_MATH_DEFINES
|
||||
|
||||
#ifdef _WIN32
|
||||
#include <windows.h>
|
||||
#include <SDKDDKVer.h>
|
||||
#endif
|
||||
|
||||
#include <utility>
|
||||
#include <CL/cl.hpp>
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
#include <iterator>
|
||||
#include <time.h>
|
||||
|
||||
#include "Timing.h"
|
||||
#include "Renderer.h"
|
||||
|
||||
#if defined(BUILDING_EMBERCL)
|
||||
#define EMBERCL_API __declspec(dllexport)
|
||||
#else
|
||||
#define EMBERCL_API __declspec(dllimport)
|
||||
#endif
|
||||
|
||||
using namespace std;
|
||||
using namespace EmberNs;
|
||||
//#define TEST_CL 1
|
383
Source/EmberCL/EmberCLStructs.h
Normal file
383
Source/EmberCL/EmberCLStructs.h
Normal file
@ -0,0 +1,383 @@
|
||||
#pragma once
|
||||
|
||||
#include "EmberCLPch.h"
|
||||
|
||||
/// <summary>
|
||||
/// Various data structures defined for the CPU and OpenCL.
|
||||
/// These are stripped down versions of THE classes in Ember, for use with OpenCL.
|
||||
/// Their sole purpose is to pass values from the host to the device.
|
||||
/// They retain most of the member variables, but do not contain the functions.
|
||||
/// Visual Studio defaults to alighment of 16, but it's made explicit in case another compiler is used.
|
||||
/// This must match the alignment specified in the kernel.
|
||||
/// </summary>
|
||||
|
||||
namespace EmberCLns
|
||||
{
|
||||
#define ALIGN __declspec(align(16))//These two must always match.
|
||||
#define ALIGN_CL "((aligned (16)))"//The extra parens are necessary.
|
||||
|
||||
/// <summary>
|
||||
/// Various constants needed for rendering.
|
||||
/// </summary>
|
||||
static string ConstantDefinesString(bool doublePrecision)
|
||||
{
|
||||
ostringstream os;
|
||||
|
||||
if (doublePrecision)
|
||||
{
|
||||
os << "#if defined(cl_amd_fp64)\n"//AMD extension available?
|
||||
<< " #pragma OPENCL EXTENSION cl_amd_fp64 : enable\n"
|
||||
<< "#endif\n"
|
||||
<< "#if defined(cl_khr_fp64)\n"//Khronos extension available?
|
||||
<< " #pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
|
||||
<< "#endif\n"
|
||||
<< "#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable\n"//Only supported on nVidia.
|
||||
<< "typedef long intPrec;\n"
|
||||
<< "typedef ulong atomi;\n"
|
||||
<< "typedef double real_t;\n"
|
||||
<< "typedef double4 real4;\n";
|
||||
}
|
||||
else
|
||||
{
|
||||
os << "typedef int intPrec;\n"
|
||||
"typedef unsigned int atomi;\n"
|
||||
"typedef float real_t;\n"
|
||||
"typedef float4 real4;\n";
|
||||
}
|
||||
|
||||
os <<
|
||||
"typedef long int int64;\n"
|
||||
"typedef unsigned long int uint64;\n"
|
||||
"\n"
|
||||
"#define EPS ((1e-10))\n"//May need to change this, it might not be enough in some cases. Maybe try 1e-9 if things look funny when close to zero.
|
||||
"#define EPS6 ((1e-6))\n"
|
||||
"\n"
|
||||
"//The number of threads per block used in the iteration function. Don't change\n"
|
||||
"//it lightly; the block size is hard coded to be exactly 32 x 8.\n"
|
||||
"#define NTHREADS 256u\n"
|
||||
"#define THREADS_PER_WARP 32u\n"
|
||||
"#define NWARPS (NTHREADS / THREADS_PER_WARP)\n"
|
||||
"#define COLORMAP_LENGTH 256u\n"
|
||||
"#define COLORMAP_LENGTH_MINUS_1 255u\n"
|
||||
"#define DE_THRESH 100u\n"
|
||||
"#define BadVal(x) (((x) != (x)) || ((x) > 1e10) || ((x) < -1e10))\n"
|
||||
"#define Rint(A) floor((A) + (((A) < 0) ? -0.5 : 0.5))\n"
|
||||
"#define SQR(x) ((x) * (x))\n"
|
||||
"#define CUBE(x) ((x) * (x) * (x))\n"
|
||||
"#define M_2PI (M_PI * 2)\n"
|
||||
"#define M_3PI (M_PI * 3)\n"
|
||||
"#define SQRT5 2.2360679774997896964091736687313\n"
|
||||
"#define M_PHI 1.61803398874989484820458683436563\n"
|
||||
"#define DEG_2_RAD (M_PI / 180)\n"
|
||||
"\n"
|
||||
"//Index in each dimension of a thread within a block.\n"
|
||||
"#define THREAD_ID_X (get_local_id(0))\n"
|
||||
"#define THREAD_ID_Y (get_local_id(1))\n"
|
||||
"#define THREAD_ID_Z (get_local_id(2))\n"
|
||||
"\n"
|
||||
"//Index in each dimension of a block within a grid.\n"
|
||||
"#define BLOCK_ID_X (get_group_id(0))\n"
|
||||
"#define BLOCK_ID_Y (get_group_id(1))\n"
|
||||
"#define BLOCK_ID_Z (get_group_id(2))\n"
|
||||
"\n"
|
||||
"//Absolute index in each dimension of a thread within a grid.\n"
|
||||
"#define GLOBAL_ID_X (get_global_id(0))\n"
|
||||
"#define GLOBAL_ID_Y (get_global_id(1))\n"
|
||||
"#define GLOBAL_ID_Z (get_global_id(2))\n"
|
||||
"\n"
|
||||
"//Dimensions of a block.\n"
|
||||
"#define BLOCK_SIZE_X (get_local_size(0))\n"
|
||||
"#define BLOCK_SIZE_Y (get_local_size(1))\n"
|
||||
"#define BLOCK_SIZE_Z (get_local_size(2))\n"
|
||||
"\n"
|
||||
"//Dimensions of a grid, in terms of blocks.\n"
|
||||
"#define GRID_SIZE_X (get_num_groups(0))\n"
|
||||
"#define GRID_SIZE_Y (get_num_groups(1))\n"
|
||||
"#define GRID_SIZE_Z (get_num_groups(2))\n"
|
||||
"\n"
|
||||
"//Dimensions of a grid, in terms of threads.\n"
|
||||
"#define GLOBAL_SIZE_X (get_global_size(0))\n"
|
||||
"#define GLOBAL_SIZE_Y (get_global_size(1))\n"
|
||||
"#define GLOBAL_SIZE_Z (get_global_size(2))\n"
|
||||
"\n"
|
||||
"#define INDEX_IN_BLOCK_2D (THREAD_ID_Y * BLOCK_SIZE_X + THREAD_ID_X)\n"
|
||||
"#define INDEX_IN_BLOCK_3D ((BLOCK_SIZE_X * BLOCK_SIZE_Y * THREAD_ID_Z) + INDEX_IN_BLOCK_2D)\n"
|
||||
"\n"
|
||||
"#define INDEX_IN_GRID_2D (GLOBAL_ID_Y * GLOBAL_SIZE_X + GLOBAL_ID_X)\n"
|
||||
"#define INDEX_IN_GRID_3D ((GLOBAL_SIZE_X * GLOBAL_SIZE_Y * GLOBAL_ID_Z) + INDEX_IN_GRID_2D)\n"
|
||||
"\n";
|
||||
|
||||
return os.str();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// A point structure on the host that maps to the one used on the device to iterate in OpenCL.
|
||||
/// It might seem better to use vec4, however 2D palettes and even 3D coordinates may eventually
|
||||
/// be supported, which will make it more than 4 members.
|
||||
/// </summary>
|
||||
template <typename T>
|
||||
struct ALIGN PointCL
|
||||
{
|
||||
T m_X;
|
||||
T m_Y;
|
||||
T m_Z;
|
||||
T m_ColorX;
|
||||
T m_LastXfUsed;
|
||||
};
|
||||
|
||||
/// <summary>
|
||||
/// The point structure used to iterate in OpenCL.
|
||||
/// It might seem better to use float4, however 2D palettes and even 3D coordinates may eventually
|
||||
/// be supported, which will make it more than 4 members.
|
||||
/// </summary>
|
||||
static const char* PointCLStructString =
|
||||
"typedef struct __attribute__ " ALIGN_CL " _Point\n"
|
||||
"{\n"
|
||||
" real_t m_X;\n"
|
||||
" real_t m_Y;\n"
|
||||
" real_t m_Z;\n"
|
||||
" real_t m_ColorX;\n"
|
||||
" uint m_LastXfUsed;\n"
|
||||
"} Point;\n"
|
||||
"\n";
|
||||
|
||||
#define MAX_CL_VARS 8//These must always match.
|
||||
#define MAX_CL_VARS_STRING "8"
|
||||
|
||||
/// <summary>
|
||||
/// A structure on the host used to hold all of the needed information for an xform used on the device to iterate in OpenCL.
|
||||
/// Template argument expected to be float or double.
|
||||
/// </summary>
|
||||
template <typename T>
|
||||
struct ALIGN XformCL
|
||||
{
|
||||
T m_A, m_B, m_C, m_D, m_E, m_F;//24 (48)
|
||||
T m_VariationWeights[MAX_CL_VARS];//56 (112)
|
||||
T m_PostA, m_PostB, m_PostC, m_PostD, m_PostE, m_PostF;//80 (160)
|
||||
T m_DirectColor;//84 (168)
|
||||
T m_ColorSpeedCache;//88 (176)
|
||||
T m_OneMinusColorCache;//92 (184)
|
||||
T m_Opacity;//96 (192)
|
||||
T m_VizAdjusted;//100 (200)
|
||||
};
|
||||
|
||||
/// <summary>
|
||||
/// The xform structure used to iterate in OpenCL.
|
||||
/// </summary>
|
||||
static const char* XformCLStructString =
|
||||
"typedef struct __attribute__ " ALIGN_CL " _XformCL\n"
|
||||
"{\n"
|
||||
" real_t m_A, m_B, m_C, m_D, m_E, m_F;\n"
|
||||
" real_t m_VariationWeights[" MAX_CL_VARS_STRING "];\n"
|
||||
" real_t m_PostA, m_PostB, m_PostC, m_PostD, m_PostE, m_PostF;\n"
|
||||
" real_t m_DirectColor;\n"
|
||||
" real_t m_ColorSpeedCache;\n"
|
||||
" real_t m_OneMinusColorCache;\n"
|
||||
" real_t m_Opacity;\n"
|
||||
" real_t m_VizAdjusted;\n"
|
||||
"} XformCL;\n"
|
||||
"\n";
|
||||
|
||||
#define MAX_CL_XFORM 21//These must always match.
|
||||
#define MAX_CL_XFORM_STRING "21"
|
||||
|
||||
/// <summary>
|
||||
/// A structure on the host used to hold all of the needed information for an ember used on the device to iterate in OpenCL.
|
||||
/// Template argument expected to be float or double.
|
||||
/// </summary>
|
||||
template <typename T>
|
||||
struct ALIGN EmberCL
|
||||
{
|
||||
unsigned int m_FinalXformIndex;
|
||||
XformCL<T> m_Xforms[MAX_CL_XFORM];
|
||||
T m_CamZPos;
|
||||
T m_CamPerspective;
|
||||
T m_CamYaw;
|
||||
T m_CamPitch;
|
||||
T m_CamDepthBlur;
|
||||
T m_BlurCoef;
|
||||
m3T m_CamMat;
|
||||
T m_CenterX, m_CenterY;
|
||||
T m_RotA, m_RotB, m_RotD, m_RotE;
|
||||
};
|
||||
|
||||
/// <summary>
|
||||
/// The ember structure used to iterate in OpenCL.
|
||||
/// </summary>
|
||||
static const char* EmberCLStructString =
|
||||
"typedef struct __attribute__ " ALIGN_CL " _EmberCL\n"
|
||||
"{\n"
|
||||
" uint m_FinalXformIndex;\n"
|
||||
" XformCL m_Xforms[" MAX_CL_XFORM_STRING "];\n"
|
||||
" real_t m_CamZPos;\n"
|
||||
" real_t m_CamPerspective;\n"
|
||||
" real_t m_CamYaw;\n"
|
||||
" real_t m_CamPitch;\n"
|
||||
" real_t m_CamDepthBlur;\n"
|
||||
" real_t m_BlurCoef;\n"
|
||||
" real_t m_C00;\n"
|
||||
" real_t m_C01;\n"
|
||||
" real_t m_C02;\n"
|
||||
" real_t m_C10;\n"
|
||||
" real_t m_C11;\n"
|
||||
" real_t m_C12;\n"
|
||||
" real_t m_C20;\n"
|
||||
" real_t m_C21;\n"
|
||||
" real_t m_C22;\n"
|
||||
" real_t m_CenterX, m_CenterY;\n"
|
||||
" real_t m_RotA, m_RotB, m_RotD, m_RotE;\n"
|
||||
"} EmberCL;\n"
|
||||
"\n";
|
||||
|
||||
/// <summary>
|
||||
/// A structure on the host used to hold all of the needed information for cartesian to raster mapping used on the device to iterate in OpenCL.
|
||||
/// Template argument expected to be float or double.
|
||||
/// </summary>
|
||||
template <typename T>
|
||||
struct ALIGN CarToRasCL
|
||||
{
|
||||
T m_PixPerImageUnitW, m_RasLlX;
|
||||
unsigned int m_RasWidth;
|
||||
T m_PixPerImageUnitH, m_RasLlY;
|
||||
T m_CarLlX, m_CarUrX, m_CarUrY, m_CarLlY;
|
||||
};
|
||||
|
||||
/// <summary>
|
||||
/// The cartesian to raster structure used to iterate in OpenCL.
|
||||
/// </summary>
|
||||
static const char* CarToRasCLStructString =
|
||||
"typedef struct __attribute__ " ALIGN_CL " _CarToRasCL\n"
|
||||
"{\n"
|
||||
" real_t m_PixPerImageUnitW, m_RasLlX;\n"
|
||||
" uint m_RasWidth;\n"
|
||||
" real_t m_PixPerImageUnitH, m_RasLlY;\n"
|
||||
" real_t m_CarLlX, m_CarUrX, m_CarUrY, m_CarLlY;\n"
|
||||
"} CarToRasCL;\n"
|
||||
"\n";
|
||||
|
||||
/// <summary>
|
||||
/// A structure on the host used to hold all of the needed information for density filtering used on the device to iterate in OpenCL.
|
||||
/// Note that the actual filter buffer is held elsewhere.
|
||||
/// Template argument expected to be float or double.
|
||||
/// </summary>
|
||||
template <typename T>
|
||||
struct ALIGN DensityFilterCL
|
||||
{
|
||||
T m_Curve;
|
||||
T m_K1;
|
||||
T m_K2;
|
||||
unsigned int m_Supersample;
|
||||
unsigned int m_SuperRasW;
|
||||
unsigned int m_SuperRasH;
|
||||
unsigned int m_KernelSize;
|
||||
unsigned int m_MaxFilterIndex;
|
||||
unsigned int m_MaxFilteredCounts;
|
||||
unsigned int m_FilterWidth;
|
||||
};
|
||||
|
||||
/// <summary>
|
||||
/// The density filtering structure used to iterate in OpenCL.
|
||||
/// Note that the actual filter buffer is held elsewhere.
|
||||
/// </summary>
|
||||
static const char* DensityFilterCLStructString =
|
||||
"typedef struct __attribute__ " ALIGN_CL " _DensityFilterCL\n"
|
||||
"{\n"
|
||||
" real_t m_Curve;\n"
|
||||
" real_t m_K1;\n"
|
||||
" real_t m_K2;\n"
|
||||
" uint m_Supersample;\n"
|
||||
" uint m_SuperRasW;\n"
|
||||
" uint m_SuperRasH;\n"
|
||||
" uint m_KernelSize;\n"
|
||||
" uint m_MaxFilterIndex;\n"
|
||||
" uint m_MaxFilteredCounts;\n"
|
||||
" uint m_FilterWidth;\n"
|
||||
"} DensityFilterCL;\n"
|
||||
"\n";
|
||||
|
||||
/// <summary>
|
||||
/// A structure on the host used to hold all of the needed information for spatial filtering used on the device to iterate in OpenCL.
|
||||
/// Note that the actual filter buffer is held elsewhere.
|
||||
/// </summary>
|
||||
template <typename T>
|
||||
struct ALIGN SpatialFilterCL
|
||||
{
|
||||
unsigned int m_SuperRasW;
|
||||
unsigned int m_SuperRasH;
|
||||
unsigned int m_FinalRasW;
|
||||
unsigned int m_FinalRasH;
|
||||
unsigned int m_Supersample;
|
||||
unsigned int m_FilterWidth;
|
||||
unsigned int m_NumChannels;
|
||||
unsigned int m_BytesPerChannel;
|
||||
unsigned int m_DensityFilterOffset;
|
||||
unsigned int m_Transparency;
|
||||
T m_Vibrancy;
|
||||
T m_HighlightPower;
|
||||
T m_Gamma;
|
||||
T m_LinRange;
|
||||
Color<T> m_Background;
|
||||
};
|
||||
|
||||
/// <summary>
|
||||
/// The spatial filtering structure used to iterate in OpenCL.
|
||||
/// Note that the actual filter buffer is held elsewhere.
|
||||
/// </summary>
|
||||
static const char* SpatialFilterCLStructString =
|
||||
"typedef struct __attribute__ ((aligned (16))) _SpatialFilterCL\n"
|
||||
"{\n"
|
||||
" uint m_SuperRasW;\n"
|
||||
" uint m_SuperRasH;\n"
|
||||
" uint m_FinalRasW;\n"
|
||||
" uint m_FinalRasH;\n"
|
||||
" uint m_Supersample;\n"
|
||||
" uint m_FilterWidth;\n"
|
||||
" uint m_NumChannels;\n"
|
||||
" uint m_BytesPerChannel;\n"
|
||||
" uint m_DensityFilterOffset;\n"
|
||||
" uint m_Transparency;\n"
|
||||
" real_t m_Vibrancy;\n"
|
||||
" real_t m_HighlightPower;\n"
|
||||
" real_t m_Gamma;\n"
|
||||
" real_t m_LinRange;\n"
|
||||
" real_t m_Background[4];\n"//For some reason, using float4/double4 here does not align no matter what. So just use an array of 4.
|
||||
"} SpatialFilterCL;\n"
|
||||
"\n";
|
||||
|
||||
/// <summary>
|
||||
/// EmberCL makes extensive use of the build in vector types, however accessing
|
||||
/// their members as a buffer is not natively supported.
|
||||
/// Declaring them in a union with a buffer resolves this problem.
|
||||
/// </summary>
|
||||
static const char* UnionCLStructString =
|
||||
"typedef union\n"
|
||||
"{\n"
|
||||
" uchar3 m_Uchar3;\n"
|
||||
" uchar m_Uchars[3];\n"
|
||||
"} uchar3uchars;\n"
|
||||
"\n"
|
||||
"typedef union\n"
|
||||
"{\n"
|
||||
" uchar4 m_Uchar4;\n"
|
||||
" uchar m_Uchars[4];\n"
|
||||
"} uchar4uchars;\n"
|
||||
"\n"
|
||||
"typedef union\n"
|
||||
"{\n"
|
||||
" uint4 m_Uint4;\n"
|
||||
" uint m_Uints[4];\n"
|
||||
"} uint4uints;\n"
|
||||
"\n"
|
||||
"typedef union\n"//Use in places where float is required.
|
||||
"{\n"
|
||||
" float4 m_Float4;\n"
|
||||
" float m_Floats[4];\n"
|
||||
"} float4floats;\n"
|
||||
"\n"
|
||||
"typedef union\n"//Use in places where float or double can be used depending on the template type.
|
||||
"{\n"
|
||||
" real4 m_Real4;\n"
|
||||
" real_t m_Reals[4];\n"
|
||||
"} real4reals;\n"
|
||||
"\n";
|
||||
}
|
517
Source/EmberCL/FinalAccumOpenCLKernelCreator.cpp
Normal file
517
Source/EmberCL/FinalAccumOpenCLKernelCreator.cpp
Normal file
@ -0,0 +1,517 @@
|
||||
#include "EmberCLPch.h"
|
||||
#include "FinalAccumOpenCLKernelCreator.h"
|
||||
|
||||
namespace EmberCLns
|
||||
{
|
||||
/// <summary>
|
||||
/// Constructor that creates all kernel strings.
|
||||
/// The caller will access these strings through the accessor functions.
|
||||
/// </summary>
|
||||
template <typename T>
|
||||
FinalAccumOpenCLKernelCreator<T>::FinalAccumOpenCLKernelCreator()
|
||||
{
|
||||
m_GammaCorrectionWithAlphaCalcEntryPoint = "GammaCorrectionWithAlphaCalcKernel";
|
||||
m_GammaCorrectionWithoutAlphaCalcEntryPoint = "GammaCorrectionWithoutAlphaCalcKernel";
|
||||
|
||||
m_GammaCorrectionWithAlphaCalcKernel = CreateGammaCorrectionKernelString(true);
|
||||
m_GammaCorrectionWithoutAlphaCalcKernel = CreateGammaCorrectionKernelString(false);
|
||||
|
||||
m_FinalAccumEarlyClipEntryPoint = "FinalAccumEarlyClipKernel";
|
||||
m_FinalAccumEarlyClipWithAlphaCalcWithAlphaAccumEntryPoint = "FinalAccumEarlyClipWithAlphaCalcWithAlphaAccumKernel";
|
||||
m_FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumEntryPoint = "FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumKernel";
|
||||
|
||||
m_FinalAccumEarlyClipKernel = CreateFinalAccumKernelString(true, false, false);
|
||||
m_FinalAccumEarlyClipWithAlphaCalcWithAlphaAccumKernel = CreateFinalAccumKernelString(true, true, true);
|
||||
m_FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumKernel = CreateFinalAccumKernelString(true, false, true);
|
||||
|
||||
m_FinalAccumLateClipEntryPoint = "FinalAccumLateClipKernel";
|
||||
m_FinalAccumLateClipWithAlphaCalcWithAlphaAccumEntryPoint = "FinalAccumLateClipWithAlphaCalcWithAlphaAccumKernel";
|
||||
m_FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumEntryPoint = "FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumKernel";
|
||||
|
||||
m_FinalAccumLateClipKernel = CreateFinalAccumKernelString(false, false, false);
|
||||
m_FinalAccumLateClipWithAlphaCalcWithAlphaAccumKernel = CreateFinalAccumKernelString(false, true, true);
|
||||
m_FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumKernel = CreateFinalAccumKernelString(false, false, true);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Kernel source and entry point properties, getters only.
|
||||
/// </summary>
|
||||
|
||||
template <typename T> string FinalAccumOpenCLKernelCreator<T>::GammaCorrectionWithAlphaCalcKernel() { return m_GammaCorrectionWithAlphaCalcKernel; }
|
||||
template <typename T> string FinalAccumOpenCLKernelCreator<T>::GammaCorrectionWithAlphaCalcEntryPoint() { return m_GammaCorrectionWithAlphaCalcEntryPoint; }
|
||||
template <typename T> string FinalAccumOpenCLKernelCreator<T>::GammaCorrectionWithoutAlphaCalcKernel() { return m_GammaCorrectionWithoutAlphaCalcKernel; }
|
||||
template <typename T> string FinalAccumOpenCLKernelCreator<T>::GammaCorrectionWithoutAlphaCalcEntryPoint() { return m_GammaCorrectionWithoutAlphaCalcEntryPoint; }
|
||||
|
||||
template <typename T> string FinalAccumOpenCLKernelCreator<T>::FinalAccumEarlyClipKernel() { return m_FinalAccumEarlyClipKernel; }
|
||||
template <typename T> string FinalAccumOpenCLKernelCreator<T>::FinalAccumEarlyClipEntryPoint() { return m_FinalAccumEarlyClipEntryPoint; }
|
||||
template <typename T> string FinalAccumOpenCLKernelCreator<T>::FinalAccumEarlyClipWithAlphaCalcWithAlphaAccumKernel() { return m_FinalAccumEarlyClipWithAlphaCalcWithAlphaAccumKernel; }
|
||||
template <typename T> string FinalAccumOpenCLKernelCreator<T>::FinalAccumEarlyClipWithAlphaCalcWithAlphaAccumEntryPoint() { return m_FinalAccumEarlyClipWithAlphaCalcWithAlphaAccumEntryPoint; }
|
||||
template <typename T> string FinalAccumOpenCLKernelCreator<T>::FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumKernel() { return m_FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumKernel; }
|
||||
template <typename T> string FinalAccumOpenCLKernelCreator<T>::FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumEntryPoint() { return m_FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumEntryPoint; }
|
||||
|
||||
template <typename T> string FinalAccumOpenCLKernelCreator<T>::FinalAccumLateClipKernel() { return m_FinalAccumLateClipKernel; }
|
||||
template <typename T> string FinalAccumOpenCLKernelCreator<T>::FinalAccumLateClipEntryPoint() { return m_FinalAccumLateClipEntryPoint; }
|
||||
template <typename T> string FinalAccumOpenCLKernelCreator<T>::FinalAccumLateClipWithAlphaCalcWithAlphaAccumKernel() { return m_FinalAccumLateClipWithAlphaCalcWithAlphaAccumKernel; }
|
||||
template <typename T> string FinalAccumOpenCLKernelCreator<T>::FinalAccumLateClipWithAlphaCalcWithAlphaAccumEntryPoint() { return m_FinalAccumLateClipWithAlphaCalcWithAlphaAccumEntryPoint; }
|
||||
template <typename T> string FinalAccumOpenCLKernelCreator<T>::FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumKernel() { return m_FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumKernel; }
|
||||
template <typename T> string FinalAccumOpenCLKernelCreator<T>::FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumEntryPoint() { return m_FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumEntryPoint; }
|
||||
|
||||
/// <summary>
|
||||
/// Get the gamma correction entry point.
|
||||
/// </summary>
|
||||
/// <param name="channels">The number of channels used, 3 or 4.</param>
|
||||
/// <param name="transparency">True if channels equals 4 and using transparency, else false.</param>
|
||||
/// <returns>The name of the gamma correction entry point kernel function</returns>
|
||||
template <typename T>
|
||||
string FinalAccumOpenCLKernelCreator<T>::GammaCorrectionEntryPoint(unsigned int channels, bool transparency)
|
||||
{
|
||||
bool alphaCalc = (channels > 3 && transparency);
|
||||
return alphaCalc ? m_GammaCorrectionWithAlphaCalcEntryPoint : m_GammaCorrectionWithoutAlphaCalcEntryPoint;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Get the gamma correction kernel string.
|
||||
/// </summary>
|
||||
/// <param name="channels">The number of channels used, 3 or 4.</param>
|
||||
/// <param name="transparency">True if channels equals 4 and using transparency, else false.</param>
|
||||
/// <returns>The gamma correction kernel string</returns>
|
||||
template <typename T>
|
||||
string FinalAccumOpenCLKernelCreator<T>::GammaCorrectionKernel(unsigned int channels, bool transparency)
|
||||
{
|
||||
bool alphaCalc = (channels > 3 && transparency);
|
||||
return alphaCalc ? m_GammaCorrectionWithAlphaCalcKernel : m_GammaCorrectionWithoutAlphaCalcKernel;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Get the final accumulation entry point.
|
||||
/// </summary>
|
||||
/// <param name="earlyClip">True if early clip is desired, else false.</param>
|
||||
/// <param name="channels">The number of channels used, 3 or 4.</param>
|
||||
/// <param name="transparency">True if channels equals 4 and using transparency, else false.</param>
|
||||
/// <param name="alphaBase">Storage for the alpha base value used in the kernel. 0 if transparency is true, else 255.</param>
|
||||
/// <param name="alphaScale">Storage for the alpha scale value used in the kernel. 255 if transparency is true, else 0.</param>
|
||||
/// <returns>The name of the final accumulation entry point kernel function</returns>
|
||||
template <typename T>
|
||||
string FinalAccumOpenCLKernelCreator<T>::FinalAccumEntryPoint(bool earlyClip, unsigned int channels, bool transparency, T& alphaBase, T& alphaScale)
|
||||
{
|
||||
bool alphaCalc = (channels > 3 && transparency);
|
||||
bool alphaAccum = channels > 3;
|
||||
|
||||
if (alphaAccum)
|
||||
{
|
||||
alphaBase = transparency ? 0.0f : 255.0f;//See the table below.
|
||||
alphaScale = transparency ? 255.0f : 0.0f;
|
||||
}
|
||||
|
||||
if (earlyClip)
|
||||
{
|
||||
if (!alphaCalc && !alphaAccum)//Rgb output, the most common case.
|
||||
return FinalAccumEarlyClipEntryPoint();
|
||||
else if (alphaCalc && alphaAccum)//Rgba output and Transparency.
|
||||
return FinalAccumEarlyClipWithAlphaCalcWithAlphaAccumEntryPoint();
|
||||
else if (!alphaCalc && alphaAccum)//Rgba output and !Transparency.
|
||||
return FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumEntryPoint();
|
||||
else
|
||||
return "";//Cannot have alphaCalc and !alphaAccum, it makes no sense.
|
||||
}
|
||||
else
|
||||
{
|
||||
if (!alphaCalc && !alphaAccum)//Rgb output, the most common case.
|
||||
return FinalAccumLateClipEntryPoint();
|
||||
else if (alphaCalc && alphaAccum)//Rgba output and Transparency.
|
||||
return FinalAccumLateClipWithAlphaCalcWithAlphaAccumEntryPoint();
|
||||
else if (!alphaCalc && alphaAccum)//Rgba output and !Transparency.
|
||||
return FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumEntryPoint();
|
||||
else
|
||||
return "";//Cannot have alphaCalc and !alphaAccum, it makes no sense.
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Get the final accumulation kernel string.
|
||||
/// </summary>
|
||||
/// <param name="earlyClip">True if early clip is desired, else false.</param>
|
||||
/// <param name="channels">The number of channels used, 3 or 4.</param>
|
||||
/// <param name="transparency">True if channels equals 4 and using transparency, else false.</param>
|
||||
/// <returns>The final accumulation kernel string</returns>
|
||||
template <typename T>
|
||||
string FinalAccumOpenCLKernelCreator<T>::FinalAccumKernel(bool earlyClip, unsigned int channels, bool transparency)
|
||||
{
|
||||
bool alphaCalc = (channels > 3 && transparency);
|
||||
bool alphaAccum = channels > 3;
|
||||
|
||||
if (earlyClip)
|
||||
{
|
||||
if (!alphaCalc && !alphaAccum)//Rgb output, the most common case.
|
||||
return FinalAccumEarlyClipKernel();
|
||||
else if (alphaCalc && alphaAccum)//Rgba output and Transparency.
|
||||
return FinalAccumEarlyClipWithAlphaCalcWithAlphaAccumKernel();
|
||||
else if (!alphaCalc && alphaAccum)//Rgba output and !Transparency.
|
||||
return FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumKernel();
|
||||
else
|
||||
return "";//Cannot have alphaCalc and !alphaAccum, it makes no sense.
|
||||
}
|
||||
else
|
||||
{
|
||||
if (!alphaCalc && !alphaAccum)//Rgb output, the most common case.
|
||||
return FinalAccumLateClipKernel();
|
||||
else if (alphaCalc && alphaAccum)//Rgba output and Transparency.
|
||||
return FinalAccumLateClipWithAlphaCalcWithAlphaAccumKernel();
|
||||
else if (!alphaCalc && alphaAccum)//Rgba output and !Transparency.
|
||||
return FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumKernel();
|
||||
else
|
||||
return "";//Cannot have alphaCalc and !alphaAccum, it makes no sense.
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Wrapper around CreateFinalAccumKernelString().
|
||||
/// </summary>
|
||||
/// <param name="earlyClip">True if early clip is desired, else false.</param>
|
||||
/// <param name="channels">The number of channels used, 3 or 4.</param>
|
||||
/// <param name="transparency">True if channels equals 4 and using transparency, else false.</param>
|
||||
/// <returns>The final accumulation kernel string</returns>
|
||||
template <typename T>
|
||||
string FinalAccumOpenCLKernelCreator<T>::CreateFinalAccumKernelString(bool earlyClip, unsigned int channels, bool transparency)
|
||||
{
|
||||
return CreateFinalAccumKernelString(earlyClip, (channels > 3 && transparency), channels > 3);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Create the final accumulation kernel string
|
||||
/// </summary>
|
||||
/// <param name="earlyClip">True if early clip is desired, else false.</param>
|
||||
/// <param name="alphaCalc">True if channels equals 4 and transparency is desired, else false.</param>
|
||||
/// <param name="alphaAccum">True if channels equals 4</param>
|
||||
/// <returns>The final accumulation kernel string</returns>
|
||||
template <typename T>
|
||||
string FinalAccumOpenCLKernelCreator<T>::CreateFinalAccumKernelString(bool earlyClip, bool alphaCalc, bool alphaAccum)
|
||||
{
|
||||
ostringstream os;
|
||||
string channels = alphaAccum ? "4" : "3";
|
||||
|
||||
os <<
|
||||
ConstantDefinesString(typeid(T) == typeid(double)) <<
|
||||
ClampRealFunctionString <<
|
||||
UnionCLStructString <<
|
||||
RgbToHsvFunctionString <<
|
||||
HsvToRgbFunctionString <<
|
||||
CalcAlphaFunctionString <<
|
||||
SpatialFilterCLStructString;
|
||||
|
||||
if (earlyClip)
|
||||
{
|
||||
if (!alphaCalc && !alphaAccum)//Rgb output, the most common case.
|
||||
os << "__kernel void " << m_FinalAccumEarlyClipEntryPoint << "(\n";
|
||||
else if (alphaCalc && alphaAccum)//Rgba output and Transparency.
|
||||
os << "__kernel void " << m_FinalAccumEarlyClipWithAlphaCalcWithAlphaAccumEntryPoint << "(\n";
|
||||
else if (!alphaCalc && alphaAccum)//Rgba output and !Transparency.
|
||||
os << "__kernel void " << m_FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumEntryPoint << "(\n";
|
||||
else
|
||||
return "";//Cannot have alphaCalc and !alphaAccum, it makes no sense.
|
||||
}
|
||||
else
|
||||
{
|
||||
os <<
|
||||
CreateCalcNewRgbFunctionString(false) <<
|
||||
CreateGammaCorrectionFunctionString(false, alphaCalc, alphaAccum, true);
|
||||
|
||||
if (!alphaCalc && !alphaAccum)//Rgb output, the most common case.
|
||||
os << "__kernel void " << m_FinalAccumLateClipEntryPoint << "(\n";
|
||||
else if (alphaCalc && alphaAccum)//Rgba output and Transparency.
|
||||
os << "__kernel void " << m_FinalAccumLateClipWithAlphaCalcWithAlphaAccumEntryPoint << "(\n";
|
||||
else if (!alphaCalc && alphaAccum)//Rgba output and !Transparency.
|
||||
os << "__kernel void " << m_FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumEntryPoint << "(\n";
|
||||
else
|
||||
return "";//Cannot have alphaCalc and !alphaAccum, it makes no sense.
|
||||
}
|
||||
|
||||
os <<
|
||||
" const __global real4reals* accumulator,\n"
|
||||
" __write_only image2d_t pixels,\n"
|
||||
" __constant SpatialFilterCL* spatialFilter,\n"
|
||||
" __constant real_t* filterCoefs,\n"
|
||||
" const real_t alphaBase,\n"
|
||||
" const real_t alphaScale\n"
|
||||
"\t)\n"
|
||||
"{\n"
|
||||
"\n"
|
||||
" if ((GLOBAL_ID_Y >= spatialFilter->m_FinalRasH) || (GLOBAL_ID_X >= spatialFilter->m_FinalRasW))\n"
|
||||
" return;\n"
|
||||
"\n"
|
||||
" unsigned int accumX = spatialFilter->m_DensityFilterOffset + (GLOBAL_ID_X * spatialFilter->m_Supersample);\n"
|
||||
" unsigned int accumY = spatialFilter->m_DensityFilterOffset + (GLOBAL_ID_Y * spatialFilter->m_Supersample);\n"
|
||||
|
||||
" int2 finalCoord;\n"
|
||||
" finalCoord.x = GLOBAL_ID_X;\n"
|
||||
" finalCoord.y = GLOBAL_ID_Y;\n"
|
||||
" float4floats finalColor;\n"
|
||||
" real_t alpha, ls;\n"
|
||||
" int ii, jj;\n"
|
||||
" unsigned int filterKRowIndex;\n"
|
||||
" const __global real4reals* accumBucket;\n"
|
||||
" real4reals newBucket;\n"
|
||||
" newBucket.m_Real4 = 0;\n"
|
||||
" real4reals newRgb;\n"
|
||||
" newRgb.m_Real4 = 0;\n"
|
||||
"\n"
|
||||
" for (jj = 0; jj < spatialFilter->m_FilterWidth; jj++)\n"
|
||||
" {\n"
|
||||
" filterKRowIndex = jj * spatialFilter->m_FilterWidth;\n"
|
||||
"\n"
|
||||
" for (ii = 0; ii < spatialFilter->m_FilterWidth; ii++)\n"
|
||||
" {\n"
|
||||
" real_t k = filterCoefs[ii + filterKRowIndex];\n"
|
||||
"\n"
|
||||
" accumBucket = accumulator + (accumX + ii) + ((accumY + jj) * spatialFilter->m_SuperRasW);\n"
|
||||
" newBucket.m_Real4 += (k * accumBucket->m_Real4);\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
"\n";
|
||||
|
||||
//Not supporting 2 bytes per channel on the GPU. If the user wants it, run on the CPU.
|
||||
if (earlyClip)//If early clip, simply assign values directly to the temp uint4 since they've been gamma corrected already, then write it straight to the output image below.
|
||||
{
|
||||
os <<
|
||||
" finalColor.m_Float4.x = (float)newBucket.m_Real4.x;\n"//CPU side clamps, skip here because write_imagef() does the clamping for us.
|
||||
" finalColor.m_Float4.y = (float)newBucket.m_Real4.y;\n"
|
||||
" finalColor.m_Float4.z = (float)newBucket.m_Real4.z;\n";
|
||||
|
||||
if (alphaAccum)
|
||||
{
|
||||
if (alphaCalc)
|
||||
os << " finalColor.m_Float4.w = (float)newBucket.m_Real4.w * 255.0f;\n";
|
||||
else
|
||||
os << " finalColor.m_Float4.w = 255;\n";
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
//Late clip, so must gamma correct from the temp new bucket to temp float4.
|
||||
if (typeid(T) == typeid(double))
|
||||
{
|
||||
os <<
|
||||
" real4reals realFinal;\n"
|
||||
"\n"
|
||||
" GammaCorrectionFloats(&newBucket, &(spatialFilter->m_Background[0]), spatialFilter->m_Gamma, spatialFilter->m_LinRange, spatialFilter->m_Vibrancy, spatialFilter->m_HighlightPower, alphaBase, alphaScale, &(realFinal.m_Reals[0]));\n"
|
||||
" finalColor.m_Float4.x = (float)realFinal.m_Real4.x;\n"
|
||||
" finalColor.m_Float4.y = (float)realFinal.m_Real4.y;\n"
|
||||
" finalColor.m_Float4.z = (float)realFinal.m_Real4.z;\n"
|
||||
" finalColor.m_Float4.w = (float)realFinal.m_Real4.w;\n"
|
||||
;
|
||||
}
|
||||
else
|
||||
{
|
||||
os <<
|
||||
" GammaCorrectionFloats(&newBucket, &(spatialFilter->m_Background[0]), spatialFilter->m_Gamma, spatialFilter->m_LinRange, spatialFilter->m_Vibrancy, spatialFilter->m_HighlightPower, alphaBase, alphaScale, &(finalColor.m_Floats[0]));\n";
|
||||
}
|
||||
}
|
||||
|
||||
os <<
|
||||
" finalColor.m_Float4 /= 255.0f;\n"
|
||||
" write_imagef(pixels, finalCoord, finalColor.m_Float4);\n"//Use write_imagef instead of write_imageui because only the former works when sharing with an OpenGL texture.
|
||||
" barrier(CLK_GLOBAL_MEM_FENCE);\n"//Required, or else page tearing will occur during interactive rendering.
|
||||
"}\n"
|
||||
;
|
||||
|
||||
return os.str();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Creates the gamma correction function string.
|
||||
/// This is not a full kernel, just a function that is used in the kernels.
|
||||
/// </summary>
|
||||
/// <param name="globalBucket">True if writing to a global buffer (early clip), else false (late clip).</param>
|
||||
/// <param name="alphaCalc">True if channels equals 4 and transparency is desired, else false.</param>
|
||||
/// <param name="alphaAccum">True if channels equals 4</param>
|
||||
/// <param name="finalOut">True if writing to global buffer (late clip), else false (early clip).</param>
|
||||
/// <returns>The gamma correction function string</returns>
|
||||
template <typename T>
|
||||
string FinalAccumOpenCLKernelCreator<T>::CreateGammaCorrectionFunctionString(bool globalBucket, bool alphaCalc, bool alphaAccum, bool finalOut)
|
||||
{
|
||||
ostringstream os;
|
||||
string dataType;
|
||||
string unionMember;
|
||||
dataType = "real_t";
|
||||
|
||||
//Use real_t for all cases, early clip and final accum.
|
||||
os << "void GammaCorrectionFloats(" << (globalBucket ? "__global " : "") << "real4reals* bucket, __constant real_t* background, real_t g, real_t linRange, real_t vibrancy, real_t highlightPower, real_t alphaBase, real_t alphaScale, " << (finalOut ? "" : "__global") << " real_t* correctedChannels)\n";
|
||||
|
||||
os
|
||||
<< "{\n"
|
||||
<< " real_t alpha, ls, tmp, a;\n"
|
||||
<< " real4reals newRgb;\n"
|
||||
<< "\n"
|
||||
<< " if (bucket->m_Reals[3] <= 0)\n"
|
||||
<< " {\n"
|
||||
<< " alpha = 0;\n"
|
||||
<< " ls = 0;\n"
|
||||
<< " }\n"
|
||||
<< " else\n"
|
||||
<< " {\n"
|
||||
<< " tmp = bucket->m_Reals[3];\n"
|
||||
<< " alpha = CalcAlpha(tmp, g, linRange);\n"
|
||||
<< " ls = vibrancy * 256.0 * alpha / tmp;\n"
|
||||
<< " ClampRef(&alpha, 0.0, 1.0);\n"
|
||||
<< " }\n"
|
||||
<< "\n"
|
||||
<< " CalcNewRgb(bucket, ls, highlightPower, &newRgb);\n"
|
||||
<< "\n"
|
||||
<< " for (unsigned int rgbi = 0; rgbi < 3; rgbi++)\n"
|
||||
<< " {\n"
|
||||
<< " a = newRgb.m_Reals[rgbi] + ((1.0 - vibrancy) * 256.0 * pow(bucket->m_Reals[rgbi], g));\n"
|
||||
<< "\n";
|
||||
|
||||
if (!alphaCalc)
|
||||
{
|
||||
os <<
|
||||
" a += ((1.0 - alpha) * background[rgbi]);\n";
|
||||
}
|
||||
else
|
||||
{
|
||||
os
|
||||
<< " if (alpha > 0)\n"
|
||||
<< " a /= alpha;\n"
|
||||
<< " else\n"
|
||||
<< " a = 0;\n";
|
||||
}
|
||||
|
||||
os <<
|
||||
"\n"
|
||||
" correctedChannels[rgbi] = (" << dataType << ")clamp(a, 0.0, 255.0);\n"
|
||||
" }\n"
|
||||
"\n";
|
||||
|
||||
//The CPU code has 3 cases for assigning alpha:
|
||||
//[3] = alpha.//Early clip.
|
||||
//[3] = alpha * 255.//Final Rgba with transparency.
|
||||
//[3] = 255.//Final Rgba without transparency.
|
||||
//Putting conditionals in GPU code is to be avoided. So do base + alpha * scale which will
|
||||
//work for all 3 cases without using a conditional, which should be faster on a GPU. This gives:
|
||||
//Base = 0, scale = 1. [3] = (0 + (alpha * 1)). [3] = alpha.
|
||||
//Base = 0, scale = 255. [3] = (0 + (alpha * 255)). [3] = alpha * 255.
|
||||
//Base = 255, scale = 0. [3] = (255 + (alpha * 0)). [3] = 255.
|
||||
if (alphaAccum)
|
||||
{
|
||||
os
|
||||
<< " correctedChannels[3] = (" << dataType << ")(alphaBase + (alpha * alphaScale));\n";
|
||||
}
|
||||
|
||||
os <<
|
||||
"}\n"
|
||||
"\n";
|
||||
|
||||
return os.str();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// OpenCL equivalent of Palette::CalcNewRgb().
|
||||
/// </summary>
|
||||
/// <param name="globalBucket">True if writing the corrected value to a global buffer (early clip), else false (late clip).</param>
|
||||
/// <returns>The CalcNewRgb function string</returns>
|
||||
template <typename T>
|
||||
string FinalAccumOpenCLKernelCreator<T>::CreateCalcNewRgbFunctionString(bool globalBucket)
|
||||
{
|
||||
ostringstream os;
|
||||
|
||||
os <<
|
||||
"static void CalcNewRgb(" << (globalBucket ? "__global " : "") << "real4reals* oldRgb, real_t ls, real_t highPow, real4reals* newRgb)\n"
|
||||
"{\n"
|
||||
" int rgbi;\n"
|
||||
" real_t newls, lsratio;\n"
|
||||
" real4reals newHsv;\n"
|
||||
" real_t maxa, maxc;\n"
|
||||
" real_t adjhlp;\n"
|
||||
"\n"
|
||||
" if (ls == 0 || (oldRgb->m_Real4.x == 0 && oldRgb->m_Real4.y == 0 && oldRgb->m_Real4.z == 0))\n"//Can't do a vector compare to zero.
|
||||
" {\n"
|
||||
" newRgb->m_Real4 = 0;\n"
|
||||
" return;\n"
|
||||
" }\n"
|
||||
"\n"
|
||||
//Identify the most saturated channel.
|
||||
" maxc = max(max(oldRgb->m_Reals[0], oldRgb->m_Reals[1]), oldRgb->m_Reals[2]);\n"
|
||||
" maxa = ls * maxc;\n"
|
||||
"\n"
|
||||
//If a channel is saturated and highlight power is non-negative
|
||||
//modify the color to prevent hue shift.
|
||||
" if (maxa > 255 && highPow >= 0)\n"
|
||||
" {\n"
|
||||
" newls = 255.0 / maxc;\n"
|
||||
" lsratio = pow(newls / ls, highPow);\n"
|
||||
"\n"
|
||||
//Calculate the max-value color (ranged 0 - 1).
|
||||
" for (rgbi = 0; rgbi < 3; rgbi++)\n"
|
||||
" newRgb->m_Reals[rgbi] = newls * oldRgb->m_Reals[rgbi] / 255.0;\n"
|
||||
"\n"
|
||||
//Reduce saturation by the lsratio.
|
||||
" RgbToHsv(&(newRgb->m_Real4), &(newHsv.m_Real4));\n"
|
||||
" newHsv.m_Real4.y *= lsratio;\n"
|
||||
" HsvToRgb(&(newHsv.m_Real4), &(newRgb->m_Real4));\n"
|
||||
"\n"
|
||||
" for (rgbi = 0; rgbi < 3; rgbi++)\n"//Unrolling and vectorizing makes no difference.
|
||||
" newRgb->m_Reals[rgbi] *= 255.0;\n"
|
||||
" }\n"
|
||||
" else\n"
|
||||
" {\n"
|
||||
" newls = 255.0 / maxc;\n"
|
||||
" adjhlp = -highPow;\n"
|
||||
"\n"
|
||||
" if (adjhlp > 1)\n"
|
||||
" adjhlp = 1;\n"
|
||||
"\n"
|
||||
" if (maxa <= 255)\n"
|
||||
" adjhlp = 1;\n"
|
||||
"\n"
|
||||
//Calculate the max-value color (ranged 0 - 1) interpolated with the old behavior.
|
||||
" for (rgbi = 0; rgbi < 3; rgbi++)\n"//Unrolling, caching and vectorizing makes no difference.
|
||||
" newRgb->m_Reals[rgbi] = ((1.0 - adjhlp) * newls + adjhlp * ls) * oldRgb->m_Reals[rgbi];\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
"\n";
|
||||
|
||||
return os.str();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Create the gamma correction kernel string used for early clipping.
|
||||
/// </summary>
|
||||
/// <param name="alphaCalc">True if channels equals 4 and transparency is desired, else false.</param>
|
||||
/// <returns>The gamma correction kernel string used for early clipping</returns>
|
||||
template <typename T>
|
||||
string FinalAccumOpenCLKernelCreator<T>::CreateGammaCorrectionKernelString(bool alphaCalc)
|
||||
{
|
||||
ostringstream os;
|
||||
string dataType;
|
||||
|
||||
os <<
|
||||
ConstantDefinesString(typeid(T) == typeid(double)) <<
|
||||
ClampRealFunctionString <<
|
||||
UnionCLStructString <<
|
||||
RgbToHsvFunctionString <<
|
||||
HsvToRgbFunctionString <<
|
||||
CalcAlphaFunctionString <<
|
||||
CreateCalcNewRgbFunctionString(true) <<
|
||||
SpatialFilterCLStructString <<
|
||||
CreateGammaCorrectionFunctionString(true, alphaCalc, true, false);//Will only be used with float in this case, early clip. Will always alpha accum.
|
||||
|
||||
os << "__kernel void " << (alphaCalc ? m_GammaCorrectionWithAlphaCalcEntryPoint : m_GammaCorrectionWithoutAlphaCalcEntryPoint) << "(\n" <<
|
||||
" __global real4reals* accumulator,\n"
|
||||
" __constant SpatialFilterCL* spatialFilter\n"
|
||||
")\n"
|
||||
"{\n"
|
||||
" int testGutter = 0;\n"
|
||||
"\n"
|
||||
" if (GLOBAL_ID_Y >= (spatialFilter->m_SuperRasH - testGutter) || GLOBAL_ID_X >= (spatialFilter->m_SuperRasW - testGutter))\n"
|
||||
" return;\n"
|
||||
"\n"
|
||||
" unsigned int superIndex = (GLOBAL_ID_Y * spatialFilter->m_SuperRasW) + GLOBAL_ID_X;\n"
|
||||
" __global real4reals* bucket = accumulator + superIndex;\n"
|
||||
//Pass in an alphaBase and alphaScale of 0, 1 which means to just directly assign the computed alpha value.
|
||||
" GammaCorrectionFloats(bucket, &(spatialFilter->m_Background[0]), spatialFilter->m_Gamma, spatialFilter->m_LinRange, spatialFilter->m_Vibrancy, spatialFilter->m_HighlightPower, 0.0, 1.0, &(bucket->m_Reals[0]));\n"
|
||||
"}\n"
|
||||
;
|
||||
|
||||
return os.str();
|
||||
}
|
||||
}
|
87
Source/EmberCL/FinalAccumOpenCLKernelCreator.h
Normal file
87
Source/EmberCL/FinalAccumOpenCLKernelCreator.h
Normal file
@ -0,0 +1,87 @@
|
||||
#pragma once
|
||||
|
||||
#include "EmberCLPch.h"
|
||||
#include "EmberCLStructs.h"
|
||||
#include "EmberCLFunctions.h"
|
||||
|
||||
/// <summary>
|
||||
/// FinalAccumOpenCLKernelCreator class.
|
||||
/// </summary>
|
||||
|
||||
namespace EmberCLns
|
||||
{
|
||||
/// <summary>
|
||||
/// Class for creating the final accumulation code in OpenCL.
|
||||
/// There are many conditionals in the CPU code to create the
|
||||
/// final output image. This class creates many different kernels
|
||||
/// with all conditionals and unnecessary calculations stripped out.
|
||||
/// The conditionals are:
|
||||
/// Early clip/late clip
|
||||
/// Alpha channel, no alpha channel
|
||||
/// Alpha with/without transparency
|
||||
/// Template argument expected to be float or double.
|
||||
/// </summary>
|
||||
template <typename T>
|
||||
class EMBERCL_API FinalAccumOpenCLKernelCreator
|
||||
{
|
||||
public:
|
||||
FinalAccumOpenCLKernelCreator();
|
||||
|
||||
string GammaCorrectionWithAlphaCalcKernel();
|
||||
string GammaCorrectionWithAlphaCalcEntryPoint();
|
||||
|
||||
string GammaCorrectionWithoutAlphaCalcKernel();
|
||||
string GammaCorrectionWithoutAlphaCalcEntryPoint();
|
||||
|
||||
string FinalAccumEarlyClipKernel();
|
||||
string FinalAccumEarlyClipEntryPoint();
|
||||
string FinalAccumEarlyClipWithAlphaCalcWithAlphaAccumKernel();
|
||||
string FinalAccumEarlyClipWithAlphaCalcWithAlphaAccumEntryPoint();
|
||||
string FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumKernel();
|
||||
string FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumEntryPoint();
|
||||
|
||||
string FinalAccumLateClipKernel();
|
||||
string FinalAccumLateClipEntryPoint();
|
||||
string FinalAccumLateClipWithAlphaCalcWithAlphaAccumKernel();
|
||||
string FinalAccumLateClipWithAlphaCalcWithAlphaAccumEntryPoint();
|
||||
string FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumKernel();
|
||||
string FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumEntryPoint();
|
||||
string GammaCorrectionEntryPoint(unsigned int channels, bool transparency);
|
||||
string GammaCorrectionKernel(unsigned int channels, bool transparency);
|
||||
string FinalAccumEntryPoint(bool earlyClip, unsigned int channels, bool transparency, T& alphaBase, T& alphaScale);
|
||||
string FinalAccumKernel(bool earlyClip, unsigned int channels, bool transparency);
|
||||
|
||||
private:
|
||||
string CreateFinalAccumKernelString(bool earlyClip, unsigned int channels, bool transparency);
|
||||
string CreateGammaCorrectionKernelString(bool alphaCalc);
|
||||
|
||||
string CreateFinalAccumKernelString(bool earlyClip, bool alphaCalc, bool alphaAccum);
|
||||
string CreateGammaCorrectionFunctionString(bool globalBucket, bool alphaCalc, bool alphaAccum, bool finalOut);
|
||||
string CreateCalcNewRgbFunctionString(bool globalBucket);
|
||||
string m_GammaCorrectionWithAlphaCalcKernel;
|
||||
string m_GammaCorrectionWithAlphaCalcEntryPoint;
|
||||
|
||||
string m_GammaCorrectionWithoutAlphaCalcKernel;
|
||||
string m_GammaCorrectionWithoutAlphaCalcEntryPoint;
|
||||
|
||||
string m_FinalAccumEarlyClipKernel;//False, false.
|
||||
string m_FinalAccumEarlyClipEntryPoint;
|
||||
string m_FinalAccumEarlyClipWithAlphaCalcWithAlphaAccumKernel;//True, true.
|
||||
string m_FinalAccumEarlyClipWithAlphaCalcWithAlphaAccumEntryPoint;
|
||||
string m_FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumKernel;//False, true.
|
||||
string m_FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumEntryPoint;
|
||||
|
||||
string m_FinalAccumLateClipKernel;//False, false.
|
||||
string m_FinalAccumLateClipEntryPoint;
|
||||
string m_FinalAccumLateClipWithAlphaCalcWithAlphaAccumKernel;//True, true.
|
||||
string m_FinalAccumLateClipWithAlphaCalcWithAlphaAccumEntryPoint;
|
||||
string m_FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumKernel;//False, true.
|
||||
string m_FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumEntryPoint;
|
||||
};
|
||||
|
||||
template EMBERCL_API class FinalAccumOpenCLKernelCreator<float>;
|
||||
|
||||
#ifdef DO_DOUBLE
|
||||
template EMBERCL_API class FinalAccumOpenCLKernelCreator<double>;
|
||||
#endif
|
||||
}
|
785
Source/EmberCL/IterOpenCLKernelCreator.cpp
Normal file
785
Source/EmberCL/IterOpenCLKernelCreator.cpp
Normal file
@ -0,0 +1,785 @@
|
||||
#include "EmberCLPch.h"
|
||||
#include "IterOpenCLKernelCreator.h"
|
||||
|
||||
namespace EmberCLns
|
||||
{
|
||||
/// <summary>
|
||||
/// Empty constructor that does nothing. The user must call the one which takes a bool
|
||||
/// argument before using this class.
|
||||
/// This constructor only exists so the class can be a member of a class.
|
||||
/// </summary>
|
||||
template <typename T>
|
||||
IterOpenCLKernelCreator<T>::IterOpenCLKernelCreator()
|
||||
{
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Constructor that sets up some basic entry point strings and creates
|
||||
/// the zeroization kernel string since it requires no conditional inputs.
|
||||
/// </summary>
|
||||
template <typename T>
|
||||
IterOpenCLKernelCreator<T>::IterOpenCLKernelCreator(bool nVidia)
|
||||
{
|
||||
m_NVidia = nVidia;
|
||||
m_IterEntryPoint = "IterateKernel";
|
||||
m_ZeroizeEntryPoint = "ZeroizeKernel";
|
||||
m_ZeroizeKernel = CreateZeroizeKernelString();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Accessors.
|
||||
/// </summary>
|
||||
|
||||
template <typename T> string IterOpenCLKernelCreator<T>::ZeroizeKernel() { return m_ZeroizeKernel; }
|
||||
template <typename T> string IterOpenCLKernelCreator<T>::ZeroizeEntryPoint() { return m_ZeroizeEntryPoint; }
|
||||
template <typename T> string IterOpenCLKernelCreator<T>::IterEntryPoint() { return m_IterEntryPoint; }
|
||||
|
||||
/// <summary>
|
||||
/// Create the iteration kernel string using the Cuburn method.
|
||||
/// Template argument expected to be float or double.
|
||||
/// </summary>
|
||||
/// <param name="ember">The ember to create the kernel string for</param>
|
||||
/// <param name="params">The parametric variation #define string</param>
|
||||
/// <param name="doAccum">Debugging parameter to include or omit accumulating to the histogram. Default: true.</param>
|
||||
/// <returns>The kernel string</returns>
|
||||
template <typename T>
|
||||
string IterOpenCLKernelCreator<T>::CreateIterKernelString(Ember<T>& ember, string& parVarDefines, bool lockAccum, bool doAccum)
|
||||
{
|
||||
bool doublePrecision = typeid(T) == typeid(double);
|
||||
unsigned int i, v, varIndex, varCount, totalXformCount = ember.TotalXformCount();
|
||||
ostringstream kernelIterBody, xformFuncs, os;
|
||||
vector<Variation<T>*> variations;
|
||||
|
||||
xformFuncs << "\n" << parVarDefines << endl;
|
||||
ember.GetPresentVariations(variations);
|
||||
std::for_each(variations.begin(), variations.end(), [&](Variation<T>* var) { if (var) xformFuncs << var->OpenCLFuncsString(); });
|
||||
|
||||
for (i = 0; i < totalXformCount; i++)
|
||||
{
|
||||
Xform<T>* xform = ember.GetTotalXform(i);
|
||||
unsigned int totalVarCount = xform->TotalVariationCount();
|
||||
bool needPrecalcSumSquares = false;
|
||||
bool needPrecalcSqrtSumSquares = false;
|
||||
bool needPrecalcAngles = false;
|
||||
bool needPrecalcAtanXY = false;
|
||||
bool needPrecalcAtanYX = false;
|
||||
|
||||
v = varIndex = varCount = 0;
|
||||
xformFuncs <<
|
||||
"void Xform" << i << "(__constant XformCL* xform, __constant real_t* parVars, Point* inPoint, Point* outPoint, uint2* mwc)\n" <<
|
||||
"{\n"
|
||||
" real_t transX, transY, transZ;\n"
|
||||
" real4 vIn, vOut = 0.0;\n";
|
||||
|
||||
//Determine if any variations, regular, pre, or post need precalcs.
|
||||
while (Variation<T>* var = xform->GetVariation(v++))
|
||||
{
|
||||
needPrecalcSumSquares |= var->NeedPrecalcSumSquares();
|
||||
needPrecalcSqrtSumSquares |= var->NeedPrecalcSqrtSumSquares();
|
||||
needPrecalcAngles |= var->NeedPrecalcAngles();
|
||||
needPrecalcAtanXY |= var->NeedPrecalcAtanXY();
|
||||
needPrecalcAtanYX |= var->NeedPrecalcAtanYX();
|
||||
}
|
||||
|
||||
if (needPrecalcSumSquares)
|
||||
xformFuncs << "\treal_t precalcSumSquares;\n";
|
||||
|
||||
if (needPrecalcSqrtSumSquares)
|
||||
xformFuncs << "\treal_t precalcSqrtSumSquares;\n";
|
||||
|
||||
if (needPrecalcAngles)
|
||||
{
|
||||
xformFuncs << "\treal_t precalcSina;\n";
|
||||
xformFuncs << "\treal_t precalcCosa;\n";
|
||||
}
|
||||
|
||||
if (needPrecalcAtanXY)
|
||||
xformFuncs << "\treal_t precalcAtanxy;\n";
|
||||
|
||||
if (needPrecalcAtanYX)
|
||||
xformFuncs << "\treal_t precalcAtanyx;\n";
|
||||
|
||||
xformFuncs << "\treal_t tempColor = outPoint->m_ColorX = xform->m_ColorSpeedCache + (xform->m_OneMinusColorCache * inPoint->m_ColorX);\n";
|
||||
|
||||
if (xform->PreVariationCount() + xform->VariationCount() == 0)
|
||||
{
|
||||
xformFuncs <<
|
||||
" outPoint->m_X = (xform->m_A * inPoint->m_X) + (xform->m_B * inPoint->m_Y) + xform->m_C;\n" <<
|
||||
" outPoint->m_Y = (xform->m_D * inPoint->m_X) + (xform->m_E * inPoint->m_Y) + xform->m_F;\n" <<
|
||||
" outPoint->m_Z = inPoint->m_Z;\n";
|
||||
}
|
||||
else
|
||||
{
|
||||
xformFuncs <<
|
||||
" transX = (xform->m_A * inPoint->m_X) + (xform->m_B * inPoint->m_Y) + xform->m_C;\n" <<
|
||||
" transY = (xform->m_D * inPoint->m_X) + (xform->m_E * inPoint->m_Y) + xform->m_F;\n" <<
|
||||
" transZ = inPoint->m_Z;\n";
|
||||
|
||||
varCount = xform->PreVariationCount();
|
||||
|
||||
if (varCount > 0)
|
||||
{
|
||||
xformFuncs << "\n\t//Apply each of the " << varCount << " pre variations in this xform.\n";
|
||||
|
||||
//Output the code for each pre variation in this xform.
|
||||
for (varIndex = 0; varIndex < varCount; varIndex++)
|
||||
{
|
||||
if (Variation<T>* var = xform->GetVariation(varIndex))
|
||||
{
|
||||
xformFuncs << "\n\t//" << var->Name() << ".\n";
|
||||
xformFuncs << var->PrecalcOpenCLString();
|
||||
xformFuncs << xform->ReadOpenCLString(VARTYPE_PRE) << endl;
|
||||
xformFuncs << var->OpenCLString() << endl;
|
||||
xformFuncs << xform->WriteOpenCLString(VARTYPE_PRE, var->AssignType()) << endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (xform->VariationCount() > 0)
|
||||
{
|
||||
if (xform->NeedPrecalcSumSquares())
|
||||
xformFuncs << "\tprecalcSumSquares = SQR(transX) + SQR(transY);\n";
|
||||
|
||||
if (xform->NeedPrecalcSqrtSumSquares())
|
||||
xformFuncs << "\tprecalcSqrtSumSquares = sqrt(precalcSumSquares);\n";
|
||||
|
||||
if (xform->NeedPrecalcAngles())
|
||||
{
|
||||
xformFuncs << "\tprecalcSina = transX / precalcSqrtSumSquares;\n";
|
||||
xformFuncs << "\tprecalcCosa = transY / precalcSqrtSumSquares;\n";
|
||||
}
|
||||
|
||||
if (xform->NeedPrecalcAtanXY())
|
||||
xformFuncs << "\tprecalcAtanxy = atan2(transX, transY);\n";
|
||||
|
||||
if (xform->NeedPrecalcAtanYX())
|
||||
xformFuncs << "\tprecalcAtanyx = atan2(transY, transX);\n";
|
||||
|
||||
xformFuncs << "\n\toutPoint->m_X = 0;";
|
||||
xformFuncs << "\n\toutPoint->m_Y = 0;";
|
||||
xformFuncs << "\n\toutPoint->m_Z = 0;\n";
|
||||
xformFuncs << "\n\t//Apply each of the " << xform->VariationCount() << " regular variations in this xform.\n\n";
|
||||
xformFuncs << xform->ReadOpenCLString(VARTYPE_REG);
|
||||
|
||||
varCount += xform->VariationCount();
|
||||
|
||||
//Output the code for each regular variation in this xform.
|
||||
for (; varIndex < varCount; varIndex++)
|
||||
{
|
||||
if (Variation<T>* var = xform->GetVariation(varIndex))
|
||||
{
|
||||
xformFuncs << "\n\t//" << var->Name() << ".\n"
|
||||
<< var->OpenCLString() << (varIndex == varCount - 1 ? "\n" : "\n\n")
|
||||
<< xform->WriteOpenCLString(VARTYPE_REG, ASSIGNTYPE_SUM);
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
xformFuncs <<
|
||||
" outPoint->m_X = transX;\n"
|
||||
" outPoint->m_Y = transY;\n"
|
||||
" outPoint->m_Z = transZ;\n";
|
||||
}
|
||||
}
|
||||
|
||||
if (xform->PostVariationCount() > 0)
|
||||
{
|
||||
varCount += xform->PostVariationCount();
|
||||
xformFuncs << "\n\t//Apply each of the " << xform->PostVariationCount() << " post variations in this xform.\n";
|
||||
|
||||
//Output the code for each post variation in this xform.
|
||||
for (; varIndex < varCount; varIndex++)
|
||||
{
|
||||
if (Variation<T>* var = xform->GetVariation(varIndex))
|
||||
{
|
||||
xformFuncs << "\n\t//" << var->Name() << ".\n";
|
||||
xformFuncs << var->PrecalcOpenCLString();
|
||||
xformFuncs << xform->ReadOpenCLString(VARTYPE_POST) << endl;
|
||||
xformFuncs << var->OpenCLString() << endl;
|
||||
xformFuncs << xform->WriteOpenCLString(VARTYPE_POST, var->AssignType()) << (varIndex == varCount - 1 ? "\n" : "\n\n");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (xform->HasPost())
|
||||
{
|
||||
xformFuncs <<
|
||||
"\n\t//Apply post affine transform.\n"
|
||||
"\treal_t tempX = outPoint->m_X;\n"
|
||||
"\n"
|
||||
"\toutPoint->m_X = (xform->m_PostA * tempX) + (xform->m_PostB * outPoint->m_Y) + xform->m_PostC;\n" <<
|
||||
"\toutPoint->m_Y = (xform->m_PostD * tempX) + (xform->m_PostE * outPoint->m_Y) + xform->m_PostF;\n";
|
||||
}
|
||||
|
||||
xformFuncs << "\toutPoint->m_ColorX = outPoint->m_ColorX + xform->m_DirectColor * (tempColor - outPoint->m_ColorX);\n";
|
||||
xformFuncs << "}\n"
|
||||
<< "\n";
|
||||
}
|
||||
|
||||
os <<
|
||||
ConstantDefinesString(doublePrecision) <<
|
||||
InlineMathFunctionsString <<
|
||||
ClampRealFunctionString <<
|
||||
RandFunctionString <<
|
||||
PointCLStructString <<
|
||||
XformCLStructString <<
|
||||
EmberCLStructString <<
|
||||
UnionCLStructString <<
|
||||
CarToRasCLStructString <<
|
||||
CarToRasFunctionString <<
|
||||
AtomicString(doublePrecision, m_NVidia) <<
|
||||
xformFuncs.str() <<
|
||||
"__kernel void " << m_IterEntryPoint << "(\n" <<
|
||||
" uint iterCount,\n"
|
||||
" uint fuseCount,\n"
|
||||
" uint seed,\n"
|
||||
" __constant EmberCL* ember,\n"
|
||||
" __constant real_t* parVars,\n"
|
||||
" __global uchar* xformDistributions,\n"//Using uchar is quicker than uint. Can't be constant because the size can be too large to fit when using xaos.//FINALOPT
|
||||
" __constant CarToRasCL* carToRas,\n"
|
||||
" __global real4reals* histogram,\n"
|
||||
" uint histSize,\n"
|
||||
" __read_only image2d_t palette,\n"
|
||||
" __global Point* points\n"
|
||||
"\t)\n"
|
||||
"{\n"
|
||||
" bool fuse, ok;\n"
|
||||
" uint threadIndex = INDEX_IN_BLOCK_2D;\n"
|
||||
" uint i, itersToDo;\n"
|
||||
" uint consec = 0;\n"
|
||||
//" int badvals = 0;\n"
|
||||
" uint histIndex;\n"
|
||||
" real_t p00, p01;\n"
|
||||
" Point firstPoint, secondPoint, tempPoint;\n"
|
||||
" uint2 mwc;\n"
|
||||
" float4 palColor1;\n"
|
||||
" int2 iPaletteCoord;\n"
|
||||
" const sampler_t paletteSampler = CLK_NORMALIZED_COORDS_FALSE |\n"//Coords from 0 to 255.
|
||||
" CLK_ADDRESS_CLAMP_TO_EDGE |\n"//Clamp to edge
|
||||
" CLK_FILTER_NEAREST;\n"//Don't interpolate
|
||||
" uint threadXY = (THREAD_ID_X + THREAD_ID_Y);\n"
|
||||
" uint threadXDivRows = (THREAD_ID_X / (NTHREADS / THREADS_PER_WARP));\n"
|
||||
" uint threadsMinus1 = NTHREADS - 1;\n"
|
||||
;
|
||||
|
||||
os <<
|
||||
"\n"
|
||||
" __local Point swap[NTHREADS];\n"
|
||||
" __local uint xfsel[NWARPS];\n"
|
||||
"\n"
|
||||
" unsigned int pointsIndex = INDEX_IN_GRID_2D;\n"
|
||||
" mwc.x = (pointsIndex + 1 * seed) & 0x7FFFFFFF;\n"
|
||||
" mwc.y = ((BLOCK_ID_X + 1) + (pointsIndex + 1) * seed) & 0x7FFFFFFF;\n"
|
||||
" iPaletteCoord.y = 0;\n"
|
||||
"\n"
|
||||
" if (fuseCount > 0)\n"
|
||||
" {\n"
|
||||
" fuse = true;\n"
|
||||
" itersToDo = fuseCount;\n"
|
||||
" firstPoint.m_X = MwcNextNeg1Pos1(&mwc);\n"
|
||||
" firstPoint.m_Y = MwcNextNeg1Pos1(&mwc);\n"
|
||||
" firstPoint.m_Z = 0.0;\n"
|
||||
" firstPoint.m_ColorX = MwcNext01(&mwc);\n"
|
||||
" firstPoint.m_LastXfUsed = 0;\n"
|
||||
" }\n"
|
||||
" else\n"
|
||||
" {\n"
|
||||
" fuse = false;\n"
|
||||
" itersToDo = iterCount;\n"
|
||||
" firstPoint = points[pointsIndex];\n"
|
||||
" }\n"
|
||||
"\n";
|
||||
|
||||
//This is done once initially here and then again after each swap-sync in the main loop.
|
||||
//This along with the randomness that the point shuffle provides gives sufficient randomness
|
||||
//to produce results identical to those produced on the CPU.
|
||||
os <<
|
||||
" if (THREAD_ID_Y == 0 && THREAD_ID_X < NWARPS)\n"
|
||||
" xfsel[THREAD_ID_X] = MwcNext(&mwc) % " << CHOOSE_XFORM_GRAIN << ";\n"//It's faster to do the % here ahead of time than every time an xform is looked up to use inside the loop.
|
||||
"\n"
|
||||
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
||||
"\n"
|
||||
" for (i = 0; i < itersToDo; i++)\n"
|
||||
" {\n";
|
||||
|
||||
os <<
|
||||
" consec = 0;\n"
|
||||
"\n"
|
||||
" do\n"
|
||||
" {\n";
|
||||
if (ember.XaosPresent())
|
||||
{
|
||||
os <<
|
||||
" secondPoint.m_LastXfUsed = xformDistributions[xfsel[THREAD_ID_Y] + (" << CHOOSE_XFORM_GRAIN << " * (firstPoint.m_LastXfUsed + 1u))];\n\n";
|
||||
}
|
||||
else
|
||||
{
|
||||
os <<
|
||||
" secondPoint.m_LastXfUsed = xformDistributions[xfsel[THREAD_ID_Y]];\n\n";
|
||||
}
|
||||
|
||||
for (i = 0; i < ember.XformCount(); i++)
|
||||
{
|
||||
if (i == 0)
|
||||
os <<
|
||||
" if (secondPoint.m_LastXfUsed == " << i << ")\n";
|
||||
else
|
||||
os <<
|
||||
" else if (secondPoint.m_LastXfUsed == " << i << ")\n";
|
||||
|
||||
os <<
|
||||
" {\n" <<
|
||||
" Xform" << i << "(&(ember->m_Xforms[" << i << "]), parVars, &firstPoint, &secondPoint, &mwc);\n" <<
|
||||
" }\n";
|
||||
}
|
||||
os <<
|
||||
"\n"
|
||||
" ok = !BadVal(secondPoint.m_X) && !BadVal(secondPoint.m_Y);\n"
|
||||
//" ok = !BadVal(secondPoint.m_X) && !BadVal(secondPoint.m_Y) && !BadVal(secondPoint.m_Z);\n"
|
||||
"\n"
|
||||
" if (!ok)\n"
|
||||
" {\n"
|
||||
" firstPoint.m_X = MwcNextNeg1Pos1(&mwc);\n"
|
||||
" firstPoint.m_Y = MwcNextNeg1Pos1(&mwc);\n"
|
||||
" firstPoint.m_Z = 0.0;\n"
|
||||
" firstPoint.m_ColorX = secondPoint.m_ColorX;\n"
|
||||
" consec++;\n"
|
||||
//" badvals++;\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" while (!ok && consec < 5);\n"
|
||||
"\n"
|
||||
" if (!ok)\n"
|
||||
" {\n"
|
||||
" secondPoint.m_X = MwcNextNeg1Pos1(&mwc);\n"
|
||||
" secondPoint.m_Y = MwcNextNeg1Pos1(&mwc);\n"
|
||||
" secondPoint.m_Z = 0.0;\n"
|
||||
" }\n"
|
||||
"\n"//Rotate points between threads. This is how randomization is achieved.
|
||||
" uint swr = threadXY + ((i & 1u) * threadXDivRows);\n"
|
||||
" uint sw = (swr * THREADS_PER_WARP + THREAD_ID_X) & threadsMinus1;\n"
|
||||
"\n"
|
||||
|
||||
//Write to another thread's location.
|
||||
" swap[sw] = secondPoint;\n"
|
||||
"\n"
|
||||
|
||||
//Populate randomized xform index buffer with new random values.
|
||||
" if (THREAD_ID_Y == 0 && THREAD_ID_X < NWARPS)\n"
|
||||
" xfsel[THREAD_ID_X] = MwcNext(&mwc) % " << CHOOSE_XFORM_GRAIN << ";\n"
|
||||
"\n"
|
||||
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
||||
"\n"
|
||||
|
||||
//Another thread will have written to this thread's location, so read the new value and use it for accumulation below.
|
||||
" firstPoint = swap[threadIndex];\n"
|
||||
"\n"
|
||||
" if (fuse)\n"
|
||||
" {\n"
|
||||
" if (i >= fuseCount - 1)\n"
|
||||
" {\n"
|
||||
" i = 0;\n"
|
||||
" fuse = false;\n"
|
||||
" itersToDo = iterCount;\n"
|
||||
" barrier(CLK_LOCAL_MEM_FENCE);\n"//Sort of seems necessary, sort of doesn't. Makes no speed difference.
|
||||
" }\n"
|
||||
"\n"
|
||||
" continue;\n"
|
||||
" }\n"
|
||||
"\n";
|
||||
|
||||
if (ember.UseFinalXform())
|
||||
{
|
||||
//CPU takes an extra step here to preserve the opacity of the randomly selected xform, rather than the final xform's opacity.
|
||||
//The same thing takes place here automatically because secondPoint.m_LastXfUsed is used below to retrieve the opacity when accumulating.
|
||||
os <<
|
||||
" if ((ember->m_Xforms[ember->m_FinalXformIndex].m_Opacity == 1) || (MwcNext01(&mwc) < ember->m_Xforms[ember->m_FinalXformIndex].m_Opacity))\n"
|
||||
" {\n"
|
||||
" Xform" << (ember.TotalXformCount() - 1) << "(&(ember->m_Xforms[ember->m_FinalXformIndex]), parVars, &secondPoint, &tempPoint, &mwc);\n"
|
||||
" secondPoint = tempPoint;\n"
|
||||
" }\n"
|
||||
"\n";
|
||||
}
|
||||
|
||||
os << CreateProjectionString(ember);
|
||||
|
||||
if (doAccum)
|
||||
{
|
||||
os <<
|
||||
" p00 = secondPoint.m_X - ember->m_CenterX;\n"
|
||||
" p01 = secondPoint.m_Y - ember->m_CenterY;\n"
|
||||
" tempPoint.m_X = (p00 * ember->m_RotA) + (p01 * ember->m_RotB) + ember->m_CenterX;\n"
|
||||
" tempPoint.m_Y = (p00 * ember->m_RotD) + (p01 * ember->m_RotE) + ember->m_CenterY;\n"
|
||||
"\n"
|
||||
//Add this point to the appropriate location in the histogram.
|
||||
" if (CarToRasInBounds(carToRas, &tempPoint))\n"
|
||||
" {\n"
|
||||
" CarToRasConvertPointToSingle(carToRas, &tempPoint, &histIndex);\n"
|
||||
"\n"
|
||||
" if (histIndex < histSize)\n"//Provides an extra level of safety and makes no speed difference.
|
||||
" {\n";
|
||||
|
||||
//Basic texture index interoplation does not produce identical results
|
||||
//to the CPU. So the code here must explicitly do the same thing and not
|
||||
//rely on the GPU texture coordinate lookup.
|
||||
if (ember.m_PaletteMode == PALETTE_LINEAR)
|
||||
{
|
||||
os <<
|
||||
" real_t colorIndexFrac;\n"
|
||||
" real_t colorIndex = secondPoint.m_ColorX * COLORMAP_LENGTH;\n"
|
||||
" int intColorIndex = (int)colorIndex;\n"
|
||||
" float4 palColor2;\n"
|
||||
"\n"
|
||||
" if (intColorIndex < 0)\n"
|
||||
" {\n"
|
||||
" intColorIndex = 0;\n"
|
||||
" colorIndexFrac = 0;\n"
|
||||
" }\n"
|
||||
" else if (intColorIndex >= COLORMAP_LENGTH_MINUS_1)\n"
|
||||
" {\n"
|
||||
" intColorIndex = COLORMAP_LENGTH_MINUS_1 - 1;\n"
|
||||
" colorIndexFrac = 1.0;\n"
|
||||
" }\n"
|
||||
" else\n"
|
||||
" {\n"
|
||||
" colorIndexFrac = colorIndex - (real_t)intColorIndex;\n"//Interpolate between intColorIndex and intColorIndex + 1.
|
||||
" }\n"
|
||||
"\n"
|
||||
" iPaletteCoord.x = intColorIndex;\n"//Palette operations are strictly float because OpenCL does not support dp64 textures.
|
||||
" palColor1 = read_imagef(palette, paletteSampler, iPaletteCoord);\n"
|
||||
" iPaletteCoord.x += 1;\n"
|
||||
" palColor2 = read_imagef(palette, paletteSampler, iPaletteCoord);\n"
|
||||
" palColor1 = (palColor1 * (1.0f - (float)colorIndexFrac)) + (palColor2 * (float)colorIndexFrac);\n";//The 1.0f here *must* have the 'f' suffix at the end to compile.
|
||||
}
|
||||
else if (ember.m_PaletteMode == PALETTE_STEP)
|
||||
{
|
||||
os <<
|
||||
" iPaletteCoord.x = (int)(secondPoint.m_ColorX * COLORMAP_LENGTH);\n"
|
||||
" palColor1 = read_imagef(palette, paletteSampler, iPaletteCoord);\n";
|
||||
}
|
||||
|
||||
if (lockAccum)
|
||||
{
|
||||
if (typeid(T) == typeid(double))
|
||||
{
|
||||
os <<
|
||||
" AtomicAdd(&(histogram[histIndex].m_Reals[0]), (real_t)palColor1.x * ember->m_Xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n"//Always apply opacity, even though it's usually 1.
|
||||
" AtomicAdd(&(histogram[histIndex].m_Reals[1]), (real_t)palColor1.y * ember->m_Xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n"
|
||||
" AtomicAdd(&(histogram[histIndex].m_Reals[2]), (real_t)palColor1.z * ember->m_Xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n"
|
||||
" AtomicAdd(&(histogram[histIndex].m_Reals[3]), (real_t)palColor1.w * ember->m_Xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n";
|
||||
}
|
||||
else
|
||||
{
|
||||
os <<
|
||||
" AtomicAdd(&(histogram[histIndex].m_Reals[0]), palColor1.x * ember->m_Xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n"//Always apply opacity, even though it's usually 1.
|
||||
" AtomicAdd(&(histogram[histIndex].m_Reals[1]), palColor1.y * ember->m_Xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n"
|
||||
" AtomicAdd(&(histogram[histIndex].m_Reals[2]), palColor1.z * ember->m_Xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n"
|
||||
" AtomicAdd(&(histogram[histIndex].m_Reals[3]), palColor1.w * ember->m_Xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n";
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (typeid(T) == typeid(double))
|
||||
{
|
||||
os <<
|
||||
" real4 realColor;\n"
|
||||
"\n"
|
||||
" realColor.x = (real_t)palColor1.x;\n"
|
||||
" realColor.y = (real_t)palColor1.y;\n"
|
||||
" realColor.z = (real_t)palColor1.z;\n"
|
||||
" realColor.w = (real_t)palColor1.w;\n"
|
||||
" histogram[histIndex].m_Real4 += (realColor * ember->m_Xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n";
|
||||
}
|
||||
else
|
||||
{
|
||||
os <<
|
||||
" histogram[histIndex].m_Real4 += (palColor1 * ember->m_Xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n";
|
||||
}
|
||||
}
|
||||
|
||||
os <<
|
||||
" }\n"//histIndex < histSize.
|
||||
" }\n"//CarToRasInBounds.
|
||||
"\n"
|
||||
" barrier(CLK_GLOBAL_MEM_FENCE);\n";//Barrier every time, whether or not the point was in bounds, else artifacts will occur when doing strips.
|
||||
}
|
||||
|
||||
os <<
|
||||
" }\n"//Main for loop.
|
||||
"\n"
|
||||
//At this point, iterating for this round is done, so write the final points back out
|
||||
//to the global points buffer to be used as inputs for the next round. This preserves point trajectory
|
||||
//between kernel calls.
|
||||
" points[pointsIndex] = firstPoint;\n"
|
||||
" barrier(CLK_GLOBAL_MEM_FENCE);\n"
|
||||
"}\n";
|
||||
|
||||
return os.str();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Create an OpenCL string of #defines and a corresponding host side vector for parametric variation values.
|
||||
/// Parametric variations present a special problem in the iteration code.
|
||||
/// The values can't be passed in with the array of other xform values because
|
||||
/// the length of the parametric values is unknown.
|
||||
/// This is solved by passing a separate buffer of values dedicated specifically
|
||||
/// to parametric variations.
|
||||
/// In OpenCL, a series of #define constants are declared which specify the indices in
|
||||
/// the buffer where the various values are stored.
|
||||
/// The possibility of a parametric variation type being present in multiple xforms is taken
|
||||
/// into account by appending the xform index to the #define, thus making each unique.
|
||||
/// The kernel creator then uses these to retrieve the values in the iteration code.
|
||||
/// Example:
|
||||
/// Xform1: Curl (curl_c1: 1.1, curl_c2: 2.2)
|
||||
/// Xform2: Curl (curl_c1: 4.4, curl_c2: 5.5)
|
||||
/// Xform3: Blob (blob_low: 1, blob_high: 2, blob_waves: 3)
|
||||
///
|
||||
/// Host vector to be passed as arg to the iter kernel call:
|
||||
/// [1.1][2.2][4.4][5.5][1][2][3]
|
||||
///
|
||||
/// #defines in OpenCL to access the buffer:
|
||||
///
|
||||
/// #define CURL_C1_1 0
|
||||
/// #define CURL_C2_1 1
|
||||
/// #define CURL_C1_2 2
|
||||
/// #define CURL_C2_2 3
|
||||
/// #define BLOB_LOW_3 4
|
||||
/// #define BLOB_HIGH_3 5
|
||||
/// #define BLOB_WAVES_ 6
|
||||
///
|
||||
/// The variations the use these #defines by first looking up the index of the
|
||||
/// xform they belong to in the parent ember and generating the OpenCL string based on that
|
||||
/// in their overriden OpenCLString() functions.
|
||||
/// Template argument expected to be float or double.
|
||||
/// </summary>
|
||||
/// <param name="ember">The ember to create the values from</param>
|
||||
/// <param name="params">The string,vector pair to store the values in</param>
|
||||
/// <param name="doVals">True if the vector should be populated, else false. Default: true.</param>
|
||||
/// <param name="doString">True if the string should be populated, else false. Default: true.</param>
|
||||
template <typename T>
|
||||
void IterOpenCLKernelCreator<T>::ParVarIndexDefines(Ember<T>& ember, pair<string, vector<T>>& params, bool doVals, bool doString)
|
||||
{
|
||||
unsigned int i, j, k, size = 0, xformCount = ember.TotalXformCount();
|
||||
Xform<T>* xform;
|
||||
ParametricVariation<T>* parVar;
|
||||
ostringstream os;
|
||||
|
||||
if (doVals)
|
||||
params.second.clear();
|
||||
|
||||
for (i = 0; i < xformCount; i++)
|
||||
{
|
||||
if (xform = ember.GetTotalXform(i))
|
||||
{
|
||||
unsigned int varCount = xform->TotalVariationCount();
|
||||
|
||||
for (j = 0; j < varCount; j++)
|
||||
{
|
||||
if (parVar = dynamic_cast<ParametricVariation<T>*>(xform->GetVariation(j)))
|
||||
{
|
||||
for (k = 0; k < parVar->ParamCount(); k++)
|
||||
{
|
||||
if (doString)
|
||||
os << "#define " << ToUpper(parVar->Params()[k].Name()) << "_" << i << " " << size << endl;//Uniquely identify this param in this variation in this xform.
|
||||
|
||||
if (doVals)
|
||||
params.second.push_back(parVar->Params()[k].ParamVal());
|
||||
|
||||
size++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (doString)
|
||||
{
|
||||
os << "\n";
|
||||
params.first = os.str();
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Determine whether the two embers passed in differ enough
|
||||
/// to require a rebuild of the iteration code.
|
||||
/// A rebuild is required if they differ in the following ways:
|
||||
/// Xform count
|
||||
/// Final xform presence
|
||||
/// Xaos presence
|
||||
/// Palette accumulation mode
|
||||
/// Xform post affine presence
|
||||
/// Variation count
|
||||
/// Variation type
|
||||
/// Template argument expected to be float or double.
|
||||
/// </summary>
|
||||
/// <param name="ember1">The first ember to compare</param>
|
||||
/// <param name="ember2">The second ember to compare</param>
|
||||
/// <returns>True if a rebuild is required, else false</returns>
|
||||
template <typename T>
|
||||
bool IterOpenCLKernelCreator<T>::IsBuildRequired(Ember<T>& ember1, Ember<T>& ember2)
|
||||
{
|
||||
unsigned int i, j, xformCount = ember1.TotalXformCount();
|
||||
|
||||
if (xformCount != ember2.TotalXformCount())
|
||||
return true;
|
||||
|
||||
if (ember1.UseFinalXform() != ember2.UseFinalXform())
|
||||
return true;
|
||||
|
||||
if (ember1.XaosPresent() != ember2.XaosPresent())
|
||||
return true;
|
||||
|
||||
if (ember1.m_PaletteMode != ember2.m_PaletteMode)
|
||||
return true;
|
||||
|
||||
if (ember1.ProjBits() != ember2.ProjBits())
|
||||
return true;
|
||||
|
||||
for (i = 0; i < xformCount; i++)
|
||||
{
|
||||
Xform<T>* xform1 = ember1.GetTotalXform(i);
|
||||
Xform<T>* xform2 = ember2.GetTotalXform(i);
|
||||
unsigned int varCount = xform1->TotalVariationCount();
|
||||
|
||||
if (xform1->HasPost() != xform2->HasPost())
|
||||
return true;
|
||||
|
||||
if (varCount != xform2->TotalVariationCount())
|
||||
return true;
|
||||
|
||||
for (j = 0; j < varCount; j++)
|
||||
if (xform1->GetVariation(j)->VariationId() != xform2->GetVariation(j)->VariationId())
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Create the zeroize kernel string.
|
||||
/// OpenCL comes with no way to zeroize a buffer like memset()
|
||||
/// would do on the CPU. So a special kernel must be ran to set a range
|
||||
/// of memory addresses to zero.
|
||||
/// </summary>
|
||||
/// <returns>The kernel string</returns>
|
||||
template <typename T>
|
||||
string IterOpenCLKernelCreator<T>::CreateZeroizeKernelString()
|
||||
{
|
||||
ostringstream os;
|
||||
|
||||
os <<
|
||||
ConstantDefinesString(typeid(T) == typeid(double)) <<//Double precision doesn't matter here since it's not used.
|
||||
"__kernel void " << m_ZeroizeEntryPoint << "(__global uchar* buffer, uint width, uint height)\n"
|
||||
"{\n"
|
||||
" if (GLOBAL_ID_X >= width || GLOBAL_ID_Y >= height)\n"
|
||||
" return;\n"
|
||||
"\n"
|
||||
" buffer[(GLOBAL_ID_Y * width) + GLOBAL_ID_X] = 0;\n"//Can't use INDEX_IN_GRID_2D here because the grid might be larger than the buffer to make even dimensions.
|
||||
" barrier(CLK_GLOBAL_MEM_FENCE);\n"//Just to be safe.
|
||||
"}\n"
|
||||
"\n";
|
||||
|
||||
return os.str();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Create the string for 3D projection based on the 3D values of the ember.
|
||||
/// Projection is done on the second point.
|
||||
/// If any of these fields toggle between 0 and nonzero between runs, a recompile is triggered.
|
||||
/// </summary>
|
||||
/// <param name="ember">The ember to create the projection string for</param>
|
||||
/// <returns>The kernel string</returns>
|
||||
template <typename T>
|
||||
string IterOpenCLKernelCreator<T>::CreateProjectionString(Ember<T>& ember)
|
||||
{
|
||||
unsigned int projBits = ember.ProjBits();
|
||||
ostringstream os;
|
||||
|
||||
if (projBits)
|
||||
{
|
||||
if (projBits & PROJBITS_BLUR)
|
||||
{
|
||||
if (projBits & PROJBITS_YAW)
|
||||
{
|
||||
os <<
|
||||
" real_t dsin, dcos;\n"
|
||||
" real_t t = MwcNext01(&mwc) * M_2PI;\n"
|
||||
" real_t z = secondPoint.m_Z - ember->m_CamZPos;\n"
|
||||
" real_t x = ember->m_C00 * secondPoint.m_X + ember->m_C10 * secondPoint.m_Y;\n"
|
||||
" real_t y = ember->m_C01 * secondPoint.m_X + ember->m_C11 * secondPoint.m_Y + ember->m_C21 * z;\n"
|
||||
"\n"
|
||||
" z = ember->m_C02 * secondPoint.m_X + ember->m_C12 * secondPoint.m_Y + ember->m_C22 * z;\n"
|
||||
"\n"
|
||||
" real_t zr = 1 - ember->m_CamPerspective * z;\n"
|
||||
" real_t dr = MwcNext01(&mwc) * ember->m_BlurCoef * z;\n"
|
||||
"\n"
|
||||
" dsin = sin(t);\n"
|
||||
" dcos = cos(t);\n"
|
||||
"\n"
|
||||
" secondPoint.m_X = (x + dr * dcos) / zr;\n"
|
||||
" secondPoint.m_Y = (y + dr * dsin) / zr;\n"
|
||||
" secondPoint.m_Z -= ember->m_CamZPos;\n";
|
||||
}
|
||||
else
|
||||
{
|
||||
os <<
|
||||
" real_t y, z, zr;\n"
|
||||
" real_t dsin, dcos;\n"
|
||||
" real_t t = MwcNext01(&mwc) * M_2PI;\n"
|
||||
"\n"
|
||||
" z = secondPoint.m_Z - ember->m_CamZPos;\n"
|
||||
" y = ember->m_C11 * secondPoint.m_Y + ember->m_C21 * z;\n"
|
||||
" z = ember->m_C12 * secondPoint.m_Y + ember->m_C22 * z;\n"
|
||||
" zr = 1 - ember->m_CamPerspective * z;\n"
|
||||
"\n"
|
||||
" dsin = sin(t);\n"
|
||||
" dcos = cos(t);\n"
|
||||
"\n"
|
||||
" real_t dr = MwcNext01(&mwc) * ember->m_BlurCoef * z;\n"
|
||||
"\n"
|
||||
" secondPoint.m_X = (secondPoint.m_X + dr * dcos) / zr;\n"
|
||||
" secondPoint.m_Y = (y + dr * dsin) / zr;\n"
|
||||
" secondPoint.m_Z -= ember->m_CamZPos;\n";
|
||||
}
|
||||
}
|
||||
else if ((projBits & PROJBITS_PITCH) || (projBits & PROJBITS_YAW))
|
||||
{
|
||||
if (projBits & PROJBITS_YAW)
|
||||
{
|
||||
os <<
|
||||
" real_t z = secondPoint.m_Z - ember->m_CamZPos;\n"
|
||||
" real_t x = ember->m_C00 * secondPoint.m_X + ember->m_C10 * secondPoint.m_Y;\n"
|
||||
" real_t y = ember->m_C01 * secondPoint.m_X + ember->m_C11 * secondPoint.m_Y + ember->m_C21 * z;\n"
|
||||
" real_t zr = 1 - ember->m_CamPerspective * (ember->m_C02 * secondPoint.m_X + ember->m_C12 * secondPoint.m_Y + ember->m_C22 * z);\n"
|
||||
"\n"
|
||||
" secondPoint.m_X = x / zr;\n"
|
||||
" secondPoint.m_Y = y / zr;\n"
|
||||
" secondPoint.m_Z -= ember->m_CamZPos;\n";
|
||||
}
|
||||
else
|
||||
{
|
||||
os <<
|
||||
" real_t z = secondPoint.m_Z - ember->m_CamZPos;\n"
|
||||
" real_t y = ember->m_C11 * secondPoint.m_Y + ember->m_C21 * z;\n"
|
||||
" real_t zr = 1 - ember->m_CamPerspective * (ember->m_C12 * secondPoint.m_Y + ember->m_C22 * z);\n"
|
||||
"\n"
|
||||
" secondPoint.m_X /= zr;\n"
|
||||
" secondPoint.m_Y = y / zr;\n"
|
||||
" secondPoint.m_Z -= ember->m_CamZPos;\n";
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
os <<
|
||||
" real_t zr = 1 - ember->m_CamPerspective * (secondPoint.m_Z - ember->m_CamZPos);\n"
|
||||
"\n"
|
||||
" secondPoint.m_X /= zr;\n"
|
||||
" secondPoint.m_Y /= zr;\n"
|
||||
" secondPoint.m_Z -= ember->m_CamZPos;\n";
|
||||
}
|
||||
}
|
||||
|
||||
return os.str();
|
||||
}
|
||||
}
|
89
Source/EmberCL/IterOpenCLKernelCreator.h
Normal file
89
Source/EmberCL/IterOpenCLKernelCreator.h
Normal file
@ -0,0 +1,89 @@
|
||||
#pragma once
|
||||
|
||||
#include "EmberCLPch.h"
|
||||
#include "EmberCLStructs.h"
|
||||
#include "EmberCLFunctions.h"
|
||||
|
||||
/// <summary>
|
||||
/// IterOpenCLKernelCreator class.
|
||||
/// </summary>
|
||||
|
||||
namespace EmberCLns
|
||||
{
|
||||
/// <summary>
|
||||
/// Class for creating the main iteration code in OpenCL.
|
||||
/// It uses the Cuburn method of iterating where all conditionals
|
||||
/// are stripped out and a specific kernel is compiled at run-time.
|
||||
/// It uses a very sophisticated method for randomization that avoids
|
||||
/// the problem of warp/wavefront divergence that would occur if every
|
||||
/// thread selected a random xform to apply.
|
||||
/// This only works with embers of type float, double is not supported.
|
||||
/// </summary>
|
||||
template <typename T>
|
||||
class EMBERCL_API IterOpenCLKernelCreator
|
||||
{
|
||||
public:
|
||||
IterOpenCLKernelCreator();
|
||||
IterOpenCLKernelCreator(bool nVidia);
|
||||
string ZeroizeKernel();
|
||||
string ZeroizeEntryPoint();
|
||||
string IterEntryPoint();
|
||||
string CreateIterKernelString(Ember<T>& ember, string& parVarDefines, bool lockAccum = false, bool doAccum = true);
|
||||
static void ParVarIndexDefines(Ember<T>& ember, pair<string, vector<T>>& params, bool doVals = true, bool doString = true);
|
||||
static bool IsBuildRequired(Ember<T>& ember1, Ember<T>& ember2);
|
||||
|
||||
private:
|
||||
string CreateZeroizeKernelString();
|
||||
string CreateProjectionString(Ember<T>& ember);
|
||||
|
||||
string m_IterEntryPoint;
|
||||
string m_ZeroizeKernel;
|
||||
string m_ZeroizeEntryPoint;
|
||||
bool m_NVidia;
|
||||
};
|
||||
|
||||
template EMBERCL_API class IterOpenCLKernelCreator<float>;
|
||||
|
||||
#ifdef DO_DOUBLE
|
||||
template EMBERCL_API class IterOpenCLKernelCreator<double>;
|
||||
#endif
|
||||
|
||||
//
|
||||
//template EMBERCL_API string IterOpenCLKernelCreator::CreateIterKernelString<float>(Ember<float>& ember, string& parVarDefines, bool lockAccum, bool doAccum);
|
||||
//template EMBERCL_API string IterOpenCLKernelCreator::CreateIterKernelString<double>(Ember<double>& ember, string& parVarDefines, bool lockAccum, bool doAccum);
|
||||
//
|
||||
//template EMBERCL_API void IterOpenCLKernelCreator::ParVarIndexDefines<float>(Ember<float>& ember, pair<string, vector<float>>& params, bool doVals, bool doString);
|
||||
//template EMBERCL_API void IterOpenCLKernelCreator::ParVarIndexDefines<double>(Ember<double>& ember, pair<string, vector<double>>& params, bool doVals, bool doString);
|
||||
//
|
||||
//template EMBERCL_API bool IterOpenCLKernelCreator::IsBuildRequired<float>(Ember<float>& ember1, Ember<float>& ember2);
|
||||
//template EMBERCL_API bool IterOpenCLKernelCreator::IsBuildRequired<double>(Ember<double>& ember1, Ember<double>& ember2);
|
||||
|
||||
#ifdef OPEN_CL_TEST_AREA
|
||||
typedef void (*KernelFuncPointer) (unsigned int gridWidth, unsigned int gridHeight, unsigned int blockWidth, unsigned int blockHeight,
|
||||
unsigned int BLOCK_ID_X, unsigned int BLOCK_ID_Y, unsigned int THREAD_ID_X, unsigned int THREAD_ID_Y);
|
||||
|
||||
static void OpenCLSim(unsigned int gridWidth, unsigned int gridHeight, unsigned int blockWidth, unsigned int blockHeight, KernelFuncPointer func)
|
||||
{
|
||||
cout << "OpenCLSim(): " << endl;
|
||||
cout << " Params: " << endl;
|
||||
cout << " gridW: " << gridWidth << endl;
|
||||
cout << " gridH: " << gridHeight << endl;
|
||||
cout << " blockW: " << blockWidth << endl;
|
||||
cout << " blockH: " << blockHeight << endl;
|
||||
|
||||
for (unsigned int i = 0; i < gridHeight; i += blockHeight)
|
||||
{
|
||||
for (unsigned int j = 0; j < gridWidth; j += blockWidth)
|
||||
{
|
||||
for (unsigned int k = 0; k < blockHeight; k++)
|
||||
{
|
||||
for (unsigned int l = 0; l < blockWidth; l++)
|
||||
{
|
||||
func(gridWidth, gridHeight, blockWidth, blockHeight, j / blockWidth, i / blockHeight, l, k);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
1366
Source/EmberCL/OpenCLWrapper.cpp
Normal file
1366
Source/EmberCL/OpenCLWrapper.cpp
Normal file
File diff suppressed because it is too large
Load Diff
219
Source/EmberCL/OpenCLWrapper.h
Normal file
219
Source/EmberCL/OpenCLWrapper.h
Normal file
@ -0,0 +1,219 @@
|
||||
#pragma once
|
||||
|
||||
#include "EmberCLPch.h"
|
||||
|
||||
/// <summary>
|
||||
/// OpenCLWrapper, Spk, NamedBuffer, NamedImage2D, NamedImage2DGL classes.
|
||||
/// </summary>
|
||||
|
||||
namespace EmberCLns
|
||||
{
|
||||
#if CL_VERSION_1_2
|
||||
#define IMAGEGL2D cl::ImageGL
|
||||
#else
|
||||
#define IMAGEGL2D cl::Image2DGL
|
||||
#endif
|
||||
|
||||
/// <summary>
|
||||
/// Class to contain all of the things needed to store an OpenCL program.
|
||||
/// The name of it, the source, the compiled program object and the kernel.
|
||||
/// </summary>
|
||||
class EMBERCL_API Spk
|
||||
{
|
||||
public:
|
||||
string m_Name;
|
||||
cl::Program::Sources m_Source;
|
||||
cl::Program m_Program;
|
||||
cl::Kernel m_Kernel;
|
||||
};
|
||||
|
||||
/// <summary>
|
||||
/// Class to hold an OpenCL buffer with a name to identify it by.
|
||||
/// </summary>
|
||||
class EMBERCL_API NamedBuffer
|
||||
{
|
||||
public:
|
||||
NamedBuffer()
|
||||
{
|
||||
}
|
||||
|
||||
NamedBuffer(cl::Buffer& buff, string name)
|
||||
{
|
||||
m_Buffer = buff;
|
||||
m_Name = name;
|
||||
}
|
||||
|
||||
cl::Buffer m_Buffer;
|
||||
string m_Name;
|
||||
};
|
||||
|
||||
/// <summary>
|
||||
/// Class to hold a 2D image with a name to identify it by.
|
||||
/// </summary>
|
||||
class EMBERCL_API NamedImage2D
|
||||
{
|
||||
public:
|
||||
NamedImage2D()
|
||||
{
|
||||
}
|
||||
|
||||
NamedImage2D(cl::Image2D& image, string name)
|
||||
{
|
||||
m_Image = image;
|
||||
m_Name = name;
|
||||
}
|
||||
|
||||
cl::Image2D m_Image;
|
||||
string m_Name;
|
||||
};
|
||||
|
||||
/// <summary>
|
||||
/// Class to hold a 2D image that is mapped to an OpenGL texture
|
||||
/// and a name to identify it by.
|
||||
/// </summary>
|
||||
class EMBERCL_API NamedImage2DGL
|
||||
{
|
||||
public:
|
||||
NamedImage2DGL()
|
||||
{
|
||||
}
|
||||
|
||||
NamedImage2DGL(IMAGEGL2D& image, string name)
|
||||
{
|
||||
m_Image = image;
|
||||
m_Name = name;
|
||||
}
|
||||
|
||||
IMAGEGL2D m_Image;
|
||||
string m_Name;
|
||||
};
|
||||
|
||||
/// <summary>
|
||||
/// Running kernels in OpenCL can require quite a bit of setup, tear down and
|
||||
/// general housekeeping. This class helps shield the user from such hassles.
|
||||
/// It's main utility is in holding collections of programs, buffers and images
|
||||
/// all identified by names. That way, a user can access them as needed without
|
||||
/// having to pollute their code.
|
||||
/// In addition, writing to an existing object by name determines if the object
|
||||
/// can be overwritten, or if it needs to be deleted and replaced by the new one.
|
||||
/// This class derives from EmberReport, so the caller is able
|
||||
/// to retrieve a text dump of error information if any errors occur.
|
||||
/// </summary>
|
||||
class EMBERCL_API OpenCLWrapper : public EmberReport
|
||||
{
|
||||
public:
|
||||
OpenCLWrapper();
|
||||
bool CheckOpenCL();
|
||||
bool Init(unsigned int platform, unsigned int device, bool shared = false);
|
||||
|
||||
//Programs.
|
||||
bool AddProgram(std::string name, std::string& program, std::string& entryPoint, bool doublePrecision);
|
||||
void ClearPrograms();
|
||||
|
||||
//Buffers.
|
||||
bool AddBuffer(string name, size_t size, cl_mem_flags flags = CL_MEM_READ_WRITE);
|
||||
bool AddAndWriteBuffer(string name, void* data, size_t size);
|
||||
bool WriteBuffer(string name, void* data, size_t size);
|
||||
bool WriteBuffer(unsigned int bufferIndex, void* data, size_t size);
|
||||
bool ReadBuffer(string name, void* data, size_t size);
|
||||
bool ReadBuffer(unsigned int bufferIndex, void* data, size_t size);
|
||||
int FindBufferIndex(string name);
|
||||
unsigned int GetBufferSize(string name);
|
||||
unsigned int GetBufferSize(unsigned int bufferIndex);
|
||||
void ClearBuffers();
|
||||
|
||||
//Images.
|
||||
bool AddAndWriteImage(string name, cl_mem_flags flags, const cl::ImageFormat& format, ::size_t width, ::size_t height, ::size_t row_pitch, void* data = NULL, bool shared = false, GLuint texName = 0);
|
||||
bool WriteImage2D(unsigned int index, bool shared, ::size_t width, ::size_t height, ::size_t row_pitch, void* data);
|
||||
bool ReadImage(string name, ::size_t width, ::size_t height, ::size_t row_pitch, bool shared, void* data);
|
||||
bool ReadImage(unsigned int imageIndex, ::size_t width, ::size_t height, ::size_t row_pitch, bool shared, void* data);
|
||||
int FindImageIndex(string name, bool shared);
|
||||
unsigned int GetImageSize(string name, bool shared);
|
||||
unsigned int GetImageSize(unsigned int imageIndex, bool shared);
|
||||
bool CompareImageParams(cl::Image& image, cl_mem_flags flags, const cl::ImageFormat& format, ::size_t width, ::size_t height, ::size_t row_pitch);
|
||||
void ClearImages(bool shared);
|
||||
bool CreateImage2D(cl::Image2D& image2D, cl_mem_flags flags, cl::ImageFormat format, ::size_t width, ::size_t height, ::size_t row_pitch = 0, void* data = NULL);
|
||||
bool CreateImage2DGL(IMAGEGL2D& image2DGL, cl_mem_flags flags, GLenum target, GLint miplevel, GLuint texobj);
|
||||
bool EnqueueAcquireGLObjects(string name);
|
||||
bool EnqueueAcquireGLObjects(IMAGEGL2D& image);
|
||||
bool EnqueueReleaseGLObjects(string name);
|
||||
bool EnqueueReleaseGLObjects(IMAGEGL2D& image);
|
||||
bool EnqueueAcquireGLObjects(const VECTOR_CLASS<cl::Memory>* memObjects = NULL);
|
||||
bool EnqueueReleaseGLObjects(const VECTOR_CLASS<cl::Memory>* memObjects = NULL);
|
||||
bool CreateSampler(cl::Sampler& sampler, cl_bool normalizedCoords, cl_addressing_mode addressingMode, cl_filter_mode filterMode);
|
||||
|
||||
//Arguments.
|
||||
bool SetBufferArg(unsigned int kernelIndex, unsigned int argIndex, string name);
|
||||
bool SetBufferArg(unsigned int kernelIndex, unsigned int argIndex, unsigned int bufferIndex);
|
||||
bool SetImageArg(unsigned int kernelIndex, unsigned int argIndex, bool shared, string name);
|
||||
bool SetImageArg(unsigned int kernelIndex, unsigned int argIndex, bool shared, unsigned int imageIndex);
|
||||
|
||||
/// <summary>
|
||||
/// Set an argument in the specified kernel, at the specified argument index.
|
||||
/// Must keep this here in the .h because it's templated.
|
||||
/// </summary>
|
||||
/// <param name="kernelIndex">Index of the kernel whose argument will be set</param>
|
||||
/// <param name="argIndex">Index of the argument to set</param>
|
||||
/// <param name="arg">The argument value to set</param>
|
||||
/// <returns>True if success, else false</returns>
|
||||
template <typename T>
|
||||
bool SetArg(unsigned int kernelIndex, unsigned int argIndex, T arg)
|
||||
{
|
||||
if (m_Init && kernelIndex < m_Programs.size())
|
||||
{
|
||||
cl_int err = m_Programs[kernelIndex].m_Kernel.setArg(argIndex, arg);
|
||||
|
||||
return CheckCL(err, "cl::Kernel::setArg()");
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
//Kernels.
|
||||
int FindKernelIndex(string name);
|
||||
bool RunKernel(unsigned int kernelIndex, unsigned int totalGridWidth, unsigned int totalGridHeight, unsigned int totalGridDepth, unsigned int blockWidth, unsigned int blockHeight, unsigned int blockDepth);
|
||||
|
||||
//Info.
|
||||
template<typename T>
|
||||
T GetInfo(size_t platform, size_t device, cl_device_info name);
|
||||
string PlatformName(size_t platform);
|
||||
vector<string> PlatformNames();
|
||||
string DeviceName(size_t platform, size_t device);
|
||||
vector<string> DeviceNames(size_t platform);
|
||||
string DeviceAndPlatformNames();
|
||||
string DumpInfo();
|
||||
|
||||
//Accessors.
|
||||
bool Ok();
|
||||
bool Shared();
|
||||
cl::Context Context();
|
||||
unsigned int PlatformIndex();
|
||||
unsigned int DeviceIndex();
|
||||
unsigned int LocalMemSize();
|
||||
|
||||
static void MakeEvenGridDims(unsigned int blockW, unsigned int blockH, unsigned int& gridW, unsigned int& gridH);
|
||||
|
||||
private:
|
||||
bool CreateContext(bool shared);
|
||||
bool CreateSPK(std::string& name, std::string& program, std::string& entryPoint, Spk& spk, bool doublePrecision);
|
||||
bool CheckCL(cl_int err, const char* name);
|
||||
std::string ErrorToStringCL(cl_int err);
|
||||
|
||||
bool m_Init;
|
||||
bool m_Shared;
|
||||
unsigned int m_PlatformIndex;
|
||||
unsigned int m_DeviceIndex;
|
||||
unsigned int m_LocalMemSize;
|
||||
cl::Platform m_Platform;
|
||||
cl::Context m_Context;
|
||||
cl::Device m_Device;
|
||||
cl::CommandQueue m_Queue;
|
||||
std::vector<cl::Platform> m_Platforms;
|
||||
std::vector<std::vector<cl::Device>> m_Devices;
|
||||
std::vector<cl::Device> m_DeviceVec;
|
||||
std::vector<Spk> m_Programs;
|
||||
std::vector<NamedBuffer> m_Buffers;
|
||||
std::vector<NamedImage2D> m_Images;
|
||||
std::vector<NamedImage2DGL> m_GLImages;
|
||||
};
|
||||
}
|
1340
Source/EmberCL/RendererCL.cpp
Normal file
1340
Source/EmberCL/RendererCL.cpp
Normal file
File diff suppressed because it is too large
Load Diff
156
Source/EmberCL/RendererCL.h
Normal file
156
Source/EmberCL/RendererCL.h
Normal file
@ -0,0 +1,156 @@
|
||||
#pragma once
|
||||
|
||||
#include "EmberCLPch.h"
|
||||
#include "OpenCLWrapper.h"
|
||||
#include "IterOpenCLKernelCreator.h"
|
||||
#include "DEOpenCLKernelCreator.h"
|
||||
#include "FinalAccumOpenCLKernelCreator.h"
|
||||
|
||||
/// <summary>
|
||||
/// RendererCL class.
|
||||
/// </summary>
|
||||
|
||||
namespace EmberCLns
|
||||
{
|
||||
class EMBERCL_API RendererCLBase
|
||||
{
|
||||
public:
|
||||
virtual bool ReadFinal(unsigned char* pixels) { return false; }
|
||||
virtual bool ClearFinal() { return false; }
|
||||
};
|
||||
|
||||
/// <summary>
|
||||
/// RendererCL is a derivation of the basic CPU renderer which
|
||||
/// overrides various functions to render on the GPU using OpenCL.
|
||||
/// Since this class derives from EmberReport and also contains an
|
||||
/// OpenCLWrapper member which also derives from EmberReport, the
|
||||
/// reporting functions are overridden to aggregate the errors from
|
||||
/// both sources.
|
||||
/// It does not support different types for T and bucketT, so it only has one template argument
|
||||
/// and uses both for the base.
|
||||
/// </summary>
|
||||
template <typename T>
|
||||
class EMBERCL_API RendererCL : public RendererCLBase, public Renderer<T, T>
|
||||
{
|
||||
public:
|
||||
RendererCL(unsigned int platform = 0, unsigned int device = 0, bool shared = false, GLuint outputTexID = 0);
|
||||
~RendererCL();
|
||||
|
||||
//Ordinary member functions for OpenCL specific tasks.
|
||||
bool Init(unsigned int platform, unsigned int device, bool shared, GLuint outputTexID);
|
||||
inline unsigned int IterBlocksWide();
|
||||
inline unsigned int IterBlocksHigh();
|
||||
inline unsigned int IterBlockWidth();
|
||||
inline unsigned int IterBlockHeight();
|
||||
inline unsigned int IterGridWidth();
|
||||
inline unsigned int IterGridHeight();
|
||||
inline unsigned int TotalIterKernelCount();
|
||||
unsigned int PlatformIndex();
|
||||
unsigned int DeviceIndex();
|
||||
bool ReadHist();
|
||||
bool ReadAccum();
|
||||
bool ReadPoints(vector<PointCL<T>>& vec);
|
||||
virtual bool ReadFinal(unsigned char* pixels);
|
||||
virtual bool ClearFinal();
|
||||
bool ClearHist();
|
||||
bool ClearAccum();
|
||||
bool WritePoints(vector<PointCL<T>>& vec);
|
||||
string IterKernel();
|
||||
|
||||
//Public virtual functions overriden from Renderer.
|
||||
virtual unsigned __int64 MemoryAvailable();
|
||||
virtual bool Ok() const;
|
||||
virtual void NumChannels(unsigned int numChannels);
|
||||
virtual void DumpErrorReport();
|
||||
virtual void ClearErrorReport();
|
||||
virtual unsigned int SubBatchSize() const;
|
||||
virtual unsigned int ThreadCount() const;
|
||||
virtual void ThreadCount(unsigned int threads, const char* seedString = NULL);
|
||||
virtual bool CreateDEFilter(bool& newAlloc);
|
||||
virtual bool CreateSpatialFilter(bool& newAlloc);
|
||||
virtual eRendererType RendererType() const;
|
||||
virtual string ErrorReportString();
|
||||
virtual vector<string> ErrorReport();
|
||||
|
||||
#ifndef TEST_CL
|
||||
protected:
|
||||
#endif
|
||||
//Protected virtual functions overriden from Renderer.
|
||||
virtual void MakeDmap(T colorScalar);
|
||||
virtual bool Alloc();
|
||||
virtual bool ResetBuckets(bool resetHist = true, bool resetAccum = true);
|
||||
virtual eRenderStatus LogScaleDensityFilter();
|
||||
virtual eRenderStatus GaussianDensityFilter();
|
||||
virtual eRenderStatus AccumulatorToFinalImage(unsigned char* pixels, size_t finalOffset);
|
||||
virtual EmberStats Iterate(unsigned __int64 iterCount, unsigned int pass, unsigned int temporalSample);
|
||||
|
||||
private:
|
||||
//Private functions for making and running OpenCL programs.
|
||||
bool BuildIterProgramForEmber(bool doAccum = true);
|
||||
bool RunIter(unsigned __int64 iterCount, unsigned int pass, unsigned int temporalSample, unsigned __int64& itersRan);
|
||||
eRenderStatus RunLogScaleFilter();
|
||||
eRenderStatus RunDensityFilter();
|
||||
eRenderStatus RunFinalAccum();
|
||||
bool ClearBuffer(string bufferName, unsigned int width, unsigned int height, unsigned int elementSize);
|
||||
bool RunDensityFilterPrivate(unsigned int kernelIndex, unsigned int gridW, unsigned int gridH, unsigned int blockW, unsigned int blockH, unsigned int chunkSizeW, unsigned int chunkSizeH, unsigned int rowParity, unsigned int colParity);
|
||||
int MakeAndGetDensityFilterProgram(unsigned int ss, unsigned int filterWidth);
|
||||
int MakeAndGetFinalAccumProgram(T& alphaBase, T& alphaScale);
|
||||
int MakeAndGetGammaCorrectionProgram();
|
||||
|
||||
//Private functions passing data to OpenCL programs.
|
||||
DensityFilterCL<T> ConvertDensityFilter();
|
||||
SpatialFilterCL<T> ConvertSpatialFilter();
|
||||
EmberCL<T> ConvertEmber(Ember<T>& ember);
|
||||
static CarToRasCL<T> ConvertCarToRas(const CarToRas<T>& carToRas);
|
||||
|
||||
bool m_Init;
|
||||
bool m_NVidia;
|
||||
bool m_DoublePrecision;
|
||||
unsigned int m_IterBlocksWide, m_IterBlockWidth;
|
||||
unsigned int m_IterBlocksHigh, m_IterBlockHeight;
|
||||
unsigned int m_MaxDEBlockSizeW;
|
||||
unsigned int m_MaxDEBlockSizeH;
|
||||
unsigned int m_WarpSize;
|
||||
unsigned int m_Calls;
|
||||
|
||||
string m_EmberBufferName;
|
||||
string m_ParVarsBufferName;
|
||||
string m_DistBufferName;
|
||||
string m_CarToRasBufferName;
|
||||
string m_DEFilterParamsBufferName;
|
||||
string m_SpatialFilterParamsBufferName;
|
||||
string m_DECoefsBufferName;
|
||||
string m_DEWidthsBufferName;
|
||||
string m_DECoefIndicesBufferName;
|
||||
string m_SpatialFilterCoefsBufferName;
|
||||
string m_HistBufferName;
|
||||
string m_AccumBufferName;
|
||||
string m_FinalImageName;
|
||||
string m_PointsBufferName;
|
||||
|
||||
string m_IterKernel;
|
||||
|
||||
OpenCLWrapper m_Wrapper;
|
||||
cl::ImageFormat m_PaletteFormat;
|
||||
cl::ImageFormat m_FinalFormat;
|
||||
cl::Image2D m_Palette;
|
||||
IMAGEGL2D m_AccumImage;
|
||||
GLuint m_OutputTexID;
|
||||
EmberCL<T> m_EmberCL;
|
||||
Palette<float> m_Dmap;//Used instead of the base class' m_Dmap because OpenCL only supports float textures.
|
||||
CarToRasCL<T> m_CarToRasCL;
|
||||
DensityFilterCL<T> m_DensityFilterCL;
|
||||
SpatialFilterCL<T> m_SpatialFilterCL;
|
||||
IterOpenCLKernelCreator<T> m_IterOpenCLKernelCreator;
|
||||
DEOpenCLKernelCreator<T> m_DEOpenCLKernelCreator;
|
||||
FinalAccumOpenCLKernelCreator<T> m_FinalAccumOpenCLKernelCreator;
|
||||
pair<string, vector<T>> m_Params;
|
||||
Ember<T> m_LastBuiltEmber;
|
||||
};
|
||||
|
||||
template EMBERCL_API class RendererCL<float>;
|
||||
|
||||
#ifdef DO_DOUBLE
|
||||
template EMBERCL_API class RendererCL<double>;
|
||||
#endif
|
||||
}
|
Reference in New Issue
Block a user