mirror of
https://bitbucket.org/mfeemster/fractorium.git
synced 2025-01-22 05:30:06 -05:00
ea649bbda6
-Add backward compatibility option for the following variations: cos, cosh, cot, coth, csc, csch, sec, sech, sin, sinh, tan, tanh. -Add the ability to re-order variations by dragging them in the Info tab.
1207 lines
41 KiB
C++
1207 lines
41 KiB
C++
#include "EmberCLPch.h"
|
|
#include "IterOpenCLKernelCreator.h"
|
|
|
|
//#define STRAIGHT_RAND 1
|
|
|
|
namespace EmberCLns
|
|
{
|
|
/// <summary>
|
|
/// Constructor that sets up some basic entry point strings and creates
|
|
/// the zeroization kernel string since it requires no conditional inputs.
|
|
/// </summary>
|
|
template <typename T>
|
|
IterOpenCLKernelCreator<T>::IterOpenCLKernelCreator()
|
|
{
|
|
m_ZeroizeKernel = CreateZeroizeKernelString();
|
|
m_SumHistKernel = CreateSumHistKernelString();
|
|
}
|
|
|
|
/// <summary>
|
|
/// Accessors.
|
|
/// </summary>
|
|
|
|
template <typename T> const string& IterOpenCLKernelCreator<T>::ZeroizeKernel() const { return m_ZeroizeKernel; }
|
|
template <typename T> const string& IterOpenCLKernelCreator<T>::ZeroizeEntryPoint() const { return m_ZeroizeEntryPoint; }
|
|
template <typename T> const string& IterOpenCLKernelCreator<T>::SumHistKernel() const { return m_SumHistKernel; }
|
|
template <typename T> const string& IterOpenCLKernelCreator<T>::SumHistEntryPoint() const { return m_SumHistEntryPoint; }
|
|
template <typename T> const string& IterOpenCLKernelCreator<T>::IterEntryPoint() const { return m_IterEntryPoint; }
|
|
/// <summary>
|
|
/// Create the iteration kernel string using the Cuburn method.
|
|
/// Template argument expected to be float or double.
|
|
/// Pre Reg Post Formula
|
|
/// x trans = affine(inpoint)
|
|
/// foreach prevar
|
|
/// tempin = trans
|
|
/// tempout = prevar(i, tempin)
|
|
/// trans = tempout
|
|
/// outpoint = trans
|
|
///
|
|
/// x x trans = affine(inpoint)
|
|
/// foreach prevar
|
|
/// tempin = trans
|
|
/// tempout = prevar(i, tempin)
|
|
/// trans = tempout
|
|
/// tempin = trans
|
|
/// outpoint = 0
|
|
/// foreach regvar
|
|
/// tempout = regvar(i, tempin)
|
|
/// outpoint += tempout
|
|
//
|
|
/// x x x
|
|
/// trans = affine(inpoint)
|
|
/// foreach prevar
|
|
/// tempin = trans
|
|
/// tempout = prevar(i, tempin)
|
|
/// trans = tempout
|
|
/// tempin = trans
|
|
/// outpoint = 0
|
|
/// foreach regvar
|
|
/// tempout = regvar(i, tempin)
|
|
/// outpoint += tempout
|
|
/// foreach postvar
|
|
/// tempin = outpoint
|
|
/// tempout = postvar(i, tempin)
|
|
/// outpoint = tempout
|
|
///
|
|
/// x x
|
|
/// trans = affine(inpoint)
|
|
/// foreach prevar
|
|
/// tempin = trans
|
|
/// tempout = prevar(i, tempin)
|
|
/// trans = tempout
|
|
/// outpoint = trans
|
|
/// foreach postvar
|
|
/// tempin = outpoint
|
|
/// tempout = postvar(i, tempin)
|
|
/// outpoint = tempout
|
|
///
|
|
/// x
|
|
/// trans = affine(inpoint)
|
|
/// tempin = trans
|
|
/// outpoint = 0
|
|
/// foreach regvar
|
|
/// tempout = regvar(i, tempin)
|
|
/// outpoint += tempout
|
|
///
|
|
/// x x
|
|
/// trans = affine(inpoint)
|
|
/// tempin = trans
|
|
/// outpoint = 0
|
|
/// foreach regvar
|
|
/// tempout = regvar(i, tempin)
|
|
/// outpoint += tempout
|
|
/// foreach postvar
|
|
/// tempin = outpoint
|
|
/// tempout = postvar(i, tempin)
|
|
/// outpoint = tempout
|
|
///
|
|
/// x
|
|
/// trans = affine(inpoint)
|
|
/// outpoint = 0
|
|
/// foreach postvar
|
|
/// tempin = outpoint
|
|
/// tempout = postvar(i, tempin)
|
|
/// outpoint = tempout
|
|
///
|
|
/// none trans = affine(inpoint)
|
|
/// outpoint = 0
|
|
///
|
|
/// </summary>
|
|
/// <param name="ember">The ember to create the kernel string for</param>
|
|
/// <param name="params">The parametric variation #define string</param>
|
|
/// <param name="optAffine">True to optimize with a simple assignment when the pre affine transform is empty, else false. True is better for final renders, false for interactive to reduce repeated compilations.</param>
|
|
/// <param name="lockAccum">Whether to lock when accumulating to the histogram. This is only for debugging. Default: false.</param>
|
|
/// <param name="doAccum">Debugging parameter to include or omit accumulating to the histogram. Default: true.</param>
|
|
/// <returns>The kernel string</returns>
|
|
template <typename T>
|
|
string IterOpenCLKernelCreator<T>::CreateIterKernelString(const Ember<T>& ember, const string& parVarDefines, const string& globalSharedDefines, bool optAffine, bool lockAccum, bool doAccum)
|
|
{
|
|
bool doublePrecision = typeid(T) == typeid(double);
|
|
size_t i = 0, v, varIndex, varCount;
|
|
ostringstream kernelIterBody, xformFuncs, os;
|
|
vector<Variation<T>*> variations;
|
|
xformFuncs << VariationStateString(ember);
|
|
xformFuncs << parVarDefines << globalSharedDefines;
|
|
ember.GetPresentVariations(variations);
|
|
bool hasVarState = ember.GetVariationStateParamCount();
|
|
|
|
for (auto var : variations)
|
|
if (var)
|
|
xformFuncs << var->OpenCLFuncsString();
|
|
|
|
while (auto xform = ember.GetTotalXform(i))
|
|
{
|
|
bool needPrecalcSumSquares = false;
|
|
bool needPrecalcSqrtSumSquares = false;
|
|
bool needPrecalcAngles = false;
|
|
bool needPrecalcAtanXY = false;
|
|
bool needPrecalcAtanYX = false;
|
|
bool hasPreReg = (xform->PreVariationCount() + xform->VariationCount()) > 0;
|
|
v = varIndex = varCount = 0;
|
|
xformFuncs <<
|
|
"void Xform" << i << "(__constant XformCL* xform, __constant real_t* parVars, __global real_t* globalShared, Point* inPoint, Point* outPoint, uint2* mwc, VariationState* varState)\n" <<
|
|
"{\n"
|
|
" real_t transX, transY, transZ;\n"
|
|
" real4 vIn, vOut = 0.0;\n";
|
|
|
|
//Determine if any variations, regular, pre, or post need precalcs.
|
|
while (Variation<T>* var = xform->GetVariation(v++))
|
|
{
|
|
needPrecalcSumSquares |= var->NeedPrecalcSumSquares();
|
|
needPrecalcSqrtSumSquares |= var->NeedPrecalcSqrtSumSquares();
|
|
needPrecalcAngles |= var->NeedPrecalcAngles();
|
|
needPrecalcAtanXY |= var->NeedPrecalcAtanXY();
|
|
needPrecalcAtanYX |= var->NeedPrecalcAtanYX();
|
|
}
|
|
|
|
if (needPrecalcSumSquares)
|
|
xformFuncs << "\treal_t precalcSumSquares;\n";
|
|
|
|
if (needPrecalcSqrtSumSquares)
|
|
xformFuncs << "\treal_t precalcSqrtSumSquares;\n";
|
|
|
|
if (needPrecalcAngles)
|
|
{
|
|
xformFuncs << "\treal_t precalcSina;\n";
|
|
xformFuncs << "\treal_t precalcCosa;\n";
|
|
}
|
|
|
|
if (needPrecalcAtanXY)
|
|
xformFuncs << "\treal_t precalcAtanxy;\n";
|
|
|
|
if (needPrecalcAtanYX)
|
|
xformFuncs << "\treal_t precalcAtanyx;\n";
|
|
|
|
xformFuncs << "\treal_t tempColor = outPoint->m_ColorX = fma(xform->m_OneMinusColorCache, inPoint->m_ColorX, xform->m_ColorSpeedCache);\n\n";
|
|
|
|
if (optAffine && xform->m_Affine.IsID())
|
|
{
|
|
xformFuncs <<
|
|
" transX = inPoint->m_X;\n" <<
|
|
" transY = inPoint->m_Y;\n";
|
|
}
|
|
else
|
|
{
|
|
xformFuncs <<
|
|
" transX = fma(xform->m_A, inPoint->m_X, fma(xform->m_B, inPoint->m_Y, xform->m_C));\n" <<
|
|
" transY = fma(xform->m_D, inPoint->m_X, fma(xform->m_E, inPoint->m_Y, xform->m_F));\n";
|
|
}
|
|
|
|
xformFuncs << " transZ = inPoint->m_Z;\n";
|
|
varCount = xform->PreVariationCount();
|
|
|
|
if (hasPreReg)
|
|
{
|
|
if (varCount > 0)
|
|
{
|
|
xformFuncs << "\n\t//Apply each of the " << varCount << " pre variations in this xform.\n";
|
|
|
|
//Output the code for each pre variation in this xform.
|
|
for (varIndex = 0; varIndex < varCount; varIndex++)
|
|
{
|
|
if (Variation<T>* var = xform->GetVariation(varIndex))
|
|
{
|
|
xformFuncs << "\n\t//" << var->Name() << ".\n";
|
|
xformFuncs << xform->ReadOpenCLString(eVariationType::VARTYPE_PRE) << "\n";
|
|
xformFuncs << var->PrePostPrecalcOpenCLString();
|
|
xformFuncs << var->OpenCLString() << "\n";
|
|
xformFuncs << xform->WriteOpenCLString(eVariationType::VARTYPE_PRE, var->AssignType()) << "\n";
|
|
}
|
|
}
|
|
}
|
|
|
|
if (xform->VariationCount() > 0)
|
|
{
|
|
if (xform->NeedPrecalcSumSquares())
|
|
xformFuncs << "\tprecalcSumSquares = SQR(transX) + SQR(transY);\n";
|
|
|
|
if (xform->NeedPrecalcSqrtSumSquares())
|
|
xformFuncs << "\tprecalcSqrtSumSquares = sqrt(precalcSumSquares);\n";
|
|
|
|
if (xform->NeedPrecalcAngles())
|
|
{
|
|
xformFuncs << "\tprecalcCosa = transX / Zeps(precalcSqrtSumSquares);\n";
|
|
xformFuncs << "\tprecalcSina = transY / Zeps(precalcSqrtSumSquares);\n";
|
|
}
|
|
|
|
if (xform->NeedPrecalcAtanXY())
|
|
xformFuncs << "\tprecalcAtanxy = atan2(transX, transY);\n";
|
|
|
|
if (xform->NeedPrecalcAtanYX())
|
|
xformFuncs << "\tprecalcAtanyx = atan2(transY, transX);\n";
|
|
|
|
xformFuncs << "\n\toutPoint->m_X = 0;";
|
|
xformFuncs << "\n\toutPoint->m_Y = 0;";
|
|
xformFuncs << "\n\toutPoint->m_Z = 0;\n";
|
|
xformFuncs << "\n\t//Apply each of the " << xform->VariationCount() << " regular variations in this xform.\n\n";
|
|
xformFuncs << xform->ReadOpenCLString(eVariationType::VARTYPE_REG);
|
|
varCount += xform->VariationCount();
|
|
|
|
//Output the code for each regular variation in this xform.
|
|
for (; varIndex < varCount; varIndex++)
|
|
{
|
|
if (Variation<T>* var = xform->GetVariation(varIndex))
|
|
{
|
|
xformFuncs << "\n\t//" << var->Name() << ".\n"
|
|
<< var->OpenCLString() << (varIndex == varCount - 1 ? "\n" : "\n\n")
|
|
<< xform->WriteOpenCLString(eVariationType::VARTYPE_REG, eVariationAssignType::ASSIGNTYPE_SUM);
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
xformFuncs <<
|
|
" outPoint->m_X = transX;\n"
|
|
" outPoint->m_Y = transY;\n"
|
|
" outPoint->m_Z = transZ;\n";
|
|
}
|
|
}
|
|
else
|
|
{
|
|
xformFuncs <<
|
|
" outPoint->m_X = 0;\n"
|
|
" outPoint->m_Y = 0;\n"
|
|
" outPoint->m_Z = 0;\n";
|
|
}
|
|
|
|
if (xform->PostVariationCount() > 0)
|
|
{
|
|
varCount += xform->PostVariationCount();
|
|
xformFuncs << "\n\t//Apply each of the " << xform->PostVariationCount() << " post variations in this xform.\n";
|
|
|
|
//Output the code for each post variation in this xform.
|
|
for (; varIndex < varCount; varIndex++)
|
|
{
|
|
if (Variation<T>* var = xform->GetVariation(varIndex))
|
|
{
|
|
xformFuncs << "\n\t//" << var->Name() << ".\n";
|
|
xformFuncs << xform->ReadOpenCLString(eVariationType::VARTYPE_POST) << "\n";
|
|
xformFuncs << var->PrePostPrecalcOpenCLString();
|
|
xformFuncs << var->OpenCLString() << "\n";
|
|
xformFuncs << xform->WriteOpenCLString(eVariationType::VARTYPE_POST, var->AssignType()) << (varIndex == varCount - 1 ? "\n" : "\n\n");
|
|
}
|
|
}
|
|
}
|
|
|
|
if (xform->HasPost())
|
|
{
|
|
xformFuncs <<
|
|
"\n\t//Apply post affine transform.\n"
|
|
"\treal_t tempX = outPoint->m_X;\n"
|
|
"\n"
|
|
"\toutPoint->m_X = fma(xform->m_PostA, tempX, fma(xform->m_PostB, outPoint->m_Y, xform->m_PostC));\n" <<
|
|
"\toutPoint->m_Y = fma(xform->m_PostD, tempX, fma(xform->m_PostE, outPoint->m_Y, xform->m_PostF));\n";
|
|
}
|
|
|
|
xformFuncs << "\toutPoint->m_ColorX = fma(xform->m_DirectColor, (outPoint->m_ColorX - tempColor), tempColor);\n";
|
|
xformFuncs << "\n";
|
|
xformFuncs << "\tif (isnan(outPoint->m_ColorX))\n";
|
|
xformFuncs << "\t outPoint->m_ColorX = 0.0; \n";
|
|
xformFuncs << "}\n"
|
|
<< "\n";
|
|
i++;
|
|
}
|
|
|
|
auto varStateString = VariationStateInitString(ember);
|
|
os <<
|
|
ConstantDefinesString(doublePrecision) <<
|
|
GlobalFunctionsString(ember) <<
|
|
RandFunctionString <<
|
|
PointCLStructString <<
|
|
XformCLStructString <<
|
|
EmberCLStructString <<
|
|
UnionCLStructString <<
|
|
CarToRasCLStructString <<
|
|
CarToRasFunctionString;
|
|
|
|
if (lockAccum)
|
|
os << AtomicString();
|
|
|
|
os <<
|
|
xformFuncs.str() <<
|
|
"__kernel void " << m_IterEntryPoint << "(\n" <<
|
|
" uint iterCount,\n"
|
|
" uint fuseCount,\n"
|
|
" __global uint2* seeds,\n"
|
|
" __constant EmberCL* ember,\n"
|
|
" __constant XformCL* xforms,\n"
|
|
" __constant real_t* parVars,\n"
|
|
" __global real_t* globalShared,\n"
|
|
" __global uchar* xformDistributions,\n"//Using uchar is quicker than uint. Can't be constant because the size can be too large to fit when using xaos.
|
|
" __constant CarToRasCL* carToRas,\n"
|
|
" __global real4reals_bucket* histogram,\n";
|
|
|
|
if (hasVarState)
|
|
{
|
|
os <<
|
|
" __global VariationState* varStates,\n";
|
|
}
|
|
|
|
os <<
|
|
" uint histSize,\n"
|
|
" __read_only image2d_t palette,\n"
|
|
" __global Point* points\n"
|
|
"\t)\n"
|
|
"{\n"
|
|
" bool fuse, ok;\n"
|
|
" uint threadIndex = INDEX_IN_BLOCK_2D;\n"
|
|
" uint pointsIndex = INDEX_IN_GRID_2D;\n"
|
|
" uint blockStartIndex = BLOCK_START_INDEX_IN_GRID_2D;\n"
|
|
" uint blockStartThreadIndex = blockStartIndex + threadIndex;\n"
|
|
" uint i, itersToDo;\n"
|
|
" uint consec = 0;\n"
|
|
//" int badvals = 0;\n"
|
|
" uint histIndex;\n"
|
|
" real_t p00, p01;\n"
|
|
" Point firstPoint, secondPoint, tempPoint;\n"
|
|
" uint2 mwc = seeds[pointsIndex];\n"
|
|
" float4 palColor1;\n"
|
|
" int2 iPaletteCoord;\n"
|
|
" const sampler_t paletteSampler = CLK_NORMALIZED_COORDS_FALSE |\n"//Coords from 0 to 255.
|
|
" CLK_ADDRESS_CLAMP_TO_EDGE |\n"//Clamp to edge
|
|
" CLK_FILTER_NEAREST;\n"//Don't interpolate
|
|
" uint threadXY = (THREAD_ID_X + THREAD_ID_Y);\n"
|
|
" uint threadXDivRows = (THREAD_ID_X / NWARPS);\n"
|
|
" uint threadsMinus1 = NTHREADS - 1;\n"
|
|
" VariationState varState;\n"
|
|
"\n";
|
|
|
|
if (ember.XformCount() > 1)
|
|
{
|
|
os <<
|
|
#ifndef STRAIGHT_RAND
|
|
" __local Point swap[NTHREADS];\n"
|
|
" __local uint xfsel[NWARPS];\n";
|
|
#endif
|
|
}
|
|
|
|
os <<
|
|
" iPaletteCoord.y = 0;\n"
|
|
"\n"
|
|
" if (fuseCount > 0)\n"
|
|
" {\n"
|
|
" fuse = true;\n"
|
|
" itersToDo = fuseCount;\n"
|
|
//Calling MwcNextFRange() twice is deliberate. The first call to mwc is not very random since it just does
|
|
//an xor. So it must be called twice to get it in a good random state.
|
|
" firstPoint.m_X = MwcNextFRange(&mwc, -ember->m_RandPointRange, ember->m_RandPointRange);\n"
|
|
" firstPoint.m_X = MwcNextFRange(&mwc, -ember->m_RandPointRange, ember->m_RandPointRange);\n"
|
|
" firstPoint.m_Y = MwcNextFRange(&mwc, -ember->m_RandPointRange, ember->m_RandPointRange);\n"
|
|
" firstPoint.m_Z = 0.0;\n"
|
|
" firstPoint.m_ColorX = MwcNext01(&mwc);\n"
|
|
" firstPoint.m_LastXfUsed = 0 - 1;\n";//This ensures the first iteration chooses from the unweighted distribution array, all subsequent iterations will choose from the weighted ones.
|
|
//os <<
|
|
// varStateString << '\n';
|
|
os <<
|
|
" }\n"
|
|
" else\n"
|
|
" {\n"
|
|
" fuse = false;\n"
|
|
" itersToDo = iterCount;\n"
|
|
" firstPoint = points[blockStartThreadIndex];\n"
|
|
" }\n"
|
|
"\n"
|
|
;
|
|
|
|
if (hasVarState)
|
|
{
|
|
os <<
|
|
" varState = varStates[blockStartThreadIndex];\n";
|
|
}
|
|
|
|
//This is done once initially here and then again after each swap-sync in the main loop.
|
|
//This along with the randomness that the point shuffle provides gives sufficient randomness
|
|
//to produce results identical to those produced on the CPU.
|
|
if (ember.XformCount() > 1)
|
|
{
|
|
#ifndef STRAIGHT_RAND
|
|
os <<
|
|
"\n"
|
|
" if (THREAD_ID_Y == 0 && THREAD_ID_X < NWARPS)\n"
|
|
" xfsel[THREAD_ID_X] = MwcNext(&mwc) & " << CHOOSE_XFORM_GRAIN_M1 << ";\n"//It's faster to do the & here ahead of time than every time an xform is looked up to use inside the loop.
|
|
"\n";
|
|
#endif
|
|
}
|
|
else
|
|
{
|
|
os <<
|
|
" secondPoint.m_LastXfUsed = 0;\n";
|
|
}
|
|
|
|
os <<
|
|
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
|
"\n"
|
|
" for (i = 0; i < itersToDo; i++)\n"
|
|
" {\n"
|
|
" consec = 0;\n"
|
|
"\n"
|
|
" do\n"
|
|
" {\n";
|
|
|
|
if (ember.XformCount() > 1)
|
|
{
|
|
//If xaos is present, the a hybrid of the cuburn method is used.
|
|
//This makes each thread in a row pick the same offset into a distribution, using xfsel.
|
|
//However, the distribution the offset is in, is determined by firstPoint.m_LastXfUsed.
|
|
if (ember.XaosPresent())
|
|
{
|
|
os <<
|
|
#ifdef STRAIGHT_RAND
|
|
" secondPoint.m_LastXfUsed = xformDistributions[(MwcNext(&mwc) & " << CHOOSE_XFORM_GRAIN_M1 << ") + (" << CHOOSE_XFORM_GRAIN << " * (firstPoint.m_LastXfUsed + 1u))];\n\n";
|
|
#else
|
|
" secondPoint.m_LastXfUsed = xformDistributions[xfsel[THREAD_ID_Y] + (" << CHOOSE_XFORM_GRAIN << " * (firstPoint.m_LastXfUsed + 1u))];\n\n";//Partial cuburn hybrid.
|
|
#endif
|
|
}
|
|
else
|
|
{
|
|
os <<
|
|
#ifdef STRAIGHT_RAND
|
|
" secondPoint.m_LastXfUsed = xformDistributions[MwcNext(&mwc) & " << CHOOSE_XFORM_GRAIN_M1 << "];\n\n";//For testing, using straight rand flam4/fractron style instead of cuburn.
|
|
#else
|
|
" secondPoint.m_LastXfUsed = xformDistributions[xfsel[THREAD_ID_Y]];\n\n";
|
|
#endif
|
|
}
|
|
}
|
|
|
|
for (i = 0; i < ember.XformCount(); i++)
|
|
{
|
|
if (ember.XformCount() > 1)
|
|
{
|
|
if (i == 0)
|
|
{
|
|
os <<
|
|
" switch (secondPoint.m_LastXfUsed)\n"
|
|
" {\n";
|
|
}
|
|
|
|
os <<
|
|
" case " << i << ":\n"
|
|
" {\n" <<
|
|
" Xform" << i << "(&(xforms[" << i << "]), parVars, globalShared, &firstPoint, &secondPoint, &mwc, &varState);\n" <<
|
|
" break;\n"
|
|
" }\n";
|
|
|
|
if (i == ember.XformCount() - 1)
|
|
{
|
|
os <<
|
|
" }\n";
|
|
}
|
|
}
|
|
else
|
|
{
|
|
os << " Xform0(&(xforms[0]), parVars, globalShared, &firstPoint, &secondPoint, &mwc, &varState);";
|
|
}
|
|
}
|
|
|
|
os <<
|
|
"\n"
|
|
" ok = !BadVal(secondPoint.m_X) && !BadVal(secondPoint.m_Y);\n"
|
|
//" ok = !BadVal(secondPoint.m_X) && !BadVal(secondPoint.m_Y) && !isnan(secondPoint.m_Z);\n"
|
|
"\n"
|
|
" if (!ok)\n"
|
|
" {\n"
|
|
" firstPoint.m_X = MwcNextFRange(&mwc, -ember->m_RandPointRange, ember->m_RandPointRange);\n"
|
|
" firstPoint.m_Y = MwcNextFRange(&mwc, -ember->m_RandPointRange, ember->m_RandPointRange);\n"
|
|
" firstPoint.m_Z = 0.0;\n"
|
|
" firstPoint.m_ColorX = secondPoint.m_ColorX;\n"
|
|
" consec++;\n"
|
|
//" badvals++;\n"
|
|
" }\n"
|
|
" }\n"
|
|
" while (!ok && consec < 5);\n"
|
|
"\n"
|
|
" if (!ok)\n"
|
|
" {\n"
|
|
" secondPoint.m_X = MwcNextFRange(&mwc, -ember->m_RandPointRange, ember->m_RandPointRange);\n"
|
|
" secondPoint.m_Y = MwcNextFRange(&mwc, -ember->m_RandPointRange, ember->m_RandPointRange);\n"
|
|
" secondPoint.m_Z = 0.0;\n"
|
|
" }\n";
|
|
#ifndef STRAIGHT_RAND
|
|
|
|
if (ember.XformCount() > 1)
|
|
{
|
|
os <<
|
|
"\n"//Rotate points between threads. This is how randomization is achieved.
|
|
" uint swr = threadXY + ((i & 1u) * threadXDivRows);\n"
|
|
" uint sw = (swr * THREADS_PER_WARP + THREAD_ID_X) & threadsMinus1;\n"
|
|
"\n"
|
|
//Write to another thread's location.
|
|
" swap[sw] = secondPoint;\n";
|
|
os <<
|
|
"\n"
|
|
//Populate randomized xform index buffer with new random values.
|
|
" if (THREAD_ID_Y == 0 && THREAD_ID_X < NWARPS)\n"
|
|
" xfsel[THREAD_ID_X] = MwcNext(&mwc) & " << CHOOSE_XFORM_GRAIN_M1 << ";\n"
|
|
"\n"
|
|
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
|
//Another thread will have written to this thread's location, so read the new value and use it for accumulation below.
|
|
" firstPoint = swap[threadIndex];\n";
|
|
}
|
|
else
|
|
{
|
|
os <<
|
|
"\n"
|
|
" firstPoint = secondPoint;\n";
|
|
}
|
|
|
|
#else
|
|
os <<
|
|
" firstPoint = secondPoint;\n";//For testing, using straight rand flam4/fractron style instead of cuburn.
|
|
#endif
|
|
os <<
|
|
"\n"
|
|
" if (fuse)\n"
|
|
" {\n";
|
|
|
|
if (hasVarState && ember.XformCount() > 1)
|
|
{
|
|
os <<
|
|
" varStates[blockStartIndex + sw] = varState;\n\n";
|
|
}
|
|
|
|
os <<
|
|
" if (i >= fuseCount - 1)\n"
|
|
" {\n"
|
|
" i = 0;\n"
|
|
" fuse = false;\n"
|
|
" itersToDo = iterCount;\n";
|
|
|
|
if (ember.XformCount() > 1)
|
|
os <<
|
|
" barrier(CLK_LOCAL_MEM_FENCE);\n"//Sort of seems necessary, sort of doesn't. Makes no speed difference.
|
|
;
|
|
|
|
os <<
|
|
" }\n"
|
|
;
|
|
|
|
if (hasVarState && ember.XformCount() > 1)
|
|
{
|
|
os <<
|
|
"\n"
|
|
" barrier(CLK_GLOBAL_MEM_FENCE);\n"//Sort of seems necessary, sort of doesn't. Makes no speed difference.
|
|
" varState = varStates[blockStartThreadIndex];"
|
|
;
|
|
}
|
|
|
|
os <<
|
|
"\n"
|
|
" continue;\n"
|
|
" }\n"
|
|
"\n";
|
|
|
|
if (ember.UseFinalXform())
|
|
{
|
|
size_t finalIndex = ember.TotalXformCount() - 1;
|
|
//CPU takes an extra step here to preserve the opacity of the randomly selected xform, rather than the final xform's opacity.
|
|
//The same thing takes place here automatically because secondPoint.m_LastXfUsed is used below to retrieve the opacity when accumulating.
|
|
os <<
|
|
" if ((xforms[" << finalIndex << "].m_Opacity == 1) || (MwcNext01(&mwc) < xforms[" << finalIndex << "].m_Opacity))\n"
|
|
" {\n"
|
|
" tempPoint.m_LastXfUsed = secondPoint.m_LastXfUsed;\n"
|
|
" Xform" << finalIndex << "(&(xforms[" << finalIndex << "]), parVars, globalShared, &secondPoint, &tempPoint, &mwc, &varState);\n"
|
|
" secondPoint = tempPoint;\n"
|
|
" }\n"
|
|
"\n";
|
|
}
|
|
|
|
if (hasVarState && ember.XformCount() > 1)
|
|
{
|
|
os <<
|
|
" varStates[blockStartIndex + sw] = varState;\n"
|
|
;
|
|
}
|
|
|
|
os << CreateProjectionString(ember);
|
|
|
|
if (doAccum)
|
|
{
|
|
os <<
|
|
" p00 = secondPoint.m_X - ember->m_CenterX;\n"
|
|
" p01 = secondPoint.m_Y - ember->m_CenterY;\n"
|
|
" tempPoint.m_X = fma(p00, ember->m_RotA, fma(p01, ember->m_RotB, ember->m_CenterX));\n"
|
|
" tempPoint.m_Y = fma(p00, ember->m_RotD, fma(p01, ember->m_RotE, ember->m_CenterY));\n"
|
|
"\n"
|
|
//Add this point to the appropriate location in the histogram.
|
|
" if (CarToRasInBounds(carToRas, &tempPoint))\n"
|
|
" {\n"
|
|
" CarToRasConvertPointToSingle(carToRas, &tempPoint, &histIndex);\n"
|
|
"\n"
|
|
" if (histIndex < histSize)\n"//Provides an extra level of safety and makes no speed difference.
|
|
" {\n";
|
|
|
|
//Basic texture index interoplation does not produce identical results
|
|
//to the CPU. So the code here must explicitly do the same thing and not
|
|
//rely on the GPU texture coordinate lookup.
|
|
if (ember.m_PaletteMode == ePaletteMode::PALETTE_LINEAR)
|
|
{
|
|
os <<
|
|
" real_t colorIndexFrac;\n"
|
|
" real_t colorIndex = secondPoint.m_ColorX * ember->m_Psm1;\n"
|
|
" int intColorIndex;\n"
|
|
" float4 palColor2;\n"
|
|
"\n"
|
|
" if (colorIndex < 0)\n"
|
|
" {\n"
|
|
" intColorIndex = 0;\n"
|
|
" colorIndexFrac = 0;\n"
|
|
" }\n"
|
|
" else if (colorIndex >= ember->m_Psm1)\n"
|
|
" {\n"
|
|
" intColorIndex = (int)ember->m_Psm2;\n"
|
|
" colorIndexFrac = 1.0;\n"
|
|
" }\n"
|
|
" else\n"
|
|
" {\n"
|
|
" intColorIndex = (int)colorIndex;\n"
|
|
" colorIndexFrac = colorIndex - intColorIndex;\n"//Interpolate between intColorIndex and intColorIndex + 1.
|
|
" }\n"
|
|
"\n"
|
|
" iPaletteCoord.x = intColorIndex;\n"//Palette operations are strictly float because OpenCL does not support dp64 textures.
|
|
" palColor1 = read_imagef(palette, paletteSampler, iPaletteCoord);\n"
|
|
" iPaletteCoord.x += 1;\n"
|
|
" palColor2 = read_imagef(palette, paletteSampler, iPaletteCoord);\n"
|
|
" palColor1 = fma(palColor2, (float)colorIndexFrac, palColor1 * (1.0f - (float)colorIndexFrac));\n";//The 1.0f here *must* have the 'f' suffix at the end to compile.
|
|
}
|
|
else if (ember.m_PaletteMode == ePaletteMode::PALETTE_STEP)
|
|
{
|
|
os <<
|
|
" iPaletteCoord.x = (int)(secondPoint.m_ColorX * ember->m_Psm1);\n"
|
|
" palColor1 = read_imagef(palette, paletteSampler, iPaletteCoord);\n";
|
|
}
|
|
|
|
if (lockAccum)
|
|
{
|
|
os <<
|
|
" AtomicAdd(&(histogram[histIndex].m_Reals[0]), palColor1.x * (real_bucket_t)xforms[secondPoint.m_LastXfUsed].m_Opacity);\n"//Always apply opacity, even though it's usually 1.
|
|
" AtomicAdd(&(histogram[histIndex].m_Reals[1]), palColor1.y * (real_bucket_t)xforms[secondPoint.m_LastXfUsed].m_Opacity);\n"
|
|
" AtomicAdd(&(histogram[histIndex].m_Reals[2]), palColor1.z * (real_bucket_t)xforms[secondPoint.m_LastXfUsed].m_Opacity);\n"
|
|
" AtomicAdd(&(histogram[histIndex].m_Reals[3]), palColor1.w * (real_bucket_t)xforms[secondPoint.m_LastXfUsed].m_Opacity);\n";
|
|
}
|
|
else
|
|
{
|
|
os <<
|
|
" histogram[histIndex].m_Real4 += (palColor1 * (real_bucket_t)xforms[secondPoint.m_LastXfUsed].m_Opacity);\n";//real_bucket_t should always be float.
|
|
}
|
|
|
|
os <<
|
|
" }\n"//histIndex < histSize.
|
|
" }\n"//CarToRasInBounds.
|
|
"\n";
|
|
os <<
|
|
" barrier(CLK_GLOBAL_MEM_FENCE);\n";//Barrier every time, whether or not the point was in bounds, else artifacts will occur when doing strips.
|
|
|
|
if (hasVarState && ember.XformCount() > 1)
|
|
{
|
|
os <<
|
|
" varState = varStates[blockStartThreadIndex];\n"
|
|
;
|
|
}
|
|
}
|
|
|
|
os <<
|
|
" }\n"//Main for loop.
|
|
"\n"
|
|
//At this point, iterating for this round is done, so write the final points back out
|
|
//to the global points buffer to be used as inputs for the next round. This preserves point trajectory
|
|
//between kernel calls.
|
|
#ifdef TEST_CL_BUFFERS//Use this to populate with test values and read back in EmberTester.
|
|
" points[pointsIndex].m_X = MwcNextNeg1Pos1(&mwc);\n"
|
|
" points[pointsIndex].m_Y = MwcNextNeg1Pos1(&mwc);\n"
|
|
" points[pointsIndex].m_Z = MwcNextNeg1Pos1(&mwc);\n"
|
|
" points[pointsIndex].m_ColorX = MwcNextNeg1Pos1(&mwc);\n"
|
|
#else
|
|
" seeds[pointsIndex] = mwc;\n"
|
|
" points[blockStartThreadIndex] = firstPoint;\n";
|
|
|
|
if (hasVarState && ember.XformCount() == 1)
|
|
{
|
|
os <<
|
|
" varStates[blockStartThreadIndex] = varState;\n";
|
|
}
|
|
|
|
#endif
|
|
os <<
|
|
" barrier(CLK_GLOBAL_MEM_FENCE);\n"
|
|
"}\n";
|
|
return os.str();
|
|
}
|
|
|
|
/// <summary>
|
|
/// Return a string containing all of the global functions needed by the passed in ember.
|
|
/// </summary>
|
|
/// <param name="ember">The ember to create the global function strings from</param>
|
|
/// <returns>String of all global function names and bodies</returns>
|
|
template <typename T>
|
|
string IterOpenCLKernelCreator<T>::GlobalFunctionsString(const Ember<T>& ember)
|
|
{
|
|
size_t i = 0, j;
|
|
vector<string> funcNames;//Can't use a set here because they sort and we must preserve the insertion order due to nested function calls.
|
|
ostringstream os;
|
|
static string zeps = "Zeps";
|
|
|
|
while (auto xform = ember.GetTotalXform(i++))
|
|
{
|
|
size_t varCount = xform->TotalVariationCount();
|
|
|
|
if (xform->NeedPrecalcAngles())
|
|
if (!Contains(funcNames, zeps))
|
|
funcNames.push_back(zeps);
|
|
|
|
for (j = 0; j < varCount; j++)
|
|
{
|
|
if (auto var = xform->GetVariation(j))
|
|
{
|
|
auto names = var->OpenCLGlobalFuncNames();
|
|
|
|
if (var->NeedPrecalcAngles())
|
|
if (!Contains(funcNames, zeps))
|
|
funcNames.push_back(zeps);
|
|
|
|
for (auto& name : names)
|
|
if (!Contains(funcNames, name))
|
|
funcNames.push_back(name);
|
|
}
|
|
}
|
|
}
|
|
|
|
if (ember.ProjBits())
|
|
if (!Contains(funcNames, zeps))
|
|
funcNames.push_back(zeps);
|
|
|
|
for (auto& funcName : funcNames)
|
|
if (auto text = m_FunctionMapper.GetGlobalFunc(funcName))
|
|
os << *text << "\n";
|
|
|
|
return os.str();
|
|
}
|
|
/// <summary>
|
|
/// Create an OpenCL string of #defines and a corresponding host side vector for variation weights and parametric variation values.
|
|
/// Parametric variations present a special problem in the iteration code.
|
|
/// The values can't be passed in with the array of other xform values because
|
|
/// the length of the parametric values is unknown.
|
|
/// This is solved by passing a separate buffer of values dedicated specifically
|
|
/// to parametric variations.
|
|
/// In OpenCL, a series of #define constants are declared which specify the indices in
|
|
/// the buffer where the various values are stored.
|
|
/// The possibility of a parametric variation type being present in multiple xforms is taken
|
|
/// into account by appending the xform index to the #define, thus making each unique.
|
|
/// The kernel creator then uses these to retrieve the values in the iteration code.
|
|
/// Example:
|
|
/// Xform1: Curl (curl_c1: 1.1, curl_c2: 2.2)
|
|
/// Xform2: Curl (curl_c1: 4.4, curl_c2: 5.5)
|
|
/// Xform3: Blob (blob_low: 1, blob_high: 2, blob_waves: 3)
|
|
///
|
|
/// Host vector to be passed as arg to the iter kernel call:
|
|
/// [1.1][2.2][4.4][5.5][1][2][3]
|
|
///
|
|
/// #defines in OpenCL to access the buffer:
|
|
///
|
|
/// #define CURL_C1_1 0
|
|
/// #define CURL_C2_1 1
|
|
/// #define CURL_C1_2 2
|
|
/// #define CURL_C2_2 3
|
|
/// #define BLOB_LOW_3 4
|
|
/// #define BLOB_HIGH_3 5
|
|
/// #define BLOB_WAVES_3 6
|
|
///
|
|
/// The variations use these #defines by first looking up the index of the
|
|
/// xform they belong to in the parent ember and generating the OpenCL string based on that
|
|
/// in their overridden OpenCLString() functions.
|
|
/// Note that variation weights are also included in this buffer and are looked up in a similar manner.
|
|
/// Template argument expected to be float or double.
|
|
/// </summary>
|
|
/// <param name="ember">The ember to create the values from</param>
|
|
/// <param name="params">The string,vector pair to store the values in</param>
|
|
/// <param name="doVals">True if the vector should be populated, else false. Default: true.</param>
|
|
/// <param name="doString">True if the string should be populated, else false. Default: true.</param>
|
|
template <typename T>
|
|
void IterOpenCLKernelCreator<T>::ParVarIndexDefines(const Ember<T>& ember, pair<string, vector<T>>& params, bool doVals, bool doString)
|
|
{
|
|
size_t i = 0, size = 0;
|
|
ostringstream os;
|
|
|
|
if (doVals)
|
|
params.second.clear();
|
|
|
|
while (auto xform = ember.GetTotalXform(i))
|
|
{
|
|
size_t j = 0;
|
|
|
|
while (auto var = xform->GetVariation(j))
|
|
{
|
|
if (doString)
|
|
os << "#define WEIGHT_" << i << "_" << j << " " << size++ << "\n";//Uniquely identify the weight of this variation in this xform.
|
|
|
|
if (doVals)
|
|
params.second.push_back(var->m_Weight);
|
|
|
|
if (auto parVar = dynamic_cast<ParametricVariation<T>*>(var))
|
|
{
|
|
for (size_t k = 0; k < parVar->ParamCount(); k++)
|
|
{
|
|
if (!parVar->Params()[k].IsState())
|
|
{
|
|
if (doString)
|
|
os << "#define " << ToUpper(parVar->Params()[k].Name()) << "_" << i << " " << size << "\n";//Uniquely identify this param in this variation in this xform.
|
|
|
|
auto elements = parVar->Params()[k].Size() / sizeof(T);
|
|
|
|
if (doVals)
|
|
{
|
|
for (auto l = 0; l < elements; l++)
|
|
params.second.push_back(*(parVar->Params()[k].Param() + l));
|
|
}
|
|
|
|
size += elements;
|
|
}
|
|
}
|
|
}
|
|
|
|
j++;
|
|
}
|
|
|
|
i++;
|
|
}
|
|
|
|
if (doString)
|
|
{
|
|
os << "\n";
|
|
params.first = os.str();
|
|
}
|
|
}
|
|
/// <summary>
|
|
/// Create an OpenCL string of #defines and a corresponding host side vector for globally shared data.
|
|
/// Certain variations, such as crackle and dc_perlin use static, read-only buffers of data.
|
|
/// These need to be created separate from the buffer of parametric variation values.
|
|
/// </summary>
|
|
/// <param name="ember">The ember to create the values from</param>
|
|
/// <param name="params">The string,vector pair to store the values in</param>
|
|
/// <param name="doVals">True if the vector should be populated, else false. Default: true.</param>
|
|
/// <param name="doString">True if the string should be populated, else false. Default: true.</param>
|
|
template <typename T>
|
|
void IterOpenCLKernelCreator<T>::SharedDataIndexDefines(const Ember<T>& ember, pair<string, vector<T>>& params, bool doVals, bool doString)
|
|
{
|
|
size_t i = 0, j, offset = 0;
|
|
string s;
|
|
vector<string> dataNames;//Can't use a set here because they sort and we must preserve the insertion order due to nested function calls.
|
|
ostringstream os;
|
|
auto varFuncs = VarFuncs<T>::Instance();
|
|
|
|
if (doVals)
|
|
params.second.clear();
|
|
|
|
while (auto xform = ember.GetTotalXform(i++))
|
|
{
|
|
size_t varCount = xform->TotalVariationCount();
|
|
|
|
for (j = 0; j < varCount; j++)
|
|
{
|
|
if (auto var = xform->GetVariation(j))
|
|
{
|
|
auto names = var->OpenCLGlobalDataNames();
|
|
|
|
for (auto& name : names)
|
|
{
|
|
if (!Contains(dataNames, name))
|
|
{
|
|
s = ToUpper(name);
|
|
|
|
if (auto dataInfo = varFuncs->GetSharedData(s))///Will contain a name, pointer to data, and size of the data in units of sizeof(T).
|
|
{
|
|
if (doString)
|
|
os << "#define " << ToUpper(name) << " " << offset << '\n';
|
|
|
|
if (doVals)
|
|
params.second.insert(params.second.end(), dataInfo->first, dataInfo->first + dataInfo->second);
|
|
|
|
dataNames.push_back(name);
|
|
offset += dataInfo->second;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if (doString)
|
|
{
|
|
os << "#define TOTAL_GLOBAL_SIZE_END " << offset << "\n\n";
|
|
params.first = os.str();
|
|
}
|
|
}
|
|
/// <summary>
|
|
/// Create the string needed for the struct whose values will change between each iteration.
|
|
/// This is only needed for variations whose state changes.
|
|
/// If none are present, the struct will be empty.
|
|
/// </summary>
|
|
/// <param name="ember">The ember to generate the variation state struct string for</param>
|
|
/// <returns>The variation state struct string</returns>
|
|
template <typename T>
|
|
string IterOpenCLKernelCreator<T>::VariationStateString(const Ember<T>& ember)
|
|
{
|
|
size_t i = 0;
|
|
ostringstream os;
|
|
os << "typedef struct __attribute__ " ALIGN_CL " _VariationState\n{";
|
|
|
|
while (auto xform = ember.GetTotalXform(i++))
|
|
for (size_t j = 0; j < xform->TotalVariationCount(); j++)
|
|
if (auto var = xform->GetVariation(j))
|
|
os << var->StateOpenCLString();
|
|
|
|
os << "\n} VariationState;\n\n";
|
|
return os.str();
|
|
}
|
|
/// <summary>
|
|
/// Create the string needed for the initial state of the struct whose values will change between each iteration.
|
|
/// This is only needed for variations whose state changes.
|
|
/// If none are present, the returned init string will be empty.
|
|
/// This will be called at the beginning of each kernel.
|
|
/// </summary>
|
|
/// <param name="ember">The ember to generate the variation state struct init string for</param>
|
|
/// <returns>The variation state struct init string</returns>
|
|
template <typename T>
|
|
string IterOpenCLKernelCreator<T>::VariationStateInitString(const Ember<T>& ember)
|
|
{
|
|
size_t i = 0;
|
|
ostringstream os;
|
|
|
|
while (auto xform = ember.GetTotalXform(i++))
|
|
for (size_t j = 0; j < xform->TotalVariationCount(); j++)
|
|
if (auto var = xform->GetVariation(j))
|
|
os << var->StateInitOpenCLString();
|
|
|
|
return os.str();
|
|
}
|
|
/// <summary>
|
|
/// Determine whether the two embers passed in differ enough
|
|
/// to require a rebuild of the iteration code.
|
|
/// A rebuild is required if they differ in the following ways:
|
|
/// Xform count
|
|
/// Final xform presence
|
|
/// Xaos presence
|
|
/// Palette accumulation mode
|
|
/// Xform post affine presence
|
|
/// Variation count
|
|
/// Variation type
|
|
/// Template argument expected to be float or double.
|
|
/// </summary>
|
|
/// <param name="ember1">The first ember to compare</param>
|
|
/// <param name="ember2">The second ember to compare</param>
|
|
/// <param name="optAffine">True to optimize with a simple assignment when the pre affine transform is empty, else false. True is better for final renders, false for interactive to reduce repeated compilations.</param>
|
|
/// <returns>True if a rebuild is required, else false</returns>
|
|
template <typename T>
|
|
bool IterOpenCLKernelCreator<T>::IsBuildRequired(const Ember<T>& ember1, const Ember<T>& ember2, bool optAffine)
|
|
{
|
|
size_t i, j, xformCount = ember1.TotalXformCount();
|
|
static bool lastCompat = Compat::m_Compat;
|
|
|
|
if (lastCompat != Compat::m_Compat)
|
|
{
|
|
lastCompat = Compat::m_Compat;
|
|
return true;
|
|
}
|
|
|
|
if (xformCount != ember2.TotalXformCount())
|
|
return true;
|
|
|
|
if (ember1.UseFinalXform() != ember2.UseFinalXform())
|
|
return true;
|
|
|
|
if (ember1.XaosPresent() != ember2.XaosPresent())
|
|
return true;
|
|
|
|
if (ember1.m_PaletteMode != ember2.m_PaletteMode)
|
|
return true;
|
|
|
|
if (ember1.ProjBits() != ember2.ProjBits())
|
|
return true;
|
|
|
|
for (i = 0; i < xformCount; i++)
|
|
{
|
|
auto xform1 = ember1.GetTotalXform(i);
|
|
auto xform2 = ember2.GetTotalXform(i);
|
|
auto varCount = xform1->TotalVariationCount();
|
|
|
|
if (optAffine && (xform1->m_Affine.IsID() != xform2->m_Affine.IsID()))
|
|
return true;
|
|
|
|
if (xform1->HasPost() != xform2->HasPost())
|
|
return true;
|
|
|
|
if (varCount != xform2->TotalVariationCount())
|
|
return true;
|
|
|
|
for (j = 0; j < varCount; j++)
|
|
if (xform1->GetVariation(j)->VariationId() != xform2->GetVariation(j)->VariationId())
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
/// <summary>
|
|
/// Create the zeroize kernel string.
|
|
/// OpenCL comes with no way to zeroize a buffer like memset()
|
|
/// would do on the CPU. So a special kernel must be ran to set a range
|
|
/// of memory addresses to zero.
|
|
/// </summary>
|
|
/// <returns>The kernel string</returns>
|
|
template <typename T>
|
|
string IterOpenCLKernelCreator<T>::CreateZeroizeKernelString() const
|
|
{
|
|
ostringstream os;
|
|
os <<
|
|
ConstantDefinesString(typeid(T) == typeid(double)) <<//Double precision doesn't matter here since it's not used.
|
|
"__kernel void " << m_ZeroizeEntryPoint << "(__global uchar* buffer, uint width, uint height)\n"
|
|
"{\n"
|
|
" if (GLOBAL_ID_X >= width || GLOBAL_ID_Y >= height)\n"
|
|
" return;\n"
|
|
"\n"
|
|
" buffer[(GLOBAL_ID_Y * width) + GLOBAL_ID_X] = 0;\n"//Can't use INDEX_IN_GRID_2D here because the grid might be larger than the buffer to make even dimensions.
|
|
" barrier(CLK_GLOBAL_MEM_FENCE);\n"//Just to be safe.
|
|
"}\n"
|
|
"\n";
|
|
return os.str();
|
|
}
|
|
/// <summary>
|
|
/// Create the histogram summing kernel string.
|
|
/// This is used when running with multiple GPUs. It takes
|
|
/// two histograms present on a single device, source and dest,
|
|
/// and adds the values of source to dest.
|
|
/// It optionally sets all values of source to zero.
|
|
/// </summary>
|
|
/// <returns>The kernel string</returns>
|
|
template <typename T>
|
|
string IterOpenCLKernelCreator<T>::CreateSumHistKernelString() const
|
|
{
|
|
ostringstream os;
|
|
os <<
|
|
ConstantDefinesString(typeid(T) == typeid(double)) <<//Double precision doesn't matter here since it's not used.
|
|
"__kernel void " << m_SumHistEntryPoint << "(__global real4_bucket* source, __global real4_bucket* dest, uint width, uint height, uint clear)\n"
|
|
"{\n"
|
|
" if (GLOBAL_ID_X >= width || GLOBAL_ID_Y >= height)\n"
|
|
" return;\n"
|
|
"\n"
|
|
" dest[(GLOBAL_ID_Y * width) + GLOBAL_ID_X] += source[(GLOBAL_ID_Y * width) + GLOBAL_ID_X];\n"//Can't use INDEX_IN_GRID_2D here because the grid might be larger than the buffer to make even dimensions.
|
|
"\n"
|
|
" if (clear)\n"
|
|
" source[(GLOBAL_ID_Y * width) + GLOBAL_ID_X] = 0;\n"
|
|
"\n"
|
|
" barrier(CLK_GLOBAL_MEM_FENCE);\n"//Just to be safe.
|
|
"}\n"
|
|
"\n";
|
|
return os.str();
|
|
}
|
|
/// <summary>
|
|
/// Create the string for 3D projection based on the 3D values of the ember.
|
|
/// Projection is done on the second point.
|
|
/// If any of these fields toggle between 0 and nonzero between runs, a recompile is triggered.
|
|
/// </summary>
|
|
/// <param name="ember">The ember to create the projection string for</param>
|
|
/// <returns>The kernel string</returns>
|
|
template <typename T>
|
|
string IterOpenCLKernelCreator<T>::CreateProjectionString(const Ember<T>& ember) const
|
|
{
|
|
size_t projBits = ember.ProjBits();
|
|
ostringstream os;
|
|
|
|
if (projBits)
|
|
{
|
|
if (projBits & size_t(eProjBits::PROJBITS_BLUR))
|
|
{
|
|
if (projBits & size_t(eProjBits::PROJBITS_YAW))
|
|
{
|
|
os <<
|
|
" real_t dsin, dcos;\n"
|
|
" real_t t = MwcNext01(&mwc) * M_2PI;\n"
|
|
" real_t z = secondPoint.m_Z - ember->m_CamZPos;\n"
|
|
" real_t x = fma(ember->m_C00, secondPoint.m_X, ember->m_C10 * secondPoint.m_Y);\n"
|
|
" real_t y = fma(ember->m_C01, secondPoint.m_X, fma(ember->m_C11, secondPoint.m_Y, ember->m_C21 * z));\n"
|
|
"\n"
|
|
" z = fma(ember->m_C02, secondPoint.m_X, fma(ember->m_C12, secondPoint.m_Y, ember->m_C22 * z));\n"
|
|
"\n"
|
|
" real_t zr = Zeps(1 - ember->m_CamPerspective * z);\n"
|
|
" real_t prcx = (x - carToRas->m_CarCenterX) / carToRas->m_CarHalfX;\n"
|
|
" real_t prcy = (y - carToRas->m_CarCenterY) / carToRas->m_CarHalfY;\n"
|
|
" real_t dist = sqrt(SQR(prcx) + SQR(prcy)) * (real_t)(10.0);\n"
|
|
" real_t scale = ember->m_BlurCurve ? min((real_t)(1.0), (dist * dist) / (4 * ember->m_BlurCurve)) : (real_t)(1.0);\n"
|
|
" real_t dr = MwcNext01(&mwc) * (ember->m_BlurCoef * scale) * z;\n"
|
|
"\n"
|
|
" dsin = sin(t);\n"
|
|
" dcos = cos(t);\n"
|
|
"\n"
|
|
" secondPoint.m_X = fma(dr, dcos, x) / zr;\n"
|
|
" secondPoint.m_Y = fma(dr, dsin, y) / zr;\n"
|
|
" secondPoint.m_Z -= ember->m_CamZPos;\n";
|
|
}
|
|
else
|
|
{
|
|
os <<
|
|
" real_t y, z, zr;\n"
|
|
" real_t dsin, dcos;\n"
|
|
" real_t t = MwcNext01(&mwc) * M_2PI;\n"
|
|
"\n"
|
|
" z = secondPoint.m_Z - ember->m_CamZPos;\n"
|
|
" y = fma(ember->m_C11, secondPoint.m_Y, ember->m_C21 * z);\n"
|
|
" z = fma(ember->m_C12, secondPoint.m_Y, ember->m_C22 * z);\n"
|
|
" zr = Zeps(1 - ember->m_CamPerspective * z);\n"
|
|
"\n"
|
|
" dsin = sin(t);\n"
|
|
" dcos = cos(t);\n"
|
|
"\n"
|
|
" real_t prcx = (secondPoint.m_X - carToRas->m_CarCenterX) / carToRas->m_CarHalfX;\n"
|
|
" real_t prcy = (y - carToRas->m_CarCenterY) / carToRas->m_CarHalfY;\n"
|
|
" real_t dist = sqrt(SQR(prcx) + SQR(prcy)) * (real_t)(10.0);\n"
|
|
" real_t scale = ember->m_BlurCurve ? min((real_t)(1.0), (dist * dist) / (4 * ember->m_BlurCurve)) : (real_t)(1.0);\n"
|
|
" real_t dr = MwcNext01(&mwc) * (ember->m_BlurCoef * scale) * z;\n"
|
|
"\n"
|
|
" secondPoint.m_X = fma(dr, dcos, secondPoint.m_X) / zr;\n"
|
|
" secondPoint.m_Y = fma(dr, dsin, y) / zr;\n"
|
|
" secondPoint.m_Z -= ember->m_CamZPos;\n";
|
|
}
|
|
}
|
|
else if ((projBits & size_t(eProjBits::PROJBITS_PITCH)) || (projBits & size_t(eProjBits::PROJBITS_YAW)))
|
|
{
|
|
if (projBits & size_t(eProjBits::PROJBITS_YAW))
|
|
{
|
|
os <<
|
|
" real_t z = secondPoint.m_Z - ember->m_CamZPos;\n"
|
|
" real_t x = fma(ember->m_C00, secondPoint.m_X, ember->m_C10 * secondPoint.m_Y);\n"
|
|
" real_t y = fma(ember->m_C01, secondPoint.m_X, fma(ember->m_C11, secondPoint.m_Y, ember->m_C21 * z));\n"
|
|
" real_t zr = Zeps(1 - ember->m_CamPerspective * fma(ember->m_C02, secondPoint.m_X, fma(ember->m_C12, secondPoint.m_Y, ember->m_C22 * z)));\n"
|
|
"\n"
|
|
" secondPoint.m_X = x / zr;\n"
|
|
" secondPoint.m_Y = y / zr;\n"
|
|
" secondPoint.m_Z -= ember->m_CamZPos;\n";
|
|
}
|
|
else
|
|
{
|
|
os <<
|
|
" real_t z = secondPoint.m_Z - ember->m_CamZPos;\n"
|
|
" real_t y = fma(ember->m_C11, secondPoint.m_Y, ember->m_C21 * z);\n"
|
|
" real_t zr = Zeps(1 - ember->m_CamPerspective * fma(ember->m_C12, secondPoint.m_Y, ember->m_C22 * z));\n"
|
|
"\n"
|
|
" secondPoint.m_X /= zr;\n"
|
|
" secondPoint.m_Y = y / zr;\n"
|
|
" secondPoint.m_Z -= ember->m_CamZPos;\n";
|
|
}
|
|
}
|
|
else
|
|
{
|
|
os <<
|
|
" real_t zr = Zeps(1 - ember->m_CamPerspective * (secondPoint.m_Z - ember->m_CamZPos));\n"
|
|
"\n"
|
|
" secondPoint.m_X /= zr;\n"
|
|
" secondPoint.m_Y /= zr;\n"
|
|
" secondPoint.m_Z -= ember->m_CamZPos;\n";
|
|
}
|
|
}
|
|
|
|
return os.str();
|
|
}
|
|
template EMBERCL_API class IterOpenCLKernelCreator<float>;
|
|
#ifdef DO_DOUBLE
|
|
template EMBERCL_API class IterOpenCLKernelCreator<double>;
|
|
#endif
|
|
}
|