#include "EmberCLPch.h" #include "FinalAccumOpenCLKernelCreator.h" namespace EmberCLns { ///

/// Constructor that creates all kernel strings. /// The caller will access these strings through the accessor functions. ///

template FinalAccumOpenCLKernelCreator::FinalAccumOpenCLKernelCreator() { m_GammaCorrectionWithAlphaCalcEntryPoint = "GammaCorrectionWithAlphaCalcKernel"; m_GammaCorrectionWithoutAlphaCalcEntryPoint = "GammaCorrectionWithoutAlphaCalcKernel"; m_GammaCorrectionWithAlphaCalcKernel = CreateGammaCorrectionKernelString(true); m_GammaCorrectionWithoutAlphaCalcKernel = CreateGammaCorrectionKernelString(false); m_FinalAccumEarlyClipEntryPoint = "FinalAccumEarlyClipKernel"; m_FinalAccumEarlyClipWithAlphaCalcWithAlphaAccumEntryPoint = "FinalAccumEarlyClipWithAlphaCalcWithAlphaAccumKernel"; m_FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumEntryPoint = "FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumKernel"; m_FinalAccumEarlyClipKernel = CreateFinalAccumKernelString(true, false, false); m_FinalAccumEarlyClipWithAlphaCalcWithAlphaAccumKernel = CreateFinalAccumKernelString(true, true, true); m_FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumKernel = CreateFinalAccumKernelString(true, false, true); m_FinalAccumLateClipEntryPoint = "FinalAccumLateClipKernel"; m_FinalAccumLateClipWithAlphaCalcWithAlphaAccumEntryPoint = "FinalAccumLateClipWithAlphaCalcWithAlphaAccumKernel"; m_FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumEntryPoint = "FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumKernel"; m_FinalAccumLateClipKernel = CreateFinalAccumKernelString(false, false, false); m_FinalAccumLateClipWithAlphaCalcWithAlphaAccumKernel = CreateFinalAccumKernelString(false, true, true); m_FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumKernel = CreateFinalAccumKernelString(false, false, true); } ///

/// Kernel source and entry point properties, getters only. ///

template string FinalAccumOpenCLKernelCreator::GammaCorrectionWithAlphaCalcKernel() { return m_GammaCorrectionWithAlphaCalcKernel; } template string FinalAccumOpenCLKernelCreator::GammaCorrectionWithAlphaCalcEntryPoint() { return m_GammaCorrectionWithAlphaCalcEntryPoint; } template string FinalAccumOpenCLKernelCreator::GammaCorrectionWithoutAlphaCalcKernel() { return m_GammaCorrectionWithoutAlphaCalcKernel; } template string FinalAccumOpenCLKernelCreator::GammaCorrectionWithoutAlphaCalcEntryPoint() { return m_GammaCorrectionWithoutAlphaCalcEntryPoint; } template string FinalAccumOpenCLKernelCreator::FinalAccumEarlyClipKernel() { return m_FinalAccumEarlyClipKernel; } template string FinalAccumOpenCLKernelCreator::FinalAccumEarlyClipEntryPoint() { return m_FinalAccumEarlyClipEntryPoint; } template string FinalAccumOpenCLKernelCreator::FinalAccumEarlyClipWithAlphaCalcWithAlphaAccumKernel() { return m_FinalAccumEarlyClipWithAlphaCalcWithAlphaAccumKernel; } template string FinalAccumOpenCLKernelCreator::FinalAccumEarlyClipWithAlphaCalcWithAlphaAccumEntryPoint() { return m_FinalAccumEarlyClipWithAlphaCalcWithAlphaAccumEntryPoint; } template string FinalAccumOpenCLKernelCreator::FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumKernel() { return m_FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumKernel; } template string FinalAccumOpenCLKernelCreator::FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumEntryPoint() { return m_FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumEntryPoint; } template string FinalAccumOpenCLKernelCreator::FinalAccumLateClipKernel() { return m_FinalAccumLateClipKernel; } template string FinalAccumOpenCLKernelCreator::FinalAccumLateClipEntryPoint() { return m_FinalAccumLateClipEntryPoint; } template string FinalAccumOpenCLKernelCreator::FinalAccumLateClipWithAlphaCalcWithAlphaAccumKernel() { return m_FinalAccumLateClipWithAlphaCalcWithAlphaAccumKernel; } template string FinalAccumOpenCLKernelCreator::FinalAccumLateClipWithAlphaCalcWithAlphaAccumEntryPoint() { return m_FinalAccumLateClipWithAlphaCalcWithAlphaAccumEntryPoint; } template string FinalAccumOpenCLKernelCreator::FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumKernel() { return m_FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumKernel; } template string FinalAccumOpenCLKernelCreator::FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumEntryPoint() { return m_FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumEntryPoint; } ///

/// Get the gamma correction entry point. ///

/// The number of channels used, 3 or 4. /// True if channels equals 4 and using transparency, else false. /// The name of the gamma correction entry point kernel function template string FinalAccumOpenCLKernelCreator::GammaCorrectionEntryPoint(size_t channels, bool transparency) { bool alphaCalc = ((channels > 3) && transparency); return alphaCalc ? m_GammaCorrectionWithAlphaCalcEntryPoint : m_GammaCorrectionWithoutAlphaCalcEntryPoint; } ///

/// Get the gamma correction kernel string. ///

/// The number of channels used, 3 or 4. /// True if channels equals 4 and using transparency, else false. /// The gamma correction kernel string template string FinalAccumOpenCLKernelCreator::GammaCorrectionKernel(size_t channels, bool transparency) { bool alphaCalc = ((channels > 3) && transparency); return alphaCalc ? m_GammaCorrectionWithAlphaCalcKernel : m_GammaCorrectionWithoutAlphaCalcKernel; } ///

/// Get the final accumulation entry point. ///

/// True if early clip is desired, else false. /// The number of channels used, 3 or 4. /// True if channels equals 4 and using transparency, else false. /// Storage for the alpha base value used in the kernel. 0 if transparency is true, else 255. /// Storage for the alpha scale value used in the kernel. 255 if transparency is true, else 0. /// The name of the final accumulation entry point kernel function template string FinalAccumOpenCLKernelCreator::FinalAccumEntryPoint(bool earlyClip, size_t channels, bool transparency, T& alphaBase, T& alphaScale) { bool alphaCalc = ((channels > 3) && transparency); bool alphaAccum = channels > 3; if (alphaAccum) { alphaBase = transparency ? 0.0f : 255.0f;//See the table below. alphaScale = transparency ? 255.0f : 0.0f; } if (earlyClip) { if (!alphaCalc && !alphaAccum)//Rgb output, the most common case. return FinalAccumEarlyClipEntryPoint(); else if (alphaCalc && alphaAccum)//Rgba output and Transparency. return FinalAccumEarlyClipWithAlphaCalcWithAlphaAccumEntryPoint(); else if (!alphaCalc && alphaAccum)//Rgba output and !Transparency. return FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumEntryPoint(); else return "";//Cannot have alphaCalc and !alphaAccum, it makes no sense. } else { if (!alphaCalc && !alphaAccum)//Rgb output, the most common case. return FinalAccumLateClipEntryPoint(); else if (alphaCalc && alphaAccum)//Rgba output and Transparency. return FinalAccumLateClipWithAlphaCalcWithAlphaAccumEntryPoint(); else if (!alphaCalc && alphaAccum)//Rgba output and !Transparency. return FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumEntryPoint(); else return "";//Cannot have alphaCalc and !alphaAccum, it makes no sense. } } ///

/// Get the final accumulation kernel string. ///

/// True if early clip is desired, else false. /// The number of channels used, 3 or 4. /// True if channels equals 4 and using transparency, else false. /// The final accumulation kernel string template string FinalAccumOpenCLKernelCreator::FinalAccumKernel(bool earlyClip, size_t channels, bool transparency) { bool alphaCalc = (channels > 3 && transparency); bool alphaAccum = channels > 3; if (earlyClip) { if (!alphaCalc && !alphaAccum)//Rgb output, the most common case. return FinalAccumEarlyClipKernel(); else if (alphaCalc && alphaAccum)//Rgba output and Transparency. return FinalAccumEarlyClipWithAlphaCalcWithAlphaAccumKernel(); else if (!alphaCalc && alphaAccum)//Rgba output and !Transparency. return FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumKernel(); else return "";//Cannot have alphaCalc and !alphaAccum, it makes no sense. } else { if (!alphaCalc && !alphaAccum)//Rgb output, the most common case. return FinalAccumLateClipKernel(); else if (alphaCalc && alphaAccum)//Rgba output and Transparency. return FinalAccumLateClipWithAlphaCalcWithAlphaAccumKernel(); else if (!alphaCalc && alphaAccum)//Rgba output and !Transparency. return FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumKernel(); else return "";//Cannot have alphaCalc and !alphaAccum, it makes no sense. } } ///

/// Wrapper around CreateFinalAccumKernelString(). ///

/// Create the final accumulation kernel string ///

/// True if early clip is desired, else false. /// True if channels equals 4 and transparency is desired, else false. /// True if channels equals 4 /// The final accumulation kernel string template string FinalAccumOpenCLKernelCreator::CreateFinalAccumKernelString(bool earlyClip, bool alphaCalc, bool alphaAccum) { ostringstream os; string channels = alphaAccum ? "4" : "3"; os << ConstantDefinesString(typeid(T) == typeid(double)) << ClampRealFunctionString << UnionCLStructString << RgbToHsvFunctionString << HsvToRgbFunctionString << CalcAlphaFunctionString << CurveAdjustFunctionString << SpatialFilterCLStructString; if (earlyClip) { if (!alphaCalc && !alphaAccum)//Rgb output, the most common case. os << "__kernel void " << m_FinalAccumEarlyClipEntryPoint << "(\n"; else if (alphaCalc && alphaAccum)//Rgba output and Transparency. os << "__kernel void " << m_FinalAccumEarlyClipWithAlphaCalcWithAlphaAccumEntryPoint << "(\n"; else if (!alphaCalc && alphaAccum)//Rgba output and !Transparency. os << "__kernel void " << m_FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumEntryPoint << "(\n"; else return "";//Cannot have alphaCalc and !alphaAccum, it makes no sense. } else { os << CreateCalcNewRgbFunctionString(false) << CreateGammaCorrectionFunctionString(false, alphaCalc, alphaAccum, true); if (!alphaCalc && !alphaAccum)//Rgb output, the most common case. os << "__kernel void " << m_FinalAccumLateClipEntryPoint << "(\n"; else if (alphaCalc && alphaAccum)//Rgba output and Transparency. os << "__kernel void " << m_FinalAccumLateClipWithAlphaCalcWithAlphaAccumEntryPoint << "(\n"; else if (!alphaCalc && alphaAccum)//Rgba output and !Transparency. os << "__kernel void " << m_FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumEntryPoint << "(\n"; else return "";//Cannot have alphaCalc and !alphaAccum, it makes no sense. } os << " const __global real4reals* accumulator,\n" " __write_only image2d_t pixels,\n" " __constant SpatialFilterCL* spatialFilter,\n" " __constant real_t* filterCoefs,\n" " __constant real4reals* csa,\n" " const uint doCurves,\n" " const real_t alphaBase,\n" " const real_t alphaScale\n" "\t)\n" "{\n" "\n" " if ((GLOBAL_ID_Y >= spatialFilter->m_FinalRasH) || (GLOBAL_ID_X >= spatialFilter->m_FinalRasW))\n" " return;\n" "\n" " uint accumX = spatialFilter->m_DensityFilterOffset + (GLOBAL_ID_X * spatialFilter->m_Supersample);\n" " uint accumY = spatialFilter->m_DensityFilterOffset + (GLOBAL_ID_Y * spatialFilter->m_Supersample);\n" " int2 finalCoord;\n" " finalCoord.x = GLOBAL_ID_X;\n" " finalCoord.y = (int)((spatialFilter->m_YAxisUp == 1) ? ((spatialFilter->m_FinalRasH - GLOBAL_ID_Y) - 1) : GLOBAL_ID_Y);\n" " float4floats finalColor;\n" " int ii, jj;\n" " uint filterKRowIndex;\n" " const __global real4reals* accumBucket;\n" " real4reals newBucket;\n" " newBucket.m_Real4 = 0;\n" "\n" " for (jj = 0; jj < spatialFilter->m_FilterWidth; jj++)\n" " {\n" " filterKRowIndex = jj * spatialFilter->m_FilterWidth;\n" "\n" " for (ii = 0; ii < spatialFilter->m_FilterWidth; ii++)\n" " {\n" " real_t k = filterCoefs[ii + filterKRowIndex];\n" "\n" " accumBucket = accumulator + (accumX + ii) + ((accumY + jj) * spatialFilter->m_SuperRasW);\n" " newBucket.m_Real4 += (k * accumBucket->m_Real4);\n" " }\n" " }\n" "\n"; //Not supporting 2 bytes per channel on the GPU. If the user wants it, run on the CPU. if (earlyClip)//If early clip, simply assign values directly to the temp float4 since they've been gamma corrected already, then write it straight to the output image below. { os << " finalColor.m_Float4.x = (float)newBucket.m_Real4.x;\n"//CPU side clamps, skip here because write_imagef() does the clamping for us. " finalColor.m_Float4.y = (float)newBucket.m_Real4.y;\n" " finalColor.m_Float4.z = (float)newBucket.m_Real4.z;\n"; if (alphaAccum) { if (alphaCalc) os << " finalColor.m_Float4.w = (float)newBucket.m_Real4.w * 255.0f;\n"; else os << " finalColor.m_Float4.w = 255;\n"; } } else { //Late clip, so must gamma correct from the temp new bucket to temp float4. if (typeid(T) == typeid(double)) { os << " real4reals realFinal;\n" "\n" " GammaCorrectionFloats(&newBucket, &(spatialFilter->m_Background[0]), spatialFilter->m_Gamma, spatialFilter->m_LinRange, spatialFilter->m_Vibrancy, spatialFilter->m_HighlightPower, alphaBase, alphaScale, &(realFinal.m_Reals[0]));\n" " finalColor.m_Float4.x = (float)realFinal.m_Real4.x;\n" " finalColor.m_Float4.y = (float)realFinal.m_Real4.y;\n" " finalColor.m_Float4.z = (float)realFinal.m_Real4.z;\n" " finalColor.m_Float4.w = (float)realFinal.m_Real4.w;\n" ; } else { os << " GammaCorrectionFloats(&newBucket, &(spatialFilter->m_Background[0]), spatialFilter->m_Gamma, spatialFilter->m_LinRange, spatialFilter->m_Vibrancy, spatialFilter->m_HighlightPower, alphaBase, alphaScale, &(finalColor.m_Floats[0]));\n"; } } os << "\n" " if (doCurves)\n" " {\n" " CurveAdjust(csa, &(finalColor.m_Floats[0]), 1);\n" " CurveAdjust(csa, &(finalColor.m_Floats[1]), 2);\n" " CurveAdjust(csa, &(finalColor.m_Floats[2]), 3);\n" " }\n" "\n" " finalColor.m_Float4 /= 255.0f;\n" " write_imagef(pixels, finalCoord, finalColor.m_Float4);\n"//Use write_imagef instead of write_imageui because only the former works when sharing with an OpenGL texture. " barrier(CLK_GLOBAL_MEM_FENCE);\n"//Required, or else page tearing will occur during interactive rendering. "}\n" ; return os.str(); } ///

/// Creates the gamma correction function string. /// This is not a full kernel, just a function that is used in the kernels. ///

/// True if writing to a global buffer (early clip), else false (late clip). /// True if channels equals 4 and transparency is desired, else false. /// True if channels equals 4 /// True if writing to global buffer (late clip), else false (early clip). /// The gamma correction function string template string FinalAccumOpenCLKernelCreator::CreateGammaCorrectionFunctionString(bool globalBucket, bool alphaCalc, bool alphaAccum, bool finalOut) { ostringstream os; string dataType; string unionMember; dataType = "real_t"; //Use real_t for all cases, early clip and final accum. os << "void GammaCorrectionFloats(" << (globalBucket ? "__global " : "") << "real4reals* bucket, __constant real_t* background, real_t g, real_t linRange, real_t vibrancy, real_t highlightPower, real_t alphaBase, real_t alphaScale, " << (finalOut ? "" : "__global") << " real_t* correctedChannels)\n"; os << "{\n" << " real_t alpha, ls, tmp, a;\n" << " real4reals newRgb;\n" << "\n" << " if (bucket->m_Reals[3] <= 0)\n" << " {\n" << " alpha = 0;\n" << " ls = 0;\n" << " }\n" << " else\n" << " {\n" << " tmp = bucket->m_Reals[3];\n" << " alpha = CalcAlpha(tmp, g, linRange);\n" << " ls = vibrancy * 256.0 * alpha / tmp;\n" << " ClampRef(&alpha, 0.0, 1.0);\n" << " }\n" << "\n" << " CalcNewRgb(bucket, ls, highlightPower, &newRgb);\n" << "\n" << " for (uint rgbi = 0; rgbi < 3; rgbi++)\n" << " {\n" << " a = newRgb.m_Reals[rgbi] + ((1.0 - vibrancy) * 256.0 * pow(bucket->m_Reals[rgbi], g));\n" << "\n"; if (!alphaCalc) { os << " a += ((1.0 - alpha) * background[rgbi]);\n"; } else { os << " if (alpha > 0)\n" << " a /= alpha;\n" << " else\n" << " a = 0;\n"; } os << "\n" " correctedChannels[rgbi] = (" << dataType << ")clamp(a, 0.0, 255.0);\n" " }\n" "\n"; //The CPU code has 3 cases for assigning alpha: //[3] = alpha.//Early clip. //[3] = alpha * 255.//Final Rgba with transparency. //[3] = 255.//Final Rgba without transparency. //Putting conditionals in GPU code is to be avoided. So do base + alpha * scale which will //work for all 3 cases without using a conditional, which should be faster on a GPU. This gives: //Base = 0, scale = 1. [3] = (0 + (alpha * 1)). [3] = alpha. //Base = 0, scale = 255. [3] = (0 + (alpha * 255)). [3] = alpha * 255. //Base = 255, scale = 0. [3] = (255 + (alpha * 0)). [3] = 255. if (alphaAccum) { os << " correctedChannels[3] = (" << dataType << ")(alphaBase + (alpha * alphaScale));\n"; } os << "}\n" "\n"; return os.str(); } ///

/// OpenCL equivalent of Palette::CalcNewRgb(). ///

/// True if writing the corrected value to a global buffer (early clip), else false (late clip). /// The CalcNewRgb function string template string FinalAccumOpenCLKernelCreator::CreateCalcNewRgbFunctionString(bool globalBucket) { ostringstream os; os << "static void CalcNewRgb(" << (globalBucket ? "__global " : "") << "real4reals* oldRgb, real_t ls, real_t highPow, real4reals* newRgb)\n" "{\n" " int rgbi;\n" " real_t newls, lsratio;\n" " real4reals newHsv;\n" " real_t maxa, maxc;\n" " real_t adjhlp;\n" "\n" " if (ls == 0 || (oldRgb->m_Real4.x == 0 && oldRgb->m_Real4.y == 0 && oldRgb->m_Real4.z == 0))\n"//Can't do a vector compare to zero. " {\n" " newRgb->m_Real4 = 0;\n" " return;\n" " }\n" "\n" //Identify the most saturated channel. " maxc = max(max(oldRgb->m_Reals[0], oldRgb->m_Reals[1]), oldRgb->m_Reals[2]);\n" " maxa = ls * maxc;\n" "\n" //If a channel is saturated and highlight power is non-negative //modify the color to prevent hue shift. " if (maxa > 255 && highPow >= 0)\n" " {\n" " newls = 255.0 / maxc;\n" " lsratio = pow(newls / ls, highPow);\n" "\n" //Calculate the max-value color (ranged 0 - 1). " for (rgbi = 0; rgbi < 3; rgbi++)\n" " newRgb->m_Reals[rgbi] = newls * oldRgb->m_Reals[rgbi] / 255.0;\n" "\n" //Reduce saturation by the lsratio. " RgbToHsv(&(newRgb->m_Real4), &(newHsv.m_Real4));\n" " newHsv.m_Real4.y *= lsratio;\n" " HsvToRgb(&(newHsv.m_Real4), &(newRgb->m_Real4));\n" "\n" " for (rgbi = 0; rgbi < 3; rgbi++)\n"//Unrolling and vectorizing makes no difference. " newRgb->m_Reals[rgbi] *= 255.0;\n" " }\n" " else\n" " {\n" " newls = 255.0 / maxc;\n" " adjhlp = -highPow;\n" "\n" " if (adjhlp > 1)\n" " adjhlp = 1;\n" "\n" " if (maxa <= 255)\n" " adjhlp = 1;\n" "\n" //Calculate the max-value color (ranged 0 - 1) interpolated with the old behavior. " for (rgbi = 0; rgbi < 3; rgbi++)\n"//Unrolling, caching and vectorizing makes no difference. " newRgb->m_Reals[rgbi] = ((1.0 - adjhlp) * newls + adjhlp * ls) * oldRgb->m_Reals[rgbi];\n" " }\n" "}\n" "\n"; return os.str(); } ///

/// Create the gamma correction kernel string used for early clipping. ///

/// True if channels equals 4 and transparency is desired, else false. /// The gamma correction kernel string used for early clipping template string FinalAccumOpenCLKernelCreator::CreateGammaCorrectionKernelString(bool alphaCalc) { ostringstream os; string dataType; os << ConstantDefinesString(typeid(T) == typeid(double)) << ClampRealFunctionString << UnionCLStructString << RgbToHsvFunctionString << HsvToRgbFunctionString << CalcAlphaFunctionString << CreateCalcNewRgbFunctionString(true) << SpatialFilterCLStructString << CreateGammaCorrectionFunctionString(true, alphaCalc, true, false);//Will only be used with float in this case, early clip. Will always alpha accum. os << "__kernel void " << (alphaCalc ? m_GammaCorrectionWithAlphaCalcEntryPoint : m_GammaCorrectionWithoutAlphaCalcEntryPoint) << "(\n" << " __global real4reals* accumulator,\n" " __constant SpatialFilterCL* spatialFilter\n" ")\n" "{\n" " int testGutter = 0;\n" "\n" " if (GLOBAL_ID_Y >= (spatialFilter->m_SuperRasH - testGutter) || GLOBAL_ID_X >= (spatialFilter->m_SuperRasW - testGutter))\n" " return;\n" "\n" " uint superIndex = (GLOBAL_ID_Y * spatialFilter->m_SuperRasW) + GLOBAL_ID_X;\n" " __global real4reals* bucket = accumulator + superIndex;\n" //Pass in an alphaBase and alphaScale of 0, 1 which means to just directly assign the computed alpha value. " GammaCorrectionFloats(bucket, &(spatialFilter->m_Background[0]), spatialFilter->m_Gamma, spatialFilter->m_LinRange, spatialFilter->m_Vibrancy, spatialFilter->m_HighlightPower, 0.0, 1.0, &(bucket->m_Reals[0]));\n" "}\n" ; return os.str(); } template EMBERCL_API class FinalAccumOpenCLKernelCreator; #ifdef DO_DOUBLE template EMBERCL_API class FinalAccumOpenCLKernelCreator; #endif }