2014-07-08 03:11:14 -04:00
# include "EmberCLPch.h"
# include "DEOpenCLKernelCreator.h"
2014-12-05 21:30:46 -05:00
namespace EmberCLns
2014-07-08 03:11:14 -04:00
{
/// <summary>
2015-08-10 23:10:23 -04:00
/// Constructor that sets all kernel entry points as well as composes
2014-07-08 03:11:14 -04:00
/// all kernel source strings.
2015-08-10 23:10:23 -04:00
/// Note that no versions of kernels that use the cache are compiled because
/// the cache is not big enough to hold double4.
2014-07-08 03:11:14 -04:00
/// No program compilation is done here, the user must explicitly do it.
/// The caller must specify whether they are using an nVidia or AMD card because it changes
/// the amount of local memory available.
/// </summary>
2015-08-10 23:10:23 -04:00
/// <param name="doublePrecision">True if double precision, else false for float.</param>
2014-07-08 03:11:14 -04:00
/// <param name="nVidia">True if running on an nVidia card, else false.</param>
2015-08-10 23:10:23 -04:00
DEOpenCLKernelCreator : : DEOpenCLKernelCreator ( bool doublePrecision , bool nVidia )
2014-07-08 03:11:14 -04:00
{
2015-08-10 23:10:23 -04:00
m_DoublePrecision = doublePrecision ;
2014-07-08 03:11:14 -04:00
m_NVidia = nVidia ;
2015-08-10 23:10:23 -04:00
2014-11-28 04:37:51 -05:00
# ifdef ROW_ONLY_DE
m_LogScaleAssignDEEntryPoint = " LogScaleAssignDensityFilterKernel " ;
m_GaussianDEWithoutSsEntryPoint = " GaussianDEWithoutSsKernel " ;
m_GaussianDESsWithScfEntryPoint = " GaussianDESsWithScfKernel " ;
m_GaussianDESsWithoutScfEntryPoint = " GaussianDESsWithoutScfKernel " ;
m_GaussianDEWithoutSsNoCacheEntryPoint = " GaussianDEWithoutSsNoCacheKernel " ;
m_GaussianDESsWithScfNoCacheEntryPoint = " GaussianDESsWithScfNoCacheKernel " ;
m_GaussianDESsWithoutScfNoCacheEntryPoint = " GaussianDESsWithoutScfNoCacheKernel " ;
m_LogScaleAssignDEKernel = CreateLogScaleAssignDEKernelString ( ) ;
m_GaussianDEWithoutSsKernel = CreateGaussianDEKernel ( 1 ) ;
m_GaussianDESsWithScfKernel = CreateGaussianDEKernel ( 2 ) ;
m_GaussianDESsWithoutScfKernel = CreateGaussianDEKernel ( 3 ) ;
m_GaussianDEWithoutSsNoCacheKernel = CreateGaussianDEKernelNoLocalCache ( 1 ) ;
m_GaussianDESsWithScfNoCacheKernel = CreateGaussianDEKernelNoLocalCache ( 2 ) ;
m_GaussianDESsWithoutScfNoCacheKernel = CreateGaussianDEKernelNoLocalCache ( 3 ) ;
# else
2015-08-10 23:10:23 -04:00
m_LogScaleAssignDEEntryPoint = " LogScaleAssignDensityFilterKernel " ;
m_GaussianDEWithoutSsEntryPoint = " GaussianDEWithoutSsKernel " ;
m_GaussianDESsWithScfEntryPoint = " GaussianDESsWithScfKernel " ;
m_GaussianDESsWithoutScfEntryPoint = " GaussianDESsWithoutScfKernel " ;
m_GaussianDEWithoutSsNoCacheEntryPoint = " GaussianDEWithoutSsNoCacheKernel " ;
m_GaussianDESsWithScfNoCacheEntryPoint = " GaussianDESsWithScfNoCacheKernel " ;
2014-07-08 03:11:14 -04:00
m_GaussianDESsWithoutScfNoCacheEntryPoint = " GaussianDESsWithoutScfNoCacheKernel " ;
2015-08-10 23:10:23 -04:00
m_LogScaleAssignDEKernel = CreateLogScaleAssignDEKernelString ( ) ;
m_GaussianDEWithoutSsKernel = CreateGaussianDEKernel ( 1 ) ;
m_GaussianDESsWithScfKernel = CreateGaussianDEKernel ( 2 ) ;
m_GaussianDESsWithoutScfKernel = CreateGaussianDEKernel ( 3 ) ;
m_GaussianDEWithoutSsNoCacheKernel = CreateGaussianDEKernelNoLocalCache ( 1 ) ;
m_GaussianDESsWithScfNoCacheKernel = CreateGaussianDEKernelNoLocalCache ( 2 ) ;
m_GaussianDESsWithoutScfNoCacheKernel = CreateGaussianDEKernelNoLocalCache ( 3 ) ;
# endif
2014-07-08 03:11:14 -04:00
}
/// <summary>
/// Kernel source and entry point properties, getters only.
/// </summary>
2015-08-12 21:51:07 -04:00
const string & DEOpenCLKernelCreator : : LogScaleAssignDEKernel ( ) const { return m_LogScaleAssignDEKernel ; }
const string & DEOpenCLKernelCreator : : LogScaleAssignDEEntryPoint ( ) const { return m_LogScaleAssignDEEntryPoint ; }
2014-07-08 03:11:14 -04:00
/// <summary>
/// Get the kernel source for the specified supersample and filterWidth.
/// </summary>
/// <param name="ss">The supersample being used</param>
/// <param name="filterWidth">Filter width</param>
/// <returns>The kernel source</returns>
2015-08-12 21:51:07 -04:00
const string & DEOpenCLKernelCreator : : GaussianDEKernel ( size_t ss , uint filterWidth ) const
2014-07-08 03:11:14 -04:00
{
2014-11-28 04:37:51 -05:00
# ifndef ROW_ONLY_DE
2015-08-10 23:10:23 -04:00
if ( filterWidth > MaxDEFilterSize ( ) )
2014-07-08 03:11:14 -04:00
{
if ( ss > 1 )
{
if ( ! ( ss & 1 ) )
return m_GaussianDESsWithScfNoCacheKernel ;
else
return m_GaussianDESsWithoutScfNoCacheKernel ;
}
else
return m_GaussianDEWithoutSsNoCacheKernel ;
}
else
2014-11-28 04:37:51 -05:00
# endif
2014-07-08 03:11:14 -04:00
{
if ( ss > 1 )
{
if ( ! ( ss & 1 ) )
return m_GaussianDESsWithScfKernel ;
else
return m_GaussianDESsWithoutScfKernel ;
}
else
return m_GaussianDEWithoutSsKernel ;
}
}
/// <summary>
/// Get the kernel entry point for the specified supersample and filterWidth.
/// </summary>
/// <param name="ss">The supersample being used</param>
/// <param name="filterWidth">Filter width</param>
/// <returns>The name of the density estimation filtering entry point kernel function</returns>
2015-08-12 21:51:07 -04:00
const string & DEOpenCLKernelCreator : : GaussianDEEntryPoint ( size_t ss , uint filterWidth ) const
2014-07-08 03:11:14 -04:00
{
2014-11-28 04:37:51 -05:00
# ifndef ROW_ONLY_DE
2015-08-10 23:10:23 -04:00
if ( filterWidth > MaxDEFilterSize ( ) )
2014-07-08 03:11:14 -04:00
{
if ( ss > 1 )
{
if ( ! ( ss & 1 ) )
return m_GaussianDESsWithScfNoCacheEntryPoint ;
else
return m_GaussianDESsWithoutScfNoCacheEntryPoint ;
}
else
return m_GaussianDEWithoutSsNoCacheEntryPoint ;
}
else
2014-11-28 04:37:51 -05:00
# endif
2014-07-08 03:11:14 -04:00
{
if ( ss > 1 )
{
if ( ! ( ss & 1 ) )
return m_GaussianDESsWithScfEntryPoint ;
else
return m_GaussianDESsWithoutScfEntryPoint ;
}
else
return m_GaussianDEWithoutSsEntryPoint ;
}
}
/// <summary>
/// Get the maximum filter size allowed for running the local memory version of density filtering
/// Filters larger than this value will run the version without local memory caching.
/// </summary>
/// <returns>The maximum filter size allowed for running the local memory version of density filtering</returns>
2015-08-10 23:10:23 -04:00
uint DEOpenCLKernelCreator : : MaxDEFilterSize ( ) { return 9 ; } //The true max would be (maxBoxSize - 1) / 2, but that's impractical because it can give us a tiny block size.
2014-07-08 03:11:14 -04:00
/// <summary>
/// Solve for the maximum filter radius.
2014-12-06 00:05:09 -05:00
/// The final filter width is calculated by: (uint)(ceil(m_MaxRad) * (T)m_Supersample) + (m_Supersample - 1);
2014-07-08 03:11:14 -04:00
/// Must solve for what max rad should be in order to give a maximum final width of (maxBoxSize - 1) / 2, assuming
/// a minimum block size of 1 which processes 1 pixel.
/// Example: If a box size of 20 was allowed, a filter
/// size of up to 9: (20 - 1) / 2 == (19 / 2) == 9 could be supported.
/// This function is deprecated, the appropriate kernels take care of this problem now.
/// </summary>
/// <param name="maxBoxSize">Maximum size of the box.</param>
/// <param name="desiredFilterSize">Size of the desired filter.</param>
/// <param name="ss">The supersample being used</param>
/// <returns>The maximum filter radius allowed</returns>
2015-08-10 23:10:23 -04:00
double DEOpenCLKernelCreator : : SolveMaxDERad ( uint maxBoxSize , double desiredFilterSize , double ss )
2014-07-08 03:11:14 -04:00
{
2014-12-07 02:51:44 -05:00
uint finalFilterSize = uint ( ( ceil ( desiredFilterSize ) * ss ) + ( ss - 1.0 ) ) ;
2014-07-08 03:11:14 -04:00
//Return the desired size if the final size of it will fit.
if ( finalFilterSize < = MaxDEFilterSize ( ) )
return desiredFilterSize ;
//The final size doesn't fit, so scale the original down until it fits.
2015-08-10 23:10:23 -04:00
return floor ( ( MaxDEFilterSize ( ) - ( ss - 1.0 ) ) / ss ) ;
2014-07-08 03:11:14 -04:00
}
/// <summary>
/// Determine the maximum filter box size based on the amount of local memory available
/// to each block.
/// </summary>
/// <param name="localMem">The local memory available to a block</param>
/// <returns>The maximum filter box size allowed</returns>
2015-08-10 23:10:23 -04:00
uint DEOpenCLKernelCreator : : SolveMaxBoxSize ( uint localMem )
2014-07-08 03:11:14 -04:00
{
2015-08-10 23:10:23 -04:00
return uint ( floor ( std : : sqrt ( floor ( localMem / 16.0 ) ) ) ) ; //Divide by 16 because each element is float4.
2014-07-08 03:11:14 -04:00
}
/// <summary>
2014-11-28 04:37:51 -05:00
/// Create the log scale kernel string, using assignment.
/// Use this when Passes == 1.
2014-07-08 03:11:14 -04:00
/// </summary>
/// <returns>The kernel string</returns>
2015-08-10 23:10:23 -04:00
string DEOpenCLKernelCreator : : CreateLogScaleAssignDEKernelString ( )
2014-07-08 03:11:14 -04:00
{
ostringstream os ;
2014-12-05 21:30:46 -05:00
2014-07-08 03:11:14 -04:00
os < <
2015-08-10 23:10:23 -04:00
ConstantDefinesString ( m_DoublePrecision ) < <
2014-07-08 03:11:14 -04:00
DensityFilterCLStructString < <
2014-11-28 04:37:51 -05:00
" __kernel void " < < m_LogScaleAssignDEEntryPoint < < " ( \n "
2015-08-10 23:10:23 -04:00
" const __global real4_bucket* histogram, \n "
" __global real4_bucket* accumulator, \n "
2014-07-08 03:11:14 -04:00
" __constant DensityFilterCL* logFilter \n "
" \t ) \n "
" { \n "
" if ((GLOBAL_ID_X < logFilter->m_SuperRasW) && (GLOBAL_ID_Y < logFilter->m_SuperRasH)) \n "
" { \n "
" uint index = (GLOBAL_ID_Y * logFilter->m_SuperRasW) + GLOBAL_ID_X; \n "
" \n "
" if (histogram[index].w != 0) \n "
" { \n "
2015-08-10 23:10:23 -04:00
" real_bucket_t logScale = (logFilter->m_K1 * log(1.0 + histogram[index].w * logFilter->m_K2)) / histogram[index].w; \n "
2014-07-08 03:11:14 -04:00
" \n "
2014-11-28 04:37:51 -05:00
" accumulator[index] = histogram[index] * logScale; \n " //Using a single real4 vector operation doubles the speed from doing each component individually.
2014-07-08 03:11:14 -04:00
" } \n "
" \n "
" barrier(CLK_GLOBAL_MEM_FENCE); \n " //Just to be safe. Makes no speed difference to do all of the time or only when there's a hit.
" } \n "
" } \n " ;
2014-12-05 21:30:46 -05:00
2014-07-08 03:11:14 -04:00
return os . str ( ) ;
}
2014-11-28 04:37:51 -05:00
# ifdef ROW_ONLY_DE
2015-08-10 23:10:23 -04:00
string DEOpenCLKernelCreator : : CreateGaussianDEKernel ( size_t ss )
2014-07-08 03:11:14 -04:00
{
2014-11-28 04:37:51 -05:00
bool doSS = ss > 1 ;
bool doScf = ! ( ss & 1 ) ;
2014-07-08 03:11:14 -04:00
ostringstream os ;
2014-11-28 04:37:51 -05:00
os < <
2015-08-10 23:10:23 -04:00
ConstantDefinesString ( m_DoublePrecision ) < <
2014-07-08 03:11:14 -04:00
DensityFilterCLStructString < <
2014-11-28 04:37:51 -05:00
UnionCLStructString < <
" __kernel void " < < GaussianDEEntryPoint ( ss , MaxDEFilterSize ( ) ) < < " ( \n " < <
2015-08-10 23:10:23 -04:00
" const __global real4_bucket* histogram, \n "
" __global real4reals_bucket* accumulator, \n "
2014-11-28 04:37:51 -05:00
" __constant DensityFilterCL* densityFilter, \n "
2015-08-10 23:10:23 -04:00
" const __global real_bucket_t* filterCoefs, \n "
" const __global real_bucket_t* filterWidths, \n "
2014-11-28 04:37:51 -05:00
" const __global uint* coefIndices, \n "
" const uint chunkSizeW, \n "
" const uint chunkSizeH, \n "
" const uint chunkW, \n "
" const uint chunkH \n "
2014-07-08 03:11:14 -04:00
" \t ) \n "
" { \n "
2014-11-28 04:37:51 -05:00
" uint rowsToProcess = 32; \n " //Rows to process.
" \n "
" if (((((BLOCK_ID_X * chunkSizeW) + chunkW) * BLOCK_SIZE_X) + THREAD_ID_X >= densityFilter->m_SuperRasW) || \n "
" ((((BLOCK_ID_Y * chunkSizeH) + chunkH) * rowsToProcess) + THREAD_ID_Y >= densityFilter->m_SuperRasH)) \n "
" return; \n "
" \n " ;
if ( doSS )
{
os < <
2015-08-10 23:10:23 -04:00
" uint ss = (uint)floor((real_bucket_t)densityFilter->m_Supersample / 2.0); \n "
2014-11-28 04:37:51 -05:00
" int densityBoxLeftX; \n "
" int densityBoxRightX; \n "
" int densityBoxTopY; \n "
" int densityBoxBottomY; \n "
" \n " ;
if ( doScf )
os < <
2015-08-10 23:10:23 -04:00
" real_bucket_t scfact = pow(densityFilter->m_Supersample / (densityFilter->m_Supersample + (real_bucket_t)1.0), (real_bucket_t)2.0); \n " ;
2014-11-28 04:37:51 -05:00
}
os < <
" uint fullTempBoxWidth; \n "
" uint leftBound, rightBound, topBound, botBound; \n "
" uint blockHistStartRow, blockHistEndRow, histCol; \n "
" uint blockHistStartCol, boxReadStartCol, boxReadEndCol; \n "
" uint accumWriteStartCol, colsToWrite, colOffset, colsToWriteOffset; \n "
" int histRow, filterRow, accumWriteOffset; \n "
" \n "
" fullTempBoxWidth = BLOCK_SIZE_X + (densityFilter->m_FilterWidth * 2); \n "
//Compute the bounds of the area to be sampled, which is just the ends minus the super sample minus 1.
" leftBound = densityFilter->m_Supersample - 1; \n "
" rightBound = densityFilter->m_SuperRasW - (densityFilter->m_Supersample - 1); \n "
" topBound = densityFilter->m_Supersample - 1; \n "
" botBound = densityFilter->m_SuperRasH - (densityFilter->m_Supersample - 1); \n "
" \n "
//Start and end values are the indices in the histogram read from
//and written to in the accumulator. They are not the indices for the local block of data.
//Before computing local offsets, compute the global offsets first to determine if any rows or cols fall outside of the bounds.
" blockHistStartRow = min(botBound, topBound + (((BLOCK_ID_Y * chunkSizeH) + chunkH) * rowsToProcess)); \n " //The first histogram row this block will process.
" blockHistEndRow = min(botBound, blockHistStartRow + rowsToProcess); \n " //The last histogram row this block will process, clamped to the last row.
" blockHistStartCol = min(rightBound, leftBound + (((BLOCK_ID_X * chunkSizeW) + chunkW) * BLOCK_SIZE_X)); \n " //The first histogram column this block will process.
" boxReadStartCol = densityFilter->m_FilterWidth - min(densityFilter->m_FilterWidth, blockHistStartCol); \n " //The first box col this block will read from when copying to the accumulator.
" boxReadEndCol = densityFilter->m_FilterWidth + min(densityFilter->m_FilterWidth + BLOCK_SIZE_X, densityFilter->m_SuperRasW - blockHistStartCol); \n " //The last box col this block will read from when copying to the accumulator.
" \n "
//Last, the indices in the global accumulator that the local bounds will be writing to.
" accumWriteStartCol = blockHistStartCol - min(densityFilter->m_FilterWidth, blockHistStartCol); \n " //The first column in the accumulator this block will write to.
2015-08-10 23:10:23 -04:00
" colsToWrite = ceil((real_bucket_t)(boxReadEndCol - boxReadStartCol) / (real_bucket_t)BLOCK_SIZE_X); \n " //Elements per thread to be written to the accumulator.
2014-11-28 04:37:51 -05:00
" histCol = blockHistStartCol + THREAD_ID_X; \n " //The histogram column this individual thread will be reading from.
" \n "
" if (histCol >= rightBound) \n "
" return; \n "
" \n "
//Compute the col position in this local box to serve as the center position
//from which filter application offsets are computed.
//These are the local indices for the local data that are temporarily accumulated to before
//writing out to the global accumulator.
" uint boxCol = densityFilter->m_FilterWidth + THREAD_ID_X; \n "
2015-08-10 23:10:23 -04:00
" uint colsToZeroOffset, colsToZero = ceil((real_bucket_t)fullTempBoxWidth / (real_bucket_t)(BLOCK_SIZE_X)); \n " //Usually is 2.
2014-11-28 04:37:51 -05:00
" int i, j, k, jmin, jmax; \n "
" uint filterSelectInt, filterCoefIndex; \n "
2015-08-10 23:10:23 -04:00
" real_bucket_t cacheLog; \n "
" real_bucket_t filterSelect; \n "
" real4_bucket bucket; \n "
2014-11-28 04:37:51 -05:00
;
2015-08-10 23:10:23 -04:00
os < < " __local real4reals_bucket filterBox[192]; \n " ; //Must be >= fullTempBoxWidth.
2014-11-28 04:37:51 -05:00
os < <
" \n "
" colsToZeroOffset = colsToZero * THREAD_ID_X; \n "
" colsToWriteOffset = colsToWrite * THREAD_ID_X; \n "
" k = (int)densityFilter->m_FilterWidth; \n " //Need a signed int to use below, really is filter width, but reusing a variable to save space.
" \n "
" for (histRow = blockHistStartRow; histRow < blockHistEndRow; histRow++) \n " //Process pixels by row, for 32 rows.
2014-07-08 03:11:14 -04:00
" { \n "
2014-11-28 04:37:51 -05:00
" bucket = histogram[(histRow * densityFilter->m_SuperRasW) + histCol]; \n "
2014-07-08 03:11:14 -04:00
" \n "
2014-11-28 04:37:51 -05:00
" if (bucket.w != 0) \n "
" cacheLog = (densityFilter->m_K1 * log(1.0 + bucket.w * densityFilter->m_K2)) / bucket.w; \n "
" \n " ;
2014-12-05 21:30:46 -05:00
2014-11-28 04:37:51 -05:00
if ( doSS )
{
os < <
" filterSelect = 0; \n "
" densityBoxLeftX = histCol - min(histCol, ss); \n "
" densityBoxRightX = histCol + min(ss, (densityFilter->m_SuperRasW - histCol) - 1); \n "
" densityBoxTopY = histRow - min((uint)histRow, ss); \n "
" densityBoxBottomY = histRow + min(ss, (densityFilter->m_SuperRasH - histRow) - 1); \n "
" \n "
" for (j = densityBoxTopY; j <= densityBoxBottomY; j++) \n "
" { \n "
" for (i = densityBoxLeftX; i <= densityBoxRightX; i++) \n "
" { \n "
" filterSelect += histogram[(j * densityFilter->m_SuperRasW) + i].w; \n "
" } \n "
" } \n "
" \n " ;
if ( doScf )
os < < " filterSelect *= scfact; \n " ;
}
else
{
os
< < " filterSelect = bucket.w; \n " ;
}
os < <
" \n "
" if (filterSelect > densityFilter->m_MaxFilteredCounts) \n "
" filterSelectInt = densityFilter->m_MaxFilterIndex; \n "
" else if (filterSelect <= DE_THRESH) \n "
" filterSelectInt = (int)ceil(filterSelect) - 1; \n "
" else if (filterSelect != 0) \n "
2015-08-10 23:10:23 -04:00
" filterSelectInt = (int)DE_THRESH + (int)floor(pow((real_bucket_t)(filterSelect - DE_THRESH), densityFilter->m_Curve)); \n "
2014-11-28 04:37:51 -05:00
" else \n "
" filterSelectInt = 0; \n "
" \n "
" if (filterSelectInt > densityFilter->m_MaxFilterIndex) \n "
" filterSelectInt = densityFilter->m_MaxFilterIndex; \n "
" \n "
" filterCoefIndex = filterSelectInt * densityFilter->m_KernelSize; \n "
" \n "
//With this new method, only accumulate to the temp local buffer first. Write to the final accumulator last.
//For each loop through, note that there is a local memory barrier call inside of each call to AddToAccumNoCheck().
//If this isn't done, pixel errors occurr and even an out of resources error occurrs because too many writes are done to the same place in memory at once.
" jmin = min(k, histRow); \n "
" jmax = (int)min((densityFilter->m_SuperRasH - 1) - histRow, densityFilter->m_FilterWidth); \n "
" \n "
" for (j = -jmin; j <= jmax; j++) \n "
2014-07-08 03:11:14 -04:00
" { \n "
2014-11-28 04:37:51 -05:00
" for (i = 0; i < colsToZero && (colsToZeroOffset + i) < fullTempBoxWidth; i++) \n " //Each thread zeroizes a few columns.
" { \n "
" filterBox[colsToZeroOffset + i].m_Real4 = 0; \n "
" } \n "
2014-07-08 03:11:14 -04:00
" \n "
2014-11-28 04:37:51 -05:00
" barrier(CLK_LOCAL_MEM_FENCE); \n "
2014-07-08 03:11:14 -04:00
" \n "
2014-11-28 04:37:51 -05:00
" if (bucket.w != 0) \n "
" { \n "
" filterRow = abs(j) * (densityFilter->m_FilterWidth + 1); \n "
" \n "
" for (i = -k; i <= k; i++) \n "
" { \n "
" filterSelectInt = filterCoefIndex + coefIndices[filterRow + abs(i)]; \n " //Really is filterCoeffIndexPlusOffset, but reusing a variable to save space.
" filterBox[i + boxCol].m_Real4 += (bucket * (filterCoefs[filterSelectInt] * cacheLog)); \n "
" } \n "
" } \n "
" \n "
" barrier(CLK_LOCAL_MEM_FENCE); \n "
" \n "
//At this point, all threads in this block have applied the filter to their surrounding pixels and stored the results in the temp local box.
//Add the cells of it that are in bounds to the global accumulator.
//Compute offsets in local box to read from, and offsets into global accumulator to write to.
//Use a method here that is similar to the zeroization above: Each thread (column) in the first row iterates through all of the
//rows and adds a few columns to the accumulator.
//" if (THREAD_ID_X == 0)\n"
//" {\n"
//" for (int kk = boxReadStartCol, i = 0; kk < boxReadEndCol; kk++, i++)\n"//Each thread writes a few columns.//Could do away with kk//TODO//OPT
//" {\n"
//" accumulator[((histRow + j) * densityFilter->m_SuperRasW) + (accumWriteStartCol + i)].m_Real4 += filterBox[kk].m_Real4;\n"
//" }\n"
//" }\n"
" accumWriteOffset = ((histRow + j) * densityFilter->m_SuperRasW) + accumWriteStartCol; \n "
" \n "
" for (i = 0; i < colsToWrite; i++) \n " //Each thread writes a few columns.
" { \n "
" colOffset = colsToWriteOffset + i; \n "
" \n "
" if (boxReadStartCol + colOffset < boxReadEndCol) \n "
" accumulator[accumWriteOffset + colOffset].m_Real4 += filterBox[boxReadStartCol + colOffset].m_Real4; \n "
" } \n "
" } \n " //for() filter rows.
" barrier(CLK_GLOBAL_MEM_FENCE); \n "
" } \n " //for() histogram rows.
2014-07-08 03:11:14 -04:00
" } \n " ;
2014-11-28 04:37:51 -05:00
2014-07-08 03:11:14 -04:00
return os . str ( ) ;
}
2014-11-28 04:37:51 -05:00
# else
2014-07-08 03:11:14 -04:00
/// <summary>
/// Create the gaussian density filtering kernel string.
/// 6 different methods of processing were tried before settling on this final and fastest 7th one.
/// Each block processes a box and exits. No column or row advancements happen.
/// The block accumulates to a temporary box and writes the contents to the global density filter buffer when done.
/// Note this applies the filter from top to bottom row and not from the center outward like the CPU version does.
/// This allows the image to be filtered without suffering from pixel loss due to race conditions.
/// It is run in multiple passes that are spaced far enough apart on the image so as to not overlap.
/// This allows writing to the global buffer without ever overlapping or using atomics.
/// The supersample parameter will produce three different kernels.
/// SS = 1, SS > 1 && SS even, SS > 1 && SS odd.
2014-11-28 04:37:51 -05:00
/// The width of the kernel this runs in must be evenly divisible by 16 or else artifacts will occur.
2014-07-08 03:11:14 -04:00
/// Note that because this function uses so many variables and is so complex, OpenCL can easily run
/// out of resources in some cases. Certain variables had to be reused to condense the kernel footprint
/// down enough to be able to run a block size of 32x32.
/// For double precision, or for SS > 1, a size of 32x30 is used.
/// Box width = (BLOCK_SIZE_X + (fw * 2)).
/// Box height = (BLOCK_SIZE_Y + (fw * 2)).
/// </summary>
/// <param name="ss">The supersample being used</param>
/// <returns>The kernel string</returns>
2015-08-10 23:10:23 -04:00
string DEOpenCLKernelCreator : : CreateGaussianDEKernel ( size_t ss )
2014-07-08 03:11:14 -04:00
{
bool doSS = ss > 1 ;
bool doScf = ! ( ss & 1 ) ;
ostringstream os ;
os < <
2015-08-10 23:10:23 -04:00
ConstantDefinesString ( m_DoublePrecision ) < <
2014-07-08 03:11:14 -04:00
DensityFilterCLStructString < <
UnionCLStructString < <
" __kernel void " < < GaussianDEEntryPoint ( ss , MaxDEFilterSize ( ) ) < < " ( \n " < <
2015-08-10 23:10:23 -04:00
" const __global real4_bucket* histogram, \n "
" __global real4reals_bucket* accumulator, \n "
2014-07-08 03:11:14 -04:00
" __constant DensityFilterCL* densityFilter, \n "
2015-08-10 23:10:23 -04:00
" const __global real_bucket_t* filterCoefs, \n "
" const __global real_bucket_t* filterWidths, \n "
2014-07-08 03:11:14 -04:00
" const __global uint* coefIndices, \n "
" const uint chunkSizeW, \n "
" const uint chunkSizeH, \n "
2014-11-28 04:37:51 -05:00
" const uint chunkW, \n "
" const uint chunkH \n "
2014-07-08 03:11:14 -04:00
" \t ) \n "
" { \n "
2014-11-28 04:37:51 -05:00
" if (((((BLOCK_ID_X * chunkSizeW) + chunkW) * BLOCK_SIZE_X) + THREAD_ID_X >= densityFilter->m_SuperRasW) || \n "
" ((((BLOCK_ID_Y * chunkSizeH) + chunkH) * BLOCK_SIZE_Y) + THREAD_ID_Y >= densityFilter->m_SuperRasH)) \n "
2014-07-08 03:11:14 -04:00
" return; \n "
" \n " ;
2014-11-28 04:37:51 -05:00
2014-07-08 03:11:14 -04:00
if ( doSS )
{
os < <
2015-08-10 23:10:23 -04:00
" uint ss = (uint)floor((real_bucket_t)densityFilter->m_Supersample / 2.0); \n "
2014-07-08 03:11:14 -04:00
" int densityBoxLeftX; \n "
" int densityBoxRightX; \n "
" int densityBoxTopY; \n "
" int densityBoxBottomY; \n "
" \n " ;
if ( doScf )
os < <
2015-08-10 23:10:23 -04:00
" real_bucket_t scfact = pow(densityFilter->m_Supersample / (densityFilter->m_Supersample + (real_bucket_t)1.0), (real_bucket_t)2.0); \n " ;
2014-07-08 03:11:14 -04:00
}
//Compute the size of the temporary box which is the block width + 2 * filter width x block height + 2 * filter width.
//Ideally the block width and height are both 32. However, the height might be smaller if there isn't enough memory.
os < <
" uint fullTempBoxWidth, fullTempBoxHeight; \n "
" uint leftBound, rightBound, topBound, botBound; \n "
" uint blockHistStartRow, blockHistEndRow, boxReadStartRow, boxReadEndRow; \n "
" uint blockHistStartCol, boxReadStartCol, boxReadEndCol; \n "
" uint accumWriteStartRow, accumWriteStartCol, colsToWrite; \n "
//If any of the variables above end up being made __local, init them here.
//At the moment, it's slower even though it's more memory efficient.
//" if (THREAD_ID_X == 0 && THREAD_ID_Y == 0)\n"
//" {\n"
//Init local vars here.
//" }\n"
//"\n"
//" barrier(CLK_LOCAL_MEM_FENCE);\n"
" \n "
" fullTempBoxWidth = BLOCK_SIZE_X + (densityFilter->m_FilterWidth * 2); \n "
" fullTempBoxHeight = BLOCK_SIZE_Y + (densityFilter->m_FilterWidth * 2); \n "
//Compute the bounds of the area to be sampled, which is just the ends minus the super sample minus 1.
" leftBound = densityFilter->m_Supersample - 1; \n "
" rightBound = densityFilter->m_SuperRasW - (densityFilter->m_Supersample - 1); \n "
" topBound = densityFilter->m_Supersample - 1; \n "
" botBound = densityFilter->m_SuperRasH - (densityFilter->m_Supersample - 1); \n "
" \n "
//Start and end values are the indices in the histogram read from
//and written to in the accumulator. They are not the indices for the local block of data.
//Before computing local offsets, compute the global offsets first to determine if any rows or cols fall outside of the bounds.
2015-07-31 22:46:53 -04:00
" blockHistStartRow = min(botBound, (uint)(topBound + (((BLOCK_ID_Y * chunkSizeH) + chunkH) * BLOCK_SIZE_Y))); \n " //The first histogram row this block will process.
" blockHistEndRow = min(botBound, (uint)(blockHistStartRow + BLOCK_SIZE_Y)); \n " //The last histogram row this block will process, clamped to the last row.
2014-07-08 03:11:14 -04:00
" boxReadStartRow = densityFilter->m_FilterWidth - min(densityFilter->m_FilterWidth, blockHistStartRow); \n " //The first row in the local box to read from when writing back to the final accumulator for this block.
2015-07-31 22:46:53 -04:00
" boxReadEndRow = densityFilter->m_FilterWidth + min((uint)(densityFilter->m_FilterWidth + BLOCK_SIZE_Y), densityFilter->m_SuperRasH - blockHistStartRow); \n " //The last row in the local box to read from when writing back to the final accumulator for this block.
" blockHistStartCol = min(rightBound, leftBound + (uint)(((BLOCK_ID_X * chunkSizeW) + chunkW) * BLOCK_SIZE_X)); \n " //The first histogram column this block will process.
2014-11-28 04:37:51 -05:00
" boxReadStartCol = densityFilter->m_FilterWidth - min(densityFilter->m_FilterWidth, blockHistStartCol); \n " //The first box col this block will read from when copying to the accumulator.
2015-07-31 22:46:53 -04:00
" boxReadEndCol = densityFilter->m_FilterWidth + min(densityFilter->m_FilterWidth + (uint)BLOCK_SIZE_X, densityFilter->m_SuperRasW - blockHistStartCol); \n " //The last box col this block will read from when copying to the accumulator.
2014-07-08 03:11:14 -04:00
" \n "
//Last, the indices in the global accumulator that the local bounds will be writing to.
" accumWriteStartRow = blockHistStartRow - min(densityFilter->m_FilterWidth, blockHistStartRow); \n " //Will be fw - 0 except for boundary columns, it will be less.
" accumWriteStartCol = blockHistStartCol - min(densityFilter->m_FilterWidth, blockHistStartCol); \n "
2015-08-10 23:10:23 -04:00
" colsToWrite = ceil((real_bucket_t)(boxReadEndCol - boxReadStartCol) / (real_bucket_t)BLOCK_SIZE_X); \n "
2014-07-08 03:11:14 -04:00
" \n "
" uint threadHistRow = blockHistStartRow + THREAD_ID_Y; \n " //The histogram row this individual thread will be reading from.
" uint threadHistCol = blockHistStartCol + THREAD_ID_X; \n " //The histogram column this individual thread will be reading from.
" \n "
//Compute the center position in this local box to serve as the center position
//from which filter application offsets are computed.
//These are the local indices for the local data that are temporarily accumulated to before
//writing out to the global accumulator.
" uint boxRow = densityFilter->m_FilterWidth + THREAD_ID_Y; \n "
" uint boxCol = densityFilter->m_FilterWidth + THREAD_ID_X; \n "
2015-08-10 23:10:23 -04:00
" uint colElementsToZero = ceil((real_bucket_t)fullTempBoxWidth / (real_bucket_t)(BLOCK_SIZE_X)); \n " //Usually is 2.
2014-07-08 03:11:14 -04:00
" int i, j, k; \n "
" uint filterSelectInt, filterCoefIndex; \n "
2015-08-10 23:10:23 -04:00
" real_bucket_t cacheLog; \n "
" real_bucket_t filterSelect; \n "
" real4_bucket bucket; \n "
2014-07-08 03:11:14 -04:00
;
//This will be treated as having dimensions of (BLOCK_SIZE_X + (fw * 2)) x (BLOCK_SIZE_Y + (fw * 2)).
if ( m_NVidia )
2015-08-10 23:10:23 -04:00
os < < " __local real4reals_bucket filterBox[3000]; \n " ;
2014-07-08 03:11:14 -04:00
else
2015-08-10 23:10:23 -04:00
os < < " __local real4reals_bucket filterBox[1200]; \n " ;
2014-07-08 03:11:14 -04:00
os < <
//Zero the temp buffers first. This splits the zeroization evenly across all threads (columns) in the first block row.
//This is a middle ground solution. Previous methods tried:
//Thread (0, 0) does all init. This works, but is the slowest.
//Init is divided among all threads. This is the fastest but exposes a severe flaw in OpenCL,
//in that it will not get executed by all threads before proceeding, despite the barrier statement
//below. As a result, strange artifacts will get left around because filtering gets executed on a temp
//box that has not been properly zeroized.
//The only way to do it and still achieve reasonable speed is to have the first row do it. This is
//most likely because the first row gets executed first, ensuring zeroization is done when the rest
//of the threads execute.
" \n " //Dummy test zeroization for debugging.
//" if (THREAD_ID_Y == 0 && THREAD_ID_X == 0)\n"//First thread of the block takes the responsibility of zeroizing.
//" {\n"
//" for (k = 0; k < 2 * 1024; k++)\n"
//" {\n"
//" filterBox[k].m_Real4 = 0;\n"
//" }\n"
//" }\n"
" if (THREAD_ID_Y == 0) \n " //First row of the block takes the responsibility of zeroizing.
" { \n "
" for (i = 0; i < fullTempBoxHeight; i++) \n " //Each column in the row iterates through all rows.
" { \n "
" for (j = 0; j < colElementsToZero && ((colElementsToZero * THREAD_ID_X) + j) < fullTempBoxWidth; j++) \n " //And zeroizes a few columns from that row.
" { \n "
" filterBox[(i * fullTempBoxWidth) + ((colElementsToZero * THREAD_ID_X) + j)].m_Real4 = 0; \n "
" } \n "
" } \n "
" } \n "
" \n "
" barrier(CLK_LOCAL_MEM_FENCE); \n "
" \n "
" if (threadHistRow < botBound && threadHistCol < rightBound) \n "
" { \n "
" bucket = histogram[(threadHistRow * densityFilter->m_SuperRasW) + threadHistCol]; \n "
" \n "
" if (bucket.w != 0) \n "
" { \n "
" cacheLog = (densityFilter->m_K1 * log(1.0 + bucket.w * densityFilter->m_K2)) / bucket.w; \n " ;
2014-12-05 21:30:46 -05:00
2014-07-08 03:11:14 -04:00
if ( doSS )
{
os < <
" filterSelect = 0; \n "
" densityBoxLeftX = threadHistCol - min(threadHistCol, ss); \n "
" densityBoxRightX = threadHistCol + min(ss, (densityFilter->m_SuperRasW - threadHistCol) - 1); \n "
" densityBoxTopY = threadHistRow - min(threadHistRow, ss); \n "
" densityBoxBottomY = threadHistRow + min(ss, (densityFilter->m_SuperRasH - threadHistRow) - 1); \n "
" \n "
" for (j = densityBoxTopY; j <= densityBoxBottomY; j++) \n "
" { \n "
" for (i = densityBoxLeftX; i <= densityBoxRightX; i++) \n "
" { \n "
" filterSelect += histogram[i + (j * densityFilter->m_SuperRasW)].w; \n "
" } \n "
" } \n "
" \n " ;
2014-12-05 21:30:46 -05:00
2014-07-08 03:11:14 -04:00
if ( doScf )
2015-03-30 20:46:52 -04:00
os < <
" filterSelect *= scfact; \n " ;
2014-07-08 03:11:14 -04:00
}
else
{
2015-03-30 20:46:52 -04:00
os < <
" filterSelect = bucket.w; \n " ;
2014-07-08 03:11:14 -04:00
}
2014-12-05 21:30:46 -05:00
2014-07-08 03:11:14 -04:00
os < <
" \n "
" if (filterSelect > densityFilter->m_MaxFilteredCounts) \n "
" filterSelectInt = densityFilter->m_MaxFilterIndex; \n "
" else if (filterSelect <= DE_THRESH) \n "
" filterSelectInt = (int)ceil(filterSelect) - 1; \n "
" else \n "
2015-08-10 23:10:23 -04:00
" filterSelectInt = (int)DE_THRESH + (int)floor(pow((real_bucket_t)(filterSelect - DE_THRESH), densityFilter->m_Curve)); \n "
2014-07-08 03:11:14 -04:00
" \n "
" if (filterSelectInt > densityFilter->m_MaxFilterIndex) \n "
" filterSelectInt = densityFilter->m_MaxFilterIndex; \n "
" \n "
" filterCoefIndex = filterSelectInt * densityFilter->m_KernelSize; \n "
" \n "
//With this new method, only accumulate to the temp local buffer first. Write to the final accumulator last.
//For each loop through, note that there is a local memory barrier call inside of each call to AddToAccumNoCheck().
//If this isn't done, pixel errors occurr and even an out of resources error occurrs because too many writes are done to the same place in memory at once.
" k = (int)densityFilter->m_FilterWidth; \n " //Need a signed int to use below, really is filter width, but reusing a variable to save space.
" \n "
" for (j = -k; j <= k; j++) \n "
" { \n "
" for (i = -k; i <= k; i++) \n "
" { \n "
" filterSelectInt = filterCoefIndex + coefIndices[(abs(j) * (densityFilter->m_FilterWidth + 1)) + abs(i)]; \n " //Really is filterCoeffIndexPlusOffset, but reusing a variable to save space.
" \n "
2014-11-28 04:37:51 -05:00
" if (filterCoefs[filterSelectInt] != 0) \n " //This conditional actually improves speed, despite SIMT being bad at conditionals.
2014-07-08 03:11:14 -04:00
" { \n "
" filterBox[(i + boxCol) + ((j + boxRow) * fullTempBoxWidth)].m_Real4 += (bucket * (filterCoefs[filterSelectInt] * cacheLog)); \n "
" } \n "
" } \n "
" barrier(CLK_LOCAL_MEM_FENCE); \n " //If this is the only barrier and the block size is exactly 16, it works perfectly. Otherwise, no chunks occur, but a many streaks.
" } \n "
" } \n " //bucket.w != 0.
" } \n " //In bounds.
" \n "
" \n "
" barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE); \n "
" \n "
" if (THREAD_ID_Y == 0) \n "
" { \n "
2014-11-28 04:37:51 -05:00
//At this point, all threads in this block have applied the filter to their surrounding pixels and stored the results in the temp local box.
2014-07-08 03:11:14 -04:00
//Add the cells of it that are in bounds to the global accumulator.
//Compute offsets in local box to read from, and offsets into global accumulator to write to.
//Use a method here that is similar to the zeroization above: Each thread (column) in the first row iterates through all of the
//rows and adds a few columns to the accumulator.
" for (i = boxReadStartRow, j = accumWriteStartRow; i < boxReadEndRow; i++, j++) \n "
" { \n "
2014-11-28 04:37:51 -05:00
" for (k = 0; k < colsToWrite; k++) \n " //Each thread writes a few columns.
2014-07-08 03:11:14 -04:00
" { \n "
" boxCol = (colsToWrite * THREAD_ID_X) + k; \n " //Really is colOffset, but reusing a variable to save space.
" \n "
" if (boxReadStartCol + boxCol < boxReadEndCol) \n "
" accumulator[(j * densityFilter->m_SuperRasW) + (accumWriteStartCol + boxCol)].m_Real4 += filterBox[(i * fullTempBoxWidth) + (boxReadStartCol + boxCol)].m_Real4; \n "
" } \n "
" barrier(CLK_GLOBAL_MEM_FENCE); \n " //This must be here or else chunks will go missing.
" } \n "
" } \n "
" } \n " ;
return os . str ( ) ;
}
2014-11-28 04:37:51 -05:00
# endif
2014-07-08 03:11:14 -04:00
/// <summary>
/// Create the gaussian density filtering kernel string, but use no local cache and perform
/// all writes directly to the global density filtering buffer.
/// Note this applies the filter from top to bottom row and not from the center outward like the CPU version does.
/// This allows the image to be filtered without suffering from pixel loss due to race conditions.
/// This is used for when the filter box is greater than can fit in the local cache.
/// While the cached version is incredibly fast, this version offers no real gain over doing it
/// on the CPU because the frequent global memory access brings performance to a crawl.
/// The supersample parameter will produce three different kernels.
/// SS = 1, SS > 1 && SS even, SS > 1 && SS odd.
2014-11-28 04:37:51 -05:00
/// The width of the kernel this runs in must be evenly divisible by 16 or else artifacts will occur.
2014-07-08 03:11:14 -04:00
/// Note that because this function uses so many variables and is so complex, OpenCL can easily run
/// out of resources in some cases. Certain variables had to be reused to condense the kernel footprint
/// down enough to be able to run a block size of 32x32.
/// For double precision, or for SS > 1, a size of 32x30 is used.
/// </summary>
/// <param name="ss">The supersample being used</param>
/// <returns>The kernel string</returns>
2015-08-10 23:10:23 -04:00
string DEOpenCLKernelCreator : : CreateGaussianDEKernelNoLocalCache ( size_t ss )
2014-07-08 03:11:14 -04:00
{
bool doSS = ss > 1 ;
bool doScf = ! ( ss & 1 ) ;
ostringstream os ;
os < <
2015-08-10 23:10:23 -04:00
ConstantDefinesString ( m_DoublePrecision ) < <
2014-07-08 03:11:14 -04:00
DensityFilterCLStructString < <
UnionCLStructString < <
AddToAccumWithCheckFunctionString < <
" __kernel void " < < GaussianDEEntryPoint ( ss , MaxDEFilterSize ( ) + 1 ) < < " ( \n " < <
2015-08-10 23:10:23 -04:00
" const __global real4_bucket* histogram, \n "
" __global real4reals_bucket* accumulator, \n "
2014-07-08 03:11:14 -04:00
" __constant DensityFilterCL* densityFilter, \n "
2015-08-10 23:10:23 -04:00
" const __global real_bucket_t* filterCoefs, \n "
" const __global real_bucket_t* filterWidths, \n "
2014-07-08 03:11:14 -04:00
" const __global uint* coefIndices, \n "
" const uint chunkSizeW, \n "
" const uint chunkSizeH, \n "
2014-11-28 04:37:51 -05:00
" const uint chunkW, \n "
" const uint chunkH \n "
2014-07-08 03:11:14 -04:00
" \t ) \n "
" { \n "
2014-11-28 04:37:51 -05:00
" if (((((BLOCK_ID_X * chunkSizeW) + chunkW) * BLOCK_SIZE_X) + THREAD_ID_X >= densityFilter->m_SuperRasW) || \n "
" ((((BLOCK_ID_Y * chunkSizeH) + chunkH) * BLOCK_SIZE_Y) + THREAD_ID_Y >= densityFilter->m_SuperRasH)) \n "
2014-07-08 03:11:14 -04:00
" return; \n "
" \n " ;
2014-12-05 21:30:46 -05:00
2014-07-08 03:11:14 -04:00
if ( doSS )
{
os < <
2015-08-10 23:10:23 -04:00
" uint ss = (uint)floor((real_bucket_t)densityFilter->m_Supersample / 2.0); \n "
2014-07-08 03:11:14 -04:00
" int densityBoxLeftX; \n "
" int densityBoxRightX; \n "
" int densityBoxTopY; \n "
" int densityBoxBottomY; \n " ;
if ( doScf )
2015-08-10 23:10:23 -04:00
os < < " real_bucket_t scfact = pow((real_bucket_t)densityFilter->m_Supersample / ((real_bucket_t)densityFilter->m_Supersample + (real_bucket_t)1.0), (real_bucket_t)2.0); \n " ;
2014-07-08 03:11:14 -04:00
}
2014-12-05 21:30:46 -05:00
2014-07-08 03:11:14 -04:00
os < <
//Compute the bounds of the area to be sampled, which is just the ends minus the super sample minus 1.
" uint leftBound = densityFilter->m_Supersample - 1; \n "
" uint rightBound = densityFilter->m_SuperRasW - (densityFilter->m_Supersample - 1); \n "
" uint topBound = densityFilter->m_Supersample - 1; \n "
" uint botBound = densityFilter->m_SuperRasH - (densityFilter->m_Supersample - 1); \n "
" \n "
//Start and end values are the indices in the histogram read from and written to in the accumulator.
//Before computing local offsets, compute the global offsets first to determine if any rows or cols fall outside of the bounds.
2015-07-31 22:46:53 -04:00
" uint blockHistStartRow = min(botBound, (uint)(topBound + (((BLOCK_ID_Y * chunkSizeH) + chunkH) * BLOCK_SIZE_Y))); \n " //The first histogram row this block will process.
2014-07-08 03:11:14 -04:00
" uint threadHistRow = blockHistStartRow + THREAD_ID_Y; \n " //The histogram row this individual thread will be reading from.
" \n "
2015-07-31 22:46:53 -04:00
" uint blockHistStartCol = min(rightBound, leftBound + (uint)(((BLOCK_ID_X * chunkSizeW) + chunkW) * BLOCK_SIZE_X)); \n " //The first histogram column this block will process.
2014-07-08 03:11:14 -04:00
" uint threadHistCol = blockHistStartCol + THREAD_ID_X; \n " //The histogram column this individual thread will be reading from.
" \n "
" int i, j; \n "
" uint filterSelectInt, filterCoefIndex; \n "
2015-08-10 23:10:23 -04:00
" real_bucket_t cacheLog; \n "
" real_bucket_t filterSelect; \n "
" real4_bucket bucket; \n "
2014-07-08 03:11:14 -04:00
" \n "
" if (threadHistRow < botBound && threadHistCol < rightBound) \n "
" { \n "
" bucket = histogram[(threadHistRow * densityFilter->m_SuperRasW) + threadHistCol]; \n "
" \n "
" if (bucket.w != 0) \n "
" { \n "
" cacheLog = (densityFilter->m_K1 * log(1.0 + bucket.w * densityFilter->m_K2)) / bucket.w; \n " ;
2014-12-05 21:30:46 -05:00
2014-07-08 03:11:14 -04:00
if ( doSS )
{
os < <
" filterSelect = 0; \n "
" densityBoxLeftX = threadHistCol - min(threadHistCol, ss); \n "
" densityBoxRightX = threadHistCol + min(ss, (densityFilter->m_SuperRasW - threadHistCol) - 1); \n "
" densityBoxTopY = threadHistRow - min(threadHistRow, ss); \n "
" densityBoxBottomY = threadHistRow + min(ss, (densityFilter->m_SuperRasH - threadHistRow) - 1); \n "
" \n "
" for (j = densityBoxTopY; j <= densityBoxBottomY; j++) \n "
" { \n "
" for (i = densityBoxLeftX; i <= densityBoxRightX; i++) \n "
" { \n "
" filterSelect += histogram[i + (j * densityFilter->m_SuperRasW)].w; \n "
" } \n "
" } \n "
" \n " ;
2014-12-05 21:30:46 -05:00
2014-07-08 03:11:14 -04:00
if ( doScf )
--User changes
-Add new variations: bubbleT3D, crob, hexaplay3D, hexcrop, hexes, hexnix3D, loonie2, loonie3, nBlur, octapol and synth.
-Allow for pre/post versions of dc_bubble, dc_cylinder and dc_linear whereas before they were omitted.
-When saving a file with multiple embers in it, detect if time values are all the same and if so, start them at zero and increment by 1 for each ember.
-Allow for numerous quality increases to be coalesced into one. It will pick up at the end of the current render.
-Show selection highlight on variations tree in response to mouse hover. This makes it easier to see for which variation or param the current mouse wheel action will apply.
-Make default temporal samples be 100, whereas before it was 1000 which was overkill.
-Require the shift key to be held with delete for deleting an ember to prevent it from triggering when the user enters delete in the edit box.
-This wasn't otherwise fixable without writing a lot more code.
--Bug fixes
-EmberGenome was crashing when generating a sequence from a source file with more than 2 embers in it.
-EmberGenome was improperly handling the first frame of a merge after the last frame of the loop.
-These bugs were due to a previous commit. Revert parts of that commit.
-Prevent a zoom value of less than 0 when reading from xml.
-Slight optimization of the crescents, and mask variations, if the compiler wasn't doing it already.
-Unique file naming was broken because it was looking for _# and the default names ended with -#.
-Disallow renaming of an ember in the library tree to an empty string.
-Severe bug that prevented some variations from being read correctly from params generated outside this program.
-Severe OpenCL randomization bug. The first x coordinates of the first points in the first kernel call of the first ember of a render since the OpenCL renderer object was created were not random and were mostly -1.
-Severe bug when populating xform selection distributions that could sometimes cause a crash due to roundoff error. Fix by using double.
-Limit the max number of variations in a random ember to MAX_CL_VARS, which is 8. This ensures they'll look the same on CPU and GPU.
-Prevent user from saving stylesheet to default.qss, it's a special reserved filename.
--Code changes
-Generalize using the running sum output point inside of a variation for all cases: pre, reg and post.
-Allow for array variables in variations where the address of each element is stored in m_Params.
-Qualify all math functions with std::
-No longer use our own Clamp() in OpenCL, instead use the standard clamp().
-Redesign how functions are used in the variations OpenCL code.
-Add tests to EmberTester to verify some of the new functionality.
-Place more const and override qualifiers on functions where appropriate.
-Add a global rand with a lock to be used very sparingly.
-Use a map instead of a vector for bad param names in Xml parsing.
-Prefix affine interpolation mode defines with "AFFINE_" to make their purpose more clear.
-Allow for variations that change state during iteration by sending a separate copy of the ember to each rendering thread.
-Implement this same functionality with a local struct in OpenCL. It's members are the total of all variables that need to change state within an ember.
-Add Contains() function to Utils.h.
-EmberRender: print names of kernels being printed with --dump_kernel option.
-Clean up EmberTester to handle some of the recent changes.
-Fix various casts.
-Replace % 2 with & 1, even though the compiler was likely doing this already.
-Add new file Variations06.h to accommodate new variations.
-General cleanup.
2015-11-22 17:15:07 -05:00
os < <
" filterSelect *= scfact; \n " ;
2014-07-08 03:11:14 -04:00
}
else
{
os
< < " filterSelect = bucket.w; \n " ;
}
2014-12-05 21:30:46 -05:00
2014-07-08 03:11:14 -04:00
os < <
" \n "
" if (filterSelect > densityFilter->m_MaxFilteredCounts) \n "
" filterSelectInt = densityFilter->m_MaxFilterIndex; \n "
" else if (filterSelect <= DE_THRESH) \n "
" filterSelectInt = (int)ceil(filterSelect) - 1; \n "
" else \n "
2015-08-10 23:10:23 -04:00
" filterSelectInt = (int)DE_THRESH + (int)floor(pow((real_bucket_t)(filterSelect - DE_THRESH), densityFilter->m_Curve)); \n "
2014-07-08 03:11:14 -04:00
" \n "
" if (filterSelectInt > densityFilter->m_MaxFilterIndex) \n "
" filterSelectInt = densityFilter->m_MaxFilterIndex; \n "
" \n "
" filterCoefIndex = filterSelectInt * densityFilter->m_KernelSize; \n "
" \n "
" int fw = (int)densityFilter->m_FilterWidth; \n " //Need a signed int to use below.
" \n "
" for (j = -fw; j <= fw; j++) \n "
" { \n "
" for (i = -fw; i <= fw; i++) \n "
" { \n "
" if (AccumCheck(densityFilter->m_SuperRasW, densityFilter->m_SuperRasH, threadHistCol, i, threadHistRow, j)) \n "
" { \n "
" filterSelectInt = filterCoefIndex + coefIndices[(abs(j) * (densityFilter->m_FilterWidth + 1)) + abs(i)]; \n " //Really is filterCoeffIndexPlusOffset, but reusing a variable to save space.
" \n "
" if (filterCoefs[filterSelectInt] != 0) \n "
" { \n "
" accumulator[(i + threadHistCol) + ((j + threadHistRow) * densityFilter->m_SuperRasW)].m_Real4 += (bucket * (filterCoefs[filterSelectInt] * cacheLog)); \n "
" } \n "
" } \n "
" \n "
" barrier(CLK_GLOBAL_MEM_FENCE); \n " //Required to avoid streaks.
" } \n "
" } \n "
" } \n " //bucket.w != 0.
" } \n " //In bounds.
--User changes
-Add new variations: bubbleT3D, crob, hexaplay3D, hexcrop, hexes, hexnix3D, loonie2, loonie3, nBlur, octapol and synth.
-Allow for pre/post versions of dc_bubble, dc_cylinder and dc_linear whereas before they were omitted.
-When saving a file with multiple embers in it, detect if time values are all the same and if so, start them at zero and increment by 1 for each ember.
-Allow for numerous quality increases to be coalesced into one. It will pick up at the end of the current render.
-Show selection highlight on variations tree in response to mouse hover. This makes it easier to see for which variation or param the current mouse wheel action will apply.
-Make default temporal samples be 100, whereas before it was 1000 which was overkill.
-Require the shift key to be held with delete for deleting an ember to prevent it from triggering when the user enters delete in the edit box.
-This wasn't otherwise fixable without writing a lot more code.
--Bug fixes
-EmberGenome was crashing when generating a sequence from a source file with more than 2 embers in it.
-EmberGenome was improperly handling the first frame of a merge after the last frame of the loop.
-These bugs were due to a previous commit. Revert parts of that commit.
-Prevent a zoom value of less than 0 when reading from xml.
-Slight optimization of the crescents, and mask variations, if the compiler wasn't doing it already.
-Unique file naming was broken because it was looking for _# and the default names ended with -#.
-Disallow renaming of an ember in the library tree to an empty string.
-Severe bug that prevented some variations from being read correctly from params generated outside this program.
-Severe OpenCL randomization bug. The first x coordinates of the first points in the first kernel call of the first ember of a render since the OpenCL renderer object was created were not random and were mostly -1.
-Severe bug when populating xform selection distributions that could sometimes cause a crash due to roundoff error. Fix by using double.
-Limit the max number of variations in a random ember to MAX_CL_VARS, which is 8. This ensures they'll look the same on CPU and GPU.
-Prevent user from saving stylesheet to default.qss, it's a special reserved filename.
--Code changes
-Generalize using the running sum output point inside of a variation for all cases: pre, reg and post.
-Allow for array variables in variations where the address of each element is stored in m_Params.
-Qualify all math functions with std::
-No longer use our own Clamp() in OpenCL, instead use the standard clamp().
-Redesign how functions are used in the variations OpenCL code.
-Add tests to EmberTester to verify some of the new functionality.
-Place more const and override qualifiers on functions where appropriate.
-Add a global rand with a lock to be used very sparingly.
-Use a map instead of a vector for bad param names in Xml parsing.
-Prefix affine interpolation mode defines with "AFFINE_" to make their purpose more clear.
-Allow for variations that change state during iteration by sending a separate copy of the ember to each rendering thread.
-Implement this same functionality with a local struct in OpenCL. It's members are the total of all variables that need to change state within an ember.
-Add Contains() function to Utils.h.
-EmberRender: print names of kernels being printed with --dump_kernel option.
-Clean up EmberTester to handle some of the recent changes.
-Fix various casts.
-Replace % 2 with & 1, even though the compiler was likely doing this already.
-Add new file Variations06.h to accommodate new variations.
-General cleanup.
2015-11-22 17:15:07 -05:00
//"\n"
2014-07-08 03:11:14 -04:00
//" barrier(CLK_GLOBAL_MEM_FENCE);\n"//Just to be safe.
" } \n " ;
return os . str ( ) ;
}
2014-12-05 21:30:46 -05:00
}