2014-07-08 03:11:14 -04:00
# include "EmberCLPch.h"
# include "DEOpenCLKernelCreator.h"
2014-12-05 21:30:46 -05:00
namespace EmberCLns
2014-07-08 03:11:14 -04:00
{
--Bug fixes
-Fix improper usage of rand() in cpow2, cpow3, hypertile1, hypertile3D1, hypertile3D2, juliac, juliaq.
-Fix program crashing during density filtering on some Nvidia cards.
-hypertile3D1 was wrong.
-Parsing phoenix_julia when coming from Apophysis was wrong.
-Density filtering was freezing on certain Nvidia cards.
--Code changes
-Optimize juliac, npolar.
-Add a new function Crand() which behaves like the legacy C rand() which returns an integer between 0 and 32766, inclusive.
-Use RandBit() in some places.
-Remove Zeps() from vignette, it's not needed.
-Restructure OpenCL code for density filtering such that it does not hang after being compiled on some Nvidia cards, such as the gtx 1660. Remove barriers from conditionals where possible.
2020-12-29 00:46:55 -05:00
/// <summary>
/// Empty constructor.
/// This is needed because this class will need to be empty before it's initialized in RendererCL
/// with specific arguments.
/// </summary>
DEOpenCLKernelCreator : : DEOpenCLKernelCreator ( )
{
}
2014-07-08 03:11:14 -04:00
/// <summary>
2015-08-10 23:10:23 -04:00
/// Constructor that sets all kernel entry points as well as composes
2014-07-08 03:11:14 -04:00
/// all kernel source strings.
/// No program compilation is done here, the user must explicitly do it.
/// The caller must specify whether they are using an nVidia or AMD card because it changes
/// the amount of local memory available.
/// </summary>
2015-08-10 23:10:23 -04:00
/// <param name="doublePrecision">True if double precision, else false for float.</param>
2014-07-08 03:11:14 -04:00
/// <param name="nVidia">True if running on an nVidia card, else false.</param>
2015-08-10 23:10:23 -04:00
DEOpenCLKernelCreator : : DEOpenCLKernelCreator ( bool doublePrecision , bool nVidia )
2014-07-08 03:11:14 -04:00
{
2015-08-10 23:10:23 -04:00
m_DoublePrecision = doublePrecision ;
2014-07-08 03:11:14 -04:00
m_NVidia = nVidia ;
2014-11-28 04:37:51 -05:00
m_LogScaleAssignDEKernel = CreateLogScaleAssignDEKernelString ( ) ;
m_GaussianDEWithoutSsKernel = CreateGaussianDEKernel ( 1 ) ;
m_GaussianDESsWithScfKernel = CreateGaussianDEKernel ( 2 ) ;
m_GaussianDESsWithoutScfKernel = CreateGaussianDEKernel ( 3 ) ;
m_GaussianDEWithoutSsNoCacheKernel = CreateGaussianDEKernelNoLocalCache ( 1 ) ;
m_GaussianDESsWithScfNoCacheKernel = CreateGaussianDEKernelNoLocalCache ( 2 ) ;
m_GaussianDESsWithoutScfNoCacheKernel = CreateGaussianDEKernelNoLocalCache ( 3 ) ;
2014-07-08 03:11:14 -04:00
}
/// <summary>
/// Kernel source and entry point properties, getters only.
/// </summary>
2015-08-12 21:51:07 -04:00
const string & DEOpenCLKernelCreator : : LogScaleAssignDEKernel ( ) const { return m_LogScaleAssignDEKernel ; }
const string & DEOpenCLKernelCreator : : LogScaleAssignDEEntryPoint ( ) const { return m_LogScaleAssignDEEntryPoint ; }
2014-07-08 03:11:14 -04:00
/// <summary>
/// Get the kernel source for the specified supersample and filterWidth.
/// </summary>
/// <param name="ss">The supersample being used</param>
/// <param name="filterWidth">Filter width</param>
/// <returns>The kernel source</returns>
2015-08-12 21:51:07 -04:00
const string & DEOpenCLKernelCreator : : GaussianDEKernel ( size_t ss , uint filterWidth ) const
2014-07-08 03:11:14 -04:00
{
2015-08-10 23:10:23 -04:00
if ( filterWidth > MaxDEFilterSize ( ) )
2014-07-08 03:11:14 -04:00
{
if ( ss > 1 )
{
if ( ! ( ss & 1 ) )
2016-03-12 22:25:19 -05:00
return m_GaussianDESsWithScfNoCacheKernel ; //SS 2 or 4.
2014-07-08 03:11:14 -04:00
else
2016-03-12 22:25:19 -05:00
return m_GaussianDESsWithoutScfNoCacheKernel ; //SS 3.
2014-07-08 03:11:14 -04:00
}
else
2016-03-12 22:25:19 -05:00
return m_GaussianDEWithoutSsNoCacheKernel ; //SS 1;
2014-07-08 03:11:14 -04:00
}
2016-03-12 22:25:19 -05:00
else //Use cache.
2014-07-08 03:11:14 -04:00
{
if ( ss > 1 )
{
if ( ! ( ss & 1 ) )
2016-03-12 22:25:19 -05:00
return m_GaussianDESsWithScfKernel ; //SS 2 or 4.
2014-07-08 03:11:14 -04:00
else
2016-03-12 22:25:19 -05:00
return m_GaussianDESsWithoutScfKernel ; //SS 3.
2014-07-08 03:11:14 -04:00
}
else
2016-03-12 22:25:19 -05:00
return m_GaussianDEWithoutSsKernel ; //SS 1;
2014-07-08 03:11:14 -04:00
}
}
/// <summary>
/// Get the kernel entry point for the specified supersample and filterWidth.
/// </summary>
/// <param name="ss">The supersample being used</param>
/// <param name="filterWidth">Filter width</param>
/// <returns>The name of the density estimation filtering entry point kernel function</returns>
2015-08-12 21:51:07 -04:00
const string & DEOpenCLKernelCreator : : GaussianDEEntryPoint ( size_t ss , uint filterWidth ) const
2014-07-08 03:11:14 -04:00
{
2015-08-10 23:10:23 -04:00
if ( filterWidth > MaxDEFilterSize ( ) )
2014-07-08 03:11:14 -04:00
{
if ( ss > 1 )
{
if ( ! ( ss & 1 ) )
return m_GaussianDESsWithScfNoCacheEntryPoint ;
else
return m_GaussianDESsWithoutScfNoCacheEntryPoint ;
}
else
return m_GaussianDEWithoutSsNoCacheEntryPoint ;
}
else
{
if ( ss > 1 )
{
if ( ! ( ss & 1 ) )
return m_GaussianDESsWithScfEntryPoint ;
else
return m_GaussianDESsWithoutScfEntryPoint ;
}
else
return m_GaussianDEWithoutSsEntryPoint ;
}
}
/// <summary>
/// Get the maximum filter size allowed for running the local memory version of density filtering
/// Filters larger than this value will run the version without local memory caching.
/// </summary>
/// <returns>The maximum filter size allowed for running the local memory version of density filtering</returns>
2015-08-10 23:10:23 -04:00
uint DEOpenCLKernelCreator : : MaxDEFilterSize ( ) { return 9 ; } //The true max would be (maxBoxSize - 1) / 2, but that's impractical because it can give us a tiny block size.
2014-07-08 03:11:14 -04:00
/// <summary>
/// Solve for the maximum filter radius.
2014-12-06 00:05:09 -05:00
/// The final filter width is calculated by: (uint)(ceil(m_MaxRad) * (T)m_Supersample) + (m_Supersample - 1);
2014-07-08 03:11:14 -04:00
/// Must solve for what max rad should be in order to give a maximum final width of (maxBoxSize - 1) / 2, assuming
/// a minimum block size of 1 which processes 1 pixel.
/// Example: If a box size of 20 was allowed, a filter
/// size of up to 9: (20 - 1) / 2 == (19 / 2) == 9 could be supported.
/// This function is deprecated, the appropriate kernels take care of this problem now.
/// </summary>
/// <param name="desiredFilterSize">Size of the desired filter.</param>
/// <param name="ss">The supersample being used</param>
/// <returns>The maximum filter radius allowed</returns>
2016-03-01 20:26:45 -05:00
double DEOpenCLKernelCreator : : SolveMaxDERad ( double desiredFilterSize , double ss )
2014-07-08 03:11:14 -04:00
{
2014-12-07 02:51:44 -05:00
uint finalFilterSize = uint ( ( ceil ( desiredFilterSize ) * ss ) + ( ss - 1.0 ) ) ;
2014-07-08 03:11:14 -04:00
//Return the desired size if the final size of it will fit.
if ( finalFilterSize < = MaxDEFilterSize ( ) )
return desiredFilterSize ;
//The final size doesn't fit, so scale the original down until it fits.
06/09/2017
--User changes
-dark.qss is now per-OS.
-Properly set/reload palette when coming from the palette editor. The latter must be done if they've modified the current palette even if they've clicked cancel.
--Bug fixes
-Make the following variations safer by using Zeps(): conic, bipolar, edisc, whorl, tan, csc, cot, tanh, sech, csch, coth, auger, bwraps, hypertile3d, hypertile3d1, ortho, poincare, rational3, barycentroid, sschecks, cscq, cschq, scry_3D, splitbrdr, hexcrop, nblur, crob.
-Fix bug enabling/disabling overwrite button in palette editor.
-Small optimization for gdoffs, use precalcAtanYX.
-Properly propagate z through circlesplit, cylinder2 and tile_log variations.
-Some values in truchet_fill could've been NaN.
--Code changes
-Make most installation files read only.
-Qualify many calls with std:: to ensure they're not colliding with glm::
-Use auto in more places.
2017-06-09 22:38:06 -04:00
return std : : floor ( ( MaxDEFilterSize ( ) - ( ss - 1.0 ) ) / ss ) ;
2014-07-08 03:11:14 -04:00
}
/// <summary>
/// Determine the maximum filter box size based on the amount of local memory available
/// to each block.
/// </summary>
/// <param name="localMem">The local memory available to a block</param>
/// <returns>The maximum filter box size allowed</returns>
2015-08-10 23:10:23 -04:00
uint DEOpenCLKernelCreator : : SolveMaxBoxSize ( uint localMem )
2014-07-08 03:11:14 -04:00
{
06/09/2017
--User changes
-dark.qss is now per-OS.
-Properly set/reload palette when coming from the palette editor. The latter must be done if they've modified the current palette even if they've clicked cancel.
--Bug fixes
-Make the following variations safer by using Zeps(): conic, bipolar, edisc, whorl, tan, csc, cot, tanh, sech, csch, coth, auger, bwraps, hypertile3d, hypertile3d1, ortho, poincare, rational3, barycentroid, sschecks, cscq, cschq, scry_3D, splitbrdr, hexcrop, nblur, crob.
-Fix bug enabling/disabling overwrite button in palette editor.
-Small optimization for gdoffs, use precalcAtanYX.
-Properly propagate z through circlesplit, cylinder2 and tile_log variations.
-Some values in truchet_fill could've been NaN.
--Code changes
-Make most installation files read only.
-Qualify many calls with std:: to ensure they're not colliding with glm::
-Use auto in more places.
2017-06-09 22:38:06 -04:00
return uint ( std : : floor ( std : : sqrt ( Floor ( localMem / 16.0 ) ) ) ) ; //Divide by 16 because each element is float4.
2014-07-08 03:11:14 -04:00
}
/// <summary>
2014-11-28 04:37:51 -05:00
/// Create the log scale kernel string, using assignment.
/// Use this when Passes == 1.
2014-07-08 03:11:14 -04:00
/// </summary>
/// <returns>The kernel string</returns>
2015-08-10 23:10:23 -04:00
string DEOpenCLKernelCreator : : CreateLogScaleAssignDEKernelString ( )
2014-07-08 03:11:14 -04:00
{
ostringstream os ;
os < <
2015-08-10 23:10:23 -04:00
ConstantDefinesString ( m_DoublePrecision ) < <
2014-07-08 03:11:14 -04:00
DensityFilterCLStructString < <
2014-11-28 04:37:51 -05:00
" __kernel void " < < m_LogScaleAssignDEEntryPoint < < " ( \n "
2015-08-10 23:10:23 -04:00
" const __global real4_bucket* histogram, \n "
" __global real4_bucket* accumulator, \n "
2014-07-08 03:11:14 -04:00
" __constant DensityFilterCL* logFilter \n "
" \t ) \n "
" { \n "
" if ((GLOBAL_ID_X < logFilter->m_SuperRasW) && (GLOBAL_ID_Y < logFilter->m_SuperRasH)) \n "
" { \n "
" uint index = (GLOBAL_ID_Y * logFilter->m_SuperRasW) + GLOBAL_ID_X; \n "
" \n "
" if (histogram[index].w != 0) \n "
" { \n "
--User changes:
-Show common folder locations such as documents, downloads, pictures in the sidebar in all file dialogs.
-Warning message about exceeding memory in final render dialog now suggests strips as the solution to the problem.
-Strips now has a tooltip explaining what it does.
-Allow more digits in the spinners on the color section the flame tab.
-Add manually adjustable size spinners in the final render dialog. Percentage scale and absolute size are fully synced.
-Default prefix in final render is now the filename when doing animations (coming from sequence section of the library tab).
-Changed the elliptic variation back to using a less precise version for float, and a more precise version for double. The last release had it always using double.
-New applied xaos table that shows a read-only view of actual weights by taking the base xform weights and multiplying them by the xaos values.
-New table in the xaos tab that gives a graphical representation of the probability that each xform is chosen, with and without xaos.
-Add button to transpose the xaos rows and columns.
-Add support for importing .chaos files from Chaotica.
--Pasting back to Chaotica will work for most, but not all, variations due to incompatible parameter names in some.
-Curves are now splines instead of Bezier. This adds compatibility with Chaotica, but breaks it for Apophysis. Xmls are still pastable, but the color curves will look different.
--The curve editor on the palette tab can now add points by clicking on the lines and remove points by clicking on the points themselves, just like Chaotica.
--Splines are saved in four new xml fields: overall_curve, red_curve, green_curve and blue_curve.
-Allow for specifying the percentage of a sub batch each thread should iterate through per kernel call when running with OpenCL. This gives a roughly 1% performance increase due to having to make less kernel calls while iterating.
--This field is present for interactive editing (where it's not very useful) and in the final render dialog.
--On the command line, this is specified as --sbpctth for EmberRender and EmberAnimate.
-Allow double clicking to toggle the supersample field in the flame tab between 1 and 2 for easily checking the effect of the field.
-When showing affine values as polar coordinates, show angles normalized to 360 to match Chaotica.
-Fuse Count spinner now toggles between 15 and 100 when double clicking for easily checking the effect of the field.
-Added field for limiting the range in the x and y direction that the initial points are chosen from.
-Added a field called K2 which is an alternative way to set brightness, ignored when zero.
--This has no effect for many variations, but hs a noticeable effect for some.
-Added new variations:
arcsech
arcsech2
arcsinh
arctanh
asteria
block
bwraps_rand
circlecrop2
coth_spiral
crackle2
depth_blur
depth_blur2
depth_gaussian
depth_gaussian2
depth_ngon
depth_ngon2
depth_sine
depth_sine2
dragonfire
dspherical
dust
excinis
exp2
flipx
flowerdb
foci_p
gaussian
glynnia2
glynnsim4
glynnsim5
henon
henon
hex_rand
hex_truchet
hypershift
lazyjess
lens
lozi
lozi
modulusx
modulusy
oscilloscope2
point_symmetry
pointsymmetry
projective
pulse
rotate
scry2
shift
smartshape
spher
squares
starblur2
swirl3
swirl3r
tanh_spiral
target0
target2
tile_hlp
truchet_glyph
truchet_inv
truchet_knot
unicorngaloshen
vibration
vibration2
--hex_truchet, hex_rand should always use double. They are extremely sensitive.
--Bug fixes:
-Bounds sign was flipped for x coordinate of world space when center was not zero.
-Right clicking and dragging spinner showed menu on mouse up, even if it was very far away.
-Text boxes for size in final render dialog were hard to type in. Same bug as xform weight used to be so fix the same way.
-Fix spelling to be plural in toggle color speed box.
-Stop using the blank user palette to generate flames. Either put colored palettes in it, or exclude it from randoms.
-Clicking the random palette button for a palette file with only one palette in it would freeze the program.
-Clicking none scale in final render did not re-render the preview.
-Use less precision on random xaos. No need for 12 decimal places.
-The term sub batch is overloaded in the options dialog. Change the naming and tooltip of those settings for cpu and opencl.
--Also made clear in the tooltip for the default opencl quality setting that the value is per device.
-The arrows spinner in palette editor appears like a read-only label. Made it look like a spinner.
-Fix border colors for various spin boxes and table headers in the style sheet. Requires reload.
-Fix a bug in the bwraps variation which would produce different results than Chaotica and Apophysis.
-Synth was allowed to be selected for random flame generation when using an Nvidia card but it shouldn't have been because Nvidia has a hard time compiling synth.
-A casting bug in the OpenCL kernels for log scaling and density filtering was preventing successful compilations on Intel iGPUs. Fixed even though we don't support anything other than AMD and Nvidia.
-Palette rotation (click and drag) position was not being reset when loading a new flame.
-When the xform circles were hidden, opening and closing the options dialog would improperly reshow them.
-Double click toggle was broken on integer spin boxes.
-Fixed tab order of some controls.
-Creating a palette from a jpg in the palette editor only produced a single color.
--Needed to package imageformats/qjpeg.dll with the Windows installer.
-The basic memory benchmark test flame was not really testing memory. Make it more spread out.
-Remove the temporal samples field from the flame tab, it was never used because it's only an animation parameter which is specified in the final render dialog or on the command line with EmberAnimate.
--Code changes:
-Add IsEmpty() to Palette to determine if a palette is all black.
-Attempt to avoid selecting a blank palette in PaletteList::GetRandomPalette().
-Add function ScanForChaosNodes() and some associated helper functions in XmlToEmber.
-Make variation param name correction be case insensitive in XmlToEmber.
-Report error when assigning a variation param value in XmlToEmber.
-Add SubBatchPercentPerThread() method to RendererCL.
-Override enterEvent() and leaveEvent() in DoubleSpinBox and SpinBox to prevent the context menu from showing up on right mouse up after already leaving the spinner.
-Filtering the mouse wheel event in TableWidget no longer appears to be needed. It was probably an old Qt bug that has been fixed.
-Gui/ember syncing code in the final render dialog needed to be reworked to accommodate absolute sizes.
2019-04-13 22:00:46 -04:00
" real_bucket_t logScale = (logFilter->m_K1 * log((real_bucket_t)fma(histogram[index].w, logFilter->m_K2, (real_bucket_t)1.0))) / histogram[index].w; \n "
2014-07-08 03:11:14 -04:00
" \n "
2014-11-28 04:37:51 -05:00
" accumulator[index] = histogram[index] * logScale; \n " //Using a single real4 vector operation doubles the speed from doing each component individually.
2014-07-08 03:11:14 -04:00
" } \n "
" \n "
" barrier(CLK_GLOBAL_MEM_FENCE); \n " //Just to be safe. Makes no speed difference to do all of the time or only when there's a hit.
" } \n "
" } \n " ;
return os . str ( ) ;
}
/// <summary>
/// Create the gaussian density filtering kernel string.
/// 6 different methods of processing were tried before settling on this final and fastest 7th one.
/// Each block processes a box and exits. No column or row advancements happen.
/// The block accumulates to a temporary box and writes the contents to the global density filter buffer when done.
/// Note this applies the filter from top to bottom row and not from the center outward like the CPU version does.
/// This allows the image to be filtered without suffering from pixel loss due to race conditions.
/// It is run in multiple passes that are spaced far enough apart on the image so as to not overlap.
/// This allows writing to the global buffer without ever overlapping or using atomics.
/// The supersample parameter will produce three different kernels.
/// SS = 1, SS > 1 && SS even, SS > 1 && SS odd.
2014-11-28 04:37:51 -05:00
/// The width of the kernel this runs in must be evenly divisible by 16 or else artifacts will occur.
2014-07-08 03:11:14 -04:00
/// Note that because this function uses so many variables and is so complex, OpenCL can easily run
/// out of resources in some cases. Certain variables had to be reused to condense the kernel footprint
/// down enough to be able to run a block size of 32x32.
/// For double precision, or for SS > 1, a size of 32x30 is used.
/// Box width = (BLOCK_SIZE_X + (fw * 2)).
/// Box height = (BLOCK_SIZE_Y + (fw * 2)).
/// </summary>
/// <param name="ss">The supersample being used</param>
/// <returns>The kernel string</returns>
2015-08-10 23:10:23 -04:00
string DEOpenCLKernelCreator : : CreateGaussianDEKernel ( size_t ss )
2014-07-08 03:11:14 -04:00
{
bool doSS = ss > 1 ;
bool doScf = ! ( ss & 1 ) ;
ostringstream os ;
os < <
2016-03-12 22:25:19 -05:00
ConstantDefinesString ( m_DoublePrecision ) < <
DensityFilterCLStructString < <
UnionCLStructString < <
" __kernel void " < < GaussianDEEntryPoint ( ss , MaxDEFilterSize ( ) ) < < " ( \n " < <
" const __global real4_bucket* histogram, \n "
" __global real4reals_bucket* accumulator, \n "
" __constant DensityFilterCL* densityFilter, \n "
" const __global real_bucket_t* filterCoefs, \n "
" const __global real_bucket_t* filterWidths, \n "
" const __global uint* coefIndices, \n "
" const uint chunkSizeW, \n "
" const uint chunkSizeH, \n "
" const uint colChunkPass, \n "
" const uint rowChunkPass \n "
" \t ) \n "
" { \n "
" if (((((BLOCK_ID_X * chunkSizeW) + colChunkPass) * BLOCK_SIZE_X) + THREAD_ID_X >= densityFilter->m_SuperRasW) || \n "
" ((((BLOCK_ID_Y * chunkSizeH) + rowChunkPass) * BLOCK_SIZE_Y) + THREAD_ID_Y >= densityFilter->m_SuperRasH)) \n "
" return; \n "
" \n " ;
2014-11-28 04:37:51 -05:00
2014-07-08 03:11:14 -04:00
if ( doSS )
{
os < <
2016-03-12 22:25:19 -05:00
" uint ss = (uint)floor((real_bucket_t)densityFilter->m_Supersample / 2.0); \n "
" int densityBoxLeftX; \n "
" int densityBoxRightX; \n "
" int densityBoxTopY; \n "
" int densityBoxBottomY; \n "
" \n " ;
2014-07-08 03:11:14 -04:00
if ( doScf )
2016-03-12 22:25:19 -05:00
os < <
" real_bucket_t scfact = pow(densityFilter->m_Supersample / (densityFilter->m_Supersample + (real_bucket_t)1.0), (real_bucket_t)2.0); \n " ;
2014-07-08 03:11:14 -04:00
}
//Compute the size of the temporary box which is the block width + 2 * filter width x block height + 2 * filter width.
//Ideally the block width and height are both 32. However, the height might be smaller if there isn't enough memory.
os < <
2016-03-12 22:25:19 -05:00
" uint fullTempBoxWidth, fullTempBoxHeight; \n "
" uint leftBound, rightBound, topBound, botBound; \n "
" uint blockHistStartRow, blockHistEndRow, boxReadStartRow, boxReadEndRow; \n "
" uint blockHistStartCol, boxReadStartCol, boxReadEndCol; \n "
" uint accumWriteStartRow, accumWriteStartCol, colsToWrite; \n "
//If any of the variables above end up being made __local, init them here.
//At the moment, it's slower even though it's more memory efficient.
//" if (THREAD_ID_X == 0 && THREAD_ID_Y == 0)\n"
//" {\n"
//Init local vars here.
//" }\n"
//"\n"
//" barrier(CLK_LOCAL_MEM_FENCE);\n"
" \n "
" fullTempBoxWidth = BLOCK_SIZE_X + (densityFilter->m_FilterWidth * 2); \n "
" fullTempBoxHeight = BLOCK_SIZE_Y + (densityFilter->m_FilterWidth * 2); \n "
//Compute the bounds of the area to be sampled, which is just the ends minus the super sample minus 1.
" leftBound = densityFilter->m_Supersample - 1; \n "
" rightBound = densityFilter->m_SuperRasW - (densityFilter->m_Supersample - 1); \n "
" topBound = densityFilter->m_Supersample - 1; \n "
" botBound = densityFilter->m_SuperRasH - (densityFilter->m_Supersample - 1); \n "
" \n "
//Start and end values are the indices in the histogram read from
//and written to in the accumulator. They are not the indices for the local block of data.
//Before computing local offsets, compute the global offsets first to determine if any rows or cols fall outside of the bounds.
" blockHistStartRow = min(botBound, (uint)(topBound + (((BLOCK_ID_Y * chunkSizeH) + rowChunkPass) * BLOCK_SIZE_Y))); \n " //The first histogram row this block will process.
" blockHistEndRow = min(botBound, (uint)(blockHistStartRow + BLOCK_SIZE_Y)); \n " //The last histogram row this block will process, clamped to the last row.
" boxReadStartRow = densityFilter->m_FilterWidth - min(densityFilter->m_FilterWidth, blockHistStartRow); \n " //The first row in the local box to read from when writing back to the final accumulator for this block.
" boxReadEndRow = densityFilter->m_FilterWidth + min((uint)(densityFilter->m_FilterWidth + BLOCK_SIZE_Y), densityFilter->m_SuperRasH - blockHistStartRow); \n " //The last row in the local box to read from when writing back to the final accumulator for this block.
" blockHistStartCol = min(rightBound, leftBound + (uint)(((BLOCK_ID_X * chunkSizeW) + colChunkPass) * BLOCK_SIZE_X)); \n " //The first histogram column this block will process.
" boxReadStartCol = densityFilter->m_FilterWidth - min(densityFilter->m_FilterWidth, blockHistStartCol); \n " //The first box col this block will read from when copying to the accumulator.
" boxReadEndCol = densityFilter->m_FilterWidth + min(densityFilter->m_FilterWidth + (uint)BLOCK_SIZE_X, densityFilter->m_SuperRasW - blockHistStartCol); \n " //The last box col this block will read from when copying to the accumulator.
" \n "
//Last, the indices in the global accumulator that the local bounds will be writing to.
" accumWriteStartRow = blockHistStartRow - min(densityFilter->m_FilterWidth, blockHistStartRow); \n " //Will be fw - 0 except for boundary columns, it will be less.
" accumWriteStartCol = blockHistStartCol - min(densityFilter->m_FilterWidth, blockHistStartCol); \n "
" colsToWrite = ceil((real_bucket_t)(boxReadEndCol - boxReadStartCol) / (real_bucket_t)BLOCK_SIZE_X); \n "
" \n "
" uint threadHistRow = blockHistStartRow + THREAD_ID_Y; \n " //The histogram row this individual thread will be reading from.
" uint threadHistCol = blockHistStartCol + THREAD_ID_X; \n " //The histogram column this individual thread will be reading from.
" \n "
//Compute the center position in this local box to serve as the center position
//from which filter application offsets are computed.
//These are the local indices for the local data that are temporarily accumulated to before
//writing out to the global accumulator.
" uint boxRow = densityFilter->m_FilterWidth + THREAD_ID_Y; \n "
" uint boxCol = densityFilter->m_FilterWidth + THREAD_ID_X; \n "
" uint colElementsToZero = ceil((real_bucket_t)fullTempBoxWidth / (real_bucket_t)(BLOCK_SIZE_X)); \n " //Usually is 2.
" int i, j, k; \n "
" uint filterSelectInt, filterCoefIndex; \n "
" real_bucket_t cacheLog; \n "
" real_bucket_t filterSelect; \n "
" real4_bucket bucket; \n "
;
//This will be treated as having dimensions of (BLOCK_SIZE_X + (fw * 2)) x (BLOCK_SIZE_Y + (fw * 2)).
os < < " __local real4reals_bucket filterBox[1200]; \n " ; //Really only need 1156
2014-07-08 03:11:14 -04:00
os < <
2016-03-12 22:25:19 -05:00
//Zero the temp buffers first. This splits the zeroization evenly across all threads (columns) in the first block row.
//This is a middle ground solution. Previous methods tried:
//Thread (0, 0) does all init. This works, but is the slowest.
//Init is divided among all threads. This is the fastest but exposes a severe flaw in OpenCL,
//in that it will not get executed by all threads before proceeding, despite the barrier statement
//below. As a result, strange artifacts will get left around because filtering gets executed on a temp
//box that has not been properly zeroized.
//The only way to do it and still achieve reasonable speed is to have the first row do it. This is
//most likely because the first row gets executed first, ensuring zeroization is done when the rest
//of the threads execute.
" \n " //Dummy test zeroization for debugging.
//" if (THREAD_ID_Y == 0 && THREAD_ID_X == 0)\n"//First thread of the block takes the responsibility of zeroizing.
//" {\n"
//" for (k = 0; k < 2 * 1024; k++)\n"
//" {\n"
//" filterBox[k].m_Real4 = 0;\n"
//" }\n"
//" }\n"
" if (THREAD_ID_Y == 0) \n " //First row of the block takes the responsibility of zeroizing.
" { \n "
" for (i = 0; i < fullTempBoxHeight; i++) \n " //Each column in the row iterates through all rows.
" { \n "
" for (j = 0; j < colElementsToZero && ((colElementsToZero * THREAD_ID_X) + j) < fullTempBoxWidth; j++) \n " //And zeroizes a few columns from that row.
" { \n "
" filterBox[(i * fullTempBoxWidth) + ((colElementsToZero * THREAD_ID_X) + j)].m_Real4 = 0; \n "
" } \n "
" } \n "
" } \n "
" \n "
" barrier(CLK_LOCAL_MEM_FENCE); \n "
" \n "
--Bug fixes
-Fix improper usage of rand() in cpow2, cpow3, hypertile1, hypertile3D1, hypertile3D2, juliac, juliaq.
-Fix program crashing during density filtering on some Nvidia cards.
-hypertile3D1 was wrong.
-Parsing phoenix_julia when coming from Apophysis was wrong.
-Density filtering was freezing on certain Nvidia cards.
--Code changes
-Optimize juliac, npolar.
-Add a new function Crand() which behaves like the legacy C rand() which returns an integer between 0 and 32766, inclusive.
-Use RandBit() in some places.
-Remove Zeps() from vignette, it's not needed.
-Restructure OpenCL code for density filtering such that it does not hang after being compiled on some Nvidia cards, such as the gtx 1660. Remove barriers from conditionals where possible.
2020-12-29 00:46:55 -05:00
" if (threadHistRow < botBound && threadHistCol < rightBound) \n " //This is done to avoid putting barriers inside of conidtionals.
2016-03-12 22:25:19 -05:00
" bucket = histogram[(threadHistRow * densityFilter->m_SuperRasW) + threadHistCol]; \n "
--Bug fixes
-Fix improper usage of rand() in cpow2, cpow3, hypertile1, hypertile3D1, hypertile3D2, juliac, juliaq.
-Fix program crashing during density filtering on some Nvidia cards.
-hypertile3D1 was wrong.
-Parsing phoenix_julia when coming from Apophysis was wrong.
-Density filtering was freezing on certain Nvidia cards.
--Code changes
-Optimize juliac, npolar.
-Add a new function Crand() which behaves like the legacy C rand() which returns an integer between 0 and 32766, inclusive.
-Use RandBit() in some places.
-Remove Zeps() from vignette, it's not needed.
-Restructure OpenCL code for density filtering such that it does not hang after being compiled on some Nvidia cards, such as the gtx 1660. Remove barriers from conditionals where possible.
2020-12-29 00:46:55 -05:00
" else \n "
" bucket = 0.0; \n "
2016-03-12 22:25:19 -05:00
" \n "
--Bug fixes
-Fix improper usage of rand() in cpow2, cpow3, hypertile1, hypertile3D1, hypertile3D2, juliac, juliaq.
-Fix program crashing during density filtering on some Nvidia cards.
-hypertile3D1 was wrong.
-Parsing phoenix_julia when coming from Apophysis was wrong.
-Density filtering was freezing on certain Nvidia cards.
--Code changes
-Optimize juliac, npolar.
-Add a new function Crand() which behaves like the legacy C rand() which returns an integer between 0 and 32766, inclusive.
-Use RandBit() in some places.
-Remove Zeps() from vignette, it's not needed.
-Restructure OpenCL code for density filtering such that it does not hang after being compiled on some Nvidia cards, such as the gtx 1660. Remove barriers from conditionals where possible.
2020-12-29 00:46:55 -05:00
" if (bucket.w != 0) \n " //This is done to avoid putting barriers inside of conidtionals.
" { \n "
" cacheLog = (densityFilter->m_K1 * log((real_bucket_t)fma(bucket.w, densityFilter->m_K2, (real_bucket_t)1.0))) / bucket.w; \n " ;
2014-12-05 21:30:46 -05:00
2014-07-08 03:11:14 -04:00
if ( doSS )
{
os < <
--Bug fixes
-Fix improper usage of rand() in cpow2, cpow3, hypertile1, hypertile3D1, hypertile3D2, juliac, juliaq.
-Fix program crashing during density filtering on some Nvidia cards.
-hypertile3D1 was wrong.
-Parsing phoenix_julia when coming from Apophysis was wrong.
-Density filtering was freezing on certain Nvidia cards.
--Code changes
-Optimize juliac, npolar.
-Add a new function Crand() which behaves like the legacy C rand() which returns an integer between 0 and 32766, inclusive.
-Use RandBit() in some places.
-Remove Zeps() from vignette, it's not needed.
-Restructure OpenCL code for density filtering such that it does not hang after being compiled on some Nvidia cards, such as the gtx 1660. Remove barriers from conditionals where possible.
2020-12-29 00:46:55 -05:00
" filterSelect = 0; \n "
" densityBoxLeftX = threadHistCol - min(threadHistCol, ss); \n "
" densityBoxRightX = threadHistCol + min(ss, (densityFilter->m_SuperRasW - threadHistCol) - 1); \n "
" densityBoxTopY = threadHistRow - min(threadHistRow, ss); \n "
" densityBoxBottomY = threadHistRow + min(ss, (densityFilter->m_SuperRasH - threadHistRow) - 1); \n "
2016-03-12 22:25:19 -05:00
" \n "
--Bug fixes
-Fix improper usage of rand() in cpow2, cpow3, hypertile1, hypertile3D1, hypertile3D2, juliac, juliaq.
-Fix program crashing during density filtering on some Nvidia cards.
-hypertile3D1 was wrong.
-Parsing phoenix_julia when coming from Apophysis was wrong.
-Density filtering was freezing on certain Nvidia cards.
--Code changes
-Optimize juliac, npolar.
-Add a new function Crand() which behaves like the legacy C rand() which returns an integer between 0 and 32766, inclusive.
-Use RandBit() in some places.
-Remove Zeps() from vignette, it's not needed.
-Restructure OpenCL code for density filtering such that it does not hang after being compiled on some Nvidia cards, such as the gtx 1660. Remove barriers from conditionals where possible.
2020-12-29 00:46:55 -05:00
" for (j = densityBoxTopY; j <= densityBoxBottomY; j++) \n "
" { \n "
" for (i = densityBoxLeftX; i <= densityBoxRightX; i++) \n "
2016-03-12 22:25:19 -05:00
" { \n "
--Bug fixes
-Fix improper usage of rand() in cpow2, cpow3, hypertile1, hypertile3D1, hypertile3D2, juliac, juliaq.
-Fix program crashing during density filtering on some Nvidia cards.
-hypertile3D1 was wrong.
-Parsing phoenix_julia when coming from Apophysis was wrong.
-Density filtering was freezing on certain Nvidia cards.
--Code changes
-Optimize juliac, npolar.
-Add a new function Crand() which behaves like the legacy C rand() which returns an integer between 0 and 32766, inclusive.
-Use RandBit() in some places.
-Remove Zeps() from vignette, it's not needed.
-Restructure OpenCL code for density filtering such that it does not hang after being compiled on some Nvidia cards, such as the gtx 1660. Remove barriers from conditionals where possible.
2020-12-29 00:46:55 -05:00
" filterSelect += histogram[i + (j * densityFilter->m_SuperRasW)].w; \n "
2016-03-12 22:25:19 -05:00
" } \n "
--Bug fixes
-Fix improper usage of rand() in cpow2, cpow3, hypertile1, hypertile3D1, hypertile3D2, juliac, juliaq.
-Fix program crashing during density filtering on some Nvidia cards.
-hypertile3D1 was wrong.
-Parsing phoenix_julia when coming from Apophysis was wrong.
-Density filtering was freezing on certain Nvidia cards.
--Code changes
-Optimize juliac, npolar.
-Add a new function Crand() which behaves like the legacy C rand() which returns an integer between 0 and 32766, inclusive.
-Use RandBit() in some places.
-Remove Zeps() from vignette, it's not needed.
-Restructure OpenCL code for density filtering such that it does not hang after being compiled on some Nvidia cards, such as the gtx 1660. Remove barriers from conditionals where possible.
2020-12-29 00:46:55 -05:00
" } \n "
2016-03-12 22:25:19 -05:00
" \n " ;
2014-12-05 21:30:46 -05:00
2014-07-08 03:11:14 -04:00
if ( doScf )
2016-03-12 22:25:19 -05:00
os < <
--Bug fixes
-Fix improper usage of rand() in cpow2, cpow3, hypertile1, hypertile3D1, hypertile3D2, juliac, juliaq.
-Fix program crashing during density filtering on some Nvidia cards.
-hypertile3D1 was wrong.
-Parsing phoenix_julia when coming from Apophysis was wrong.
-Density filtering was freezing on certain Nvidia cards.
--Code changes
-Optimize juliac, npolar.
-Add a new function Crand() which behaves like the legacy C rand() which returns an integer between 0 and 32766, inclusive.
-Use RandBit() in some places.
-Remove Zeps() from vignette, it's not needed.
-Restructure OpenCL code for density filtering such that it does not hang after being compiled on some Nvidia cards, such as the gtx 1660. Remove barriers from conditionals where possible.
2020-12-29 00:46:55 -05:00
" filterSelect *= scfact; \n " ;
2014-07-08 03:11:14 -04:00
}
else
{
2015-03-30 20:46:52 -04:00
os < <
--Bug fixes
-Fix improper usage of rand() in cpow2, cpow3, hypertile1, hypertile3D1, hypertile3D2, juliac, juliaq.
-Fix program crashing during density filtering on some Nvidia cards.
-hypertile3D1 was wrong.
-Parsing phoenix_julia when coming from Apophysis was wrong.
-Density filtering was freezing on certain Nvidia cards.
--Code changes
-Optimize juliac, npolar.
-Add a new function Crand() which behaves like the legacy C rand() which returns an integer between 0 and 32766, inclusive.
-Use RandBit() in some places.
-Remove Zeps() from vignette, it's not needed.
-Restructure OpenCL code for density filtering such that it does not hang after being compiled on some Nvidia cards, such as the gtx 1660. Remove barriers from conditionals where possible.
2020-12-29 00:46:55 -05:00
" filterSelect = bucket.w; \n " ;
2014-07-08 03:11:14 -04:00
}
2014-12-05 21:30:46 -05:00
2014-07-08 03:11:14 -04:00
os < <
--Bug fixes
-Fix improper usage of rand() in cpow2, cpow3, hypertile1, hypertile3D1, hypertile3D2, juliac, juliaq.
-Fix program crashing during density filtering on some Nvidia cards.
-hypertile3D1 was wrong.
-Parsing phoenix_julia when coming from Apophysis was wrong.
-Density filtering was freezing on certain Nvidia cards.
--Code changes
-Optimize juliac, npolar.
-Add a new function Crand() which behaves like the legacy C rand() which returns an integer between 0 and 32766, inclusive.
-Use RandBit() in some places.
-Remove Zeps() from vignette, it's not needed.
-Restructure OpenCL code for density filtering such that it does not hang after being compiled on some Nvidia cards, such as the gtx 1660. Remove barriers from conditionals where possible.
2020-12-29 00:46:55 -05:00
" } \n "
" else \n "
" { \n "
" cacheLog = 0.0; \n "
" filterSelect = 1.0; \n " //Will subtract 1 to be 0 below.
" } \n "
2016-03-12 22:25:19 -05:00
" \n "
--Bug fixes
-Fix improper usage of rand() in cpow2, cpow3, hypertile1, hypertile3D1, hypertile3D2, juliac, juliaq.
-Fix program crashing during density filtering on some Nvidia cards.
-hypertile3D1 was wrong.
-Parsing phoenix_julia when coming from Apophysis was wrong.
-Density filtering was freezing on certain Nvidia cards.
--Code changes
-Optimize juliac, npolar.
-Add a new function Crand() which behaves like the legacy C rand() which returns an integer between 0 and 32766, inclusive.
-Use RandBit() in some places.
-Remove Zeps() from vignette, it's not needed.
-Restructure OpenCL code for density filtering such that it does not hang after being compiled on some Nvidia cards, such as the gtx 1660. Remove barriers from conditionals where possible.
2020-12-29 00:46:55 -05:00
" if (filterSelect > densityFilter->m_MaxFilteredCounts) \n "
" filterSelectInt = densityFilter->m_MaxFilterIndex; \n "
" else if (filterSelect <= DE_THRESH) \n "
" filterSelectInt = (int)ceil(filterSelect) - 1; \n "
" else \n "
" filterSelectInt = (int)DE_THRESH + (int)floor(pow((real_bucket_t)(filterSelect - DE_THRESH), densityFilter->m_Curve)); \n "
2016-03-12 22:25:19 -05:00
" \n "
--Bug fixes
-Fix improper usage of rand() in cpow2, cpow3, hypertile1, hypertile3D1, hypertile3D2, juliac, juliaq.
-Fix program crashing during density filtering on some Nvidia cards.
-hypertile3D1 was wrong.
-Parsing phoenix_julia when coming from Apophysis was wrong.
-Density filtering was freezing on certain Nvidia cards.
--Code changes
-Optimize juliac, npolar.
-Add a new function Crand() which behaves like the legacy C rand() which returns an integer between 0 and 32766, inclusive.
-Use RandBit() in some places.
-Remove Zeps() from vignette, it's not needed.
-Restructure OpenCL code for density filtering such that it does not hang after being compiled on some Nvidia cards, such as the gtx 1660. Remove barriers from conditionals where possible.
2020-12-29 00:46:55 -05:00
" if (filterSelectInt > densityFilter->m_MaxFilterIndex) \n "
" filterSelectInt = densityFilter->m_MaxFilterIndex; \n "
2016-03-12 22:25:19 -05:00
" \n "
--Bug fixes
-Fix improper usage of rand() in cpow2, cpow3, hypertile1, hypertile3D1, hypertile3D2, juliac, juliaq.
-Fix program crashing during density filtering on some Nvidia cards.
-hypertile3D1 was wrong.
-Parsing phoenix_julia when coming from Apophysis was wrong.
-Density filtering was freezing on certain Nvidia cards.
--Code changes
-Optimize juliac, npolar.
-Add a new function Crand() which behaves like the legacy C rand() which returns an integer between 0 and 32766, inclusive.
-Use RandBit() in some places.
-Remove Zeps() from vignette, it's not needed.
-Restructure OpenCL code for density filtering such that it does not hang after being compiled on some Nvidia cards, such as the gtx 1660. Remove barriers from conditionals where possible.
2020-12-29 00:46:55 -05:00
" filterCoefIndex = filterSelectInt * densityFilter->m_KernelSize; \n "
2016-03-12 22:25:19 -05:00
" \n "
//With this new method, only accumulate to the temp local buffer first. Write to the final accumulator last.
//For each loop through, note that there is a local memory barrier call inside of each call to AddToAccumNoCheck().
//If this isn't done, pixel errors occurr and even an out of resources error occurrs because too many writes are done to the same place in memory at once.
--Bug fixes
-Fix improper usage of rand() in cpow2, cpow3, hypertile1, hypertile3D1, hypertile3D2, juliac, juliaq.
-Fix program crashing during density filtering on some Nvidia cards.
-hypertile3D1 was wrong.
-Parsing phoenix_julia when coming from Apophysis was wrong.
-Density filtering was freezing on certain Nvidia cards.
--Code changes
-Optimize juliac, npolar.
-Add a new function Crand() which behaves like the legacy C rand() which returns an integer between 0 and 32766, inclusive.
-Use RandBit() in some places.
-Remove Zeps() from vignette, it's not needed.
-Restructure OpenCL code for density filtering such that it does not hang after being compiled on some Nvidia cards, such as the gtx 1660. Remove barriers from conditionals where possible.
2020-12-29 00:46:55 -05:00
" k = (int)densityFilter->m_FilterWidth; \n " //Need a signed int to use below, really is filter width, but reusing a variable to save space.
2016-03-12 22:25:19 -05:00
" \n "
--Bug fixes
-Fix improper usage of rand() in cpow2, cpow3, hypertile1, hypertile3D1, hypertile3D2, juliac, juliaq.
-Fix program crashing during density filtering on some Nvidia cards.
-hypertile3D1 was wrong.
-Parsing phoenix_julia when coming from Apophysis was wrong.
-Density filtering was freezing on certain Nvidia cards.
--Code changes
-Optimize juliac, npolar.
-Add a new function Crand() which behaves like the legacy C rand() which returns an integer between 0 and 32766, inclusive.
-Use RandBit() in some places.
-Remove Zeps() from vignette, it's not needed.
-Restructure OpenCL code for density filtering such that it does not hang after being compiled on some Nvidia cards, such as the gtx 1660. Remove barriers from conditionals where possible.
2020-12-29 00:46:55 -05:00
" for (j = -k; j <= k; j++) \n "
" { \n "
" for (i = -k; i <= k; i++) \n "
" { \n "
" filterSelectInt = filterCoefIndex + coefIndices[(abs(j) * (densityFilter->m_FilterWidth + 1)) + abs(i)]; \n " //Really is filterCoeffIndexPlusOffset, but reusing a variable to save space.
2016-03-12 22:25:19 -05:00
" \n "
--Bug fixes
-Fix improper usage of rand() in cpow2, cpow3, hypertile1, hypertile3D1, hypertile3D2, juliac, juliaq.
-Fix program crashing during density filtering on some Nvidia cards.
-hypertile3D1 was wrong.
-Parsing phoenix_julia when coming from Apophysis was wrong.
-Density filtering was freezing on certain Nvidia cards.
--Code changes
-Optimize juliac, npolar.
-Add a new function Crand() which behaves like the legacy C rand() which returns an integer between 0 and 32766, inclusive.
-Use RandBit() in some places.
-Remove Zeps() from vignette, it's not needed.
-Restructure OpenCL code for density filtering such that it does not hang after being compiled on some Nvidia cards, such as the gtx 1660. Remove barriers from conditionals where possible.
2020-12-29 00:46:55 -05:00
" if (filterCoefs[filterSelectInt] != 0) \n " //This conditional actually improves speed, despite SIMT being bad at conditionals.
" { \n "
" filterBox[(i + boxCol) + ((j + boxRow) * fullTempBoxWidth)].m_Real4 += (bucket * (filterCoefs[filterSelectInt] * cacheLog)); \n "
2016-03-12 22:25:19 -05:00
" } \n "
--Bug fixes
-Fix improper usage of rand() in cpow2, cpow3, hypertile1, hypertile3D1, hypertile3D2, juliac, juliaq.
-Fix program crashing during density filtering on some Nvidia cards.
-hypertile3D1 was wrong.
-Parsing phoenix_julia when coming from Apophysis was wrong.
-Density filtering was freezing on certain Nvidia cards.
--Code changes
-Optimize juliac, npolar.
-Add a new function Crand() which behaves like the legacy C rand() which returns an integer between 0 and 32766, inclusive.
-Use RandBit() in some places.
-Remove Zeps() from vignette, it's not needed.
-Restructure OpenCL code for density filtering such that it does not hang after being compiled on some Nvidia cards, such as the gtx 1660. Remove barriers from conditionals where possible.
2020-12-29 00:46:55 -05:00
" } \n "
" barrier(CLK_LOCAL_MEM_FENCE); \n " //If this is the only barrier and the block size is exactly 16, it works perfectly. Otherwise, no chunks occur, but a many streaks.
" } \n "
2016-03-12 22:25:19 -05:00
" \n "
" barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE); \n "
" \n "
" if (THREAD_ID_Y == 0) \n "
" { \n "
//At this point, all threads in this block have applied the filter to their surrounding pixels and stored the results in the temp local box.
//Add the cells of it that are in bounds to the global accumulator.
//Compute offsets in local box to read from, and offsets into global accumulator to write to.
//Use a method here that is similar to the zeroization above: Each thread (column) in the first row iterates through all of the
//rows and adds a few columns to the accumulator.
" for (i = boxReadStartRow, j = accumWriteStartRow; i < boxReadEndRow; i++, j++) \n "
" { \n "
" for (k = 0; k < colsToWrite; k++) \n " //Each thread writes a few columns.
" { \n "
" boxCol = (colsToWrite * THREAD_ID_X) + k; \n " //Really is colOffset, but reusing a variable to save space.
" \n "
" if (boxReadStartCol + boxCol < boxReadEndCol) \n "
" accumulator[(j * densityFilter->m_SuperRasW) + (accumWriteStartCol + boxCol)].m_Real4 += filterBox[(i * fullTempBoxWidth) + (boxReadStartCol + boxCol)].m_Real4; \n "
" } \n "
" barrier(CLK_GLOBAL_MEM_FENCE); \n " //This must be here or else chunks will go missing.
" } \n "
" } \n "
" } \n " ;
2014-07-08 03:11:14 -04:00
return os . str ( ) ;
}
/// <summary>
/// Create the gaussian density filtering kernel string, but use no local cache and perform
/// all writes directly to the global density filtering buffer.
/// Note this applies the filter from top to bottom row and not from the center outward like the CPU version does.
/// This allows the image to be filtered without suffering from pixel loss due to race conditions.
/// This is used for when the filter box is greater than can fit in the local cache.
/// While the cached version is incredibly fast, this version offers no real gain over doing it
/// on the CPU because the frequent global memory access brings performance to a crawl.
/// The supersample parameter will produce three different kernels.
/// SS = 1, SS > 1 && SS even, SS > 1 && SS odd.
2014-11-28 04:37:51 -05:00
/// The width of the kernel this runs in must be evenly divisible by 16 or else artifacts will occur.
2014-07-08 03:11:14 -04:00
/// Note that because this function uses so many variables and is so complex, OpenCL can easily run
/// out of resources in some cases. Certain variables had to be reused to condense the kernel footprint
/// down enough to be able to run a block size of 32x32.
/// For double precision, or for SS > 1, a size of 32x30 is used.
/// </summary>
/// <param name="ss">The supersample being used</param>
/// <returns>The kernel string</returns>
2015-08-10 23:10:23 -04:00
string DEOpenCLKernelCreator : : CreateGaussianDEKernelNoLocalCache ( size_t ss )
2014-07-08 03:11:14 -04:00
{
bool doSS = ss > 1 ;
bool doScf = ! ( ss & 1 ) ;
ostringstream os ;
os < <
2016-03-12 22:25:19 -05:00
ConstantDefinesString ( m_DoublePrecision ) < <
DensityFilterCLStructString < <
UnionCLStructString < <
AddToAccumWithCheckFunctionString < <
" __kernel void " < < GaussianDEEntryPoint ( ss , MaxDEFilterSize ( ) + 1 ) < < " ( \n " < <
" const __global real4_bucket* histogram, \n "
" __global real4reals_bucket* accumulator, \n "
" __constant DensityFilterCL* densityFilter, \n "
" const __global real_bucket_t* filterCoefs, \n "
" const __global real_bucket_t* filterWidths, \n "
" const __global uint* coefIndices, \n "
" const uint chunkSizeW, \n "
" const uint chunkSizeH, \n "
" const uint colChunkPass, \n "
" const uint rowChunkPass \n "
" \t ) \n "
" { \n "
" if (((((BLOCK_ID_X * chunkSizeW) + colChunkPass) * BLOCK_SIZE_X) + THREAD_ID_X >= densityFilter->m_SuperRasW) || \n "
" ((((BLOCK_ID_Y * chunkSizeH) + rowChunkPass) * BLOCK_SIZE_Y) + THREAD_ID_Y >= densityFilter->m_SuperRasH)) \n "
" return; \n "
" \n " ;
2014-12-05 21:30:46 -05:00
2014-07-08 03:11:14 -04:00
if ( doSS )
{
2016-03-12 22:25:19 -05:00
os < <
" uint ss = (uint)floor((real_bucket_t)densityFilter->m_Supersample / 2.0); \n "
" int densityBoxLeftX; \n "
" int densityBoxRightX; \n "
" int densityBoxTopY; \n "
" int densityBoxBottomY; \n " ;
if ( doScf )
os < < " real_bucket_t scfact = pow((real_bucket_t)densityFilter->m_Supersample / ((real_bucket_t)densityFilter->m_Supersample + (real_bucket_t)1.0), (real_bucket_t)2.0); \n " ;
2014-07-08 03:11:14 -04:00
}
2014-12-05 21:30:46 -05:00
2014-07-08 03:11:14 -04:00
os < <
2016-03-12 22:25:19 -05:00
//Compute the bounds of the area to be sampled, which is just the ends minus the super sample minus 1.
" uint leftBound = densityFilter->m_Supersample - 1; \n "
" uint rightBound = densityFilter->m_SuperRasW - (densityFilter->m_Supersample - 1); \n "
" uint topBound = densityFilter->m_Supersample - 1; \n "
" uint botBound = densityFilter->m_SuperRasH - (densityFilter->m_Supersample - 1); \n "
" \n "
//Start and end values are the indices in the histogram read from and written to in the accumulator.
//Before computing local offsets, compute the global offsets first to determine if any rows or cols fall outside of the bounds.
" uint blockHistStartRow = min(botBound, (uint)(topBound + (((BLOCK_ID_Y * chunkSizeH) + rowChunkPass) * BLOCK_SIZE_Y))); \n " //The first histogram row this block will process.
" uint threadHistRow = blockHistStartRow + THREAD_ID_Y; \n " //The histogram row this individual thread will be reading from.
" \n "
" uint blockHistStartCol = min(rightBound, leftBound + (uint)(((BLOCK_ID_X * chunkSizeW) + colChunkPass) * BLOCK_SIZE_X)); \n " //The first histogram column this block will process.
" uint threadHistCol = blockHistStartCol + THREAD_ID_X; \n " //The histogram column this individual thread will be reading from.
" \n "
" int i, j; \n "
" uint filterSelectInt, filterCoefIndex; \n "
" real_bucket_t cacheLog; \n "
" real_bucket_t filterSelect; \n "
" real4_bucket bucket; \n "
" \n "
" if (threadHistRow < botBound && threadHistCol < rightBound) \n "
" { \n "
" bucket = histogram[(threadHistRow * densityFilter->m_SuperRasW) + threadHistCol]; \n "
" \n "
" if (bucket.w != 0) \n "
" { \n "
--User changes:
-Show common folder locations such as documents, downloads, pictures in the sidebar in all file dialogs.
-Warning message about exceeding memory in final render dialog now suggests strips as the solution to the problem.
-Strips now has a tooltip explaining what it does.
-Allow more digits in the spinners on the color section the flame tab.
-Add manually adjustable size spinners in the final render dialog. Percentage scale and absolute size are fully synced.
-Default prefix in final render is now the filename when doing animations (coming from sequence section of the library tab).
-Changed the elliptic variation back to using a less precise version for float, and a more precise version for double. The last release had it always using double.
-New applied xaos table that shows a read-only view of actual weights by taking the base xform weights and multiplying them by the xaos values.
-New table in the xaos tab that gives a graphical representation of the probability that each xform is chosen, with and without xaos.
-Add button to transpose the xaos rows and columns.
-Add support for importing .chaos files from Chaotica.
--Pasting back to Chaotica will work for most, but not all, variations due to incompatible parameter names in some.
-Curves are now splines instead of Bezier. This adds compatibility with Chaotica, but breaks it for Apophysis. Xmls are still pastable, but the color curves will look different.
--The curve editor on the palette tab can now add points by clicking on the lines and remove points by clicking on the points themselves, just like Chaotica.
--Splines are saved in four new xml fields: overall_curve, red_curve, green_curve and blue_curve.
-Allow for specifying the percentage of a sub batch each thread should iterate through per kernel call when running with OpenCL. This gives a roughly 1% performance increase due to having to make less kernel calls while iterating.
--This field is present for interactive editing (where it's not very useful) and in the final render dialog.
--On the command line, this is specified as --sbpctth for EmberRender and EmberAnimate.
-Allow double clicking to toggle the supersample field in the flame tab between 1 and 2 for easily checking the effect of the field.
-When showing affine values as polar coordinates, show angles normalized to 360 to match Chaotica.
-Fuse Count spinner now toggles between 15 and 100 when double clicking for easily checking the effect of the field.
-Added field for limiting the range in the x and y direction that the initial points are chosen from.
-Added a field called K2 which is an alternative way to set brightness, ignored when zero.
--This has no effect for many variations, but hs a noticeable effect for some.
-Added new variations:
arcsech
arcsech2
arcsinh
arctanh
asteria
block
bwraps_rand
circlecrop2
coth_spiral
crackle2
depth_blur
depth_blur2
depth_gaussian
depth_gaussian2
depth_ngon
depth_ngon2
depth_sine
depth_sine2
dragonfire
dspherical
dust
excinis
exp2
flipx
flowerdb
foci_p
gaussian
glynnia2
glynnsim4
glynnsim5
henon
henon
hex_rand
hex_truchet
hypershift
lazyjess
lens
lozi
lozi
modulusx
modulusy
oscilloscope2
point_symmetry
pointsymmetry
projective
pulse
rotate
scry2
shift
smartshape
spher
squares
starblur2
swirl3
swirl3r
tanh_spiral
target0
target2
tile_hlp
truchet_glyph
truchet_inv
truchet_knot
unicorngaloshen
vibration
vibration2
--hex_truchet, hex_rand should always use double. They are extremely sensitive.
--Bug fixes:
-Bounds sign was flipped for x coordinate of world space when center was not zero.
-Right clicking and dragging spinner showed menu on mouse up, even if it was very far away.
-Text boxes for size in final render dialog were hard to type in. Same bug as xform weight used to be so fix the same way.
-Fix spelling to be plural in toggle color speed box.
-Stop using the blank user palette to generate flames. Either put colored palettes in it, or exclude it from randoms.
-Clicking the random palette button for a palette file with only one palette in it would freeze the program.
-Clicking none scale in final render did not re-render the preview.
-Use less precision on random xaos. No need for 12 decimal places.
-The term sub batch is overloaded in the options dialog. Change the naming and tooltip of those settings for cpu and opencl.
--Also made clear in the tooltip for the default opencl quality setting that the value is per device.
-The arrows spinner in palette editor appears like a read-only label. Made it look like a spinner.
-Fix border colors for various spin boxes and table headers in the style sheet. Requires reload.
-Fix a bug in the bwraps variation which would produce different results than Chaotica and Apophysis.
-Synth was allowed to be selected for random flame generation when using an Nvidia card but it shouldn't have been because Nvidia has a hard time compiling synth.
-A casting bug in the OpenCL kernels for log scaling and density filtering was preventing successful compilations on Intel iGPUs. Fixed even though we don't support anything other than AMD and Nvidia.
-Palette rotation (click and drag) position was not being reset when loading a new flame.
-When the xform circles were hidden, opening and closing the options dialog would improperly reshow them.
-Double click toggle was broken on integer spin boxes.
-Fixed tab order of some controls.
-Creating a palette from a jpg in the palette editor only produced a single color.
--Needed to package imageformats/qjpeg.dll with the Windows installer.
-The basic memory benchmark test flame was not really testing memory. Make it more spread out.
-Remove the temporal samples field from the flame tab, it was never used because it's only an animation parameter which is specified in the final render dialog or on the command line with EmberAnimate.
--Code changes:
-Add IsEmpty() to Palette to determine if a palette is all black.
-Attempt to avoid selecting a blank palette in PaletteList::GetRandomPalette().
-Add function ScanForChaosNodes() and some associated helper functions in XmlToEmber.
-Make variation param name correction be case insensitive in XmlToEmber.
-Report error when assigning a variation param value in XmlToEmber.
-Add SubBatchPercentPerThread() method to RendererCL.
-Override enterEvent() and leaveEvent() in DoubleSpinBox and SpinBox to prevent the context menu from showing up on right mouse up after already leaving the spinner.
-Filtering the mouse wheel event in TableWidget no longer appears to be needed. It was probably an old Qt bug that has been fixed.
-Gui/ember syncing code in the final render dialog needed to be reworked to accommodate absolute sizes.
2019-04-13 22:00:46 -04:00
" cacheLog = (densityFilter->m_K1 * log((real_bucket_t)fma(bucket.w, densityFilter->m_K2, (real_bucket_t)1.0))) / bucket.w; \n " ;
2014-12-05 21:30:46 -05:00
2014-07-08 03:11:14 -04:00
if ( doSS )
{
os < <
2016-03-12 22:25:19 -05:00
" filterSelect = 0; \n "
" densityBoxLeftX = threadHistCol - min(threadHistCol, ss); \n "
" densityBoxRightX = threadHistCol + min(ss, (densityFilter->m_SuperRasW - threadHistCol) - 1); \n "
" densityBoxTopY = threadHistRow - min(threadHistRow, ss); \n "
" densityBoxBottomY = threadHistRow + min(ss, (densityFilter->m_SuperRasH - threadHistRow) - 1); \n "
" \n "
" for (j = densityBoxTopY; j <= densityBoxBottomY; j++) \n "
" { \n "
" for (i = densityBoxLeftX; i <= densityBoxRightX; i++) \n "
" { \n "
" filterSelect += histogram[i + (j * densityFilter->m_SuperRasW)].w; \n "
" } \n "
" } \n "
" \n " ;
2014-12-05 21:30:46 -05:00
2014-07-08 03:11:14 -04:00
if ( doScf )
2016-03-12 22:25:19 -05:00
os < <
" filterSelect *= scfact; \n " ;
2014-07-08 03:11:14 -04:00
}
else
{
os
2016-03-12 22:25:19 -05:00
< < " filterSelect = bucket.w; \n " ;
2014-07-08 03:11:14 -04:00
}
2014-12-05 21:30:46 -05:00
2014-07-08 03:11:14 -04:00
os < <
2016-03-12 22:25:19 -05:00
" \n "
" if (filterSelect > densityFilter->m_MaxFilteredCounts) \n "
" filterSelectInt = densityFilter->m_MaxFilterIndex; \n "
" else if (filterSelect <= DE_THRESH) \n "
" filterSelectInt = (int)ceil(filterSelect) - 1; \n "
" else \n "
" filterSelectInt = (int)DE_THRESH + (int)floor(pow((real_bucket_t)(filterSelect - DE_THRESH), densityFilter->m_Curve)); \n "
" \n "
" if (filterSelectInt > densityFilter->m_MaxFilterIndex) \n "
" filterSelectInt = densityFilter->m_MaxFilterIndex; \n "
" \n "
" filterCoefIndex = filterSelectInt * densityFilter->m_KernelSize; \n "
" \n "
" int fw = (int)densityFilter->m_FilterWidth; \n " //Need a signed int to use below.
" \n "
" for (j = -fw; j <= fw; j++) \n "
" { \n "
" for (i = -fw; i <= fw; i++) \n "
" { \n "
" if (AccumCheck(densityFilter->m_SuperRasW, densityFilter->m_SuperRasH, threadHistCol, i, threadHistRow, j)) \n "
" { \n "
" filterSelectInt = filterCoefIndex + coefIndices[(abs(j) * (densityFilter->m_FilterWidth + 1)) + abs(i)]; \n " //Really is filterCoeffIndexPlusOffset, but reusing a variable to save space.
" \n "
" if (filterCoefs[filterSelectInt] != 0) \n "
" { \n "
" accumulator[(i + threadHistCol) + ((j + threadHistRow) * densityFilter->m_SuperRasW)].m_Real4 += (bucket * (filterCoefs[filterSelectInt] * cacheLog)); \n "
" } \n "
" } \n "
" \n "
" barrier(CLK_GLOBAL_MEM_FENCE); \n " //Required to avoid streaks.
" } \n "
" } \n "
" } \n " //bucket.w != 0.
" } \n " //In bounds.
//"\n"
//" barrier(CLK_GLOBAL_MEM_FENCE);\n"//Just to be safe.
" } \n " ;
2014-07-08 03:11:14 -04:00
return os . str ( ) ;
}
2014-12-05 21:30:46 -05:00
}