--User changes

-Add new variations: crackle, dc_perlin.
 -Make default palette interp mode be linear instead of step.
 -Make summary tab the selected one in the Info tab.
 -Allow for highlight power of up to 10. It was previously limited to 2.

--Bug fixes
 -Direct color calculations were wrong.
 -Flattening was not applied to final xform.
 -Fix "pure virtual function call" error on shutdown.

--Code changes
 -Allow for array precalc params in variations by adding a size member to the ParamWithName class.
  -In IterOpenCLKernelCreator, memcpy precalc params instead of a direct assign since they can now be of variable length.
 -Add new file VarFuncs to consolidate some functions that are common to multiple variations. This also contains texture data for crackle and dc_perlin.
  -Place OpenCL versions of these functions in the FunctionMapper class in the EmberCL project.
 -Add new Singleton class that uses CRTP, is thread safe, and deletes after the last reference goes away. This fixes the usual "delete after main()" problem with singletons that use the static local function variable pattern.
 -Began saving files with AStyle autoformatter turned on. This will eventually touch all files as they are worked on.
 -Add missing backslash to CUDA include and library paths for builds on Nvidia systems.
 -Add missing gl.h include for Windows.
 -Remove glew include paths from Fractorium, it's not used.
 -Remove any Nvidia specific #defines and build targets, they are no longer needed with OpenCL 1.2.
 -Fix bad paths on linux build.
 -General cleanup.
This commit is contained in:
mfeemster
2015-12-31 13:41:59 -08:00
parent 914b5412c3
commit 6ba16888e3
57 changed files with 3444 additions and 2433 deletions

View File

@ -16,6 +16,7 @@
#if defined(_WIN32)
#include <windows.h>
#include <SDKDDKVer.h>
#include "GL/gl.h"
#elif defined(__APPLE__)
#include <OpenGL/gl.h>
#else
@ -23,21 +24,7 @@
#endif
#include <utility>
#ifdef NVIDIA
#ifdef CL_VERSION_1_2
#undef CL_VERSION_1_2
#endif
#if !defined(WIN32) && !defined(_WIN32)
#ifndef CL_VERSION_1_1
#define CL_VERSION_1_1
#endif
#endif
#endif
#include <CL/cl.hpp>
#include <algorithm>
#include <atomic>
#include <cstdio>

View File

@ -23,100 +23,101 @@ static string ConstantDefinesString(bool doublePrecision)
if (doublePrecision)
{
os << "#if defined(cl_amd_fp64)\n"//AMD extension available?
" #pragma OPENCL EXTENSION cl_amd_fp64 : enable\n"
"#endif\n"
"#if defined(cl_khr_fp64)\n"//Khronos extension available?
" #pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
"#endif\n"
"#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable\n"//Only supported on nVidia.
"typedef long intPrec;\n"
"typedef uint atomi;\n"//Same size as real_bucket_t, always 4 bytes.
"typedef double real_t;\n"
"typedef float real_bucket_t;\n"//Assume buckets are always float, even though iter calcs are in double.
"typedef double2 real2;\n"
"typedef double4 real4;\n"
"typedef float4 real4_bucket;\n"//And here too.
"#define EPS (DBL_EPSILON)\n"
"#define TLOW (DBL_MIN)\n"
"#define TMAX (DBL_MAX)\n"
;
" #pragma OPENCL EXTENSION cl_amd_fp64 : enable\n"
"#endif\n"
"#if defined(cl_khr_fp64)\n"//Khronos extension available?
" #pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
"#endif\n"
"#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable\n"//Only supported on nVidia.
"typedef long intPrec;\n"
"typedef uint atomi;\n"//Same size as real_bucket_t, always 4 bytes.
"typedef double real_t;\n"
"typedef float real_bucket_t;\n"//Assume buckets are always float, even though iter calcs are in double.
"typedef double2 real2;\n"
"typedef double3 real3;\n"
"typedef double4 real4;\n"
"typedef float4 real4_bucket;\n"//And here too.
"#define EPS (DBL_EPSILON)\n"
"#define TLOW (DBL_MIN)\n"
"#define TMAX (DBL_MAX)\n"
;
}
else
{
os << "typedef int intPrec;\n"
"typedef uint atomi;\n"
"typedef float real_t;\n"
"typedef float real_bucket_t;\n"
"typedef float2 real2;\n"
"typedef float4 real4;\n"
"typedef float4 real4_bucket;\n"
"#define EPS (FLT_EPSILON)\n"
"#define TLOW (FLT_MIN)\n"
"#define TMAX (FLT_MAX)\n"
;
"typedef uint atomi;\n"
"typedef float real_t;\n"
"typedef float real_bucket_t;\n"
"typedef float2 real2;\n"
"typedef float3 real3;\n"
"typedef float4 real4;\n"
"typedef float4 real4_bucket;\n"
"#define EPS (FLT_EPSILON)\n"
"#define TLOW (FLT_MIN)\n"
"#define TMAX (FLT_MAX)\n"
;
}
os <<
"typedef long int int64;\n"
"typedef unsigned long int uint64;\n"
"\n"
"#define EPS6 ((1e-6))\n"
"\n"
"//The number of threads per block used in the iteration function. Don't change\n"
"//it lightly; the block size is hard coded to be exactly 32 x 8.\n"
"#define NTHREADS 256u\n"
"#define THREADS_PER_WARP 32u\n"
"#define NWARPS (NTHREADS / THREADS_PER_WARP)\n"
"#define COLORMAP_LENGTH 256u\n"
"#define COLORMAP_LENGTH_MINUS_1 255u\n"
"#define DE_THRESH 100u\n"
"#define BadVal(x) (((x) != (x)) || ((x) > 1e10) || ((x) < -1e10))\n"
"#define Rint(A) floor((A) + (((A) < 0) ? -0.5 : 0.5))\n"
"#define SQR(x) ((x) * (x))\n"
"#define CUBE(x) ((x) * (x) * (x))\n"
"#define M_2PI (M_PI * 2)\n"
"#define M_3PI (M_PI * 3)\n"
"#define SQRT5 2.2360679774997896964091736687313\n"
"#define M_PHI 1.61803398874989484820458683436563\n"
"#define DEG_2_RAD (M_PI / 180)\n"
"\n"
"//Index in each dimension of a thread within a block.\n"
"#define THREAD_ID_X (get_local_id(0))\n"
"#define THREAD_ID_Y (get_local_id(1))\n"
"#define THREAD_ID_Z (get_local_id(2))\n"
"\n"
"//Index in each dimension of a block within a grid.\n"
"#define BLOCK_ID_X (get_group_id(0))\n"
"#define BLOCK_ID_Y (get_group_id(1))\n"
"#define BLOCK_ID_Z (get_group_id(2))\n"
"\n"
"//Absolute index in each dimension of a thread within a grid.\n"
"#define GLOBAL_ID_X (get_global_id(0))\n"
"#define GLOBAL_ID_Y (get_global_id(1))\n"
"#define GLOBAL_ID_Z (get_global_id(2))\n"
"\n"
"//Dimensions of a block.\n"
"#define BLOCK_SIZE_X (get_local_size(0))\n"
"#define BLOCK_SIZE_Y (get_local_size(1))\n"
"#define BLOCK_SIZE_Z (get_local_size(2))\n"
"\n"
"//Dimensions of a grid, in terms of blocks.\n"
"#define GRID_SIZE_X (get_num_groups(0))\n"
"#define GRID_SIZE_Y (get_num_groups(1))\n"
"#define GRID_SIZE_Z (get_num_groups(2))\n"
"\n"
"//Dimensions of a grid, in terms of threads.\n"
"#define GLOBAL_SIZE_X (get_global_size(0))\n"
"#define GLOBAL_SIZE_Y (get_global_size(1))\n"
"#define GLOBAL_SIZE_Z (get_global_size(2))\n"
"\n"
"#define INDEX_IN_BLOCK_2D (THREAD_ID_Y * BLOCK_SIZE_X + THREAD_ID_X)\n"
"#define INDEX_IN_BLOCK_3D ((BLOCK_SIZE_X * BLOCK_SIZE_Y * THREAD_ID_Z) + INDEX_IN_BLOCK_2D)\n"
"\n"
"#define INDEX_IN_GRID_2D (GLOBAL_ID_Y * GLOBAL_SIZE_X + GLOBAL_ID_X)\n"
"#define INDEX_IN_GRID_3D ((GLOBAL_SIZE_X * GLOBAL_SIZE_Y * GLOBAL_ID_Z) + INDEX_IN_GRID_2D)\n"
"\n";
"typedef long int int64;\n"
"typedef unsigned long int uint64;\n"
"\n"
"#define EPS6 ((1e-6))\n"
"\n"
"//The number of threads per block used in the iteration function. Don't change\n"
"//it lightly; the block size is hard coded to be exactly 32 x 8.\n"
"#define NTHREADS 256u\n"
"#define THREADS_PER_WARP 32u\n"
"#define NWARPS (NTHREADS / THREADS_PER_WARP)\n"
"#define COLORMAP_LENGTH 256u\n"
"#define COLORMAP_LENGTH_MINUS_1 255u\n"
"#define DE_THRESH 100u\n"
"#define BadVal(x) (((x) != (x)) || ((x) > 1e10) || ((x) < -1e10))\n"
"#define Rint(A) floor((A) + (((A) < 0) ? -0.5 : 0.5))\n"
"#define SQR(x) ((x) * (x))\n"
"#define CUBE(x) ((x) * (x) * (x))\n"
"#define M_2PI (M_PI * 2)\n"
"#define M_3PI (M_PI * 3)\n"
"#define SQRT5 2.2360679774997896964091736687313\n"
"#define M_PHI 1.61803398874989484820458683436563\n"
"#define DEG_2_RAD (M_PI / 180)\n"
"\n"
"//Index in each dimension of a thread within a block.\n"
"#define THREAD_ID_X (get_local_id(0))\n"
"#define THREAD_ID_Y (get_local_id(1))\n"
"#define THREAD_ID_Z (get_local_id(2))\n"
"\n"
"//Index in each dimension of a block within a grid.\n"
"#define BLOCK_ID_X (get_group_id(0))\n"
"#define BLOCK_ID_Y (get_group_id(1))\n"
"#define BLOCK_ID_Z (get_group_id(2))\n"
"\n"
"//Absolute index in each dimension of a thread within a grid.\n"
"#define GLOBAL_ID_X (get_global_id(0))\n"
"#define GLOBAL_ID_Y (get_global_id(1))\n"
"#define GLOBAL_ID_Z (get_global_id(2))\n"
"\n"
"//Dimensions of a block.\n"
"#define BLOCK_SIZE_X (get_local_size(0))\n"
"#define BLOCK_SIZE_Y (get_local_size(1))\n"
"#define BLOCK_SIZE_Z (get_local_size(2))\n"
"\n"
"//Dimensions of a grid, in terms of blocks.\n"
"#define GRID_SIZE_X (get_num_groups(0))\n"
"#define GRID_SIZE_Y (get_num_groups(1))\n"
"#define GRID_SIZE_Z (get_num_groups(2))\n"
"\n"
"//Dimensions of a grid, in terms of threads.\n"
"#define GLOBAL_SIZE_X (get_global_size(0))\n"
"#define GLOBAL_SIZE_Y (get_global_size(1))\n"
"#define GLOBAL_SIZE_Z (get_global_size(2))\n"
"\n"
"#define INDEX_IN_BLOCK_2D (THREAD_ID_Y * BLOCK_SIZE_X + THREAD_ID_X)\n"
"#define INDEX_IN_BLOCK_3D ((BLOCK_SIZE_X * BLOCK_SIZE_Y * THREAD_ID_Z) + INDEX_IN_BLOCK_2D)\n"
"\n"
"#define INDEX_IN_GRID_2D (GLOBAL_ID_Y * GLOBAL_SIZE_X + GLOBAL_ID_X)\n"
"#define INDEX_IN_GRID_3D ((GLOBAL_SIZE_X * GLOBAL_SIZE_Y * GLOBAL_ID_Z) + INDEX_IN_GRID_2D)\n"
"\n";
return os.str();
}
@ -141,15 +142,15 @@ struct ALIGN PointCL
/// be supported, which will make it more than 4 members.
/// </summary>
static const char* PointCLStructString =
"typedef struct __attribute__ " ALIGN_CL " _Point\n"
"{\n"
" real_t m_X;\n"
" real_t m_Y;\n"
" real_t m_Z;\n"
" real_t m_ColorX;\n"
" uint m_LastXfUsed;\n"
"} Point;\n"
"\n";
"typedef struct __attribute__ " ALIGN_CL " _Point\n"
"{\n"
" real_t m_X;\n"
" real_t m_Y;\n"
" real_t m_Z;\n"
" real_t m_ColorX;\n"
" uint m_LastXfUsed;\n"
"} Point;\n"
"\n";
#define MAX_CL_VARS 8//These must always match.
#define MAX_CL_VARS_STRING "8"
@ -175,18 +176,18 @@ struct ALIGN XformCL
/// The xform structure used to iterate in OpenCL.
/// </summary>
static const char* XformCLStructString =
"typedef struct __attribute__ " ALIGN_CL " _XformCL\n"
"{\n"
" real_t m_A, m_B, m_C, m_D, m_E, m_F;\n"
" real_t m_VariationWeights[" MAX_CL_VARS_STRING "];\n"
" real_t m_PostA, m_PostB, m_PostC, m_PostD, m_PostE, m_PostF;\n"
" real_t m_DirectColor;\n"
" real_t m_ColorSpeedCache;\n"
" real_t m_OneMinusColorCache;\n"
" real_t m_Opacity;\n"
" real_t m_VizAdjusted;\n"
"} XformCL;\n"
"\n";
"typedef struct __attribute__ " ALIGN_CL " _XformCL\n"
"{\n"
" real_t m_A, m_B, m_C, m_D, m_E, m_F;\n"
" real_t m_VariationWeights[" MAX_CL_VARS_STRING "];\n"
" real_t m_PostA, m_PostB, m_PostC, m_PostD, m_PostE, m_PostF;\n"
" real_t m_DirectColor;\n"
" real_t m_ColorSpeedCache;\n"
" real_t m_OneMinusColorCache;\n"
" real_t m_Opacity;\n"
" real_t m_VizAdjusted;\n"
"} XformCL;\n"
"\n";
/// <summary>
/// A structure on the host used to hold all of the needed information for an ember used on the device to iterate in OpenCL.
@ -210,27 +211,27 @@ struct ALIGN EmberCL
/// The ember structure used to iterate in OpenCL.
/// </summary>
static const char* EmberCLStructString =
"typedef struct __attribute__ " ALIGN_CL " _EmberCL\n"
"{\n"
" real_t m_CamZPos;\n"
" real_t m_CamPerspective;\n"
" real_t m_CamYaw;\n"
" real_t m_CamPitch;\n"
" real_t m_CamDepthBlur;\n"
" real_t m_BlurCoef;\n"
" real_t m_C00;\n"
" real_t m_C01;\n"
" real_t m_C02;\n"
" real_t m_C10;\n"
" real_t m_C11;\n"
" real_t m_C12;\n"
" real_t m_C20;\n"
" real_t m_C21;\n"
" real_t m_C22;\n"
" real_t m_CenterX, m_CenterY;\n"
" real_t m_RotA, m_RotB, m_RotD, m_RotE;\n"
"} EmberCL;\n"
"\n";
"typedef struct __attribute__ " ALIGN_CL " _EmberCL\n"
"{\n"
" real_t m_CamZPos;\n"
" real_t m_CamPerspective;\n"
" real_t m_CamYaw;\n"
" real_t m_CamPitch;\n"
" real_t m_CamDepthBlur;\n"
" real_t m_BlurCoef;\n"
" real_t m_C00;\n"
" real_t m_C01;\n"
" real_t m_C02;\n"
" real_t m_C10;\n"
" real_t m_C11;\n"
" real_t m_C12;\n"
" real_t m_C20;\n"
" real_t m_C21;\n"
" real_t m_C22;\n"
" real_t m_CenterX, m_CenterY;\n"
" real_t m_RotA, m_RotB, m_RotD, m_RotE;\n"
"} EmberCL;\n"
"\n";
/// <summary>
/// A structure on the host used to hold all of the needed information for cartesian to raster mapping used on the device to iterate in OpenCL.
@ -249,14 +250,14 @@ struct ALIGN CarToRasCL
/// The cartesian to raster structure used to iterate in OpenCL.
/// </summary>
static const char* CarToRasCLStructString =
"typedef struct __attribute__ " ALIGN_CL " _CarToRasCL\n"
"{\n"
" real_t m_PixPerImageUnitW, m_RasLlX;\n"
" uint m_RasWidth;\n"
" real_t m_PixPerImageUnitH, m_RasLlY;\n"
" real_t m_CarLlX, m_CarUrX, m_CarUrY, m_CarLlY;\n"
"} CarToRasCL;\n"
"\n";
"typedef struct __attribute__ " ALIGN_CL " _CarToRasCL\n"
"{\n"
" real_t m_PixPerImageUnitW, m_RasLlX;\n"
" uint m_RasWidth;\n"
" real_t m_PixPerImageUnitH, m_RasLlY;\n"
" real_t m_CarLlX, m_CarUrX, m_CarUrY, m_CarLlY;\n"
"} CarToRasCL;\n"
"\n";
/// <summary>
/// A structure on the host used to hold all of the needed information for density filtering used on the device to iterate in OpenCL.
@ -283,20 +284,20 @@ struct ALIGN DensityFilterCL
/// Note that the actual filter buffer is held elsewhere.
/// </summary>
static const char* DensityFilterCLStructString =
"typedef struct __attribute__ " ALIGN_CL " _DensityFilterCL\n"
"{\n"
" real_bucket_t m_Curve;\n"
" real_bucket_t m_K1;\n"
" real_bucket_t m_K2;\n"
" uint m_Supersample;\n"
" uint m_SuperRasW;\n"
" uint m_SuperRasH;\n"
" uint m_KernelSize;\n"
" uint m_MaxFilterIndex;\n"
" uint m_MaxFilteredCounts;\n"
" uint m_FilterWidth;\n"
"} DensityFilterCL;\n"
"\n";
"typedef struct __attribute__ " ALIGN_CL " _DensityFilterCL\n"
"{\n"
" real_bucket_t m_Curve;\n"
" real_bucket_t m_K1;\n"
" real_bucket_t m_K2;\n"
" uint m_Supersample;\n"
" uint m_SuperRasW;\n"
" uint m_SuperRasH;\n"
" uint m_KernelSize;\n"
" uint m_MaxFilterIndex;\n"
" uint m_MaxFilteredCounts;\n"
" uint m_FilterWidth;\n"
"} DensityFilterCL;\n"
"\n";
/// <summary>
/// A structure on the host used to hold all of the needed information for spatial filtering used on the device to iterate in OpenCL.
@ -328,26 +329,26 @@ struct ALIGN SpatialFilterCL
/// Note that the actual filter buffer is held elsewhere.
/// </summary>
static const char* SpatialFilterCLStructString =
"typedef struct __attribute__ ((aligned (16))) _SpatialFilterCL\n"
"{\n"
" uint m_SuperRasW;\n"
" uint m_SuperRasH;\n"
" uint m_FinalRasW;\n"
" uint m_FinalRasH;\n"
" uint m_Supersample;\n"
" uint m_FilterWidth;\n"
" uint m_NumChannels;\n"
" uint m_BytesPerChannel;\n"
" uint m_DensityFilterOffset;\n"
" uint m_Transparency;\n"
" uint m_YAxisUp;\n"
" real_bucket_t m_Vibrancy;\n"
" real_bucket_t m_HighlightPower;\n"
" real_bucket_t m_Gamma;\n"
" real_bucket_t m_LinRange;\n"
" real_bucket_t m_Background[4];\n"//For some reason, using float4/double4 here does not align no matter what. So just use an array of 4.
"} SpatialFilterCL;\n"
"\n";
"typedef struct __attribute__ ((aligned (16))) _SpatialFilterCL\n"
"{\n"
" uint m_SuperRasW;\n"
" uint m_SuperRasH;\n"
" uint m_FinalRasW;\n"
" uint m_FinalRasH;\n"
" uint m_Supersample;\n"
" uint m_FilterWidth;\n"
" uint m_NumChannels;\n"
" uint m_BytesPerChannel;\n"
" uint m_DensityFilterOffset;\n"
" uint m_Transparency;\n"
" uint m_YAxisUp;\n"
" real_bucket_t m_Vibrancy;\n"
" real_bucket_t m_HighlightPower;\n"
" real_bucket_t m_Gamma;\n"
" real_bucket_t m_LinRange;\n"
" real_bucket_t m_Background[4];\n"//For some reason, using float4/double4 here does not align no matter what. So just use an array of 4.
"} SpatialFilterCL;\n"
"\n";
/// <summary>
/// EmberCL makes extensive use of the build in vector types, however accessing
@ -355,40 +356,40 @@ static const char* SpatialFilterCLStructString =
/// Declaring them in a union with a buffer resolves this problem.
/// </summary>
static const char* UnionCLStructString =
"typedef union\n"
"{\n"
" uchar3 m_Uchar3;\n"
" uchar m_Uchars[3];\n"
"} uchar3uchars;\n"
"\n"
"typedef union\n"
"{\n"
" uchar4 m_Uchar4;\n"
" uchar m_Uchars[4];\n"
"} uchar4uchars;\n"
"\n"
"typedef union\n"
"{\n"
" uint4 m_Uint4;\n"
" uint m_Uints[4];\n"
"} uint4uints;\n"
"\n"
"typedef union\n"//Use in places where float is required.
"{\n"
" float4 m_Float4;\n"
" float m_Floats[4];\n"
"} float4floats;\n"
"\n"
"typedef union\n"//Use in places where float or double can be used depending on the template type.
"{\n"
" real4 m_Real4;\n"
" real_t m_Reals[4];\n"
"} real4reals;\n"
"\n"
"typedef union\n"//Used to match the bucket template type.
"{\n"
" real4_bucket m_Real4;\n"
" real_bucket_t m_Reals[4];\n"
"} real4reals_bucket;\n"
"\n";
"typedef union\n"
"{\n"
" uchar3 m_Uchar3;\n"
" uchar m_Uchars[3];\n"
"} uchar3uchars;\n"
"\n"
"typedef union\n"
"{\n"
" uchar4 m_Uchar4;\n"
" uchar m_Uchars[4];\n"
"} uchar4uchars;\n"
"\n"
"typedef union\n"
"{\n"
" uint4 m_Uint4;\n"
" uint m_Uints[4];\n"
"} uint4uints;\n"
"\n"
"typedef union\n"//Use in places where float is required.
"{\n"
" float4 m_Float4;\n"
" float m_Floats[4];\n"
"} float4floats;\n"
"\n"
"typedef union\n"//Use in places where float or double can be used depending on the template type.
"{\n"
" real4 m_Real4;\n"
" real_t m_Reals[4];\n"
"} real4reals;\n"
"\n"
"typedef union\n"//Used to match the bucket template type.
"{\n"
" real4_bucket m_Real4;\n"
" real_bucket_t m_Reals[4];\n"
"} real4reals_bucket;\n"
"\n";
}

View File

@ -15,31 +15,26 @@ FunctionMapper::FunctionMapper()
" intPrec temp = (x >= 0.0 ? (intPrec)(x + 0.5) : (intPrec)(x - 0.5));\n"
" return (real_t)temp;\n"
"}\n";
m_GlobalMap["Round"] =
"inline real_t Round(real_t r)\n"
"{\n"
" return (r > 0.0) ? floor(r + 0.5) : ceil(r - 0.5);\n"
"}\n";
m_GlobalMap["Sign"] =
"inline real_t Sign(real_t v)\n"
"{\n"
" return (v < 0.0) ? -1 : (v > 0.0) ? 1 : 0.0;\n"
"}\n";
m_GlobalMap["SignNz"] =
"inline real_t SignNz(real_t v)\n"
"{\n"
" return (v < 0.0) ? -1.0 : 1.0;\n"
"}\n";
m_GlobalMap["Sqr"] =
"inline real_t Sqr(real_t v)\n"
"{\n"
" return v * v;\n"
"}\n";
m_GlobalMap["SafeSqrt"] =
"inline real_t SafeSqrt(real_t x)\n"
"{\n"
@ -48,49 +43,41 @@ FunctionMapper::FunctionMapper()
"\n"
" return sqrt(x);\n"
"}\n";
m_GlobalMap["Cube"] =
"inline real_t Cube(real_t v)\n"
"{\n"
" return v * v * v;\n"
"}\n";
m_GlobalMap["Hypot"] =
"inline real_t Hypot(real_t x, real_t y)\n"
"{\n"
" return sqrt(SQR(x) + SQR(y));\n"
"}\n";
m_GlobalMap["Spread"] =
"inline real_t Spread(real_t x, real_t y)\n"
"{\n"
" return Hypot(x, y) * ((x) > 0.0 ? 1.0 : -1.0);\n"
"}\n";
m_GlobalMap["Powq4"] =
"inline real_t Powq4(real_t x, real_t y)\n"
"{\n"
" return pow(fabs(x), y) * SignNz(x);\n"
"}\n";
m_GlobalMap["Powq4c"] =
"inline real_t Powq4c(real_t x, real_t y)\n"
"{\n"
" return y == 1.0 ? x : Powq4(x, y);\n"
"}\n";
m_GlobalMap["Zeps"] =
"inline real_t Zeps(real_t x)\n"
"{\n"
" return x == 0.0 ? EPS : x;\n"
"}\n";
m_GlobalMap["Lerp"] =
"inline real_t Lerp(real_t a, real_t b, real_t p)\n"
"{\n"
" return a + (b - a) * p;\n"
"}\n";
m_GlobalMap["Fabsmod"] =
"inline real_t Fabsmod(real_t v)\n"
"{\n"
@ -98,37 +85,31 @@ FunctionMapper::FunctionMapper()
"\n"
" return modf(v, &dummy);\n"
"}\n";
m_GlobalMap["Fosc"] =
"inline real_t Fosc(real_t p, real_t amp, real_t ph)\n"
"{\n"
" return 0.5 - cos(p * amp + ph) * 0.5;\n"
"}\n";
m_GlobalMap["Foscn"] =
"inline real_t Foscn(real_t p, real_t ph)\n"
"{\n"
" return 0.5 - cos(p + ph) * 0.5;\n"
"}\n";
m_GlobalMap["LogScale"] =
"inline real_t LogScale(real_t x)\n"
"{\n"
" return x == 0.0 ? 0.0 : log((fabs(x) + 1) * M_E) * SignNz(x) / M_E;\n"
"}\n";
m_GlobalMap["LogMap"] =
"inline real_t LogMap(real_t x)\n"
"{\n"
" return x == 0.0 ? 0.0 : (M_E + log(x * M_E)) * 0.25 * SignNz(x);\n"
"}\n";
m_GlobalMap["ClampGte"] =
"inline real_t ClampGte(real_t val, real_t gte)\n"
"{\n"
" return (val < gte) ? gte : val;\n"
"}\n";
m_GlobalMap["Swap"] =
"inline void Swap(real_t* val1, real_t* val2)\n"
"{\n"
@ -136,13 +117,172 @@ FunctionMapper::FunctionMapper()
" *val1 = *val2;\n"
" *val2 = tmp;\n"
"}\n";
m_GlobalMap["Vratio"] =
"inline real_t Vratio(real2* p, real2* q, real2* u)\n"
"{\n"
" real_t pmQx, pmQy;\n"
"\n"
" pmQx = (*p).x - (*q).x;\n"
" pmQy = (*p).y - (*q).y;\n"
"\n"
" if (pmQx == 0 && pmQy == 0)\n"
" return 1.0;\n"
"\n"
" return 2 * (((*u).x - (*q).x) * pmQx + ((*u).y - (*q).y) * pmQy) / (pmQx * pmQx + pmQy * pmQy);\n"
"}\n";
m_GlobalMap["Closest"] =
"inline int Closest(real2* p, int n, real2* u)\n"
"{\n"
" real_t d2;\n"
" real_t d2min = TMAX;\n"
" int i, j = 0;\n"
"\n"
" for (i = 0; i < n; i++)\n"
" {\n"
" d2 = Sqr(p[i].x - (*u).x) + Sqr(p[i].y - (*u).y);\n"
"\n"
" if (d2 < d2min)\n"
" {\n"
" d2min = d2;\n"
" j = i;\n"
" }\n"
" }\n"
"\n"
" return j;\n"
"}\n";
m_GlobalMap["Voronoi"] =
"inline real_t Voronoi(real2* p, int n, int q, real2* u)\n"
"{\n"
" real_t ratio;\n"
" real_t ratiomax = TLOW;\n"
" int i;\n"
"\n"
" for (i = 0; i < n; i++)\n"
" {\n"
" if (i != q)\n"
" {\n"
" ratio = Vratio(&p[i], &p[q], u);\n"
"\n"
" if (ratio > ratiomax)\n"
" ratiomax = ratio;\n"
" }\n"
" }\n"
"\n"
" return ratiomax;\n"
"}\n";
m_GlobalMap["SimplexNoise3D"] =
"inline real_t SimplexNoise3D(real3* v, __global real_t* p, __global real3* grad)\n"
"{\n"
" real3 c[4];\n"
" real_t n = 0;\n"
" int gi[4];\n"
" real_t t;\n"
" real_t skewIn = ((*v).x + (*v).y + (*v).z) * 0.3333;\n"
" int i = (int)floor((*v).x + skewIn);\n"
" int j = (int)floor((*v).y + skewIn);\n"
" int k = (int)floor((*v).z + skewIn);\n"
" t = (i + j + k) * 0.16666;\n"
" real_t x0 = i - t;\n"
" real_t y0 = j - t;\n"
" real_t z0 = k - t;\n"
" c[0].x = (*v).x - x0;\n"
" c[0].y = (*v).y - y0;\n"
" c[0].z = (*v).z - z0;\n"
" int i1, j1, k1;\n"
" int i2, j2, k2;\n"
"\n"
" if (c[0].x >= c[0].y)\n"
" {\n"
" if (c[0].y >= c[0].z)\n"
" {\n"
" i1 = 1; j1 = 0; k1 = 0; i2 = 1; j2 = 1; k2 = 0;\n"
" }\n"
" else\n"
" {\n"
" if (c[0].x >= c[0].z)\n"
" {\n"
" i1 = 1; j1 = 0; k1 = 0; i2 = 1; j2 = 0; k2 = 1;\n"
" }\n"
" else\n"
" {\n"
" i1 = 0; j1 = 0; k1 = 1; i2 = 1; j2 = 0; k2 = 1;\n"
" }\n"
" }\n"
" }\n"
" else\n"
" {\n"
" if (c[0].y < c[0].z)\n"
" {\n"
" i1 = 0; j1 = 0; k1 = 1; i2 = 0; j2 = 1; k2 = 1;\n"
" }\n"
" else\n"
" {\n"
" if (c[0].x < c[0].z)\n"
" {\n"
" i1 = 0; j1 = 1; k1 = 0; i2 = 0; j2 = 1; k2 = 1;\n"
" }\n"
" else\n"
" {\n"
" i1 = 0; j1 = 1; k1 = 0; i2 = 1; j2 = 1; k2 = 0;\n"
" }\n"
" }\n"
" }\n"
"\n"
" c[1].x = c[0].x - i1 + 0.16666;\n"
" c[1].y = c[0].y - j1 + 0.16666;\n"
" c[1].z = c[0].z - k1 + 0.16666;\n"
" c[2].x = c[0].x - i2 + 2 * 0.16666;\n"
" c[2].y = c[0].y - j2 + 2 * 0.16666;\n"
" c[2].z = c[0].z - k2 + 2 * 0.16666;\n"
" c[3].x = c[0].x - 1 + 3 * 0.16666;\n"
" c[3].y = c[0].y - 1 + 3 * 0.16666;\n"
" c[3].z = c[0].z - 1 + 3 * 0.16666;\n"
" int ii = i & 0x3ff;\n"
" int jj = j & 0x3ff;\n"
" int kk = k & 0x3ff;\n"
" gi[0] = (int)p[ii + (int)p[jj + (int)p[kk]]];\n"
" gi[1] = (int)p[ii + i1 + (int)p[jj + j1 + (int)p[kk + k1]]];\n"
" gi[2] = (int)p[ii + i2 + (int)p[jj + j2 + (int)p[kk + k2]]];\n"
" gi[3] = (int)p[ii + 1 + (int)p[jj + 1 + (int)p[kk + 1]]];\n"
" for (uint corner = 0; corner < 4; corner++)\n"
" {\n"
" t = 0.6 - c[corner].x * c[corner].x - c[corner].y * c[corner].y - c[corner].z * c[corner].z;\n"
"\n"
" if (t > 0)\n"
" {\n"
" real3 u = grad[gi[corner]];\n"
" t *= t;\n"
" n += t * t * (u.x * c[corner].x + u.y * c[corner].y + u.z * c[corner].z);\n"
" }\n"
" }\n"
"\n"
" return 32 * n;\n"
"}\n";
m_GlobalMap["PerlinNoise3D"] =
"inline real_t PerlinNoise3D(real3* v, __global real_t* p, __global real3* grad, real_t aScale, real_t fScale, int octaves)\n"
"{\n"
" int i;\n"
" real_t n = 0, a = 1;\n"
" real3 u = *v;\n"
"\n"
" for (i = 0; i < octaves; i++)\n"
" {\n"
" n += SimplexNoise3D(&u, p, grad) / a;\n"
" a *= aScale;\n"
" u.x *= fScale;\n"
" u.y *= fScale;\n"
" u.x *= fScale;\n"
" }\n"
"\n"
" return n;\n"
"}\n";
}
}
const string* FunctionMapper::GetGlobalFunc(const string& func)
{
const auto& text = m_GlobalMap.find(func);
if (text != m_GlobalMap.end())
return &text->second;
else

File diff suppressed because it is too large Load Diff

View File

@ -30,9 +30,10 @@ public:
const string& SumHistKernel() const;
const string& SumHistEntryPoint() const;
const string& IterEntryPoint() const;
string CreateIterKernelString(const Ember<T>& ember, string& parVarDefines, bool lockAccum = false, bool doAccum = true);
string CreateIterKernelString(const Ember<T>& ember, const string& parVarDefines, const string& globalSharedDefines, bool lockAccum = false, bool doAccum = true);
string GlobalFunctionsString(const Ember<T>& ember);
static void ParVarIndexDefines(const Ember<T>& ember, pair<string, vector<T>>& params, bool doVals = true, bool doString = true);
static void SharedDataIndexDefines(const Ember<T>& ember, pair<string, vector<T>>& params, bool doVals = true, bool doString = true);
static string VariationStateString(const Ember<T>& ember);
static string VariationStateInitString(const Ember<T>& ember);
static bool IsBuildRequired(const Ember<T>& ember1, const Ember<T>& ember2);

View File

@ -3,17 +3,6 @@
namespace EmberCLns
{
/// <summary>
/// Initialize and return a reference to the one and only OpenCLInfo object.
/// </summary>
/// <returns>A reference to the only OpenCLInfo object.</returns>
OpenCLInfo& OpenCLInfo::Instance()
{
static OpenCLInfo instance;
return instance;
}
/// <summary>
/// Initialize the all platforms and devices and keep information about them in lists.
/// </summary>
@ -23,7 +12,6 @@ OpenCLInfo::OpenCLInfo()
vector<cl::Platform> platforms;
vector<vector<cl::Device>> devices;
intmax_t workingPlatformIndex = -1;
m_Init = false;
cl::Platform::get(&platforms);
devices.resize(platforms.size());
@ -210,39 +198,36 @@ bool OpenCLInfo::CreateContext(const cl::Platform& platform, cl::Context& contex
if (shared)
{
//Define OS-specific context properties and create the OpenCL context.
#if defined (__APPLE__) || defined(MACOSX)
CGLContextObj kCGLContext = CGLGetCurrentContext();
CGLShareGroupObj kCGLShareGroup = CGLGetShareGroup(kCGLContext);
cl_context_properties props[] =
{
CL_CONTEXT_PROPERTY_USE_CGL_SHAREGROUP_APPLE, (cl_context_properties)kCGLShareGroup,
0
};
context = cl::Context(CL_DEVICE_TYPE_GPU, props, nullptr, nullptr, &err);//May need to tinker with this on Mac.
#else
#if defined WIN32
cl_context_properties props[] =
{
CL_GL_CONTEXT_KHR, (cl_context_properties)wglGetCurrentContext(),
CL_WGL_HDC_KHR, (cl_context_properties)wglGetCurrentDC(),
CL_CONTEXT_PLATFORM, reinterpret_cast<cl_context_properties>((platform)()),
0
};
context = cl::Context(CL_DEVICE_TYPE_GPU, props, nullptr, nullptr, &err);
#else
cl_context_properties props[] =
{
CL_GL_CONTEXT_KHR, cl_context_properties(glXGetCurrentContext()),
CL_GLX_DISPLAY_KHR, cl_context_properties(glXGetCurrentDisplay()),
CL_CONTEXT_PLATFORM, reinterpret_cast<cl_context_properties>((platform)()),
0
};
context = cl::Context(CL_DEVICE_TYPE_GPU, props, nullptr, nullptr, &err);
#endif
#endif
#if defined (__APPLE__) || defined(MACOSX)
CGLContextObj kCGLContext = CGLGetCurrentContext();
CGLShareGroupObj kCGLShareGroup = CGLGetShareGroup(kCGLContext);
cl_context_properties props[] =
{
CL_CONTEXT_PROPERTY_USE_CGL_SHAREGROUP_APPLE, (cl_context_properties)kCGLShareGroup,
0
};
context = cl::Context(CL_DEVICE_TYPE_GPU, props, nullptr, nullptr, &err);//May need to tinker with this on Mac.
#else
#if defined WIN32
cl_context_properties props[] =
{
CL_GL_CONTEXT_KHR, (cl_context_properties)wglGetCurrentContext(),
CL_WGL_HDC_KHR, (cl_context_properties)wglGetCurrentDC(),
CL_CONTEXT_PLATFORM, reinterpret_cast<cl_context_properties>((platform)()),
0
};
context = cl::Context(CL_DEVICE_TYPE_GPU, props, nullptr, nullptr, &err);
#else
cl_context_properties props[] =
{
CL_GL_CONTEXT_KHR, cl_context_properties(glXGetCurrentContext()),
CL_GLX_DISPLAY_KHR, cl_context_properties(glXGetCurrentDisplay()),
CL_CONTEXT_PLATFORM, reinterpret_cast<cl_context_properties>((platform)()),
0
};
context = cl::Context(CL_DEVICE_TYPE_GPU, props, nullptr, nullptr, &err);
#endif
#endif
}
else
{
@ -252,7 +237,6 @@ bool OpenCLInfo::CreateContext(const cl::Platform& platform, cl::Context& contex
reinterpret_cast<cl_context_properties>((platform)()),
0
};
context = cl::Context(CL_DEVICE_TYPE_ALL, props, nullptr, nullptr, &err);
}
@ -276,7 +260,6 @@ string OpenCLInfo::DumpInfo() const
{
ostringstream os;
vector<size_t> sizes;
os.imbue(locale(""));
for (size_t platform = 0; platform < m_Platforms.size(); platform++)
@ -294,17 +277,14 @@ string OpenCLInfo::DumpInfo() const
os << "CL_DEVICE_MAX_WRITE_IMAGE_ARGS: " << GetInfo<cl_uint>(platform, device, CL_DEVICE_MAX_WRITE_IMAGE_ARGS) << endl;
os << "CL_DEVICE_MAX_MEM_ALLOC_SIZE: " << GetInfo<cl_ulong>(platform, device, CL_DEVICE_MAX_MEM_ALLOC_SIZE) << endl;
os << "CL_DEVICE_ADDRESS_BITS: " << GetInfo<cl_uint>(platform, device, CL_DEVICE_ADDRESS_BITS) << endl;
os << "CL_DEVICE_GLOBAL_MEM_CACHE_TYPE: " << GetInfo<cl_uint>(platform, device, CL_DEVICE_GLOBAL_MEM_CACHE_TYPE) << endl;
os << "CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE: " << GetInfo<cl_uint>(platform, device, CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE) << endl;
os << "CL_DEVICE_GLOBAL_MEM_CACHE_SIZE: " << GetInfo<cl_ulong>(platform, device, CL_DEVICE_GLOBAL_MEM_CACHE_SIZE) << endl;
os << "CL_DEVICE_GLOBAL_MEM_SIZE: " << GetInfo<cl_ulong>(platform, device, CL_DEVICE_GLOBAL_MEM_SIZE) << endl;
os << "CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE: " << GetInfo<cl_ulong>(platform, device, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE) << endl;
os << "CL_DEVICE_MAX_CONSTANT_ARGS: " << GetInfo<cl_uint>(platform, device, CL_DEVICE_MAX_CONSTANT_ARGS) << endl;
os << "CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS: " << GetInfo<cl_uint>(platform, device, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS) << endl;
os << "CL_DEVICE_MAX_WORK_GROUP_SIZE: " << GetInfo<size_t>(platform, device, CL_DEVICE_MAX_WORK_GROUP_SIZE) << endl;
sizes = GetInfo<vector<size_t>>(platform, device, CL_DEVICE_MAX_WORK_ITEM_SIZES);
os << "CL_DEVICE_MAX_WORK_ITEM_SIZES: " << sizes[0] << ", " << sizes[1] << ", " << sizes[2] << endl << endl;
@ -346,55 +326,105 @@ string OpenCLInfo::ErrorToStringCL(cl_int err)
switch (err)
{
case CL_SUCCESS: return "Success";
case CL_DEVICE_NOT_FOUND: return "Device not found";
case CL_DEVICE_NOT_AVAILABLE: return "Device not available";
case CL_COMPILER_NOT_AVAILABLE: return "Compiler not available";
case CL_MEM_OBJECT_ALLOCATION_FAILURE: return "Memory object allocation failure";
case CL_OUT_OF_RESOURCES: return "Out of resources";
case CL_OUT_OF_HOST_MEMORY: return "Out of host memory";
case CL_PROFILING_INFO_NOT_AVAILABLE: return "Profiling information not available";
case CL_MEM_COPY_OVERLAP: return "Memory copy overlap";
case CL_IMAGE_FORMAT_MISMATCH: return "Image format mismatch";
case CL_IMAGE_FORMAT_NOT_SUPPORTED: return "Image format not supported";
case CL_BUILD_PROGRAM_FAILURE: return "Program build failure";
case CL_MAP_FAILURE: return "Map failure";
case CL_MISALIGNED_SUB_BUFFER_OFFSET: return "Misaligned sub buffer offset";
case CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST: return "Exec status error for events in wait list";
case CL_INVALID_VALUE: return "Invalid value";
case CL_INVALID_DEVICE_TYPE: return "Invalid device type";
case CL_INVALID_PLATFORM: return "Invalid platform";
case CL_INVALID_DEVICE: return "Invalid device";
case CL_INVALID_CONTEXT: return "Invalid context";
case CL_INVALID_QUEUE_PROPERTIES: return "Invalid queue properties";
case CL_INVALID_COMMAND_QUEUE: return "Invalid command queue";
case CL_INVALID_HOST_PTR: return "Invalid host pointer";
case CL_INVALID_MEM_OBJECT: return "Invalid memory object";
case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR: return "Invalid image format descriptor";
case CL_INVALID_IMAGE_SIZE: return "Invalid image size";
case CL_INVALID_SAMPLER: return "Invalid sampler";
case CL_INVALID_BINARY: return "Invalid binary";
case CL_INVALID_BUILD_OPTIONS: return "Invalid build options";
case CL_INVALID_PROGRAM: return "Invalid program";
case CL_INVALID_PROGRAM_EXECUTABLE: return "Invalid program executable";
case CL_INVALID_KERNEL_NAME: return "Invalid kernel name";
case CL_INVALID_KERNEL_DEFINITION: return "Invalid kernel definition";
case CL_INVALID_KERNEL: return "Invalid kernel";
case CL_INVALID_ARG_INDEX: return "Invalid argument index";
case CL_INVALID_ARG_VALUE: return "Invalid argument value";
case CL_INVALID_ARG_SIZE: return "Invalid argument size";
case CL_INVALID_KERNEL_ARGS: return "Invalid kernel arguments";
case CL_INVALID_WORK_DIMENSION: return "Invalid work dimension";
case CL_INVALID_WORK_GROUP_SIZE: return "Invalid work group size";
case CL_INVALID_WORK_ITEM_SIZE: return "Invalid work item size";
case CL_INVALID_GLOBAL_OFFSET: return "Invalid global offset";
case CL_INVALID_EVENT_WAIT_LIST: return "Invalid event wait list";
case CL_INVALID_EVENT: return "Invalid event";
case CL_INVALID_OPERATION: return "Invalid operation";
case CL_INVALID_GL_OBJECT: return "Invalid OpenGL object";
case CL_INVALID_BUFFER_SIZE: return "Invalid buffer size";
case CL_INVALID_MIP_LEVEL: return "Invalid mip-map level";
case CL_INVALID_GLOBAL_WORK_SIZE: return "Invalid global work size";
case CL_INVALID_PROPERTY: return "Invalid property";
default:
{
ostringstream ss;
@ -403,4 +433,4 @@ string OpenCLInfo::ErrorToStringCL(cl_int err)
}
}
}
}
}

View File

@ -17,10 +17,9 @@ namespace EmberCLns
/// This class derives from EmberReport, so the caller is able
/// to retrieve a text dump of error information if any errors occur.
/// </summary>
class EMBERCL_API OpenCLInfo : public EmberReport
class EMBERCL_API OpenCLInfo : public EmberReport, public Singleton<OpenCLInfo>
{
public:
static OpenCLInfo& Instance();
const vector<cl::Platform>& Platforms() const;
const string& PlatformName(size_t platform) const;
const vector<string>& PlatformNames() const;
@ -55,6 +54,8 @@ public:
return val;
}
SINGLETON_DERIVED_IMPL(OpenCLInfo);
private:
OpenCLInfo();

View File

@ -9,19 +9,18 @@ namespace EmberCLns
/// global OpenCLInfo object. The caller must explicitly do it.
/// </summary>
OpenCLWrapper::OpenCLWrapper()
: m_Info(OpenCLInfo::Instance())
{
m_Init = false;
m_Shared = false;
m_PlatformIndex = 0;
m_DeviceIndex = 0;
m_LocalMemSize = 0;
//Pre-allocate some space to avoid temporary copying.
m_Programs.reserve(4);
m_Buffers.reserve(4);
m_Images.reserve(4);
m_GLImages.reserve(4);
m_Info = OpenCLInfo::Instance();
}
/// <summary>
@ -35,25 +34,24 @@ OpenCLWrapper::OpenCLWrapper()
bool OpenCLWrapper::Init(size_t platformIndex, size_t deviceIndex, bool shared)
{
cl_int err;
auto& platforms = m_Info.Platforms();
auto& devices = m_Info.Devices();
auto& platforms = m_Info->Platforms();
auto& devices = m_Info->Devices();
m_Init = false;
ClearErrorReport();
if (m_Info.Ok())
if (m_Info->Ok())
{
if (platformIndex < platforms.size() && platformIndex < devices.size())
{
cl::Context context;
if (m_Info.CreateContext(platforms[platformIndex], context, shared))//Platform index is within range, now do context.
if (m_Info->CreateContext(platforms[platformIndex], context, shared))//Platform index is within range, now do context.
{
if (deviceIndex < devices[platformIndex].size())//Context is ok, now do device.
{
auto q = cl::CommandQueue(context, devices[platformIndex][deviceIndex], 0, &err);//At least one GPU device is present, so create a command queue.
if (m_Info.CheckCL(err, "cl::CommandQueue()"))//Everything was successful so assign temporaries to members.
if (m_Info->CheckCL(err, "cl::CommandQueue()"))//Everything was successful so assign temporaries to members.
{
m_Platform = platforms[platformIndex];
m_Device = devices[platformIndex][deviceIndex];
@ -63,9 +61,9 @@ bool OpenCLWrapper::Init(size_t platformIndex, size_t deviceIndex, bool shared)
m_DeviceIndex = deviceIndex;
m_DeviceVec.clear();
m_DeviceVec.push_back(m_Device);
m_LocalMemSize = size_t(m_Info.GetInfo<cl_ulong>(m_PlatformIndex, m_DeviceIndex, CL_DEVICE_LOCAL_MEM_SIZE));
m_GlobalMemSize = size_t(m_Info.GetInfo<cl_ulong>(m_PlatformIndex, m_DeviceIndex, CL_DEVICE_GLOBAL_MEM_SIZE));
m_MaxAllocSize = size_t(m_Info.GetInfo<cl_ulong>(m_PlatformIndex, m_DeviceIndex, CL_DEVICE_MAX_MEM_ALLOC_SIZE));
m_LocalMemSize = size_t(m_Info->GetInfo<cl_ulong>(m_PlatformIndex, m_DeviceIndex, CL_DEVICE_LOCAL_MEM_SIZE));
m_GlobalMemSize = size_t(m_Info->GetInfo<cl_ulong>(m_PlatformIndex, m_DeviceIndex, CL_DEVICE_GLOBAL_MEM_SIZE));
m_MaxAllocSize = size_t(m_Info->GetInfo<cl_ulong>(m_PlatformIndex, m_DeviceIndex, CL_DEVICE_MAX_MEM_ALLOC_SIZE));
m_Shared = shared;
m_Init = true;//Command queue is ok, it's now ok to begin building and running programs.
}
@ -139,28 +137,25 @@ bool OpenCLWrapper::AddBuffer(const string& name, size_t size, cl_mem_flags flag
{
cl::Buffer buff(m_Context, flags, size, nullptr, &err);
if (!m_Info.CheckCL(err, "cl::Buffer()"))
if (!m_Info->CheckCL(err, "cl::Buffer()"))
return false;
NamedBuffer nb(buff, name);
m_Buffers.push_back(nb);
}
else if (GetBufferSize(bufferIndex) != size)//If it did exist, only create and add if the sizes were different.
{
m_Buffers[bufferIndex] = NamedBuffer(cl::Buffer(m_Context, flags, 0, nullptr, &err), "emptybuffer");//First clear out the original so the two don't exist in memory at once.
cl::Buffer buff(m_Context, flags, size, nullptr, &err);//Create the new buffer.
if (!m_Info.CheckCL(err, "cl::Buffer()"))
if (!m_Info->CheckCL(err, "cl::Buffer()"))
return false;
NamedBuffer nb(buff, name);//Make a named buffer out of the new buffer.
m_Buffers[bufferIndex] = nb;//Finally, assign.
}
//If the buffer existed and the sizes were the same, take no action.
//If the buffer existed and the sizes were the same, take no action.
return true;
}
@ -199,7 +194,6 @@ bool OpenCLWrapper::AddAndWriteBuffer(const string& name, void* data, size_t siz
bool OpenCLWrapper::WriteBuffer(const string& name, void* data, size_t size)
{
int bufferIndex = FindBufferIndex(name);
return bufferIndex != -1 ? WriteBuffer(bufferIndex, data, size) : false;
}
@ -216,11 +210,10 @@ bool OpenCLWrapper::WriteBuffer(size_t bufferIndex, void* data, size_t size)
{
cl::Event e;
cl_int err = m_Queue.enqueueWriteBuffer(m_Buffers[bufferIndex].m_Buffer, CL_TRUE, 0, size, data, nullptr, &e);
e.wait();
m_Queue.finish();
if (m_Info.CheckCL(err, "cl::CommandQueue::enqueueWriteBuffer()"))
if (m_Info->CheckCL(err, "cl::CommandQueue::enqueueWriteBuffer()"))
return true;
}
@ -237,7 +230,6 @@ bool OpenCLWrapper::WriteBuffer(size_t bufferIndex, void* data, size_t size)
bool OpenCLWrapper::ReadBuffer(const string& name, void* data, size_t size)
{
int bufferIndex = FindBufferIndex(name);
return bufferIndex != -1 ? ReadBuffer(bufferIndex, data, size) : false;
}
@ -254,11 +246,10 @@ bool OpenCLWrapper::ReadBuffer(size_t bufferIndex, void* data, size_t size)
{
cl::Event e;
cl_int err = m_Queue.enqueueReadBuffer(m_Buffers[bufferIndex].m_Buffer, CL_TRUE, 0, size, data, nullptr, &e);
e.wait();
m_Queue.finish();
if (m_Info.CheckCL(err, "cl::CommandQueue::enqueueReadBuffer()"))
if (m_Info->CheckCL(err, "cl::CommandQueue::enqueueReadBuffer()"))
return true;
}
@ -287,7 +278,6 @@ int OpenCLWrapper::FindBufferIndex(const string& name)
size_t OpenCLWrapper::GetBufferSize(const string& name)
{
int bufferIndex = FindBufferIndex(name);
return bufferIndex != -1 ? GetBufferSize(bufferIndex) : 0;
}
@ -345,7 +335,7 @@ bool OpenCLWrapper::AddAndWriteImage(const string& name, cl_mem_flags flags, con
IMAGEGL2D imageGL(m_Context, flags, GL_TEXTURE_2D, 0, texName, &err);
NamedImage2DGL namedImageGL(imageGL, name);
if (m_Info.CheckCL(err, "cl::ImageGL()"))
if (m_Info->CheckCL(err, "cl::ImageGL()"))
{
m_GLImages.push_back(namedImageGL);
@ -359,7 +349,7 @@ bool OpenCLWrapper::AddAndWriteImage(const string& name, cl_mem_flags flags, con
{
NamedImage2D namedImage(cl::Image2D(m_Context, flags, format, width, height, row_pitch, data, &err), name);
if (m_Info.CheckCL(err, "cl::Image2D()"))
if (m_Info->CheckCL(err, "cl::Image2D()"))
{
m_Images.push_back(namedImage);
return true;
@ -376,7 +366,7 @@ bool OpenCLWrapper::AddAndWriteImage(const string& name, cl_mem_flags flags, con
{
NamedImage2DGL namedImageGL(IMAGEGL2D(m_Context, flags, GL_TEXTURE_2D, 0, texName, &err), name);//Sizes are different, so create new.
if (m_Info.CheckCL(err, "cl::ImageGL()"))
if (m_Info->CheckCL(err, "cl::ImageGL()"))
{
m_GLImages[imageIndex] = namedImageGL;
}
@ -395,10 +385,9 @@ bool OpenCLWrapper::AddAndWriteImage(const string& name, cl_mem_flags flags, con
if (!CompareImageParams(m_Images[imageIndex].m_Image, flags, format, width, height, row_pitch))
{
m_Images[imageIndex] = NamedImage2D();//First clear out the original so the two don't exist in memory at once.
NamedImage2D namedImage(cl::Image2D(m_Context, flags, format, width, height, row_pitch, data, &err), name);
if (m_Info.CheckCL(err, "cl::Image2D()"))
if (m_Info->CheckCL(err, "cl::Image2D()"))
{
m_Images[imageIndex] = namedImage;
return true;
@ -432,11 +421,9 @@ bool OpenCLWrapper::WriteImage2D(size_t index, bool shared, ::size_t width, ::si
cl_int err;
cl::Event e;
cl::size_t<3> origin, region;
origin[0] = 0;
origin[1] = 0;
origin[2] = 0;
region[0] = width;
region[1] = height;
region[2] = 1;
@ -450,9 +437,8 @@ bool OpenCLWrapper::WriteImage2D(size_t index, bool shared, ::size_t width, ::si
err = m_Queue.enqueueWriteImage(imageGL, CL_TRUE, origin, region, row_pitch, 0, data, nullptr, &e);
e.wait();
m_Queue.finish();
bool b = EnqueueReleaseGLObjects(imageGL);
return m_Info.CheckCL(err, "cl::enqueueWriteImage()") && b;
return m_Info->CheckCL(err, "cl::enqueueWriteImage()") && b;
}
}
else if (!shared && index < m_Images.size())
@ -460,7 +446,7 @@ bool OpenCLWrapper::WriteImage2D(size_t index, bool shared, ::size_t width, ::si
err = m_Queue.enqueueWriteImage(m_Images[index].m_Image, CL_TRUE, origin, region, row_pitch, 0, data, nullptr, &e);
e.wait();
m_Queue.finish();
return m_Info.CheckCL(err, "cl::enqueueWriteImage()");
return m_Info->CheckCL(err, "cl::enqueueWriteImage()");
}
}
@ -507,11 +493,9 @@ bool OpenCLWrapper::ReadImage(size_t imageIndex, ::size_t width, ::size_t height
cl_int err;
cl::Event e;
cl::size_t<3> origin, region;
origin[0] = 0;
origin[1] = 0;
origin[2] = 0;
region[0] = width;
region[1] = height;
region[2] = 1;
@ -524,13 +508,13 @@ bool OpenCLWrapper::ReadImage(size_t imageIndex, ::size_t width, ::size_t height
{
err = m_Queue.enqueueReadImage(m_GLImages[imageIndex].m_Image, true, origin, region, row_pitch, 0, data);
bool b = EnqueueReleaseGLObjects(m_GLImages[imageIndex].m_Image);
return m_Info.CheckCL(err, "cl::enqueueReadImage()") && b;
return m_Info->CheckCL(err, "cl::enqueueReadImage()") && b;
}
}
else if (!shared && imageIndex < m_Images.size())
{
err = m_Queue.enqueueReadImage(m_Images[imageIndex].m_Image, true, origin, region, row_pitch, 0, data);
return m_Info.CheckCL(err, "cl::enqueueReadImage()");
return m_Info->CheckCL(err, "cl::enqueueReadImage()");
}
}
@ -588,7 +572,6 @@ size_t OpenCLWrapper::GetImageSize(size_t imageIndex, bool shared)
if (shared && imageIndex < m_GLImages.size())
{
vector<cl::Memory> images;
images.push_back(m_GLImages[imageIndex].m_Image);
IMAGEGL2D image = m_GLImages[imageIndex].m_Image;
@ -620,12 +603,11 @@ size_t OpenCLWrapper::GetImageSize(size_t imageIndex, bool shared)
bool OpenCLWrapper::CompareImageParams(cl::Image& image, cl_mem_flags flags, const cl::ImageFormat& format, ::size_t width, ::size_t height, ::size_t row_pitch)
{
cl_image_format tempFormat = image.getImageInfo<CL_IMAGE_FORMAT>(nullptr);
return (/*image.getImageInfo<CL_MEM_FLAGS>() == flags &&*/
tempFormat.image_channel_data_type == format.image_channel_data_type &&
tempFormat.image_channel_order == format.image_channel_order &&
image.getImageInfo<CL_IMAGE_WIDTH>(nullptr) == width &&
image.getImageInfo<CL_IMAGE_HEIGHT>(nullptr) == height/* &&
tempFormat.image_channel_data_type == format.image_channel_data_type &&
tempFormat.image_channel_order == format.image_channel_order &&
image.getImageInfo<CL_IMAGE_WIDTH>(nullptr) == width &&
image.getImageInfo<CL_IMAGE_HEIGHT>(nullptr) == height/* &&
image.getImageInfo<CL_IMAGE_ROW_PITCH>() == row_pitch*/);//Pitch will be (width * bytes per pixel) + padding.
}
@ -657,17 +639,15 @@ bool OpenCLWrapper::CreateImage2D(cl::Image2D& image2D, cl_mem_flags flags, cl::
if (m_Init)
{
cl_int err;
image2D = cl::Image2D(m_Context,
flags,
format,
width,
height,
row_pitch,
data,
&err);
return m_Info.CheckCL(err, "cl::Image2D()");
flags,
format,
width,
height,
row_pitch,
data,
&err);
return m_Info->CheckCL(err, "cl::Image2D()");
}
return false;
@ -687,15 +667,13 @@ bool OpenCLWrapper::CreateImage2DGL(IMAGEGL2D& image2DGL, cl_mem_flags flags, GL
if (m_Init)
{
cl_int err;
image2DGL = IMAGEGL2D(m_Context,
flags,
target,
miplevel,
texobj,
&err);
return m_Info.CheckCL(err, "cl::ImageGL()");
flags,
target,
miplevel,
texobj,
&err);
return m_Info->CheckCL(err, "cl::ImageGL()");
}
return false;
@ -726,11 +704,10 @@ bool OpenCLWrapper::EnqueueAcquireGLObjects(IMAGEGL2D& image)
if (m_Init && m_Shared)
{
vector<cl::Memory> images;
images.push_back(image);
cl_int err = m_Queue.enqueueAcquireGLObjects(&images);
m_Queue.finish();
return m_Info.CheckCL(err, "cl::CommandQueue::enqueueAcquireGLObjects()");
return m_Info->CheckCL(err, "cl::CommandQueue::enqueueAcquireGLObjects()");
}
return false;
@ -761,11 +738,10 @@ bool OpenCLWrapper::EnqueueReleaseGLObjects(IMAGEGL2D& image)
if (m_Init && m_Shared)
{
vector<cl::Memory> images;
images.push_back(image);
cl_int err = m_Queue.enqueueReleaseGLObjects(&images);
m_Queue.finish();
return m_Info.CheckCL(err, "cl::CommandQueue::enqueueReleaseGLObjects()");
return m_Info->CheckCL(err, "cl::CommandQueue::enqueueReleaseGLObjects()");
}
return false;
@ -781,9 +757,8 @@ bool OpenCLWrapper::EnqueueAcquireGLObjects(const VECTOR_CLASS<cl::Memory>* memO
if (m_Init && m_Shared)
{
cl_int err = m_Queue.enqueueAcquireGLObjects(memObjects);
m_Queue.finish();
return m_Info.CheckCL(err, "cl::CommandQueue::enqueueAcquireGLObjects()");
return m_Info->CheckCL(err, "cl::CommandQueue::enqueueAcquireGLObjects()");
}
return false;
@ -799,9 +774,8 @@ bool OpenCLWrapper::EnqueueReleaseGLObjects(const VECTOR_CLASS<cl::Memory>* memO
if (m_Init && m_Shared)
{
cl_int err = m_Queue.enqueueReleaseGLObjects(memObjects);
m_Queue.finish();
return m_Info.CheckCL(err, "cl::CommandQueue::enqueueReleaseGLObjects()");
return m_Info->CheckCL(err, "cl::CommandQueue::enqueueReleaseGLObjects()");
}
return false;
@ -818,14 +792,12 @@ bool OpenCLWrapper::EnqueueReleaseGLObjects(const VECTOR_CLASS<cl::Memory>* memO
bool OpenCLWrapper::CreateSampler(cl::Sampler& sampler, cl_bool normalizedCoords, cl_addressing_mode addressingMode, cl_filter_mode filterMode)
{
cl_int err;
sampler = cl::Sampler(m_Context,
normalizedCoords,
addressingMode,
filterMode,
&err);
return m_Info.CheckCL(err, "cl::Sampler()");
normalizedCoords,
addressingMode,
filterMode,
&err);
return m_Info->CheckCL(err, "cl::Sampler()");
}
/// <summary>
@ -839,7 +811,6 @@ bool OpenCLWrapper::CreateSampler(cl::Sampler& sampler, cl_bool normalizedCoords
bool OpenCLWrapper::SetBufferArg(size_t kernelIndex, cl_uint argIndex, const string& name)
{
int bufferIndex = OpenCLWrapper::FindBufferIndex(name);
return bufferIndex != -1 ? SetBufferArg(kernelIndex, argIndex, bufferIndex) : false;
}
@ -897,12 +868,12 @@ bool OpenCLWrapper::SetImageArg(size_t kernelIndex, cl_uint argIndex, bool share
if (shared && imageIndex < m_GLImages.size())
{
err = m_Programs[kernelIndex].m_Kernel.setArg(argIndex, m_GLImages[imageIndex].m_Image);
return m_Info.CheckCL(err, "cl::Kernel::setArg()");
return m_Info->CheckCL(err, "cl::Kernel::setArg()");
}
else if (!shared && imageIndex < m_Images.size())
{
err = m_Programs[kernelIndex].m_Kernel.setArg(argIndex, m_Images[imageIndex].m_Image);
return m_Info.CheckCL(err, "cl::Kernel::setArg()");
return m_Info->CheckCL(err, "cl::Kernel::setArg()");
}
}
@ -935,21 +906,20 @@ int OpenCLWrapper::FindKernelIndex(const string& name)
/// <param name="blockDepth">Depth of each block</param>
/// <returns>True if success, else false.</returns>
bool OpenCLWrapper::RunKernel(size_t kernelIndex, size_t totalGridWidth, size_t totalGridHeight, size_t totalGridDepth,
size_t blockWidth, size_t blockHeight, size_t blockDepth)
size_t blockWidth, size_t blockHeight, size_t blockDepth)
{
if (m_Init && kernelIndex < m_Programs.size())
{
cl::Event e;
cl_int err = m_Queue.enqueueNDRangeKernel(m_Programs[kernelIndex].m_Kernel,
cl::NullRange,
cl::NDRange(totalGridWidth, totalGridHeight, totalGridDepth),
cl::NDRange(blockWidth, blockHeight, blockDepth),
nullptr,
&e);
cl::NullRange,
cl::NDRange(totalGridWidth, totalGridHeight, totalGridDepth),
cl::NDRange(blockWidth, blockHeight, blockDepth),
nullptr,
&e);
e.wait();
m_Queue.finish();
return m_Info.CheckCL(err, "cl::CommandQueue::enqueueNDRangeKernel()");
return m_Info->CheckCL(err, "cl::CommandQueue::enqueueNDRangeKernel()");
}
return false;
@ -963,7 +933,7 @@ bool OpenCLWrapper::Shared() const { return m_Shared; }
const cl::Context& OpenCLWrapper::Context() const { return m_Context; }
size_t OpenCLWrapper::PlatformIndex() const { return m_PlatformIndex; }
size_t OpenCLWrapper::DeviceIndex() const { return m_DeviceIndex; }
const string& OpenCLWrapper::DeviceName() const { return m_Info.DeviceName(m_PlatformIndex, m_DeviceIndex); }
const string& OpenCLWrapper::DeviceName() const { return m_Info->DeviceName(m_PlatformIndex, m_DeviceIndex); }
size_t OpenCLWrapper::LocalMemSize() const { return m_LocalMemSize; }
size_t OpenCLWrapper::GlobalMemSize() const { return m_GlobalMemSize; }
size_t OpenCLWrapper::MaxAllocSize() const { return m_MaxAllocSize; }
@ -997,7 +967,6 @@ bool OpenCLWrapper::CreateSPK(const string& name, const string& program, const s
if (m_Init)
{
cl_int err;
spk.m_Name = name;
spk.m_Source = cl::Program::Sources(1, std::make_pair(program.c_str(), program.length() + 1));
spk.m_Program = cl::Program(m_Context, spk.m_Source);
@ -1006,17 +975,18 @@ bool OpenCLWrapper::CreateSPK(const string& name, const string& program, const s
err = spk.m_Program.build(m_DeviceVec, "-cl-mad-enable");//Tinker with other options later.
else
err = spk.m_Program.build(m_DeviceVec, "-cl-mad-enable -cl-no-signed-zeros -cl-single-precision-constant");
//err = spk.m_Program.build(m_DeviceVec, "-cl-single-precision-constant");
//err = spk.m_Program.build(m_DeviceVec, "-cl-mad-enable -cl-single-precision-constant");
//err = spk.m_Program.build(m_DeviceVec, "-cl-mad-enable -cl-no-signed-zeros -cl-fast-relaxed-math -cl-single-precision-constant");//This can cause some rounding.
//err = spk.m_Program.build(m_DeviceVec, "-cl-mad-enable -cl-single-precision-constant");
if (m_Info.CheckCL(err, "cl::Program::build()"))
//err = spk.m_Program.build(m_DeviceVec, "-cl-single-precision-constant");
//err = spk.m_Program.build(m_DeviceVec, "-cl-mad-enable -cl-single-precision-constant");
//err = spk.m_Program.build(m_DeviceVec, "-cl-mad-enable -cl-no-signed-zeros -cl-fast-relaxed-math -cl-single-precision-constant");//This can cause some rounding.
//err = spk.m_Program.build(m_DeviceVec, "-cl-mad-enable -cl-single-precision-constant");
if (m_Info->CheckCL(err, "cl::Program::build()"))
{
//Building of program is ok, now create kernel with the specified entry point.
spk.m_Kernel = cl::Kernel(spk.m_Program, entryPoint.c_str(), &err);
if (m_Info.CheckCL(err, "cl::Kernel()"))
if (m_Info->CheckCL(err, "cl::Kernel()"))
return true;//Everything is ok.
}
else

View File

@ -9,11 +9,7 @@
namespace EmberCLns
{
#if CL_VERSION_1_2
#define IMAGEGL2D cl::ImageGL
#else
#define IMAGEGL2D cl::Image2DGL
#endif
/// <summary>
/// Class to contain all of the things needed to store an OpenCL program.
@ -162,8 +158,7 @@ public:
if (m_Init && kernelIndex < m_Programs.size())
{
cl_int err = m_Programs[kernelIndex].m_Kernel.setArg(argIndex, arg);
return m_Info.CheckCL(err, "cl::Kernel::setArg()");
return m_Info->CheckCL(err, "cl::Kernel::setArg()");
}
return false;
@ -201,7 +196,7 @@ private:
cl::Context m_Context;
cl::Device m_Device;
cl::CommandQueue m_Queue;
OpenCLInfo& m_Info;
shared_ptr<OpenCLInfo> m_Info;
std::vector<cl::Device> m_DeviceVec;
std::vector<Spk> m_Programs;
std::vector<NamedBuffer> m_Buffers;

View File

@ -41,11 +41,11 @@ void RendererCL<T, bucketT>::Init()
m_Init = false;
m_DoublePrecision = typeid(T) == typeid(double);
m_NumChannels = 4;
//Buffer names.
m_EmberBufferName = "Ember";
m_XformsBufferName = "Xforms";
m_ParVarsBufferName = "ParVars";
m_GlobalSharedBufferName = "GlobalShared";
m_SeedsBufferName = "Seeds";
m_DistBufferName = "Dist";
m_CarToRasBufferName = "CarToRas";
@ -60,7 +60,6 @@ void RendererCL<T, bucketT>::Init()
m_AccumBufferName = "Accum";
m_FinalImageName = "Final";
m_PointsBufferName = "Points";
//It's critical that these numbers never change. They are
//based on the cuburn model of each kernel launch containing
//256 threads. 32 wide by 8 high. Everything done in the OpenCL
@ -70,7 +69,6 @@ void RendererCL<T, bucketT>::Init()
m_IterBlockHeight = 8;
m_IterBlocksWide = 64;
m_IterBlocksHigh = 2;
m_PaletteFormat.image_channel_order = CL_RGBA;
m_PaletteFormat.image_channel_data_type = CL_FLOAT;
m_FinalFormat.image_channel_order = CL_RGBA;
@ -111,11 +109,11 @@ bool RendererCL<T, bucketT>::Init(const vector<pair<size_t, size_t>>& devices, b
auto& zeroizeProgram = m_IterOpenCLKernelCreator.ZeroizeKernel();
auto& sumHistProgram = m_IterOpenCLKernelCreator.SumHistKernel();
ostringstream os;
m_Init = false;
m_Devices.clear();
m_Devices.reserve(devices.size());
m_OutputTexID = outputTexID;
m_GlobalShared.second.resize(16);//Dummy data until a real alloc is needed.
for (size_t i = 0; i < devices.size(); i++)
{
@ -126,8 +124,11 @@ bool RendererCL<T, bucketT>::Init(const vector<pair<size_t, size_t>>& devices, b
if ((b = cld->Init()))//Build a simple program to ensure OpenCL is working right.
{
if (b && !(b = cld->m_Wrapper.AddProgram(m_IterOpenCLKernelCreator.ZeroizeEntryPoint(), zeroizeProgram, m_IterOpenCLKernelCreator.ZeroizeEntryPoint(), m_DoublePrecision))) { AddToReport(loc); }
if (b && !(b = cld->m_Wrapper.AddAndWriteImage("Palette", CL_MEM_READ_ONLY, m_PaletteFormat, 256, 1, 0, nullptr))) { AddToReport(loc); }
if (b && !(b = cld->m_Wrapper.AddAndWriteBuffer(m_GlobalSharedBufferName, m_GlobalShared.second.data(), m_GlobalShared.second.size() * sizeof(m_GlobalShared.second[0])))) { AddToReport(loc); }//Empty at start, will be filled in later if needed.
if (b)
{
m_Devices.push_back(std::move(cld));//Success, so move to the vector, else it will go out of scope and be deleted.
@ -156,9 +157,10 @@ bool RendererCL<T, bucketT>::Init(const vector<pair<size_t, size_t>>& devices, b
{
auto& firstWrapper = m_Devices[0]->m_Wrapper;
m_DEOpenCLKernelCreator = DEOpenCLKernelCreator(m_DoublePrecision, m_Devices[0]->Nvidia());
//Build a simple program to ensure OpenCL is working right.
if (b && !(b = firstWrapper.AddProgram(m_DEOpenCLKernelCreator.LogScaleAssignDEEntryPoint(), m_DEOpenCLKernelCreator.LogScaleAssignDEKernel(), m_DEOpenCLKernelCreator.LogScaleAssignDEEntryPoint(), m_DoublePrecision))) { AddToReport(loc); }
if (b && !(b = firstWrapper.AddProgram(m_IterOpenCLKernelCreator.SumHistEntryPoint(), sumHistProgram, m_IterOpenCLKernelCreator.SumHistEntryPoint(), m_DoublePrecision))) { AddToReport(loc); }
if (b)
@ -170,7 +172,6 @@ bool RendererCL<T, bucketT>::Init(const vector<pair<size_t, size_t>>& devices, b
//AMD is further limited because of less local memory so these have to be 16 on AMD.
m_MaxDEBlockSizeW = m_Devices[0]->Nvidia() ? 24 : 16;//These *must* both be divisible by 8 or else pixels will go missing.
m_MaxDEBlockSizeH = m_Devices[0]->Nvidia() ? 24 : 16;
FillSeeds();
for (size_t device = 0; device < m_Devices.size(); device++)
@ -533,7 +534,9 @@ bool RendererCL<T, bucketT>::CreateDEFilter(bool& newAlloc)
auto& wrapper = m_Devices[0]->m_Wrapper;
if (b && !(b = wrapper.AddAndWriteBuffer(m_DECoefsBufferName, reinterpret_cast<void*>(const_cast<bucketT*>(m_DensityFilter->Coefs())), m_DensityFilter->CoefsSizeBytes()))) { AddToReport(loc); }
if (b && !(b = wrapper.AddAndWriteBuffer(m_DEWidthsBufferName, reinterpret_cast<void*>(const_cast<bucketT*>(m_DensityFilter->Widths())), m_DensityFilter->WidthsSizeBytes()))) { AddToReport(loc); }
if (b && !(b = wrapper.AddAndWriteBuffer(m_DECoefIndicesBufferName, reinterpret_cast<void*>(const_cast<uint*>(m_DensityFilter->CoefIndices())), m_DensityFilter->CoefsIndicesSizeBytes()))) { AddToReport(loc); }
}
}
@ -606,7 +609,7 @@ vector<string> RendererCL<T, bucketT>::ErrorReport()
auto s = device->m_Wrapper.ErrorReport();
ours.insert(ours.end(), s.begin(), s.end());
}
return ours;
}
@ -655,28 +658,37 @@ bool RendererCL<T, bucketT>::Alloc(bool histOnly)
EnterResize();
m_XformsCL.resize(m_Ember.TotalXformCount());
bool b = true;
size_t histLength = SuperSize() * sizeof(v4bT);
size_t accumLength = SuperSize() * sizeof(v4bT);
const char* loc = __FUNCTION__;
auto& wrapper = m_Devices[0]->m_Wrapper;
if (b && !(b = wrapper.AddBuffer(m_DEFilterParamsBufferName, sizeof(m_DensityFilterCL)))) { AddToReport(loc); }
if (b && !(b = wrapper.AddBuffer(m_SpatialFilterParamsBufferName, sizeof(m_SpatialFilterCL)))) { AddToReport(loc); }
if (b && !(b = wrapper.AddBuffer(m_CurvesCsaName, SizeOf(m_Csa.m_Entries)))) { AddToReport(loc); }
if (b && !(b = wrapper.AddBuffer(m_AccumBufferName, accumLength))) { AddToReport(loc); }//Accum buffer.
for (auto& device : m_Devices)
{
if (b && !(b = device->m_Wrapper.AddBuffer(m_EmberBufferName, sizeof(m_EmberCL)))) { AddToReport(loc); break; }
if (b && !(b = device->m_Wrapper.AddBuffer(m_XformsBufferName, SizeOf(m_XformsCL)))) { AddToReport(loc); break; }
if (b && !(b = device->m_Wrapper.AddBuffer(m_ParVarsBufferName, 128 * sizeof(T)))) { AddToReport(loc); break; }
if (b && !(b = device->m_Wrapper.AddBuffer(m_DistBufferName, CHOOSE_XFORM_GRAIN))) { AddToReport(loc); break; }//Will be resized for xaos.
if (b && !(b = device->m_Wrapper.AddBuffer(m_CarToRasBufferName, sizeof(m_CarToRasCL)))) { AddToReport(loc); break; }
if (b && !(b = device->m_Wrapper.AddBuffer(m_HistBufferName, histLength))) { AddToReport(loc); break; }//Histogram. Will memset to zero later.
if (b && !(b = device->m_Wrapper.AddBuffer(m_PointsBufferName, IterGridKernelCount() * sizeof(PointCL<T>)))) { AddToReport(loc); break; }//Points between iter calls.
//Global shared is allocated once and written when building the kernel.
}
LeaveResize();
@ -739,12 +751,9 @@ eRenderStatus RendererCL<T, bucketT>::GaussianDensityFilter()
//}
//else
// return RENDER_ERROR;
//Timing t(4);
eRenderStatus status = RunDensityFilter();
//t.Toc(__FUNCTION__ " RunKernel()");
return status;
}
@ -802,36 +811,41 @@ EmberStats RendererCL<T, bucketT>::Iterate(size_t iterCount, size_t temporalSamp
if (IterOpenCLKernelCreator<T>::IsBuildRequired(m_Ember, m_LastBuiltEmber))
b = BuildIterProgramForEmber(true);
//Setup buffers on all devices.
for (auto& device : m_Devices)
if (b)
{
auto& wrapper = device->m_Wrapper;
if (b && !(b = wrapper.WriteBuffer (m_EmberBufferName, reinterpret_cast<void*>(&m_EmberCL), sizeof(m_EmberCL)))) { AddToReport(loc); }
if (b && !(b = wrapper.WriteBuffer (m_XformsBufferName, reinterpret_cast<void*>(m_XformsCL.data()), sizeof(m_XformsCL[0]) * m_XformsCL.size()))) { AddToReport(loc); }
if (b && !(b = wrapper.AddAndWriteBuffer(m_DistBufferName, reinterpret_cast<void*>(const_cast<byte*>(XformDistributions())), XformDistributionsSize()))) { AddToReport(loc); }//Will be resized for xaos.
if (b && !(b = wrapper.WriteBuffer (m_CarToRasBufferName, reinterpret_cast<void*>(&m_CarToRasCL), sizeof(m_CarToRasCL)))) { AddToReport(loc); }
if (b && !(b = wrapper.AddAndWriteImage("Palette", CL_MEM_READ_ONLY, m_PaletteFormat, m_Dmap.m_Entries.size(), 1, 0, m_Dmap.m_Entries.data()))) { AddToReport(loc); }
if (b)
//Setup buffers on all devices.
for (auto& device : m_Devices)
{
IterOpenCLKernelCreator<T>::ParVarIndexDefines(m_Ember, m_Params, true, false);//Always do this to get the values (but no string), regardless of whether a rebuild is necessary.
auto& wrapper = device->m_Wrapper;
//Don't know the size of the parametric varations parameters buffer until the ember is examined.
//So set it up right before the run.
if (!m_Params.second.empty())
if (b && !(b = wrapper.WriteBuffer(m_EmberBufferName, reinterpret_cast<void*>(&m_EmberCL), sizeof(m_EmberCL))))
break;
if (b && !(b = wrapper.WriteBuffer(m_XformsBufferName, reinterpret_cast<void*>(m_XformsCL.data()), sizeof(m_XformsCL[0]) * m_XformsCL.size())))
break;
if (b && !(b = wrapper.AddAndWriteBuffer(m_DistBufferName, reinterpret_cast<void*>(const_cast<byte*>(XformDistributions())), XformDistributionsSize())))//Will be resized for xaos.
break;
if (b && !(b = wrapper.WriteBuffer(m_CarToRasBufferName, reinterpret_cast<void*>(&m_CarToRasCL), sizeof(m_CarToRasCL))))
break;
if (b && !(b = wrapper.AddAndWriteImage("Palette", CL_MEM_READ_ONLY, m_PaletteFormat, m_Dmap.m_Entries.size(), 1, 0, m_Dmap.m_Entries.data())))
break;
if (b)
{
if (!wrapper.AddAndWriteBuffer(m_ParVarsBufferName, m_Params.second.data(), m_Params.second.size() * sizeof(m_Params.second[0])))
{
m_Abort = true;
AddToReport(loc);
return stats;
}
IterOpenCLKernelCreator<T>::ParVarIndexDefines(m_Ember, m_Params, true, false);//Always do this to get the values (but no string), regardless of whether a rebuild is necessary.
//Don't know the size of the parametric varations parameters buffer until the ember is examined.
//So set it up right before the run.
if (!m_Params.second.empty())
if (!wrapper.AddAndWriteBuffer(m_ParVarsBufferName, m_Params.second.data(), m_Params.second.size() * sizeof(m_Params.second[0])))
break;
}
else
break;
}
else
return stats;
}
}
@ -876,39 +890,53 @@ bool RendererCL<T, bucketT>::BuildIterProgramForEmber(bool doAccum)
bool b = !m_Devices.empty();
const char* loc = __FUNCTION__;
IterOpenCLKernelCreator<T>::ParVarIndexDefines(m_Ember, m_Params, false, true);//Do with string and no vals.
m_IterKernel = m_IterOpenCLKernelCreator.CreateIterKernelString(m_Ember, m_Params.first, m_LockAccum, doAccum);
//cout << "Building: " << endl << iterProgram << endl;
vector<std::thread> threads;
std::function<void(RendererClDevice*)> func = [&](RendererClDevice* dev)
{
if (!dev->m_Wrapper.AddProgram(m_IterOpenCLKernelCreator.IterEntryPoint(), m_IterKernel, m_IterOpenCLKernelCreator.IterEntryPoint(), m_DoublePrecision))
{
m_ResizeCs.Enter();//Just use the resize CS for lack of a better one.
b = false;
AddToReport(string(loc) + "()\n" + dev->m_Wrapper.DeviceName() + ":\nBuilding the following program failed: \n" + m_IterKernel + "\n");
m_ResizeCs.Leave();
}
};
threads.reserve(m_Devices.size() - 1);
for (size_t device = m_Devices.size() - 1; device >= 0 && device < m_Devices.size(); device--)//Check both extents because size_t will wrap.
{
if (!device)//Secondary devices on their own threads.
threads.push_back(std::thread([&](RendererClDevice* dev) { func(dev); }, m_Devices[device].get()));
else//Primary device on this thread.
func(m_Devices[device].get());
}
for (auto& th : threads)
if (th.joinable())
th.join();
IterOpenCLKernelCreator<T>::SharedDataIndexDefines(m_Ember, m_GlobalShared, true, true);//Do with string and vals only once on build since it won't change until another build occurs.
if (b)
{
//t.Toc(__FUNCTION__ " program build");
//cout << string(loc) << "():\nBuilding the following program succeeded: \n" << iterProgram << endl;
m_LastBuiltEmber = m_Ember;
m_IterKernel = m_IterOpenCLKernelCreator.CreateIterKernelString(m_Ember, m_Params.first, m_GlobalShared.first, m_LockAccum, doAccum);
//cout << "Building: " << endl << iterProgram << endl;
vector<std::thread> threads;
std::function<void(RendererClDevice*)> func = [&](RendererClDevice * dev)
{
if (!dev->m_Wrapper.AddProgram(m_IterOpenCLKernelCreator.IterEntryPoint(), m_IterKernel, m_IterOpenCLKernelCreator.IterEntryPoint(), m_DoublePrecision))
{
m_ResizeCs.Enter();//Just use the resize CS for lack of a better one.
b = false;
AddToReport(string(loc) + "()\n" + dev->m_Wrapper.DeviceName() + ":\nBuilding the following program failed: \n" + m_IterKernel + "\n");
m_ResizeCs.Leave();
}
else if (!m_GlobalShared.second.empty())
{
if (!dev->m_Wrapper.AddAndWriteBuffer(m_GlobalSharedBufferName, m_GlobalShared.second.data(), m_GlobalShared.second.size() * sizeof(m_GlobalShared.second[0])))
{
m_ResizeCs.Enter();//Just use the resize CS for lack of a better one.
b = false;
AddToReport(string(loc) + "()\n" + dev->m_Wrapper.DeviceName() + ":\nAdding global shared buffer failed.\n");
m_ResizeCs.Leave();
}
}
};
threads.reserve(m_Devices.size() - 1);
for (size_t device = m_Devices.size() - 1; device >= 0 && device < m_Devices.size(); device--)//Check both extents because size_t will wrap.
{
if (!device)//Secondary devices on their own threads.
threads.push_back(std::thread([&](RendererClDevice * dev) { func(dev); }, m_Devices[device].get()));
else//Primary device on this thread.
func(m_Devices[device].get());
}
for (auto& th : threads)
if (th.joinable())
th.join();
if (b)
{
//t.Toc(__FUNCTION__ " program build");
//cout << string(loc) << "():\nBuilding the following program succeeded: \n" << iterProgram << endl;
m_LastBuiltEmber = m_Ember;
}
}
return b;
@ -935,7 +963,6 @@ bool RendererCL<T, bucketT>::RunIter(size_t iterCount, size_t temporalSample, si
std::atomic<size_t> atomLaunchesRan;
std::atomic<intmax_t> atomItersRan, atomItersRemaining;
size_t adjustedIterCountPerKernel = m_IterCountPerKernel;
itersRan = 0;
atomItersRan.store(0);
atomItersRemaining.store(iterCount);
@ -954,11 +981,9 @@ bool RendererCL<T, bucketT>::RunIter(size_t iterCount, size_t temporalSample, si
}
size_t fuseFreq = Renderer<T, bucketT>::SubBatchSize() / adjustedIterCountPerKernel;//Use the base sbs to determine when to fuse.
#ifdef TEST_CL
m_Abort = false;
#endif
std::function<void(size_t, int)> iterFunc = [&](size_t dev, int kernelIndex)
{
bool b = true;
@ -979,18 +1004,31 @@ bool RendererCL<T, bucketT>::RunIter(size_t iterCount, size_t temporalSample, si
size_t iterCountThisLaunch = iterCountPerKernel * IterGridKernelWidth() * IterGridKernelHeight();
//cout << "itersRemaining " << itersRemaining << ", iterCountPerKernel " << iterCountPerKernel << ", iterCountThisLaunch " << iterCountThisLaunch << endl;
if (b && !(b = wrapper.SetArg (kernelIndex, argIndex++, iterCountPerKernel))) { AddToReport(loc); }//Number of iters for each thread to run.
if (b && !(b = wrapper.SetArg (kernelIndex, argIndex++, fuse))) { AddToReport(loc); }//Number of iters to fuse.
if (b && !(b = wrapper.SetBufferArg(kernelIndex, argIndex++, m_SeedsBufferName))) { AddToReport(loc); }//Seeds.
if (b && !(b = wrapper.SetBufferArg(kernelIndex, argIndex++, m_EmberBufferName))) { AddToReport(loc); }//Ember.
if (b && !(b = wrapper.SetBufferArg(kernelIndex, argIndex++, m_XformsBufferName))) { AddToReport(loc); }//Xforms.
if (b && !(b = wrapper.SetBufferArg(kernelIndex, argIndex++, m_ParVarsBufferName))) { AddToReport(loc); }//Parametric variation parameters.
if (b && !(b = wrapper.SetBufferArg(kernelIndex, argIndex++, m_DistBufferName))) { AddToReport(loc); }//Xform distributions.
if (b && !(b = wrapper.SetBufferArg(kernelIndex, argIndex++, m_CarToRasBufferName))) { AddToReport(loc); }//Coordinate converter.
if (b && !(b = wrapper.SetBufferArg(kernelIndex, argIndex++, m_HistBufferName))) { AddToReport(loc); }//Histogram.
if (b && !(b = wrapper.SetArg (kernelIndex, argIndex++, histSuperSize))) { AddToReport(loc); }//Histogram size.
if (b && !(b = wrapper.SetImageArg (kernelIndex, argIndex++, false, "Palette"))) { AddToReport(loc); }//Palette.
if (b && !(b = wrapper.SetBufferArg(kernelIndex, argIndex++, m_PointsBufferName))) { AddToReport(loc); }//Random start points.
if (b && !(b = wrapper.SetArg (kernelIndex, argIndex++, iterCountPerKernel))) { AddToReport(loc); }//Number of iters for each thread to run.
if (b && !(b = wrapper.SetArg (kernelIndex, argIndex++, fuse))) { AddToReport(loc); }//Number of iters to fuse.
if (b && !(b = wrapper.SetBufferArg(kernelIndex, argIndex++, m_SeedsBufferName))) { AddToReport(loc); }//Seeds.
if (b && !(b = wrapper.SetBufferArg(kernelIndex, argIndex++, m_EmberBufferName))) { AddToReport(loc); }//Ember.
if (b && !(b = wrapper.SetBufferArg(kernelIndex, argIndex++, m_XformsBufferName))) { AddToReport(loc); }//Xforms.
if (b && !(b = wrapper.SetBufferArg(kernelIndex, argIndex++, m_ParVarsBufferName))) { AddToReport(loc); }//Parametric variation parameters.
if (b && !(b = wrapper.SetBufferArg(kernelIndex, argIndex++, m_GlobalSharedBufferName))) { AddToReport(loc); }//Global shared data.
if (b && !(b = wrapper.SetBufferArg(kernelIndex, argIndex++, m_DistBufferName))) { AddToReport(loc); }//Xform distributions.
if (b && !(b = wrapper.SetBufferArg(kernelIndex, argIndex++, m_CarToRasBufferName))) { AddToReport(loc); }//Coordinate converter.
if (b && !(b = wrapper.SetBufferArg(kernelIndex, argIndex++, m_HistBufferName))) { AddToReport(loc); }//Histogram.
if (b && !(b = wrapper.SetArg (kernelIndex, argIndex++, histSuperSize))) { AddToReport(loc); }//Histogram size.
if (b && !(b = wrapper.SetImageArg (kernelIndex, argIndex++, false, "Palette"))) { AddToReport(loc); }//Palette.
if (b && !(b = wrapper.SetBufferArg(kernelIndex, argIndex++, m_PointsBufferName))) { AddToReport(loc); }//Random start points.
if (b && !(b = wrapper.RunKernel(kernelIndex,
IterGridKernelWidth(),//Total grid dims.
@ -1014,17 +1052,16 @@ bool RendererCL<T, bucketT>::RunIter(size_t iterCount, size_t temporalSample, si
if (m_Callback && !dev)//Will only do callback on the first device, however it will report the progress of all devices.
{
double percent = 100.0 *
double
(
double
(
double
(
double(m_LastIter + atomItersRan.load()) / double(ItersPerTemporalSample())
) + temporalSample
) / double(TemporalSamples())
);
double
(
double
(
double
(
double(m_LastIter + atomItersRan.load()) / double(ItersPerTemporalSample())
) + temporalSample
) / double(TemporalSamples())
);
double percentDiff = percent - m_LastIterPercent;
double toc = m_ProgressTimer.Toc();
@ -1072,7 +1109,7 @@ bool RendererCL<T, bucketT>::RunIter(size_t iterCount, size_t temporalSample, si
if (m_Devices.size() > 1)//Determine whether/when to sum histograms of secondary devices with the primary.
{
if (((TemporalSamples() == 1) || (temporalSample == TemporalSamples() - 1)) &&//If there are no temporal samples (not animating), or the current one is the last...
((m_LastIter + itersRan) >= ItersPerTemporalSample()))//...and the required number of iters for that sample have completed...
((m_LastIter + itersRan) >= ItersPerTemporalSample()))//...and the required number of iters for that sample have completed...
if (success && !(success = SumDeviceHist())) { AddToReport(loc); }//...read the histogram from the secondary devices and sum them to the primary.
}
@ -1104,17 +1141,19 @@ eRenderStatus RendererCL<T, bucketT>::RunLogScaleFilter()
size_t blockH = 4;//A height of 4 seems to run the fastest.
size_t gridW = m_DensityFilterCL.m_SuperRasW;
size_t gridH = m_DensityFilterCL.m_SuperRasH;
OpenCLWrapper::MakeEvenGridDims(blockW, blockH, gridW, gridH);
if (b && !(b = wrapper.AddAndWriteBuffer(m_DEFilterParamsBufferName, reinterpret_cast<void*>(&m_DensityFilterCL), sizeof(m_DensityFilterCL)))) { AddToReport(loc); }
if (b && !(b = wrapper.SetBufferArg(kernelIndex, argIndex++, m_HistBufferName))) { AddToReport(loc); }//Histogram.
if (b && !(b = wrapper.SetBufferArg(kernelIndex, argIndex++, m_AccumBufferName))) { AddToReport(loc); }//Accumulator.
if (b && !(b = wrapper.SetBufferArg(kernelIndex, argIndex++, m_DEFilterParamsBufferName))) { AddToReport(loc); }//DensityFilterCL.
//t.Tic();
if (b && !(b = wrapper.RunKernel(kernelIndex, gridW, gridH, 1, blockW, blockH, 1))) { AddToReport(loc); }
//t.Toc(loc);
}
else
@ -1164,7 +1203,6 @@ eRenderStatus RendererCL<T, bucketT>::RunDensityFilter()
//Can't just blindly pass dimension in vals. Must adjust them first to evenly divide the block count
//into the total grid dimensions.
OpenCLWrapper::MakeEvenGridDims(blockSizeW, blockSizeH, gridW, gridH);
//t.Tic();
//The classic problem with performing DE on adjacent pixels is that the filter will overlap.
//This can be solved in 2 ways. One is to use atomics, which is unacceptably slow.
@ -1188,7 +1226,6 @@ eRenderStatus RendererCL<T, bucketT>::RunDensityFilter()
gapH = (uint)ceil((m_DensityFilterCL.m_FilterWidth * 2.0) / (double)32);//Block height is 1, but iterates over 32 rows.
chunkSizeH = gapH + 1;
totalChunks = chunkSizeW * chunkSizeH;
OpenCLWrapper::MakeEvenGridDims(blockSizeW, blockSizeH, gridW, gridH);
gridW /= chunkSizeW;
gridH /= chunkSizeH;
@ -1199,6 +1236,7 @@ eRenderStatus RendererCL<T, bucketT>::RunDensityFilter()
{
//t2.Tic();
if (b && !(b = RunDensityFilterPrivate(kernelIndex, gridW, gridH, blockSizeW, blockSizeH, chunkSizeW, chunkSizeH, colChunk, rowChunk))) { m_Abort = true; AddToReport(loc); }
//t2.Toc(loc);
if (b && m_Callback)
@ -1211,6 +1249,7 @@ eRenderStatus RendererCL<T, bucketT>::RunDensityFilter()
}
}
}
#else
gridW /= chunkSizeW;
gridH /= chunkSizeH;
@ -1222,6 +1261,7 @@ eRenderStatus RendererCL<T, bucketT>::RunDensityFilter()
{
//t2.Tic();
if (b && !(b = RunDensityFilterPrivate(kernelIndex, gridW, gridH, blockSizeW, blockSizeH, chunkSizeW, chunkSizeH, colChunk, rowChunk))) { m_Abort = true; AddToReport(loc); }
//t2.Toc(loc);
if (b && m_Callback)
@ -1234,6 +1274,7 @@ eRenderStatus RendererCL<T, bucketT>::RunDensityFilter()
}
}
}
#endif
if (b && m_Callback)
@ -1273,11 +1314,11 @@ eRenderStatus RendererCL<T, bucketT>::RunFinalAccum()
if (!m_Abort && accumKernelIndex != -1)
{
auto& wrapper = m_Devices[0]->m_Wrapper;
//This is needed with or without early clip.
ConvertSpatialFilter();
if (b && !(b = wrapper.AddAndWriteBuffer(m_SpatialFilterParamsBufferName, reinterpret_cast<void*>(&m_SpatialFilterCL), sizeof(m_SpatialFilterCL)))) { AddToReport(loc); }
if (b && !(b = wrapper.AddAndWriteBuffer(m_CurvesCsaName, m_Csa.m_Entries.data(), SizeOf(m_Csa.m_Entries)))) { AddToReport(loc); }
//Since early clip requires gamma correcting the entire accumulator first,
@ -1297,6 +1338,7 @@ eRenderStatus RendererCL<T, bucketT>::RunFinalAccum()
OpenCLWrapper::MakeEvenGridDims(blockW, blockH, gridW, gridH);
if (b && !(b = wrapper.SetBufferArg(gammaCorrectKernelIndex, argIndex++, m_AccumBufferName))) { AddToReport(loc); }//Accumulator.
if (b && !(b = wrapper.SetBufferArg(gammaCorrectKernelIndex, argIndex++, m_SpatialFilterParamsBufferName))) { AddToReport(loc); }//SpatialFilterCL.
if (b && !(b = wrapper.RunKernel(gammaCorrectKernelIndex, gridW, gridH, 1, blockW, blockH, 1))) { AddToReport(loc); }
@ -1316,13 +1358,19 @@ eRenderStatus RendererCL<T, bucketT>::RunFinalAccum()
OpenCLWrapper::MakeEvenGridDims(blockW, blockH, gridW, gridH);
if (b && !(b = wrapper.SetBufferArg(accumKernelIndex, argIndex++, m_AccumBufferName))) { AddToReport(loc); }//Accumulator.
if (b && !(b = wrapper.SetImageArg(accumKernelIndex, argIndex++, wrapper.Shared(), m_FinalImageName))) { AddToReport(loc); }//Final image.
if (b && !(b = wrapper.SetBufferArg(accumKernelIndex, argIndex++, m_SpatialFilterParamsBufferName))) { AddToReport(loc); }//SpatialFilterCL.
if (b && !(b = wrapper.SetBufferArg(accumKernelIndex, argIndex++, m_SpatialFilterCoefsBufferName))) { AddToReport(loc); }//Filter coefs.
if (b && !(b = wrapper.SetBufferArg(accumKernelIndex, argIndex++, m_CurvesCsaName))) { AddToReport(loc); }//Curve points.
if (b && !(b = wrapper.SetArg (accumKernelIndex, argIndex++, curvesSet))) { AddToReport(loc); }//Do curves.
if (b && !(b = wrapper.SetArg (accumKernelIndex, argIndex++, bucketT(alphaBase)))) { AddToReport(loc); }//Alpha base.
if (b && !(b = wrapper.SetArg (accumKernelIndex, argIndex++, bucketT(alphaScale)))) { AddToReport(loc); }//Alpha scale.
if (b && wrapper.Shared())
@ -1371,13 +1419,15 @@ bool RendererCL<T, bucketT>::ClearBuffer(size_t device, const string& bufferName
size_t blockH = m_Devices[device]->Nvidia() ? 32 : 16;
size_t gridW = width * elementSize;
size_t gridH = height;
b = true;
OpenCLWrapper::MakeEvenGridDims(blockW, blockH, gridW, gridH);
if (b && !(b = wrapper.SetBufferArg(kernelIndex, argIndex++, bufferName))) { AddToReport(loc); }//Buffer of byte.
if (b && !(b = wrapper.SetArg(kernelIndex, argIndex++, width * elementSize))) { AddToReport(loc); }//Width.
if (b && !(b = wrapper.SetArg(kernelIndex, argIndex++, height))) { AddToReport(loc); }//Height.
if (b && !(b = wrapper.RunKernel(kernelIndex, gridW, gridH, 1, blockW, blockH, 1))) { AddToReport(loc); }
}
else
@ -1416,21 +1466,31 @@ bool RendererCL<T, bucketT>::RunDensityFilterPrivate(size_t kernelIndex, size_t
auto& wrapper = m_Devices[0]->m_Wrapper;
if (b && !(b = wrapper.SetBufferArg(kernelIndex, argIndex, m_HistBufferName))) { AddToReport(loc); } argIndex++;//Histogram.
if (b && !(b = wrapper.SetBufferArg(kernelIndex, argIndex, m_AccumBufferName))) { AddToReport(loc); } argIndex++;//Accumulator.
if (b && !(b = wrapper.SetBufferArg(kernelIndex, argIndex, m_DEFilterParamsBufferName))) { AddToReport(loc); } argIndex++;//FlameDensityFilterCL.
if (b && !(b = wrapper.SetBufferArg(kernelIndex, argIndex, m_DECoefsBufferName))) { AddToReport(loc); } argIndex++;//Coefs.
if (b && !(b = wrapper.SetBufferArg(kernelIndex, argIndex, m_DEWidthsBufferName))) { AddToReport(loc); } argIndex++;//Widths.
if (b && !(b = wrapper.SetBufferArg(kernelIndex, argIndex, m_DECoefIndicesBufferName))) { AddToReport(loc); } argIndex++;//Coef indices.
if (b && !(b = wrapper.SetArg(kernelIndex, argIndex, chunkSizeW))) { AddToReport(loc); } argIndex++;//Chunk size width (gapW + 1).
if (b && !(b = wrapper.SetArg(kernelIndex, argIndex, chunkSizeH))) { AddToReport(loc); } argIndex++;//Chunk size height (gapH + 1).
if (b && !(b = wrapper.SetArg(kernelIndex, argIndex, chunkW))) { AddToReport(loc); } argIndex++;//Column chunk.
if (b && !(b = wrapper.SetArg(kernelIndex, argIndex, chunkH))) { AddToReport(loc); } argIndex++;//Row chunk.
//t.Toc(__FUNCTION__ " set args");
//t.Tic();
if (b && !(b = wrapper.RunKernel(kernelIndex, gridW, gridH, 1, blockW, blockH, 1))) { AddToReport(loc); }//Method 7, accumulating to temp box area.
//t.Toc(__FUNCTION__ " RunKernel()");
//t.Toc(__FUNCTION__ " RunKernel()");
return b;
}
@ -1490,6 +1550,7 @@ int RendererCL<T, bucketT>::MakeAndGetFinalAccumProgram(double& alphaBase, doubl
if ((kernelIndex = wrapper.FindKernelIndex(finalAccumEntryPoint)) == -1)//Has not been built yet.
{
auto& kernel = m_FinalAccumOpenCLKernelCreator.FinalAccumKernel(EarlyClip(), Renderer<T, bucketT>::NumChannels(), Transparency());
if (wrapper.AddProgram(finalAccumEntryPoint, kernel, finalAccumEntryPoint, m_DoublePrecision))
kernelIndex = wrapper.FindKernelIndex(finalAccumEntryPoint);//Try to find it again, it will be present if successfully built.
else
@ -1562,10 +1623,15 @@ bool RendererCL<T, bucketT>::SumDeviceHist()
cl_uint argIndex = 0;
if (b && !(b = wrapper.SetBufferArg(kernelIndex, argIndex++, m_AccumBufferName))) { break; }//Source buffer of v4bT.
if (b && !(b = wrapper.SetBufferArg(kernelIndex, argIndex++, m_HistBufferName))) { break; }//Dest buffer of v4bT.
if (b && !(b = wrapper.SetArg (kernelIndex, argIndex++, uint(SuperRasW())))) { break; }//Width in pixels.
if (b && !(b = wrapper.SetArg (kernelIndex, argIndex++, uint(SuperRasH())))) { break; }//Height in pixels.
if (b && !(b = wrapper.SetArg (kernelIndex, argIndex++, (device == m_Devices.size() - 1) ? 1 : 0))) { break; }//Clear the source buffer on the last device.
if (b && !(b = wrapper.RunKernel (kernelIndex, gridW, gridH, 1, blockW, blockH, 1))) { break; }
}
else
@ -1579,11 +1645,10 @@ bool RendererCL<T, bucketT>::SumDeviceHist()
}
}
}
if (!b)
{
ostringstream os;
os << loc << ": failed to sum histograms from the secondary device(s) to the primary device.";
AddToReport(os.str());
}
@ -1639,7 +1704,6 @@ void RendererCL<T, bucketT>::ConvertSpatialFilter()
if (m_SpatialFilter.get())
{
this->PrepFinalAccumVals(background, g, linRange, vibrancy);
m_SpatialFilterCL.m_SuperRasW = uint(SuperRasW());
m_SpatialFilterCL.m_SuperRasH = uint(SuperRasH());
m_SpatialFilterCL.m_FinalRasW = uint(FinalRasW());
@ -1670,7 +1734,6 @@ template <typename T, typename bucketT>
void RendererCL<T, bucketT>::ConvertEmber(Ember<T>& ember, EmberCL<T>& emberCL, vector<XformCL<T>>& xformsCL)
{
memset(&emberCL, 0, sizeof(EmberCL<T>));//Might not really be needed.
emberCL.m_RotA = m_RotMat.A();
emberCL.m_RotB = m_RotMat.B();
emberCL.m_RotD = m_RotMat.D();
@ -1688,21 +1751,18 @@ void RendererCL<T, bucketT>::ConvertEmber(Ember<T>& ember, EmberCL<T>& emberCL,
for (size_t i = 0; i < ember.TotalXformCount() && i < xformsCL.size(); i++)
{
Xform<T>* xform = ember.GetTotalXform(i);
xformsCL[i].m_A = xform->m_Affine.A();
xformsCL[i].m_B = xform->m_Affine.B();
xformsCL[i].m_C = xform->m_Affine.C();
xformsCL[i].m_D = xform->m_Affine.D();
xformsCL[i].m_E = xform->m_Affine.E();
xformsCL[i].m_F = xform->m_Affine.F();
xformsCL[i].m_PostA = xform->m_Post.A();
xformsCL[i].m_PostB = xform->m_Post.B();
xformsCL[i].m_PostC = xform->m_Post.C();
xformsCL[i].m_PostD = xform->m_Post.D();
xformsCL[i].m_PostE = xform->m_Post.E();
xformsCL[i].m_PostF = xform->m_Post.F();
xformsCL[i].m_DirectColor = xform->m_DirectColor;
xformsCL[i].m_ColorSpeedCache = xform->ColorSpeedCache();
xformsCL[i].m_OneMinusColorCache = xform->OneMinusColorCache();

View File

@ -41,57 +41,57 @@ public:
template <typename T, typename bucketT>
class EMBERCL_API RendererCL : public Renderer<T, bucketT>, public RendererCLBase
{
using EmberNs::Renderer<T, bucketT>::RendererBase::Abort;
using EmberNs::Renderer<T, bucketT>::RendererBase::EarlyClip;
using EmberNs::Renderer<T, bucketT>::RendererBase::Transparency;
using EmberNs::Renderer<T, bucketT>::RendererBase::EnterResize;
using EmberNs::Renderer<T, bucketT>::RendererBase::LeaveResize;
using EmberNs::Renderer<T, bucketT>::RendererBase::FinalRasW;
using EmberNs::Renderer<T, bucketT>::RendererBase::FinalRasH;
using EmberNs::Renderer<T, bucketT>::RendererBase::SuperRasW;
using EmberNs::Renderer<T, bucketT>::RendererBase::SuperRasH;
using EmberNs::Renderer<T, bucketT>::RendererBase::SuperSize;
using EmberNs::Renderer<T, bucketT>::RendererBase::BytesPerChannel;
using EmberNs::Renderer<T, bucketT>::RendererBase::TemporalSamples;
using EmberNs::Renderer<T, bucketT>::RendererBase::ItersPerTemporalSample;
using EmberNs::Renderer<T, bucketT>::RendererBase::FuseCount;
using EmberNs::Renderer<T, bucketT>::RendererBase::DensityFilterOffset;
using EmberNs::Renderer<T, bucketT>::RendererBase::m_ProgressParameter;
using EmberNs::Renderer<T, bucketT>::RendererBase::m_YAxisUp;
using EmberNs::Renderer<T, bucketT>::RendererBase::m_LockAccum;
using EmberNs::Renderer<T, bucketT>::RendererBase::m_Abort;
using EmberNs::Renderer<T, bucketT>::RendererBase::m_NumChannels;
using EmberNs::Renderer<T, bucketT>::RendererBase::m_LastIter;
using EmberNs::Renderer<T, bucketT>::RendererBase::m_LastIterPercent;
using EmberNs::Renderer<T, bucketT>::RendererBase::m_Stats;
using EmberNs::Renderer<T, bucketT>::RendererBase::m_Callback;
using EmberNs::Renderer<T, bucketT>::RendererBase::m_Rand;
using EmberNs::Renderer<T, bucketT>::RendererBase::m_RenderTimer;
using EmberNs::Renderer<T, bucketT>::RendererBase::m_IterTimer;
using EmberNs::Renderer<T, bucketT>::RendererBase::m_ProgressTimer;
using EmberNs::Renderer<T, bucketT>::RendererBase::EmberReport::AddToReport;
using EmberNs::Renderer<T, bucketT>::RendererBase::m_ResizeCs;
using EmberNs::Renderer<T, bucketT>::RendererBase::m_ProcessAction;
using EmberNs::Renderer<T, bucketT>::m_RotMat;
using EmberNs::Renderer<T, bucketT>::m_Ember;
using EmberNs::Renderer<T, bucketT>::m_Csa;
using EmberNs::Renderer<T, bucketT>::m_CurvesSet;
using EmberNs::Renderer<T, bucketT>::CenterX;
using EmberNs::Renderer<T, bucketT>::CenterY;
using EmberNs::Renderer<T, bucketT>::K1;
using EmberNs::Renderer<T, bucketT>::K2;
using EmberNs::Renderer<T, bucketT>::Supersample;
using EmberNs::Renderer<T, bucketT>::HighlightPower;
using EmberNs::Renderer<T, bucketT>::HistBuckets;
using EmberNs::Renderer<T, bucketT>::AccumulatorBuckets;
using EmberNs::Renderer<T, bucketT>::GetDensityFilter;
using EmberNs::Renderer<T, bucketT>::GetSpatialFilter;
using EmberNs::Renderer<T, bucketT>::CoordMap;
using EmberNs::Renderer<T, bucketT>::XformDistributions;
using EmberNs::Renderer<T, bucketT>::XformDistributionsSize;
using EmberNs::Renderer<T, bucketT>::m_Dmap;
using EmberNs::Renderer<T, bucketT>::m_DensityFilter;
using EmberNs::Renderer<T, bucketT>::m_SpatialFilter;
using EmberNs::Renderer<T, bucketT>::RendererBase::Abort;
using EmberNs::Renderer<T, bucketT>::RendererBase::EarlyClip;
using EmberNs::Renderer<T, bucketT>::RendererBase::Transparency;
using EmberNs::Renderer<T, bucketT>::RendererBase::EnterResize;
using EmberNs::Renderer<T, bucketT>::RendererBase::LeaveResize;
using EmberNs::Renderer<T, bucketT>::RendererBase::FinalRasW;
using EmberNs::Renderer<T, bucketT>::RendererBase::FinalRasH;
using EmberNs::Renderer<T, bucketT>::RendererBase::SuperRasW;
using EmberNs::Renderer<T, bucketT>::RendererBase::SuperRasH;
using EmberNs::Renderer<T, bucketT>::RendererBase::SuperSize;
using EmberNs::Renderer<T, bucketT>::RendererBase::BytesPerChannel;
using EmberNs::Renderer<T, bucketT>::RendererBase::TemporalSamples;
using EmberNs::Renderer<T, bucketT>::RendererBase::ItersPerTemporalSample;
using EmberNs::Renderer<T, bucketT>::RendererBase::FuseCount;
using EmberNs::Renderer<T, bucketT>::RendererBase::DensityFilterOffset;
using EmberNs::Renderer<T, bucketT>::RendererBase::m_ProgressParameter;
using EmberNs::Renderer<T, bucketT>::RendererBase::m_YAxisUp;
using EmberNs::Renderer<T, bucketT>::RendererBase::m_LockAccum;
using EmberNs::Renderer<T, bucketT>::RendererBase::m_Abort;
using EmberNs::Renderer<T, bucketT>::RendererBase::m_NumChannels;
using EmberNs::Renderer<T, bucketT>::RendererBase::m_LastIter;
using EmberNs::Renderer<T, bucketT>::RendererBase::m_LastIterPercent;
using EmberNs::Renderer<T, bucketT>::RendererBase::m_Stats;
using EmberNs::Renderer<T, bucketT>::RendererBase::m_Callback;
using EmberNs::Renderer<T, bucketT>::RendererBase::m_Rand;
using EmberNs::Renderer<T, bucketT>::RendererBase::m_RenderTimer;
using EmberNs::Renderer<T, bucketT>::RendererBase::m_IterTimer;
using EmberNs::Renderer<T, bucketT>::RendererBase::m_ProgressTimer;
using EmberNs::Renderer<T, bucketT>::RendererBase::EmberReport::AddToReport;
using EmberNs::Renderer<T, bucketT>::RendererBase::m_ResizeCs;
using EmberNs::Renderer<T, bucketT>::RendererBase::m_ProcessAction;
using EmberNs::Renderer<T, bucketT>::m_RotMat;
using EmberNs::Renderer<T, bucketT>::m_Ember;
using EmberNs::Renderer<T, bucketT>::m_Csa;
using EmberNs::Renderer<T, bucketT>::m_CurvesSet;
using EmberNs::Renderer<T, bucketT>::CenterX;
using EmberNs::Renderer<T, bucketT>::CenterY;
using EmberNs::Renderer<T, bucketT>::K1;
using EmberNs::Renderer<T, bucketT>::K2;
using EmberNs::Renderer<T, bucketT>::Supersample;
using EmberNs::Renderer<T, bucketT>::HighlightPower;
using EmberNs::Renderer<T, bucketT>::HistBuckets;
using EmberNs::Renderer<T, bucketT>::AccumulatorBuckets;
using EmberNs::Renderer<T, bucketT>::GetDensityFilter;
using EmberNs::Renderer<T, bucketT>::GetSpatialFilter;
using EmberNs::Renderer<T, bucketT>::CoordMap;
using EmberNs::Renderer<T, bucketT>::XformDistributions;
using EmberNs::Renderer<T, bucketT>::XformDistributionsSize;
using EmberNs::Renderer<T, bucketT>::m_Dmap;
using EmberNs::Renderer<T, bucketT>::m_DensityFilter;
using EmberNs::Renderer<T, bucketT>::m_SpatialFilter;
public:
RendererCL(const vector<pair<size_t, size_t>>& devices, bool shared = false, GLuint outputTexID = 0);
@ -200,6 +200,7 @@ private:
string m_EmberBufferName;
string m_XformsBufferName;
string m_ParVarsBufferName;
string m_GlobalSharedBufferName;
string m_SeedsBufferName;
string m_DistBufferName;
string m_CarToRasBufferName;
@ -233,6 +234,7 @@ private:
DEOpenCLKernelCreator m_DEOpenCLKernelCreator;
FinalAccumOpenCLKernelCreator m_FinalAccumOpenCLKernelCreator;
pair<string, vector<T>> m_Params;
pair<string, vector<T>> m_GlobalShared;
vector<unique_ptr<RendererClDevice>> m_Devices;
Ember<T> m_LastBuiltEmber;
};

View File

@ -13,7 +13,6 @@ namespace EmberCLns
/// <param name="shared">True if shared with OpenGL, else false.</param>
/// <returns>True if success, else false.</returns>
RendererClDevice::RendererClDevice(bool doublePrec, size_t platform, size_t device, bool shared)
: m_Info(OpenCLInfo::Instance())
{
m_Init = false;
m_Shared = shared;
@ -22,6 +21,7 @@ RendererClDevice::RendererClDevice(bool doublePrec, size_t platform, size_t devi
m_Calls = 0;
m_PlatformIndex = platform;
m_DeviceIndex = device;
m_Info = OpenCLInfo::Instance();
}
/// <summary>
@ -40,7 +40,7 @@ bool RendererClDevice::Init()
if (b && m_Wrapper.Ok() && !m_Init)
{
m_NVidia = ToLower(m_Info.PlatformName(m_PlatformIndex)).find_first_of("nvidia") != string::npos && m_Wrapper.LocalMemSize() > (32 * 1024);
m_NVidia = ToLower(m_Info->PlatformName(m_PlatformIndex)).find_first_of("nvidia") != string::npos && m_Wrapper.LocalMemSize() > (32 * 1024);
m_WarpSize = m_NVidia ? 32 : 64;
m_Init = true;
}

View File

@ -37,6 +37,6 @@ private:
size_t m_WarpSize;
size_t m_PlatformIndex;
size_t m_DeviceIndex;
OpenCLInfo& m_Info;
shared_ptr<OpenCLInfo> m_Info;
};
}