--User changes

-Add new preset dimensions to the right click menu of the width and height fields in the editor. -Change QSS stylesheets to properly handle tabs. -Make tabs rectangular by default. For some reason, they had always been triangular. --Bug fixes -Incremental rendering times in the editor were wrong. --Code changes -Migrate to Qt6. There is probably more work to be done here. -Migrate to VS2022. -Migrate to Wix 4 installer. -Change installer to install to program files for all users. -Fix many VS2022 code analysis warnings. -No longer use byte typedef, because std::byte is now a type. Revert all back to unsigned char. -Upgrade OpenCL headers to version 3.0 and keep locally now rather than trying to look for system files. -No longer link to Nvidia or AMD specific OpenCL libraries. Use the generic installer located at OCL_ROOT too. -Add the ability to change OpenCL grid dimensions. This was attempted for investigating possible performance improvments, but made no difference. This has not been verified on Linux or Mac yet.
2025-07-01 22:06:10 -04:00 · 2023-04-25 17:59:54 -06:00
parent 64d4470b12
commit 1dfbd4eff2
306 changed files with 514515 additions and 491207 deletions
--- a/Source/EmberCL/CL/cl.h
+++ b/Source/EmberCL/CL/cl.h
--- a/Source/EmberCL/CL/cl_d3d10.h
+++ b/Source/EmberCL/CL/cl_d3d10.h
@ -0,0 +1,241 @@
+/*******************************************************************************
+ * Copyright (c) 2008-2023 The Khronos Group Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ ******************************************************************************/
+
+#ifndef OPENCL_CL_D3D10_H_
+#define OPENCL_CL_D3D10_H_
+
+/*
+** This header is generated from the Khronos OpenCL XML API Registry.
+*/
+
+#if defined(_MSC_VER)
+#if _MSC_VER >=1500
+#pragma warning( push )
+#pragma warning( disable : 4201 )
+#pragma warning( disable : 5105 )
+#endif
+#endif
+#include <d3d10.h>
+#if defined(_MSC_VER)
+#if _MSC_VER >=1500
+#pragma warning( pop )
+#endif
+#endif
+
+#include <CL/cl.h>
+
+/* CL_NO_PROTOTYPES implies CL_NO_EXTENSION_PROTOTYPES: */
+#if defined(CL_NO_PROTOTYPES) && !defined(CL_NO_EXTENSION_PROTOTYPES)
+#define CL_NO_EXTENSION_PROTOTYPES
+#endif
+
+/* CL_NO_EXTENSION_PROTOTYPES implies
+   CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES and
+   CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES: */
+#if defined(CL_NO_EXTENSION_PROTOTYPES) && \
+    !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES)
+#define CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES
+#endif
+#if defined(CL_NO_EXTENSION_PROTOTYPES) && \
+    !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)
+#define CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/***************************************************************
+* cl_khr_d3d10_sharing
+***************************************************************/
+#define cl_khr_d3d10_sharing 1
+#define CL_KHR_D3D10_SHARING_EXTENSION_NAME \
+    "cl_khr_d3d10_sharing"
+
+typedef cl_uint             cl_d3d10_device_source_khr;
+typedef cl_uint             cl_d3d10_device_set_khr;
+
+/* Error codes */
+#define CL_INVALID_D3D10_DEVICE_KHR                         -1002
+#define CL_INVALID_D3D10_RESOURCE_KHR                       -1003
+#define CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR              -1004
+#define CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR                  -1005
+
+/* cl_d3d10_device_source_khr */
+#define CL_D3D10_DEVICE_KHR                                 0x4010
+#define CL_D3D10_DXGI_ADAPTER_KHR                           0x4011
+
+/* cl_d3d10_device_set_khr */
+#define CL_PREFERRED_DEVICES_FOR_D3D10_KHR                  0x4012
+#define CL_ALL_DEVICES_FOR_D3D10_KHR                        0x4013
+
+/* cl_context_info */
+#define CL_CONTEXT_D3D10_DEVICE_KHR                         0x4014
+#define CL_CONTEXT_D3D10_PREFER_SHARED_RESOURCES_KHR        0x402C
+
+/* cl_mem_info */
+#define CL_MEM_D3D10_RESOURCE_KHR                           0x4015
+
+/* cl_image_info */
+#define CL_IMAGE_D3D10_SUBRESOURCE_KHR                      0x4016
+
+/* cl_command_type */
+#define CL_COMMAND_ACQUIRE_D3D10_OBJECTS_KHR                0x4017
+#define CL_COMMAND_RELEASE_D3D10_OBJECTS_KHR                0x4018
+
+
+typedef cl_int (CL_API_CALL *
+clGetDeviceIDsFromD3D10KHR_fn)(
+    cl_platform_id platform,
+    cl_d3d10_device_source_khr d3d_device_source,
+    void* d3d_object,
+    cl_d3d10_device_set_khr d3d_device_set,
+    cl_uint num_entries,
+    cl_device_id* devices,
+    cl_uint* num_devices) CL_API_SUFFIX__VERSION_1_0;
+
+typedef cl_mem (CL_API_CALL *
+clCreateFromD3D10BufferKHR_fn)(
+    cl_context context,
+    cl_mem_flags flags,
+    ID3D10Buffer* resource,
+    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef cl_mem (CL_API_CALL *
+clCreateFromD3D10Texture2DKHR_fn)(
+    cl_context context,
+    cl_mem_flags flags,
+    ID3D10Texture2D* resource,
+    UINT subresource,
+    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef cl_mem (CL_API_CALL *
+clCreateFromD3D10Texture3DKHR_fn)(
+    cl_context context,
+    cl_mem_flags flags,
+    ID3D10Texture3D* resource,
+    UINT subresource,
+    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef cl_int (CL_API_CALL *
+clEnqueueAcquireD3D10ObjectsKHR_fn)(
+    cl_command_queue command_queue,
+    cl_uint num_objects,
+    const cl_mem* mem_objects,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) CL_API_SUFFIX__VERSION_1_0;
+
+typedef cl_int (CL_API_CALL *
+clEnqueueReleaseD3D10ObjectsKHR_fn)(
+    cl_command_queue command_queue,
+    cl_uint num_objects,
+    const cl_mem* mem_objects,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) CL_API_SUFFIX__VERSION_1_0;
+
+#if !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES)
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetDeviceIDsFromD3D10KHR(
+    cl_platform_id platform,
+    cl_d3d10_device_source_khr d3d_device_source,
+    void* d3d_object,
+    cl_d3d10_device_set_khr d3d_device_set,
+    cl_uint num_entries,
+    cl_device_id* devices,
+    cl_uint* num_devices) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromD3D10BufferKHR(
+    cl_context context,
+    cl_mem_flags flags,
+    ID3D10Buffer* resource,
+    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromD3D10Texture2DKHR(
+    cl_context context,
+    cl_mem_flags flags,
+    ID3D10Texture2D* resource,
+    UINT subresource,
+    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromD3D10Texture3DKHR(
+    cl_context context,
+    cl_mem_flags flags,
+    ID3D10Texture3D* resource,
+    UINT subresource,
+    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueAcquireD3D10ObjectsKHR(
+    cl_command_queue command_queue,
+    cl_uint num_objects,
+    const cl_mem* mem_objects,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReleaseD3D10ObjectsKHR(
+    cl_command_queue command_queue,
+    cl_uint num_objects,
+    const cl_mem* mem_objects,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) CL_API_SUFFIX__VERSION_1_0;
+
+#endif /* !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES) */
+
+/***************************************************************
+* cl_intel_sharing_format_query_d3d10
+***************************************************************/
+#define cl_intel_sharing_format_query_d3d10 1
+#define CL_INTEL_SHARING_FORMAT_QUERY_D3D10_EXTENSION_NAME \
+    "cl_intel_sharing_format_query_d3d10"
+
+/* when cl_khr_d3d10_sharing is supported */
+
+typedef cl_int (CL_API_CALL *
+clGetSupportedD3D10TextureFormatsINTEL_fn)(
+    cl_context context,
+    cl_mem_flags flags,
+    cl_mem_object_type image_type,
+    cl_uint num_entries,
+    DXGI_FORMAT* d3d10_formats,
+    cl_uint* num_texture_formats) ;
+
+#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetSupportedD3D10TextureFormatsINTEL(
+    cl_context context,
+    cl_mem_flags flags,
+    cl_mem_object_type image_type,
+    cl_uint num_entries,
+    DXGI_FORMAT* d3d10_formats,
+    cl_uint* num_texture_formats) ;
+
+#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* OPENCL_CL_D3D10_H_ */
--- a/Source/EmberCL/CL/cl_d3d11.h
+++ b/Source/EmberCL/CL/cl_d3d11.h
@ -0,0 +1,243 @@
+/*******************************************************************************
+ * Copyright (c) 2008-2023 The Khronos Group Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ ******************************************************************************/
+
+#ifndef OPENCL_CL_D3D11_H_
+#define OPENCL_CL_D3D11_H_
+
+/*
+** This header is generated from the Khronos OpenCL XML API Registry.
+*/
+
+#if defined(_MSC_VER)
+#if _MSC_VER >=1500
+#pragma warning( push )
+#pragma warning( disable : 4201 )
+#pragma warning( disable : 5105 )
+#endif
+#endif
+#include <d3d11.h>
+#if defined(_MSC_VER)
+#if _MSC_VER >=1500
+#pragma warning( pop )
+#endif
+#endif
+
+#include <CL/cl.h>
+
+/* CL_NO_PROTOTYPES implies CL_NO_EXTENSION_PROTOTYPES: */
+#if defined(CL_NO_PROTOTYPES) && !defined(CL_NO_EXTENSION_PROTOTYPES)
+#define CL_NO_EXTENSION_PROTOTYPES
+#endif
+
+/* CL_NO_EXTENSION_PROTOTYPES implies
+   CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES and
+   CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES: */
+#if defined(CL_NO_EXTENSION_PROTOTYPES) && \
+    !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES)
+#define CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES
+#endif
+#if defined(CL_NO_EXTENSION_PROTOTYPES) && \
+    !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)
+#define CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/***************************************************************
+* cl_khr_d3d11_sharing
+***************************************************************/
+#define cl_khr_d3d11_sharing 1
+#define CL_KHR_D3D11_SHARING_EXTENSION_NAME \
+    "cl_khr_d3d11_sharing"
+
+typedef cl_uint             cl_d3d11_device_source_khr;
+typedef cl_uint             cl_d3d11_device_set_khr;
+
+/* Error codes */
+#define CL_INVALID_D3D11_DEVICE_KHR                         -1006
+#define CL_INVALID_D3D11_RESOURCE_KHR                       -1007
+#define CL_D3D11_RESOURCE_ALREADY_ACQUIRED_KHR              -1008
+#define CL_D3D11_RESOURCE_NOT_ACQUIRED_KHR                  -1009
+
+/* cl_d3d11_device_source_khr */
+#define CL_D3D11_DEVICE_KHR                                 0x4019
+#define CL_D3D11_DXGI_ADAPTER_KHR                           0x401A
+
+/* cl_d3d11_device_set_khr */
+#define CL_PREFERRED_DEVICES_FOR_D3D11_KHR                  0x401B
+#define CL_ALL_DEVICES_FOR_D3D11_KHR                        0x401C
+
+/* cl_context_info */
+#define CL_CONTEXT_D3D11_DEVICE_KHR                         0x401D
+#define CL_CONTEXT_D3D11_PREFER_SHARED_RESOURCES_KHR        0x402D
+
+/* cl_mem_info */
+#define CL_MEM_D3D11_RESOURCE_KHR                           0x401E
+
+/* cl_image_info */
+#define CL_IMAGE_D3D11_SUBRESOURCE_KHR                      0x401F
+
+/* cl_command_type */
+#define CL_COMMAND_ACQUIRE_D3D11_OBJECTS_KHR                0x4020
+#define CL_COMMAND_RELEASE_D3D11_OBJECTS_KHR                0x4021
+
+
+typedef cl_int (CL_API_CALL *
+clGetDeviceIDsFromD3D11KHR_fn)(
+    cl_platform_id platform,
+    cl_d3d11_device_source_khr d3d_device_source,
+    void* d3d_object,
+    cl_d3d11_device_set_khr d3d_device_set,
+    cl_uint num_entries,
+    cl_device_id* devices,
+    cl_uint* num_devices) CL_API_SUFFIX__VERSION_1_2;
+
+typedef cl_mem (CL_API_CALL *
+clCreateFromD3D11BufferKHR_fn)(
+    cl_context context,
+    cl_mem_flags flags,
+    ID3D11Buffer* resource,
+    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+
+typedef cl_mem (CL_API_CALL *
+clCreateFromD3D11Texture2DKHR_fn)(
+    cl_context context,
+    cl_mem_flags flags,
+    ID3D11Texture2D* resource,
+    UINT subresource,
+    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+
+typedef cl_mem (CL_API_CALL *
+clCreateFromD3D11Texture3DKHR_fn)(
+    cl_context context,
+    cl_mem_flags flags,
+    ID3D11Texture3D* resource,
+    UINT subresource,
+    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+
+typedef cl_int (CL_API_CALL *
+clEnqueueAcquireD3D11ObjectsKHR_fn)(
+    cl_command_queue command_queue,
+    cl_uint num_objects,
+    const cl_mem* mem_objects,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) CL_API_SUFFIX__VERSION_1_2;
+
+typedef cl_int (CL_API_CALL *
+clEnqueueReleaseD3D11ObjectsKHR_fn)(
+    cl_command_queue command_queue,
+    cl_uint num_objects,
+    const cl_mem* mem_objects,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) CL_API_SUFFIX__VERSION_1_2;
+
+#if !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES)
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetDeviceIDsFromD3D11KHR(
+    cl_platform_id platform,
+    cl_d3d11_device_source_khr d3d_device_source,
+    void* d3d_object,
+    cl_d3d11_device_set_khr d3d_device_set,
+    cl_uint num_entries,
+    cl_device_id* devices,
+    cl_uint* num_devices) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromD3D11BufferKHR(
+    cl_context context,
+    cl_mem_flags flags,
+    ID3D11Buffer* resource,
+    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromD3D11Texture2DKHR(
+    cl_context context,
+    cl_mem_flags flags,
+    ID3D11Texture2D* resource,
+    UINT subresource,
+    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromD3D11Texture3DKHR(
+    cl_context context,
+    cl_mem_flags flags,
+    ID3D11Texture3D* resource,
+    UINT subresource,
+    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueAcquireD3D11ObjectsKHR(
+    cl_command_queue command_queue,
+    cl_uint num_objects,
+    const cl_mem* mem_objects,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReleaseD3D11ObjectsKHR(
+    cl_command_queue command_queue,
+    cl_uint num_objects,
+    const cl_mem* mem_objects,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) CL_API_SUFFIX__VERSION_1_2;
+
+#endif /* !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES) */
+
+/***************************************************************
+* cl_intel_sharing_format_query_d3d11
+***************************************************************/
+#define cl_intel_sharing_format_query_d3d11 1
+#define CL_INTEL_SHARING_FORMAT_QUERY_D3D11_EXTENSION_NAME \
+    "cl_intel_sharing_format_query_d3d11"
+
+/* when cl_khr_d3d11_sharing is supported */
+
+typedef cl_int (CL_API_CALL *
+clGetSupportedD3D11TextureFormatsINTEL_fn)(
+    cl_context context,
+    cl_mem_flags flags,
+    cl_mem_object_type image_type,
+    cl_uint plane,
+    cl_uint num_entries,
+    DXGI_FORMAT* d3d11_formats,
+    cl_uint* num_texture_formats) ;
+
+#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetSupportedD3D11TextureFormatsINTEL(
+    cl_context context,
+    cl_mem_flags flags,
+    cl_mem_object_type image_type,
+    cl_uint plane,
+    cl_uint num_entries,
+    DXGI_FORMAT* d3d11_formats,
+    cl_uint* num_texture_formats) ;
+
+#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* OPENCL_CL_D3D11_H_ */
--- a/Source/EmberCL/CL/cl_dx9_media_sharing.h
+++ b/Source/EmberCL/CL/cl_dx9_media_sharing.h
@ -0,0 +1,350 @@
+/*******************************************************************************
+ * Copyright (c) 2008-2023 The Khronos Group Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ ******************************************************************************/
+
+#ifndef OPENCL_CL_DX9_MEDIA_SHARING_H_
+#define OPENCL_CL_DX9_MEDIA_SHARING_H_
+
+/*
+** This header is generated from the Khronos OpenCL XML API Registry.
+*/
+
+#if defined(_WIN32)
+#if defined(_MSC_VER)
+#if _MSC_VER >=1500
+#pragma warning( push )
+#pragma warning( disable : 4201 )
+#pragma warning( disable : 5105 )
+#endif
+#endif
+#include <d3d9.h>
+#if defined(_MSC_VER)
+#if _MSC_VER >=1500
+#pragma warning( pop )
+#endif
+#endif
+#endif
+
+#include <CL/cl.h>
+
+/* CL_NO_PROTOTYPES implies CL_NO_EXTENSION_PROTOTYPES: */
+#if defined(CL_NO_PROTOTYPES) && !defined(CL_NO_EXTENSION_PROTOTYPES)
+#define CL_NO_EXTENSION_PROTOTYPES
+#endif
+
+/* CL_NO_EXTENSION_PROTOTYPES implies
+   CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES and
+   CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES: */
+#if defined(CL_NO_EXTENSION_PROTOTYPES) && \
+    !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES)
+#define CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES
+#endif
+#if defined(CL_NO_EXTENSION_PROTOTYPES) && \
+    !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)
+#define CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/***************************************************************
+* cl_khr_dx9_media_sharing
+***************************************************************/
+#define cl_khr_dx9_media_sharing 1
+#define CL_KHR_DX9_MEDIA_SHARING_EXTENSION_NAME \
+    "cl_khr_dx9_media_sharing"
+
+typedef cl_uint             cl_dx9_media_adapter_type_khr;
+typedef cl_uint             cl_dx9_media_adapter_set_khr;
+
+#if defined(_WIN32)
+typedef struct _cl_dx9_surface_info_khr {
+    IDirect3DSurface9* resource;
+    HANDLE shared_handle;
+} cl_dx9_surface_info_khr;
+
+#endif /* defined(_WIN32) */
+
+/* Error codes */
+#define CL_INVALID_DX9_MEDIA_ADAPTER_KHR                    -1010
+#define CL_INVALID_DX9_MEDIA_SURFACE_KHR                    -1011
+#define CL_DX9_MEDIA_SURFACE_ALREADY_ACQUIRED_KHR           -1012
+#define CL_DX9_MEDIA_SURFACE_NOT_ACQUIRED_KHR               -1013
+
+/* cl_media_adapter_type_khr */
+#define CL_ADAPTER_D3D9_KHR                                 0x2020
+#define CL_ADAPTER_D3D9EX_KHR                               0x2021
+#define CL_ADAPTER_DXVA_KHR                                 0x2022
+
+/* cl_media_adapter_set_khr */
+#define CL_PREFERRED_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR      0x2023
+#define CL_ALL_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR            0x2024
+
+/* cl_context_info */
+#define CL_CONTEXT_ADAPTER_D3D9_KHR                         0x2025
+#define CL_CONTEXT_ADAPTER_D3D9EX_KHR                       0x2026
+#define CL_CONTEXT_ADAPTER_DXVA_KHR                         0x2027
+
+/* cl_mem_info */
+#define CL_MEM_DX9_MEDIA_ADAPTER_TYPE_KHR                   0x2028
+#define CL_MEM_DX9_MEDIA_SURFACE_INFO_KHR                   0x2029
+
+/* cl_image_info */
+#define CL_IMAGE_DX9_MEDIA_PLANE_KHR                        0x202A
+
+/* cl_command_type */
+#define CL_COMMAND_ACQUIRE_DX9_MEDIA_SURFACES_KHR           0x202B
+#define CL_COMMAND_RELEASE_DX9_MEDIA_SURFACES_KHR           0x202C
+
+
+typedef cl_int (CL_API_CALL *
+clGetDeviceIDsFromDX9MediaAdapterKHR_fn)(
+    cl_platform_id platform,
+    cl_uint num_media_adapters,
+    cl_dx9_media_adapter_type_khr* media_adapter_type,
+    void* media_adapters,
+    cl_dx9_media_adapter_set_khr media_adapter_set,
+    cl_uint num_entries,
+    cl_device_id* devices,
+    cl_uint* num_devices) CL_API_SUFFIX__VERSION_1_2;
+
+typedef cl_mem (CL_API_CALL *
+clCreateFromDX9MediaSurfaceKHR_fn)(
+    cl_context context,
+    cl_mem_flags flags,
+    cl_dx9_media_adapter_type_khr adapter_type,
+    void* surface_info,
+    cl_uint plane,
+    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+
+typedef cl_int (CL_API_CALL *
+clEnqueueAcquireDX9MediaSurfacesKHR_fn)(
+    cl_command_queue command_queue,
+    cl_uint num_objects,
+    const cl_mem* mem_objects,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) CL_API_SUFFIX__VERSION_1_2;
+
+typedef cl_int (CL_API_CALL *
+clEnqueueReleaseDX9MediaSurfacesKHR_fn)(
+    cl_command_queue command_queue,
+    cl_uint num_objects,
+    const cl_mem* mem_objects,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) CL_API_SUFFIX__VERSION_1_2;
+
+#if !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES)
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetDeviceIDsFromDX9MediaAdapterKHR(
+    cl_platform_id platform,
+    cl_uint num_media_adapters,
+    cl_dx9_media_adapter_type_khr* media_adapter_type,
+    void* media_adapters,
+    cl_dx9_media_adapter_set_khr media_adapter_set,
+    cl_uint num_entries,
+    cl_device_id* devices,
+    cl_uint* num_devices) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromDX9MediaSurfaceKHR(
+    cl_context context,
+    cl_mem_flags flags,
+    cl_dx9_media_adapter_type_khr adapter_type,
+    void* surface_info,
+    cl_uint plane,
+    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueAcquireDX9MediaSurfacesKHR(
+    cl_command_queue command_queue,
+    cl_uint num_objects,
+    const cl_mem* mem_objects,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReleaseDX9MediaSurfacesKHR(
+    cl_command_queue command_queue,
+    cl_uint num_objects,
+    const cl_mem* mem_objects,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) CL_API_SUFFIX__VERSION_1_2;
+
+#endif /* !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES) */
+
+/***************************************************************
+* cl_intel_dx9_media_sharing
+***************************************************************/
+#define cl_intel_dx9_media_sharing 1
+#define CL_INTEL_DX9_MEDIA_SHARING_EXTENSION_NAME \
+    "cl_intel_dx9_media_sharing"
+
+typedef cl_uint             cl_dx9_device_source_intel;
+typedef cl_uint             cl_dx9_device_set_intel;
+
+/* Error codes */
+#define CL_INVALID_DX9_DEVICE_INTEL                         -1010
+#define CL_INVALID_DX9_RESOURCE_INTEL                       -1011
+#define CL_DX9_RESOURCE_ALREADY_ACQUIRED_INTEL              -1012
+#define CL_DX9_RESOURCE_NOT_ACQUIRED_INTEL                  -1013
+
+/* cl_dx9_device_source_intel */
+#define CL_D3D9_DEVICE_INTEL                                0x4022
+#define CL_D3D9EX_DEVICE_INTEL                              0x4070
+#define CL_DXVA_DEVICE_INTEL                                0x4071
+
+/* cl_dx9_device_set_intel */
+#define CL_PREFERRED_DEVICES_FOR_DX9_INTEL                  0x4024
+#define CL_ALL_DEVICES_FOR_DX9_INTEL                        0x4025
+
+/* cl_context_info */
+#define CL_CONTEXT_D3D9_DEVICE_INTEL                        0x4026
+#define CL_CONTEXT_D3D9EX_DEVICE_INTEL                      0x4072
+#define CL_CONTEXT_DXVA_DEVICE_INTEL                        0x4073
+
+/* cl_mem_info */
+#define CL_MEM_DX9_RESOURCE_INTEL                           0x4027
+#define CL_MEM_DX9_SHARED_HANDLE_INTEL                      0x4074
+
+/* cl_image_info */
+#define CL_IMAGE_DX9_PLANE_INTEL                            0x4075
+
+/* cl_command_type */
+#define CL_COMMAND_ACQUIRE_DX9_OBJECTS_INTEL                0x402A
+#define CL_COMMAND_RELEASE_DX9_OBJECTS_INTEL                0x402B
+
+
+typedef cl_int (CL_API_CALL *
+clGetDeviceIDsFromDX9INTEL_fn)(
+    cl_platform_id platform,
+    cl_dx9_device_source_intel dx9_device_source,
+    void* dx9_object,
+    cl_dx9_device_set_intel dx9_device_set,
+    cl_uint num_entries,
+    cl_device_id* devices,
+    cl_uint* num_devices) CL_API_SUFFIX__VERSION_1_1;
+
+typedef cl_mem (CL_API_CALL *
+clCreateFromDX9MediaSurfaceINTEL_fn)(
+    cl_context context,
+    cl_mem_flags flags,
+    IDirect3DSurface9* resource,
+    HANDLE sharedHandle,
+    UINT plane,
+    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_1;
+
+typedef cl_int (CL_API_CALL *
+clEnqueueAcquireDX9ObjectsINTEL_fn)(
+    cl_command_queue command_queue,
+    cl_uint num_objects,
+    const cl_mem* mem_objects,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) CL_API_SUFFIX__VERSION_1_1;
+
+typedef cl_int (CL_API_CALL *
+clEnqueueReleaseDX9ObjectsINTEL_fn)(
+    cl_command_queue command_queue,
+    cl_uint num_objects,
+    cl_mem* mem_objects,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) CL_API_SUFFIX__VERSION_1_1;
+
+#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetDeviceIDsFromDX9INTEL(
+    cl_platform_id platform,
+    cl_dx9_device_source_intel dx9_device_source,
+    void* dx9_object,
+    cl_dx9_device_set_intel dx9_device_set,
+    cl_uint num_entries,
+    cl_device_id* devices,
+    cl_uint* num_devices) CL_API_SUFFIX__VERSION_1_1;
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromDX9MediaSurfaceINTEL(
+    cl_context context,
+    cl_mem_flags flags,
+    IDirect3DSurface9* resource,
+    HANDLE sharedHandle,
+    UINT plane,
+    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_1;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueAcquireDX9ObjectsINTEL(
+    cl_command_queue command_queue,
+    cl_uint num_objects,
+    const cl_mem* mem_objects,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) CL_API_SUFFIX__VERSION_1_1;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReleaseDX9ObjectsINTEL(
+    cl_command_queue command_queue,
+    cl_uint num_objects,
+    cl_mem* mem_objects,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) CL_API_SUFFIX__VERSION_1_1;
+
+#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */
+
+/***************************************************************
+* cl_intel_sharing_format_query_dx9
+***************************************************************/
+#define cl_intel_sharing_format_query_dx9 1
+#define CL_INTEL_SHARING_FORMAT_QUERY_DX9_EXTENSION_NAME \
+    "cl_intel_sharing_format_query_dx9"
+
+/* when cl_khr_dx9_media_sharing or cl_intel_dx9_media_sharing is supported */
+
+typedef cl_int (CL_API_CALL *
+clGetSupportedDX9MediaSurfaceFormatsINTEL_fn)(
+    cl_context context,
+    cl_mem_flags flags,
+    cl_mem_object_type image_type,
+    cl_uint plane,
+    cl_uint num_entries,
+    D3DFORMAT* dx9_formats,
+    cl_uint* num_surface_formats) ;
+
+#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetSupportedDX9MediaSurfaceFormatsINTEL(
+    cl_context context,
+    cl_mem_flags flags,
+    cl_mem_object_type image_type,
+    cl_uint plane,
+    cl_uint num_entries,
+    D3DFORMAT* dx9_formats,
+    cl_uint* num_surface_formats) ;
+
+#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* OPENCL_CL_DX9_MEDIA_SHARING_H_ */
--- a/Source/EmberCL/CL/cl_dx9_media_sharing_intel.h
+++ b/Source/EmberCL/CL/cl_dx9_media_sharing_intel.h
@ -0,0 +1,18 @@
+/*******************************************************************************
+ * Copyright (c) 2008-2020 The Khronos Group Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ ******************************************************************************/
+
+#include <CL/cl_dx9_media_sharing.h>
+#pragma message("The Intel DX9 media sharing extensions have been moved into cl_dx9_media_sharing.h.  Please include cl_dx9_media_sharing.h directly.")
--- a/Source/EmberCL/CL/cl_egl.h
+++ b/Source/EmberCL/CL/cl_egl.h
@ -0,0 +1,167 @@
+/*******************************************************************************
+ * Copyright (c) 2008-2023 The Khronos Group Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ ******************************************************************************/
+
+#ifndef OPENCL_CL_EGL_H_
+#define OPENCL_CL_EGL_H_
+
+/*
+** This header is generated from the Khronos OpenCL XML API Registry.
+*/
+
+#include <CL/cl.h>
+
+/* CL_NO_PROTOTYPES implies CL_NO_EXTENSION_PROTOTYPES: */
+#if defined(CL_NO_PROTOTYPES) && !defined(CL_NO_EXTENSION_PROTOTYPES)
+#define CL_NO_EXTENSION_PROTOTYPES
+#endif
+
+/* CL_NO_EXTENSION_PROTOTYPES implies
+   CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES and
+   CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES: */
+#if defined(CL_NO_EXTENSION_PROTOTYPES) && \
+    !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES)
+#define CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES
+#endif
+#if defined(CL_NO_EXTENSION_PROTOTYPES) && \
+    !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)
+#define CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/***************************************************************
+* cl_khr_egl_image
+***************************************************************/
+#define cl_khr_egl_image 1
+#define CL_KHR_EGL_IMAGE_EXTENSION_NAME \
+    "cl_khr_egl_image"
+
+/* Command type for events created with clEnqueueAcquireEGLObjectsKHR */
+#define CL_COMMAND_EGL_FENCE_SYNC_OBJECT_KHR                0x202F
+#define CL_COMMAND_ACQUIRE_EGL_OBJECTS_KHR                  0x202D
+#define CL_COMMAND_RELEASE_EGL_OBJECTS_KHR                  0x202E
+
+/* Error type for clCreateFromEGLImageKHR */
+#define CL_INVALID_EGL_OBJECT_KHR                           -1093
+#define CL_EGL_RESOURCE_NOT_ACQUIRED_KHR                    -1092
+
+/* CLeglImageKHR is an opaque handle to an EGLImage */
+typedef void*               CLeglImageKHR;
+
+/* CLeglDisplayKHR is an opaque handle to an EGLDisplay */
+typedef void*               CLeglDisplayKHR;
+
+/* properties passed to clCreateFromEGLImageKHR */
+typedef intptr_t            cl_egl_image_properties_khr;
+
+
+typedef cl_mem (CL_API_CALL *
+clCreateFromEGLImageKHR_fn)(
+    cl_context context,
+    CLeglDisplayKHR egldisplay,
+    CLeglImageKHR eglimage,
+    cl_mem_flags flags,
+    const cl_egl_image_properties_khr* properties,
+    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef cl_int (CL_API_CALL *
+clEnqueueAcquireEGLObjectsKHR_fn)(
+    cl_command_queue command_queue,
+    cl_uint num_objects,
+    const cl_mem* mem_objects,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) CL_API_SUFFIX__VERSION_1_0;
+
+typedef cl_int (CL_API_CALL *
+clEnqueueReleaseEGLObjectsKHR_fn)(
+    cl_command_queue command_queue,
+    cl_uint num_objects,
+    const cl_mem* mem_objects,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) CL_API_SUFFIX__VERSION_1_0;
+
+#if !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES)
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromEGLImageKHR(
+    cl_context context,
+    CLeglDisplayKHR egldisplay,
+    CLeglImageKHR eglimage,
+    cl_mem_flags flags,
+    const cl_egl_image_properties_khr* properties,
+    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueAcquireEGLObjectsKHR(
+    cl_command_queue command_queue,
+    cl_uint num_objects,
+    const cl_mem* mem_objects,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReleaseEGLObjectsKHR(
+    cl_command_queue command_queue,
+    cl_uint num_objects,
+    const cl_mem* mem_objects,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) CL_API_SUFFIX__VERSION_1_0;
+
+#endif /* !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES) */
+
+/***************************************************************
+* cl_khr_egl_event
+***************************************************************/
+#define cl_khr_egl_event 1
+#define CL_KHR_EGL_EVENT_EXTENSION_NAME \
+    "cl_khr_egl_event"
+
+/* CLeglDisplayKHR is an opaque handle to an EGLDisplay */
+/* type CLeglDisplayKHR */
+
+/* CLeglSyncKHR is an opaque handle to an EGLSync object */
+typedef void*               CLeglSyncKHR;
+
+
+typedef cl_event (CL_API_CALL *
+clCreateEventFromEGLSyncKHR_fn)(
+    cl_context context,
+    CLeglSyncKHR sync,
+    CLeglDisplayKHR display,
+    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+#if !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES)
+
+extern CL_API_ENTRY cl_event CL_API_CALL
+clCreateEventFromEGLSyncKHR(
+    cl_context context,
+    CLeglSyncKHR sync,
+    CLeglDisplayKHR display,
+    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+#endif /* !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES) */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* OPENCL_CL_EGL_H_ */
--- a/Source/EmberCL/CL/cl_ext.h
+++ b/Source/EmberCL/CL/cl_ext.h
--- a/Source/EmberCL/CL/cl_ext_intel.h
+++ b/Source/EmberCL/CL/cl_ext_intel.h
@ -0,0 +1,19 @@
+/*******************************************************************************
+ * Copyright (c) 2008-2020 The Khronos Group Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ ******************************************************************************/
+
+#include <CL/cl_ext.h>
+#pragma message("The Intel extensions have been moved into cl_ext.h.  Please include cl_ext.h directly.")
--- a/Source/EmberCL/CL/cl_gl.h
+++ b/Source/EmberCL/CL/cl_gl.h
@ -0,0 +1,372 @@
+/*******************************************************************************
+ * Copyright (c) 2008-2023 The Khronos Group Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ ******************************************************************************/
+
+#ifndef OPENCL_CL_GL_H_
+#define OPENCL_CL_GL_H_
+
+/*
+** This header is generated from the Khronos OpenCL XML API Registry.
+*/
+
+#include <CL/cl.h>
+
+/* CL_NO_PROTOTYPES implies CL_NO_EXTENSION_PROTOTYPES: */
+#if defined(CL_NO_PROTOTYPES) && !defined(CL_NO_EXTENSION_PROTOTYPES)
+#define CL_NO_EXTENSION_PROTOTYPES
+#endif
+
+/* CL_NO_EXTENSION_PROTOTYPES implies
+   CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES and
+   CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES: */
+#if defined(CL_NO_EXTENSION_PROTOTYPES) && \
+    !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES)
+#define CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES
+#endif
+#if defined(CL_NO_EXTENSION_PROTOTYPES) && \
+    !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)
+#define CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/***************************************************************
+* cl_khr_gl_sharing
+***************************************************************/
+#define cl_khr_gl_sharing 1
+#define CL_KHR_GL_SHARING_EXTENSION_NAME \
+    "cl_khr_gl_sharing"
+
+typedef cl_uint             cl_gl_context_info;
+
+/* Error codes */
+#define CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR              -1000
+
+/* cl_gl_context_info */
+#define CL_CURRENT_DEVICE_FOR_GL_CONTEXT_KHR                0x2006
+#define CL_DEVICES_FOR_GL_CONTEXT_KHR                       0x2007
+
+/* Additional cl_context_properties */
+#define CL_GL_CONTEXT_KHR                                   0x2008
+#define CL_EGL_DISPLAY_KHR                                  0x2009
+#define CL_GLX_DISPLAY_KHR                                  0x200A
+#define CL_WGL_HDC_KHR                                      0x200B
+#define CL_CGL_SHAREGROUP_KHR                               0x200C
+
+typedef cl_uint             cl_gl_object_type;
+typedef cl_uint             cl_gl_texture_info;
+typedef cl_uint             cl_gl_platform_info;
+
+/* cl_gl_object_type */
+#define CL_GL_OBJECT_BUFFER                                 0x2000
+#define CL_GL_OBJECT_TEXTURE2D                              0x2001
+#define CL_GL_OBJECT_TEXTURE3D                              0x2002
+#define CL_GL_OBJECT_RENDERBUFFER                           0x2003
+
+#if defined(CL_VERSION_1_2)
+/* cl_gl_object_type */
+#define CL_GL_OBJECT_TEXTURE2D_ARRAY                        0x200E
+#define CL_GL_OBJECT_TEXTURE1D                              0x200F
+#define CL_GL_OBJECT_TEXTURE1D_ARRAY                        0x2010
+#define CL_GL_OBJECT_TEXTURE_BUFFER                         0x2011
+
+#endif /* defined(CL_VERSION_1_2) */
+
+/* cl_gl_texture_info */
+#define CL_GL_TEXTURE_TARGET                                0x2004
+#define CL_GL_MIPMAP_LEVEL                                  0x2005
+
+
+typedef cl_int (CL_API_CALL *
+clGetGLContextInfoKHR_fn)(
+    const cl_context_properties* properties,
+    cl_gl_context_info param_name,
+    size_t param_value_size,
+    void* param_value,
+    size_t* param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef cl_mem (CL_API_CALL *
+clCreateFromGLBuffer_fn)(
+    cl_context context,
+    cl_mem_flags flags,
+    cl_GLuint bufobj,
+    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+#if !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES)
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetGLContextInfoKHR(
+    const cl_context_properties* properties,
+    cl_gl_context_info param_name,
+    size_t param_value_size,
+    void* param_value,
+    size_t* param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromGLBuffer(
+    cl_context context,
+    cl_mem_flags flags,
+    cl_GLuint bufobj,
+    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+#endif /* !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES) */
+
+#if defined(CL_VERSION_1_2)
+
+typedef cl_mem (CL_API_CALL *
+clCreateFromGLTexture_fn)(
+    cl_context context,
+    cl_mem_flags flags,
+    cl_GLenum target,
+    cl_GLint miplevel,
+    cl_GLuint texture,
+    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+
+#if !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES)
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromGLTexture(
+    cl_context context,
+    cl_mem_flags flags,
+    cl_GLenum target,
+    cl_GLint miplevel,
+    cl_GLuint texture,
+    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+
+#endif /* !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES) */
+
+#endif /* defined(CL_VERSION_1_2) */
+
+
+typedef cl_mem (CL_API_CALL *
+clCreateFromGLRenderbuffer_fn)(
+    cl_context context,
+    cl_mem_flags flags,
+    cl_GLuint renderbuffer,
+    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef cl_int (CL_API_CALL *
+clGetGLObjectInfo_fn)(
+    cl_mem memobj,
+    cl_gl_object_type* gl_object_type,
+    cl_GLuint* gl_object_name) CL_API_SUFFIX__VERSION_1_0;
+
+typedef cl_int (CL_API_CALL *
+clGetGLTextureInfo_fn)(
+    cl_mem memobj,
+    cl_gl_texture_info param_name,
+    size_t param_value_size,
+    void* param_value,
+    size_t* param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef cl_int (CL_API_CALL *
+clEnqueueAcquireGLObjects_fn)(
+    cl_command_queue command_queue,
+    cl_uint num_objects,
+    const cl_mem* mem_objects,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) CL_API_SUFFIX__VERSION_1_0;
+
+typedef cl_int (CL_API_CALL *
+clEnqueueReleaseGLObjects_fn)(
+    cl_command_queue command_queue,
+    cl_uint num_objects,
+    const cl_mem* mem_objects,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) CL_API_SUFFIX__VERSION_1_0;
+
+#if !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES)
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromGLRenderbuffer(
+    cl_context context,
+    cl_mem_flags flags,
+    cl_GLuint renderbuffer,
+    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetGLObjectInfo(
+    cl_mem memobj,
+    cl_gl_object_type* gl_object_type,
+    cl_GLuint* gl_object_name) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetGLTextureInfo(
+    cl_mem memobj,
+    cl_gl_texture_info param_name,
+    size_t param_value_size,
+    void* param_value,
+    size_t* param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueAcquireGLObjects(
+    cl_command_queue command_queue,
+    cl_uint num_objects,
+    const cl_mem* mem_objects,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReleaseGLObjects(
+    cl_command_queue command_queue,
+    cl_uint num_objects,
+    const cl_mem* mem_objects,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) CL_API_SUFFIX__VERSION_1_0;
+
+#endif /* !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES) */
+
+/* OpenCL 1.0 APIs that were deprecated in OpenCL 1.2 */
+
+typedef cl_mem (CL_API_CALL *
+clCreateFromGLTexture2D_fn)(
+    cl_context context,
+    cl_mem_flags flags,
+    cl_GLenum target,
+    cl_GLint miplevel,
+    cl_GLuint texture,
+    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_1_DEPRECATED;
+
+typedef cl_mem (CL_API_CALL *
+clCreateFromGLTexture3D_fn)(
+    cl_context context,
+    cl_mem_flags flags,
+    cl_GLenum target,
+    cl_GLint miplevel,
+    cl_GLuint texture,
+    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_1_DEPRECATED;
+
+#if !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES)
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromGLTexture2D(
+    cl_context context,
+    cl_mem_flags flags,
+    cl_GLenum target,
+    cl_GLint miplevel,
+    cl_GLuint texture,
+    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_1_DEPRECATED;
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromGLTexture3D(
+    cl_context context,
+    cl_mem_flags flags,
+    cl_GLenum target,
+    cl_GLint miplevel,
+    cl_GLuint texture,
+    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_1_DEPRECATED;
+
+#endif /* !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES) */
+
+/***************************************************************
+* cl_khr_gl_event
+***************************************************************/
+#define cl_khr_gl_event 1
+#define CL_KHR_GL_EVENT_EXTENSION_NAME \
+    "cl_khr_gl_event"
+
+typedef struct __GLsync *   cl_GLsync;
+
+/* cl_command_type */
+#define CL_COMMAND_GL_FENCE_SYNC_OBJECT_KHR                 0x200D
+
+
+typedef cl_event (CL_API_CALL *
+clCreateEventFromGLsyncKHR_fn)(
+    cl_context context,
+    cl_GLsync sync,
+    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_1;
+
+#if !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES)
+
+extern CL_API_ENTRY cl_event CL_API_CALL
+clCreateEventFromGLsyncKHR(
+    cl_context context,
+    cl_GLsync sync,
+    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_1;
+
+#endif /* !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES) */
+
+/***************************************************************
+* cl_khr_gl_depth_images
+***************************************************************/
+#define cl_khr_gl_depth_images 1
+#define CL_KHR_GL_DEPTH_IMAGES_EXTENSION_NAME \
+    "cl_khr_gl_depth_images"
+
+#if !defined(CL_VERSION_1_2)
+/* cl_channel_order - defined in CL.h for OpenCL 1.2 and newer */
+#define CL_DEPTH_STENCIL                                    0x10BE
+
+#endif /* !defined(CL_VERSION_1_2) */
+
+#if !defined(CL_VERSION_1_2)
+/* cl_channel_type - defined in CL.h for OpenCL 1.2 and newer */
+#define CL_UNORM_INT24                                      0x10DF
+
+#endif /* !defined(CL_VERSION_1_2) */
+
+/***************************************************************
+* cl_khr_gl_msaa_sharing
+***************************************************************/
+#define cl_khr_gl_msaa_sharing 1
+#define CL_KHR_GL_MSAA_SHARING_EXTENSION_NAME \
+    "cl_khr_gl_msaa_sharing"
+
+/* cl_gl_texture_info */
+#define CL_GL_NUM_SAMPLES                                   0x2012
+
+/***************************************************************
+* cl_intel_sharing_format_query_gl
+***************************************************************/
+#define cl_intel_sharing_format_query_gl 1
+#define CL_INTEL_SHARING_FORMAT_QUERY_GL_EXTENSION_NAME \
+    "cl_intel_sharing_format_query_gl"
+
+/* when cl_khr_gl_sharing is supported */
+
+typedef cl_int (CL_API_CALL *
+clGetSupportedGLTextureFormatsINTEL_fn)(
+    cl_context context,
+    cl_mem_flags flags,
+    cl_mem_object_type image_type,
+    cl_uint num_entries,
+    cl_GLenum* gl_formats,
+    cl_uint* num_texture_formats) ;
+
+#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetSupportedGLTextureFormatsINTEL(
+    cl_context context,
+    cl_mem_flags flags,
+    cl_mem_object_type image_type,
+    cl_uint num_entries,
+    cl_GLenum* gl_formats,
+    cl_uint* num_texture_formats) ;
+
+#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* OPENCL_CL_GL_H_ */
--- a/Source/EmberCL/CL/cl_gl_ext.h
+++ b/Source/EmberCL/CL/cl_gl_ext.h
@ -0,0 +1,18 @@
+/*******************************************************************************
+ * Copyright (c) 2008-2021 The Khronos Group Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ ******************************************************************************/
+
+#include <CL/cl_gl.h>
+#pragma message("The extensions in cl_gl_ext.h have been moved into cl_gl.h.  Please include cl_gl.h directly.")
--- a/Source/EmberCL/CL/cl_half.h
+++ b/Source/EmberCL/CL/cl_half.h
@ -0,0 +1,440 @@
+/*******************************************************************************
+ * Copyright (c) 2019-2020 The Khronos Group Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ ******************************************************************************/
+
+/**
+ * This is a header-only utility library that provides OpenCL host code with
+ * routines for converting to/from cl_half values.
+ *
+ * Example usage:
+ *
+ *    #include <CL/cl_half.h>
+ *    ...
+ *    cl_half h = cl_half_from_float(0.5f, CL_HALF_RTE);
+ *    cl_float f = cl_half_to_float(h);
+ */
+
+#ifndef OPENCL_CL_HALF_H
+#define OPENCL_CL_HALF_H
+
+#include <CL/cl_platform.h>
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/**
+ * Rounding mode used when converting to cl_half.
+ */
+typedef enum
+{
+  CL_HALF_RTE, // round to nearest even
+  CL_HALF_RTZ, // round towards zero
+  CL_HALF_RTP, // round towards positive infinity
+  CL_HALF_RTN, // round towards negative infinity
+} cl_half_rounding_mode;
+
+
+/* Private utility macros. */
+#define CL_HALF_EXP_MASK 0x7C00
+#define CL_HALF_MAX_FINITE_MAG 0x7BFF
+
+
+/*
+ * Utility to deal with values that overflow when converting to half precision.
+ */
+static inline cl_half cl_half_handle_overflow(cl_half_rounding_mode rounding_mode,
+                                              uint16_t sign)
+{
+  if (rounding_mode == CL_HALF_RTZ)
+  {
+    // Round overflow towards zero -> largest finite number (preserving sign)
+    return (sign << 15) | CL_HALF_MAX_FINITE_MAG;
+  }
+  else if (rounding_mode == CL_HALF_RTP && sign)
+  {
+    // Round negative overflow towards positive infinity -> most negative finite number
+    return (1 << 15) | CL_HALF_MAX_FINITE_MAG;
+  }
+  else if (rounding_mode == CL_HALF_RTN && !sign)
+  {
+    // Round positive overflow towards negative infinity -> largest finite number
+    return CL_HALF_MAX_FINITE_MAG;
+  }
+
+  // Overflow to infinity
+  return (sign << 15) | CL_HALF_EXP_MASK;
+}
+
+/*
+ * Utility to deal with values that underflow when converting to half precision.
+ */
+static inline cl_half cl_half_handle_underflow(cl_half_rounding_mode rounding_mode,
+                                               uint16_t sign)
+{
+  if (rounding_mode == CL_HALF_RTP && !sign)
+  {
+    // Round underflow towards positive infinity -> smallest positive value
+    return (sign << 15) | 1;
+  }
+  else if (rounding_mode == CL_HALF_RTN && sign)
+  {
+    // Round underflow towards negative infinity -> largest negative value
+    return (sign << 15) | 1;
+  }
+
+  // Flush to zero
+  return (sign << 15);
+}
+
+
+/**
+ * Convert a cl_float to a cl_half.
+ */
+static inline cl_half cl_half_from_float(cl_float f, cl_half_rounding_mode rounding_mode)
+{
+  // Type-punning to get direct access to underlying bits
+  union
+  {
+    cl_float f;
+    uint32_t i;
+  } f32;
+  f32.f = f;
+
+  // Extract sign bit
+  uint16_t sign = f32.i >> 31;
+
+  // Extract FP32 exponent and mantissa
+  uint32_t f_exp = (f32.i >> (CL_FLT_MANT_DIG - 1)) & 0xFF;
+  uint32_t f_mant = f32.i & ((1 << (CL_FLT_MANT_DIG - 1)) - 1);
+
+  // Remove FP32 exponent bias
+  int32_t exp = f_exp - CL_FLT_MAX_EXP + 1;
+
+  // Add FP16 exponent bias
+  uint16_t h_exp = (uint16_t)(exp + CL_HALF_MAX_EXP - 1);
+
+  // Position of the bit that will become the FP16 mantissa LSB
+  uint32_t lsb_pos = CL_FLT_MANT_DIG - CL_HALF_MANT_DIG;
+
+  // Check for NaN / infinity
+  if (f_exp == 0xFF)
+  {
+    if (f_mant)
+    {
+      // NaN -> propagate mantissa and silence it
+      uint16_t h_mant = (uint16_t)(f_mant >> lsb_pos);
+      h_mant |= 0x200;
+      return (sign << 15) | CL_HALF_EXP_MASK | h_mant;
+    }
+    else
+    {
+      // Infinity -> zero mantissa
+      return (sign << 15) | CL_HALF_EXP_MASK;
+    }
+  }
+
+  // Check for zero
+  if (!f_exp && !f_mant)
+  {
+    return (sign << 15);
+  }
+
+  // Check for overflow
+  if (exp >= CL_HALF_MAX_EXP)
+  {
+    return cl_half_handle_overflow(rounding_mode, sign);
+  }
+
+  // Check for underflow
+  if (exp < (CL_HALF_MIN_EXP - CL_HALF_MANT_DIG - 1))
+  {
+    return cl_half_handle_underflow(rounding_mode, sign);
+  }
+
+  // Check for value that will become denormal
+  if (exp < -14)
+  {
+    // Denormal -> include the implicit 1 from the FP32 mantissa
+    h_exp = 0;
+    f_mant |= 1 << (CL_FLT_MANT_DIG - 1);
+
+    // Mantissa shift amount depends on exponent
+    lsb_pos = -exp + (CL_FLT_MANT_DIG - 25);
+  }
+
+  // Generate FP16 mantissa by shifting FP32 mantissa
+  uint16_t h_mant = (uint16_t)(f_mant >> lsb_pos);
+
+  // Check whether we need to round
+  uint32_t halfway = 1 << (lsb_pos - 1);
+  uint32_t mask = (halfway << 1) - 1;
+  switch (rounding_mode)
+  {
+    case CL_HALF_RTE:
+      if ((f_mant & mask) > halfway)
+      {
+        // More than halfway -> round up
+        h_mant += 1;
+      }
+      else if ((f_mant & mask) == halfway)
+      {
+        // Exactly halfway -> round to nearest even
+        if (h_mant & 0x1)
+          h_mant += 1;
+      }
+      break;
+    case CL_HALF_RTZ:
+      // Mantissa has already been truncated -> do nothing
+      break;
+    case CL_HALF_RTP:
+      if ((f_mant & mask) && !sign)
+      {
+        // Round positive numbers up
+        h_mant += 1;
+      }
+      break;
+    case CL_HALF_RTN:
+      if ((f_mant & mask) && sign)
+      {
+        // Round negative numbers down
+        h_mant += 1;
+      }
+      break;
+  }
+
+  // Check for mantissa overflow
+  if (h_mant & 0x400)
+  {
+    h_exp += 1;
+    h_mant = 0;
+  }
+
+  return (sign << 15) | (h_exp << 10) | h_mant;
+}
+
+
+/**
+ * Convert a cl_double to a cl_half.
+ */
+static inline cl_half cl_half_from_double(cl_double d, cl_half_rounding_mode rounding_mode)
+{
+  // Type-punning to get direct access to underlying bits
+  union
+  {
+    cl_double d;
+    uint64_t i;
+  } f64;
+  f64.d = d;
+
+  // Extract sign bit
+  uint16_t sign = f64.i >> 63;
+
+  // Extract FP64 exponent and mantissa
+  uint64_t d_exp = (f64.i >> (CL_DBL_MANT_DIG - 1)) & 0x7FF;
+  uint64_t d_mant = f64.i & (((uint64_t)1 << (CL_DBL_MANT_DIG - 1)) - 1);
+
+  // Remove FP64 exponent bias
+  int64_t exp = d_exp - CL_DBL_MAX_EXP + 1;
+
+  // Add FP16 exponent bias
+  uint16_t h_exp = (uint16_t)(exp + CL_HALF_MAX_EXP - 1);
+
+  // Position of the bit that will become the FP16 mantissa LSB
+  uint32_t lsb_pos = CL_DBL_MANT_DIG - CL_HALF_MANT_DIG;
+
+  // Check for NaN / infinity
+  if (d_exp == 0x7FF)
+  {
+    if (d_mant)
+    {
+      // NaN -> propagate mantissa and silence it
+      uint16_t h_mant = (uint16_t)(d_mant >> lsb_pos);
+      h_mant |= 0x200;
+      return (sign << 15) | CL_HALF_EXP_MASK | h_mant;
+    }
+    else
+    {
+      // Infinity -> zero mantissa
+      return (sign << 15) | CL_HALF_EXP_MASK;
+    }
+  }
+
+  // Check for zero
+  if (!d_exp && !d_mant)
+  {
+    return (sign << 15);
+  }
+
+  // Check for overflow
+  if (exp >= CL_HALF_MAX_EXP)
+  {
+    return cl_half_handle_overflow(rounding_mode, sign);
+  }
+
+  // Check for underflow
+  if (exp < (CL_HALF_MIN_EXP - CL_HALF_MANT_DIG - 1))
+  {
+    return cl_half_handle_underflow(rounding_mode, sign);
+  }
+
+  // Check for value that will become denormal
+  if (exp < -14)
+  {
+    // Include the implicit 1 from the FP64 mantissa
+    h_exp = 0;
+    d_mant |= (uint64_t)1 << (CL_DBL_MANT_DIG - 1);
+
+    // Mantissa shift amount depends on exponent
+    lsb_pos = (uint32_t)(-exp + (CL_DBL_MANT_DIG - 25));
+  }
+
+  // Generate FP16 mantissa by shifting FP64 mantissa
+  uint16_t h_mant = (uint16_t)(d_mant >> lsb_pos);
+
+  // Check whether we need to round
+  uint64_t halfway = (uint64_t)1 << (lsb_pos - 1);
+  uint64_t mask = (halfway << 1) - 1;
+  switch (rounding_mode)
+  {
+    case CL_HALF_RTE:
+      if ((d_mant & mask) > halfway)
+      {
+        // More than halfway -> round up
+        h_mant += 1;
+      }
+      else if ((d_mant & mask) == halfway)
+      {
+        // Exactly halfway -> round to nearest even
+        if (h_mant & 0x1)
+          h_mant += 1;
+      }
+      break;
+    case CL_HALF_RTZ:
+      // Mantissa has already been truncated -> do nothing
+      break;
+    case CL_HALF_RTP:
+      if ((d_mant & mask) && !sign)
+      {
+        // Round positive numbers up
+        h_mant += 1;
+      }
+      break;
+    case CL_HALF_RTN:
+      if ((d_mant & mask) && sign)
+      {
+        // Round negative numbers down
+        h_mant += 1;
+      }
+      break;
+  }
+
+  // Check for mantissa overflow
+  if (h_mant & 0x400)
+  {
+    h_exp += 1;
+    h_mant = 0;
+  }
+
+  return (sign << 15) | (h_exp << 10) | h_mant;
+}
+
+
+/**
+ * Convert a cl_half to a cl_float.
+ */
+static inline cl_float cl_half_to_float(cl_half h)
+{
+  // Type-punning to get direct access to underlying bits
+  union
+  {
+    cl_float f;
+    uint32_t i;
+  } f32;
+
+  // Extract sign bit
+  uint16_t sign = h >> 15;
+
+  // Extract FP16 exponent and mantissa
+  uint16_t h_exp = (h >> (CL_HALF_MANT_DIG - 1)) & 0x1F;
+  uint16_t h_mant = h & 0x3FF;
+
+  // Remove FP16 exponent bias
+  int32_t exp = h_exp - CL_HALF_MAX_EXP + 1;
+
+  // Add FP32 exponent bias
+  uint32_t f_exp = exp + CL_FLT_MAX_EXP - 1;
+
+  // Check for NaN / infinity
+  if (h_exp == 0x1F)
+  {
+    if (h_mant)
+    {
+      // NaN -> propagate mantissa and silence it
+      uint32_t f_mant = h_mant << (CL_FLT_MANT_DIG - CL_HALF_MANT_DIG);
+      f_mant |= 0x400000;
+      f32.i = (sign << 31) | 0x7F800000 | f_mant;
+      return f32.f;
+    }
+    else
+    {
+      // Infinity -> zero mantissa
+      f32.i = (sign << 31) | 0x7F800000;
+      return f32.f;
+    }
+  }
+
+  // Check for zero / denormal
+  if (h_exp == 0)
+  {
+    if (h_mant == 0)
+    {
+      // Zero -> zero exponent
+      f_exp = 0;
+    }
+    else
+    {
+      // Denormal -> normalize it
+      // - Shift mantissa to make most-significant 1 implicit
+      // - Adjust exponent accordingly
+      uint32_t shift = 0;
+      while ((h_mant & 0x400) == 0)
+      {
+        h_mant <<= 1;
+        shift++;
+      }
+      h_mant &= 0x3FF;
+      f_exp -= shift - 1;
+    }
+  }
+
+  f32.i = (sign << 31) | (f_exp << 23) | (h_mant << 13);
+  return f32.f;
+}
+
+
+#undef CL_HALF_EXP_MASK
+#undef CL_HALF_MAX_FINITE_MAG
+
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif  /* OPENCL_CL_HALF_H */
--- a/Source/EmberCL/CL/cl_icd.h
+++ b/Source/EmberCL/CL/cl_icd.h
--- a/Source/EmberCL/CL/cl_layer.h
+++ b/Source/EmberCL/CL/cl_layer.h
@ -0,0 +1,124 @@
+/*******************************************************************************
+ * Copyright (c) 2008-2023 The Khronos Group Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ ******************************************************************************/
+
+#ifndef OPENCL_CL_LAYER_H_
+#define OPENCL_CL_LAYER_H_
+
+/*
+** This header is generated from the Khronos OpenCL XML API Registry.
+*/
+
+#include <CL/cl_icd.h>
+
+#include <CL/cl.h>
+
+/* CL_NO_PROTOTYPES implies CL_NO_EXTENSION_PROTOTYPES: */
+#if defined(CL_NO_PROTOTYPES) && !defined(CL_NO_EXTENSION_PROTOTYPES)
+#define CL_NO_EXTENSION_PROTOTYPES
+#endif
+
+/* CL_NO_EXTENSION_PROTOTYPES implies
+   CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES and
+   CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES: */
+#if defined(CL_NO_EXTENSION_PROTOTYPES) && \
+    !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES)
+#define CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES
+#endif
+#if defined(CL_NO_EXTENSION_PROTOTYPES) && \
+    !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)
+#define CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/***************************************************************
+* cl_loader_layers
+***************************************************************/
+#define cl_loader_layers 1
+#define CL_LOADER_LAYERS_EXTENSION_NAME \
+    "cl_loader_layers"
+
+typedef cl_uint             cl_layer_info;
+typedef cl_uint             cl_layer_api_version;
+
+/* cl_layer_info */
+#define CL_LAYER_API_VERSION                                0x4240
+#define CL_LAYER_NAME                                       0x4241
+
+/* Misc API enums */
+#define CL_LAYER_API_VERSION_100                            100
+
+
+typedef cl_int (CL_API_CALL *
+clGetLayerInfo_fn)(
+    cl_layer_info param_name,
+    size_t param_value_size,
+    void* param_value,
+    size_t* param_value_size_ret) ;
+
+typedef cl_int (CL_API_CALL *
+clInitLayer_fn)(
+    cl_uint num_entries,
+    const cl_icd_dispatch* target_dispatch,
+    cl_uint* num_entries_ret,
+    const cl_icd_dispatch** layer_dispatch_ret) ;
+
+/*
+** The function pointer typedefs prefixed with "pfn_" are provided for
+** compatibility with earlier versions of the headers.  New code is
+** encouraged to use the function pointer typedefs that are suffixed with
+** "_fn" instead, for consistency.
+*/
+
+typedef cl_int (CL_API_CALL *
+pfn_clGetLayerInfo)(
+    cl_layer_info param_name,
+    size_t param_value_size,
+    void* param_value,
+    size_t* param_value_size_ret) ;
+
+typedef cl_int (CL_API_CALL *
+pfn_clInitLayer)(
+    cl_uint num_entries,
+    const cl_icd_dispatch* target_dispatch,
+    cl_uint* num_entries_ret,
+    const cl_icd_dispatch** layer_dispatch_ret) ;
+
+#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetLayerInfo(
+    cl_layer_info param_name,
+    size_t param_value_size,
+    void* param_value,
+    size_t* param_value_size_ret) ;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clInitLayer(
+    cl_uint num_entries,
+    const cl_icd_dispatch* target_dispatch,
+    cl_uint* num_entries_ret,
+    const cl_icd_dispatch** layer_dispatch_ret) ;
+
+#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* OPENCL_CL_LAYER_H_ */
--- a/Source/EmberCL/CL/cl_platform.h
+++ b/Source/EmberCL/CL/cl_platform.h
--- a/Source/EmberCL/CL/cl_va_api_media_sharing_intel.h
+++ b/Source/EmberCL/CL/cl_va_api_media_sharing_intel.h
@ -0,0 +1,199 @@
+/*******************************************************************************
+ * Copyright (c) 2008-2023 The Khronos Group Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ ******************************************************************************/
+
+#ifndef OPENCL_CL_VA_API_MEDIA_SHARING_INTEL_H_
+#define OPENCL_CL_VA_API_MEDIA_SHARING_INTEL_H_
+
+/*
+** This header is generated from the Khronos OpenCL XML API Registry.
+*/
+
+#include <va/va.h>
+
+#include <CL/cl.h>
+
+/* CL_NO_PROTOTYPES implies CL_NO_EXTENSION_PROTOTYPES: */
+#if defined(CL_NO_PROTOTYPES) && !defined(CL_NO_EXTENSION_PROTOTYPES)
+#define CL_NO_EXTENSION_PROTOTYPES
+#endif
+
+/* CL_NO_EXTENSION_PROTOTYPES implies
+   CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES and
+   CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES: */
+#if defined(CL_NO_EXTENSION_PROTOTYPES) && \
+    !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES)
+#define CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES
+#endif
+#if defined(CL_NO_EXTENSION_PROTOTYPES) && \
+    !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)
+#define CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/***************************************************************
+* cl_intel_sharing_format_query_va_api
+***************************************************************/
+#define cl_intel_sharing_format_query_va_api 1
+#define CL_INTEL_SHARING_FORMAT_QUERY_VA_API_EXTENSION_NAME \
+    "cl_intel_sharing_format_query_va_api"
+
+/* when cl_intel_va_api_media_sharing is supported */
+
+typedef cl_int (CL_API_CALL *
+clGetSupportedVA_APIMediaSurfaceFormatsINTEL_fn)(
+    cl_context context,
+    cl_mem_flags flags,
+    cl_mem_object_type image_type,
+    cl_uint plane,
+    cl_uint num_entries,
+    VAImageFormat* va_api_formats,
+    cl_uint* num_surface_formats) ;
+
+#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetSupportedVA_APIMediaSurfaceFormatsINTEL(
+    cl_context context,
+    cl_mem_flags flags,
+    cl_mem_object_type image_type,
+    cl_uint plane,
+    cl_uint num_entries,
+    VAImageFormat* va_api_formats,
+    cl_uint* num_surface_formats) ;
+
+#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */
+
+/***************************************************************
+* cl_intel_va_api_media_sharing
+***************************************************************/
+#define cl_intel_va_api_media_sharing 1
+#define CL_INTEL_VA_API_MEDIA_SHARING_EXTENSION_NAME \
+    "cl_intel_va_api_media_sharing"
+
+typedef cl_uint             cl_va_api_device_source_intel;
+typedef cl_uint             cl_va_api_device_set_intel;
+
+/* Error codes */
+#define CL_INVALID_VA_API_MEDIA_ADAPTER_INTEL               -1098
+#define CL_INVALID_VA_API_MEDIA_SURFACE_INTEL               -1099
+#define CL_VA_API_MEDIA_SURFACE_ALREADY_ACQUIRED_INTEL      -1100
+#define CL_VA_API_MEDIA_SURFACE_NOT_ACQUIRED_INTEL          -1101
+
+/* cl_va_api_device_source_intel */
+#define CL_VA_API_DISPLAY_INTEL                             0x4094
+
+/* cl_va_api_device_set_intel */
+#define CL_PREFERRED_DEVICES_FOR_VA_API_INTEL               0x4095
+#define CL_ALL_DEVICES_FOR_VA_API_INTEL                     0x4096
+
+/* cl_context_info */
+#define CL_CONTEXT_VA_API_DISPLAY_INTEL                     0x4097
+
+/* cl_mem_info */
+#define CL_MEM_VA_API_MEDIA_SURFACE_INTEL                   0x4098
+
+/* cl_image_info */
+#define CL_IMAGE_VA_API_PLANE_INTEL                         0x4099
+
+/* cl_command_type */
+#define CL_COMMAND_ACQUIRE_VA_API_MEDIA_SURFACES_INTEL      0x409A
+#define CL_COMMAND_RELEASE_VA_API_MEDIA_SURFACES_INTEL      0x409B
+
+
+typedef cl_int (CL_API_CALL *
+clGetDeviceIDsFromVA_APIMediaAdapterINTEL_fn)(
+    cl_platform_id platform,
+    cl_va_api_device_source_intel media_adapter_type,
+    void* media_adapter,
+    cl_va_api_device_set_intel media_adapter_set,
+    cl_uint num_entries,
+    cl_device_id* devices,
+    cl_uint* num_devices) CL_API_SUFFIX__VERSION_1_2;
+
+typedef cl_mem (CL_API_CALL *
+clCreateFromVA_APIMediaSurfaceINTEL_fn)(
+    cl_context context,
+    cl_mem_flags flags,
+    VASurfaceID* surface,
+    cl_uint plane,
+    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+
+typedef cl_int (CL_API_CALL *
+clEnqueueAcquireVA_APIMediaSurfacesINTEL_fn)(
+    cl_command_queue command_queue,
+    cl_uint num_objects,
+    const cl_mem* mem_objects,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) CL_API_SUFFIX__VERSION_1_2;
+
+typedef cl_int (CL_API_CALL *
+clEnqueueReleaseVA_APIMediaSurfacesINTEL_fn)(
+    cl_command_queue command_queue,
+    cl_uint num_objects,
+    const cl_mem* mem_objects,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) CL_API_SUFFIX__VERSION_1_2;
+
+#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetDeviceIDsFromVA_APIMediaAdapterINTEL(
+    cl_platform_id platform,
+    cl_va_api_device_source_intel media_adapter_type,
+    void* media_adapter,
+    cl_va_api_device_set_intel media_adapter_set,
+    cl_uint num_entries,
+    cl_device_id* devices,
+    cl_uint* num_devices) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromVA_APIMediaSurfaceINTEL(
+    cl_context context,
+    cl_mem_flags flags,
+    VASurfaceID* surface,
+    cl_uint plane,
+    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueAcquireVA_APIMediaSurfacesINTEL(
+    cl_command_queue command_queue,
+    cl_uint num_objects,
+    const cl_mem* mem_objects,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReleaseVA_APIMediaSurfacesINTEL(
+    cl_command_queue command_queue,
+    cl_uint num_objects,
+    const cl_mem* mem_objects,
+    cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list,
+    cl_event* event) CL_API_SUFFIX__VERSION_1_2;
+
+#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* OPENCL_CL_VA_API_MEDIA_SHARING_INTEL_H_ */
--- a/Source/EmberCL/CL/cl_version.h
+++ b/Source/EmberCL/CL/cl_version.h
@ -0,0 +1,81 @@
+/*******************************************************************************
+ * Copyright (c) 2018-2020 The Khronos Group Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ ******************************************************************************/
+
+#ifndef __CL_VERSION_H
+#define __CL_VERSION_H
+
+/* Detect which version to target */
+#if !defined(CL_TARGET_OPENCL_VERSION)
+#pragma message("cl_version.h: CL_TARGET_OPENCL_VERSION is not defined. Defaulting to 300 (OpenCL 3.0)")
+#define CL_TARGET_OPENCL_VERSION 300
+#endif
+#if CL_TARGET_OPENCL_VERSION != 100 && \
+    CL_TARGET_OPENCL_VERSION != 110 && \
+    CL_TARGET_OPENCL_VERSION != 120 && \
+    CL_TARGET_OPENCL_VERSION != 200 && \
+    CL_TARGET_OPENCL_VERSION != 210 && \
+    CL_TARGET_OPENCL_VERSION != 220 && \
+    CL_TARGET_OPENCL_VERSION != 300
+#pragma message("cl_version: CL_TARGET_OPENCL_VERSION is not a valid value (100, 110, 120, 200, 210, 220, 300). Defaulting to 300 (OpenCL 3.0)")
+#undef CL_TARGET_OPENCL_VERSION
+#define CL_TARGET_OPENCL_VERSION 300
+#endif
+
+
+/* OpenCL Version */
+#if CL_TARGET_OPENCL_VERSION >= 300 && !defined(CL_VERSION_3_0)
+#define CL_VERSION_3_0  1
+#endif
+#if CL_TARGET_OPENCL_VERSION >= 220 && !defined(CL_VERSION_2_2)
+#define CL_VERSION_2_2  1
+#endif
+#if CL_TARGET_OPENCL_VERSION >= 210 && !defined(CL_VERSION_2_1)
+#define CL_VERSION_2_1  1
+#endif
+#if CL_TARGET_OPENCL_VERSION >= 200 && !defined(CL_VERSION_2_0)
+#define CL_VERSION_2_0  1
+#endif
+#if CL_TARGET_OPENCL_VERSION >= 120 && !defined(CL_VERSION_1_2)
+#define CL_VERSION_1_2  1
+#endif
+#if CL_TARGET_OPENCL_VERSION >= 110 && !defined(CL_VERSION_1_1)
+#define CL_VERSION_1_1  1
+#endif
+#if CL_TARGET_OPENCL_VERSION >= 100 && !defined(CL_VERSION_1_0)
+#define CL_VERSION_1_0  1
+#endif
+
+/* Allow deprecated APIs for older OpenCL versions. */
+#if CL_TARGET_OPENCL_VERSION <= 220 && !defined(CL_USE_DEPRECATED_OPENCL_2_2_APIS)
+#define CL_USE_DEPRECATED_OPENCL_2_2_APIS
+#endif
+#if CL_TARGET_OPENCL_VERSION <= 210 && !defined(CL_USE_DEPRECATED_OPENCL_2_1_APIS)
+#define CL_USE_DEPRECATED_OPENCL_2_1_APIS
+#endif
+#if CL_TARGET_OPENCL_VERSION <= 200 && !defined(CL_USE_DEPRECATED_OPENCL_2_0_APIS)
+#define CL_USE_DEPRECATED_OPENCL_2_0_APIS
+#endif
+#if CL_TARGET_OPENCL_VERSION <= 120 && !defined(CL_USE_DEPRECATED_OPENCL_1_2_APIS)
+#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
+#endif
+#if CL_TARGET_OPENCL_VERSION <= 110 && !defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+#define CL_USE_DEPRECATED_OPENCL_1_1_APIS
+#endif
+#if CL_TARGET_OPENCL_VERSION <= 100 && !defined(CL_USE_DEPRECATED_OPENCL_1_0_APIS)
+#define CL_USE_DEPRECATED_OPENCL_1_0_APIS
+#endif
+
+#endif  /* __CL_VERSION_H */
--- a/Source/EmberCL/CL/opencl.h
+++ b/Source/EmberCL/CL/opencl.h
@ -0,0 +1,32 @@
+/*******************************************************************************
+ * Copyright (c) 2008-2021 The Khronos Group Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ ******************************************************************************/
+
+#ifndef __OPENCL_H
+#define __OPENCL_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <CL/cl.h>
+#include <CL/cl_gl.h>
+#include <CL/cl_ext.h>
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* __OPENCL_H   */
--- a/Source/EmberCL/CL/opencl.hpp
+++ b/Source/EmberCL/CL/opencl.hpp
--- a/Source/EmberCL/DEOpenCLKernelCreator.cpp
+++ b/Source/EmberCL/DEOpenCLKernelCreator.cpp
--- a/Source/EmberCL/DEOpenCLKernelCreator.h
+++ b/Source/EmberCL/DEOpenCLKernelCreator.h
@ -1,76 +1,76 @@
-#pragma once
-
-#include "EmberCLPch.h"
-#include "EmberCLStructs.h"
-#include "EmberCLFunctions.h"
-
-/// <summary>
-/// DEOpenCLKernelCreator class.
-/// </summary>
-
-namespace EmberCLns
-{
-/// <summary>
-/// Kernel creator for density filtering.
-/// This implements both basic log scale filtering
-/// as well as the full flam3 density estimation filtering
-/// in OpenCL.
-/// Several conditionals are present in the CPU version. They
-/// are stripped out of the kernels and instead a separate kernel
-/// is created for every possible case.
-/// If the filter width is 9 or less, then the entire process can be
-/// done in shared memory which is very fast.
-/// However, if the filter width is greater than 9, shared memory is not
-/// used and all filtering is done directly with main global VRAM. This
-/// ends up being not much faster than doing it on the CPU.
-/// String members are kept for the program source and entry points
-/// for each version of the program.
-/// </summary>
-class EMBERCL_API DEOpenCLKernelCreator
-{
-public:
-	DEOpenCLKernelCreator();
-	DEOpenCLKernelCreator(bool doublePrecision, bool nVidia);
-
-	//Accessors.
-	const string& LogScaleAssignDEKernel() const;
-	const string& LogScaleAssignDEEntryPoint() const;
-	const string& GaussianDEKernel(size_t ss, uint filterWidth) const;
-	const string& GaussianDEEntryPoint(size_t ss, uint filterWidth) const;
-
-	//Miscellaneous static functions.
-	static uint MaxDEFilterSize();
-	static double SolveMaxDERad(double desiredFilterSize, double ss);
-	static uint SolveMaxBoxSize(uint localMem);
-
-private:
-	//Kernel creators.
-	string CreateLogScaleAssignDEKernelString();
-	string CreateGaussianDEKernel(size_t ss);
-	string CreateGaussianDEKernelNoLocalCache(size_t ss);
-
-	string m_LogScaleAssignDEKernel;
-	string m_LogScaleAssignDEEntryPoint = "LogScaleAssignDensityFilterKernel";
-
-	string m_GaussianDEWithoutSsKernel;
-	string m_GaussianDEWithoutSsEntryPoint = "GaussianDEWithoutSsKernel";
-
-	string m_GaussianDESsWithScfKernel;
-	string m_GaussianDESsWithScfEntryPoint = "GaussianDESsWithScfKernel";
-
-	string m_GaussianDESsWithoutScfKernel;
-	string m_GaussianDESsWithoutScfEntryPoint = "GaussianDESsWithoutScfKernel";
-
-	string m_GaussianDEWithoutSsNoCacheKernel;
-	string m_GaussianDEWithoutSsNoCacheEntryPoint = "GaussianDEWithoutSsNoCacheKernel";
-
-	string m_GaussianDESsWithScfNoCacheKernel;
-	string m_GaussianDESsWithScfNoCacheEntryPoint = "GaussianDESsWithScfNoCacheKernel";
-
-	string m_GaussianDESsWithoutScfNoCacheKernel;
-	string m_GaussianDESsWithoutScfNoCacheEntryPoint = "GaussianDESsWithoutScfNoCacheKernel";
-
-	bool m_DoublePrecision;
-	bool m_NVidia;
-};
-}
+#pragma once
+
+#include "EmberCLPch.h"
+#include "EmberCLStructs.h"
+#include "EmberCLFunctions.h"
+
+/// <summary>
+/// DEOpenCLKernelCreator class.
+/// </summary>
+
+namespace EmberCLns
+{
+/// <summary>
+/// Kernel creator for density filtering.
+/// This implements both basic log scale filtering
+/// as well as the full flam3 density estimation filtering
+/// in OpenCL.
+/// Several conditionals are present in the CPU version. They
+/// are stripped out of the kernels and instead a separate kernel
+/// is created for every possible case.
+/// If the filter width is 9 or less, then the entire process can be
+/// done in shared memory which is very fast.
+/// However, if the filter width is greater than 9, shared memory is not
+/// used and all filtering is done directly with main global VRAM. This
+/// ends up being not much faster than doing it on the CPU.
+/// String members are kept for the program source and entry points
+/// for each version of the program.
+/// </summary>
+class EMBERCL_API DEOpenCLKernelCreator
+{
+public:
+	DEOpenCLKernelCreator();
+	DEOpenCLKernelCreator(bool doublePrecision, bool nVidia);
+
+	//Accessors.
+	const string& LogScaleAssignDEKernel() const;
+	const string& LogScaleAssignDEEntryPoint() const;
+	const string& GaussianDEKernel(size_t ss, uint filterWidth) const;
+	const string& GaussianDEEntryPoint(size_t ss, uint filterWidth) const;
+
+	//Miscellaneous static functions.
+	static uint MaxDEFilterSize();
+	static double SolveMaxDERad(double desiredFilterSize, double ss);
+	static uint SolveMaxBoxSize(uint localMem);
+
+private:
+	//Kernel creators.
+	string CreateLogScaleAssignDEKernelString();
+	string CreateGaussianDEKernel(size_t ss);
+	string CreateGaussianDEKernelNoLocalCache(size_t ss);
+
+	string m_LogScaleAssignDEKernel;
+	string m_LogScaleAssignDEEntryPoint = "LogScaleAssignDensityFilterKernel";
+
+	string m_GaussianDEWithoutSsKernel;
+	string m_GaussianDEWithoutSsEntryPoint = "GaussianDEWithoutSsKernel";
+
+	string m_GaussianDESsWithScfKernel;
+	string m_GaussianDESsWithScfEntryPoint = "GaussianDESsWithScfKernel";
+
+	string m_GaussianDESsWithoutScfKernel;
+	string m_GaussianDESsWithoutScfEntryPoint = "GaussianDESsWithoutScfKernel";
+
+	string m_GaussianDEWithoutSsNoCacheKernel;
+	string m_GaussianDEWithoutSsNoCacheEntryPoint = "GaussianDEWithoutSsNoCacheKernel";
+
+	string m_GaussianDESsWithScfNoCacheKernel;
+	string m_GaussianDESsWithScfNoCacheEntryPoint = "GaussianDESsWithScfNoCacheKernel";
+
+	string m_GaussianDESsWithoutScfNoCacheKernel;
+	string m_GaussianDESsWithoutScfNoCacheEntryPoint = "GaussianDESsWithoutScfNoCacheKernel";
+
+	bool m_DoublePrecision = false;
+	bool m_NVidia = false;
+};
+}
--- a/Source/EmberCL/DllMain.cpp
+++ b/Source/EmberCL/DllMain.cpp
@ -1,22 +1,22 @@
-#include "EmberCLPch.h"
-
-#ifdef _WIN32
-/// <summary>
-/// Generated by Visual Studio to make the DLL run properly.
-/// </summary>
-BOOL APIENTRY DllMain( HMODULE hModule,
-					   DWORD  ul_reason_for_call,
-					   LPVOID lpReserved
-					 )
-{
-	switch (ul_reason_for_call)
-	{
-	case DLL_PROCESS_ATTACH:
-	case DLL_THREAD_ATTACH:
-	case DLL_THREAD_DETACH:
-	case DLL_PROCESS_DETACH:
-		break;
-	}
-	return TRUE;
-}
-#endif
+#include "EmberCLPch.h"
+
+#ifdef _WIN32
+/// <summary>
+/// Generated by Visual Studio to make the DLL run properly.
+/// </summary>
+BOOL APIENTRY DllMain( HMODULE hModule,
+					   DWORD  ul_reason_for_call,
+					   LPVOID lpReserved
+					 )
+{
+	switch (ul_reason_for_call)
+	{
+	case DLL_PROCESS_ATTACH:
+	case DLL_THREAD_ATTACH:
+	case DLL_THREAD_DETACH:
+	case DLL_PROCESS_DETACH:
+		break;
+	}
+	return TRUE;
+}
+#endif
--- a/Source/EmberCL/EmberCLFunctions.h
+++ b/Source/EmberCL/EmberCLFunctions.h
@ -1,274 +1,274 @@
-#pragma once
-
-#include "EmberCLPch.h"
-#include "EmberCLStructs.h"
-
-#define USEFMA 1
-
-/// <summary>
-/// OpenCL global function strings.
-/// </summary>
-
-namespace EmberCLns
-{
-/// <summary>
-/// OpenCL equivalent of Palette::RgbToHsv().
-/// </summary>
-static constexpr char RgbToHsvFunctionString[] =
-	//rgb 0 - 1,
-	//h 0 - 6, s 0 - 1, v 0 - 1
-	"static inline void RgbToHsv(real4_bucket* rgb, real4_bucket* hsv)\n"
-	"{\n"
-	"	real_bucket_t max, min, del, rc, gc, bc;\n"
-	"\n"
-	//Compute maximum of r, g, b.
-	"	if ((*rgb).x >= (*rgb).y)\n"
-	"	{\n"
-	"		if ((*rgb).x >= (*rgb).z)\n"
-	"			max = (*rgb).x;\n"
-	"		else\n"
-	"			max = (*rgb).z;\n"
-	"	}\n"
-	"	else\n"
-	"	{\n"
-	"		if ((*rgb).y >= (*rgb).z)\n"
-	"			max = (*rgb).y;\n"
-	"		else\n"
-	"			max = (*rgb).z;\n"
-	"	}\n"
-	"\n"
-	//Compute minimum of r, g, b.
-	"	if ((*rgb).x <= (*rgb).y)\n"
-	"	{\n"
-	"		if ((*rgb).x <= (*rgb).z)\n"
-	"			min = (*rgb).x;\n"
-	"		else\n"
-	"			min = (*rgb).z;\n"
-	"	}\n"
-	"	else\n"
-	"	{\n"
-	"		if ((*rgb).y <= (*rgb).z)\n"
-	"			min = (*rgb).y;\n"
-	"		else\n"
-	"			min = (*rgb).z;\n"
-	"	}\n"
-	"\n"
-	"	del = max - min;\n"
-	"	(*hsv).z = max;\n"
-	"\n"
-	"	if (max != 0)\n"
-	"		(*hsv).y = del / max;\n"
-	"	else\n"
-	"		(*hsv).y = 0;\n"
-	"\n"
-	"	(*hsv).x = 0;\n"
-	"	if ((*hsv).y != 0)\n"
-	"	{\n"
-	"		rc = (max - (*rgb).x) / del;\n"
-	"		gc = (max - (*rgb).y) / del;\n"
-	"		bc = (max - (*rgb).z) / del;\n"
-	"\n"
-	"		if ((*rgb).x == max)\n"
-	"			(*hsv).x = bc - gc;\n"
-	"		else if ((*rgb).y == max)\n"
-	"			(*hsv).x = 2 + rc - bc;\n"
-	"		else if ((*rgb).z == max)\n"
-	"			(*hsv).x = 4 + gc - rc;\n"
-	"\n"
-	"		if ((*hsv).x < 0)\n"
-	"			(*hsv).x += 6;\n"
-	"	}\n"
-	"}\n"
-	"\n";
-
-/// <summary>
-/// OpenCL equivalent of Palette::HsvToRgb().
-/// </summary>
-static constexpr char HsvToRgbFunctionString[] =
-	//h 0 - 6, s 0 - 1, v 0 - 1
-	//rgb 0 - 1
-	"static inline void HsvToRgb(real4_bucket* hsv, real4_bucket* rgb)\n"
-	"{\n"
-	"	int j;\n"
-	"	real_bucket_t f, p, q, t;\n"
-	"\n"
-	"	while ((*hsv).x >= 6)\n"
-	"		(*hsv).x = (*hsv).x - 6;\n"
-	"\n"
-	"	while ((*hsv).x <  0)\n"
-	"		(*hsv).x = (*hsv).x + 6;\n"
-	"\n"
-	"	j = (int)floor((*hsv).x);\n"
-	"	f = (*hsv).x - j;\n"
-	"	p = (*hsv).z * (1 - (*hsv).y);\n"
-	"	q = (*hsv).z * (1 - ((*hsv).y * f));\n"
-	"	t = (*hsv).z * (1 - ((*hsv).y * (1 - f)));\n"
-	"\n"
-	"	switch (j)\n"
-	"	{\n"
-	"		case 0:  (*rgb).x = (*hsv).z; (*rgb).y = t;		   (*rgb).z = p;	    break;\n"
-	"		case 1:  (*rgb).x = q;		  (*rgb).y = (*hsv).z; (*rgb).z = p;	    break;\n"
-	"		case 2:  (*rgb).x = p;		  (*rgb).y = (*hsv).z; (*rgb).z = t;	    break;\n"
-	"		case 3:  (*rgb).x = p;		  (*rgb).y = q;		   (*rgb).z = (*hsv).z; break;\n"
-	"		case 4:  (*rgb).x = t;		  (*rgb).y = p;		   (*rgb).z = (*hsv).z; break;\n"
-	"		case 5:  (*rgb).x = (*hsv).z; (*rgb).y = p;		   (*rgb).z = q;	    break;\n"
-	"		default: (*rgb).x = (*hsv).z; (*rgb).y = t;		   (*rgb).z = p;	    break;\n"
-	"	}\n"
-	"}\n"
-	"\n";
-
-/// <summary>
-/// OpenCL equivalent of Palette::CalcAlpha().
-/// </summary>
-static constexpr char CalcAlphaFunctionString[] =
-	"static inline real_t CalcAlpha(real_bucket_t density, real_bucket_t gamma, real_bucket_t linrange)\n"//Not the slightest clue what this is doing.//DOC
-	"{\n"
-	"	real_bucket_t frac, alpha, funcval = pow(linrange, gamma);\n"
-	"\n"
-	"	if (density > 0)\n"
-	"	{\n"
-	"		if (density < linrange)\n"
-	"		{\n"
-	"			frac = density / linrange;\n"
-	"			alpha = (1.0 - frac) * density * (funcval / linrange) + frac * pow(density, gamma);\n"
-	"		}\n"
-	"		else\n"
-	"			alpha = pow(density, gamma);\n"
-	"	}\n"
-	"	else\n"
-	"		alpha = 0;\n"
-	"\n"
-	"	return alpha;\n"
-	"}\n"
-	"\n";
-
-
-/// <summary>
-/// OpenCL equivalent of Renderer::CurveAdjust().
-/// Only use float here instead of real_t because the output will be passed to write_imagef()
-/// during final accumulation, which only takes floats.
-/// </summary>
-static constexpr char CurveAdjustFunctionString[] =
-	"static inline void CurveAdjust(__global real4reals_bucket* csa, float* a, uint index)\n"
-	"{\n"
-	"	uint tempIndex = (uint)clamp(*a * CURVES_LENGTH_M1, 0.0f, CURVES_LENGTH_M1);\n"
-	"	uint tempIndex2 = (uint)clamp(csa[tempIndex].m_Real4.x * CURVES_LENGTH_M1, 0.0f, CURVES_LENGTH_M1);\n"
-	"\n"
-	"	*a = (float)csa[tempIndex2].m_Reals[index];\n"
-	"}\n"
-	"\n";
-
-/// <summary>
-/// Use MWC 64 from David Thomas at the Imperial College of London for
-/// random numbers in OpenCL, instead of ISAAC which was used
-/// for CPU rendering.
-/// </summary>
-static constexpr char RandFunctionString[] =
-	"enum { MWC64X_A = 4294883355u };\n\n"
-	"inline uint MwcNext(uint2* s)\n"
-	"{\n"
-	"	uint res = (*s).x ^ (*s).y;			\n"//Calculate the result.
-	"	uint hi = mul_hi((*s).x, MWC64X_A); \n"//Step the RNG.
-	"	(*s).x = (*s).x * MWC64X_A + (*s).y;\n"//Pack the state back up.
-	"	(*s).y = hi + ((*s).x < (*s).y);	\n"
-	"	return res;							\n"//Return the next result.
-	"}\n"
-	"\n"
-	"inline uint MwcNextRange(uint2* s, uint val)\n"
-	"{\n"
-	"	return (val == 0) ? MwcNext(s) : (uint)(((ulong)MwcNext(s) * (ulong)val) >> 32);\n"
-	"}\n"
-	"\n"
-	"inline real_t MwcNext01(uint2* s)\n"
-	"{\n"
-	"	return MwcNext(s) * (real_t)(1.0 / 4294967296.0);\n"
-	"}\n"
-	"\n"
-	"inline uint MwcNextCrand(uint2* s)\n"
-	"{\n"
-	"	return MwcNextRange(s, 32767u);\n"
-	"}\n"
-	"\n"
-	"inline real_t MwcNextFRange(uint2* s, real_t lower, real_t upper)\n"
-	"{\n"
-	"	real_t f = (real_t)MwcNext(s) / (real_t)UINT_MAX;\n"
-#ifdef USEFMA
-	"	return fma(f, upper - lower, lower);\n"
-#else
-	"	return (f * (upper - lower) + lower);\n"
-#endif
-	"}\n"
-	"\n"
-	"inline real_t MwcNextNeg1Pos1(uint2* s)\n"
-	"{\n"
-	"	real_t f = (real_t)MwcNext(s) / (real_t)UINT_MAX;\n"
-#ifdef USEFMA
-	"	return fma(f, (real_t)2.0, (real_t)-1.0);\n"
-#else
-	"	return (f * (real_t)2.0 + (real_t)-1.0);\n"
-#endif
-	"}\n"
-	"\n"
-	"inline real_t MwcNext0505(uint2* s)\n"
-	"{\n"
-	"	real_t f = (real_t)MwcNext(s) / (real_t)UINT_MAX;\n"
-	"	return -0.5 + f;\n"
-	"}\n"
-	"\n";
-
-/// <summary>
-/// OpenCL equivalent Renderer::AddToAccum().
-/// </summary>
-static constexpr char AddToAccumWithCheckFunctionString[] =
-	"inline bool AccumCheck(int superRasW, int superRasH, int i, int ii, int j, int jj)\n"
-	"{\n"
-	"	return (j + jj >= 0 && j + jj < superRasH && i + ii >= 0 && i + ii < superRasW);\n"
-	"}\n"
-	"\n";
-
-/// <summary>
-/// OpenCL equivalent various CarToRas member functions.
-/// Normaly would subtract m_RasLlX and m_RasLlY, but they were negated in RendererCL before being passed in, so they could be used with fma().
-/// </summary>
-static constexpr char CarToRasFunctionString[] =
-	"inline void CarToRasConvertPointToSingle(__constant CarToRasCL* carToRas, Point* point, uint* singleBufferIndex)\n"
-	"{\n"
-#ifdef USEFMA
-	"	*singleBufferIndex = (uint)fma(carToRas->m_PixPerImageUnitW, point->m_X, carToRas->m_RasLlX) + (carToRas->m_RasWidth * (uint)fma(carToRas->m_PixPerImageUnitH, point->m_Y, carToRas->m_RasLlY));\n"
-#else
-	"	*singleBufferIndex = (uint)(carToRas->m_PixPerImageUnitW * point->m_X + carToRas->m_RasLlX) + (carToRas->m_RasWidth * (uint)(carToRas->m_PixPerImageUnitH * point->m_Y + carToRas->m_RasLlY));\n"
-#endif
-	"}\n"
-	"\n"
-	"inline bool CarToRasInBounds(__constant CarToRasCL* carToRas, Point* point)\n"
-	"{\n"
-	"	return point->m_X >= carToRas->m_CarLlX &&\n"
-	"		point->m_X < carToRas->m_CarUrX &&\n"
-	"		point->m_Y < carToRas->m_CarUrY &&\n"
-	"		point->m_Y >= carToRas->m_CarLlY;\n"
-	"}\n"
-	"\n";
-
-static constexpr char AtomicString[] =
-   "void AtomicAdd(volatile __global real_bucket_t* source, const real_bucket_t operand)\n"
-   "{\n"
-   "	union\n"
-   "	{\n"
-   "		atomi intVal;\n"
-   "		real_bucket_t realVal;\n"
-   "	} newVal;\n"
-   "\n"
-   "	union\n"
-   "	{\n"
-   "		atomi intVal;\n"
-   "		real_bucket_t realVal;\n"
-   "	} prevVal;\n"
-   "\n"
-   "	do\n"
-   "	{\n"
-   "		prevVal.realVal = *source;\n"
-   "		newVal.realVal = prevVal.realVal + operand;\n"
-   "	} while (atomic_cmpxchg((volatile __global atomi*)source, prevVal.intVal, newVal.intVal) != prevVal.intVal);\n"
-   "}\n";
-
-}
+#pragma once
+
+#include "EmberCLPch.h"
+#include "EmberCLStructs.h"
+
+#define USEFMA 1
+
+/// <summary>
+/// OpenCL global function strings.
+/// </summary>
+
+namespace EmberCLns
+{
+/// <summary>
+/// OpenCL equivalent of Palette::RgbToHsv().
+/// </summary>
+static constexpr char RgbToHsvFunctionString[] =
+	//rgb 0 - 1,
+	//h 0 - 6, s 0 - 1, v 0 - 1
+	"static inline void RgbToHsv(real4_bucket* rgb, real4_bucket* hsv)\n"
+	"{\n"
+	"	real_bucket_t max, min, del, rc, gc, bc;\n"
+	"\n"
+	//Compute maximum of r, g, b.
+	"	if ((*rgb).x >= (*rgb).y)\n"
+	"	{\n"
+	"		if ((*rgb).x >= (*rgb).z)\n"
+	"			max = (*rgb).x;\n"
+	"		else\n"
+	"			max = (*rgb).z;\n"
+	"	}\n"
+	"	else\n"
+	"	{\n"
+	"		if ((*rgb).y >= (*rgb).z)\n"
+	"			max = (*rgb).y;\n"
+	"		else\n"
+	"			max = (*rgb).z;\n"
+	"	}\n"
+	"\n"
+	//Compute minimum of r, g, b.
+	"	if ((*rgb).x <= (*rgb).y)\n"
+	"	{\n"
+	"		if ((*rgb).x <= (*rgb).z)\n"
+	"			min = (*rgb).x;\n"
+	"		else\n"
+	"			min = (*rgb).z;\n"
+	"	}\n"
+	"	else\n"
+	"	{\n"
+	"		if ((*rgb).y <= (*rgb).z)\n"
+	"			min = (*rgb).y;\n"
+	"		else\n"
+	"			min = (*rgb).z;\n"
+	"	}\n"
+	"\n"
+	"	del = max - min;\n"
+	"	(*hsv).z = max;\n"
+	"\n"
+	"	if (max != 0)\n"
+	"		(*hsv).y = del / max;\n"
+	"	else\n"
+	"		(*hsv).y = 0;\n"
+	"\n"
+	"	(*hsv).x = 0;\n"
+	"	if ((*hsv).y != 0)\n"
+	"	{\n"
+	"		rc = (max - (*rgb).x) / del;\n"
+	"		gc = (max - (*rgb).y) / del;\n"
+	"		bc = (max - (*rgb).z) / del;\n"
+	"\n"
+	"		if ((*rgb).x == max)\n"
+	"			(*hsv).x = bc - gc;\n"
+	"		else if ((*rgb).y == max)\n"
+	"			(*hsv).x = 2 + rc - bc;\n"
+	"		else if ((*rgb).z == max)\n"
+	"			(*hsv).x = 4 + gc - rc;\n"
+	"\n"
+	"		if ((*hsv).x < 0)\n"
+	"			(*hsv).x += 6;\n"
+	"	}\n"
+	"}\n"
+	"\n";
+
+/// <summary>
+/// OpenCL equivalent of Palette::HsvToRgb().
+/// </summary>
+static constexpr char HsvToRgbFunctionString[] =
+	//h 0 - 6, s 0 - 1, v 0 - 1
+	//rgb 0 - 1
+	"static inline void HsvToRgb(real4_bucket* hsv, real4_bucket* rgb)\n"
+	"{\n"
+	"	int j;\n"
+	"	real_bucket_t f, p, q, t;\n"
+	"\n"
+	"	while ((*hsv).x >= 6)\n"
+	"		(*hsv).x = (*hsv).x - 6;\n"
+	"\n"
+	"	while ((*hsv).x <  0)\n"
+	"		(*hsv).x = (*hsv).x + 6;\n"
+	"\n"
+	"	j = (int)floor((*hsv).x);\n"
+	"	f = (*hsv).x - j;\n"
+	"	p = (*hsv).z * (1 - (*hsv).y);\n"
+	"	q = (*hsv).z * (1 - ((*hsv).y * f));\n"
+	"	t = (*hsv).z * (1 - ((*hsv).y * (1 - f)));\n"
+	"\n"
+	"	switch (j)\n"
+	"	{\n"
+	"		case 0:  (*rgb).x = (*hsv).z; (*rgb).y = t;		   (*rgb).z = p;	    break;\n"
+	"		case 1:  (*rgb).x = q;		  (*rgb).y = (*hsv).z; (*rgb).z = p;	    break;\n"
+	"		case 2:  (*rgb).x = p;		  (*rgb).y = (*hsv).z; (*rgb).z = t;	    break;\n"
+	"		case 3:  (*rgb).x = p;		  (*rgb).y = q;		   (*rgb).z = (*hsv).z; break;\n"
+	"		case 4:  (*rgb).x = t;		  (*rgb).y = p;		   (*rgb).z = (*hsv).z; break;\n"
+	"		case 5:  (*rgb).x = (*hsv).z; (*rgb).y = p;		   (*rgb).z = q;	    break;\n"
+	"		default: (*rgb).x = (*hsv).z; (*rgb).y = t;		   (*rgb).z = p;	    break;\n"
+	"	}\n"
+	"}\n"
+	"\n";
+
+/// <summary>
+/// OpenCL equivalent of Palette::CalcAlpha().
+/// </summary>
+static constexpr char CalcAlphaFunctionString[] =
+	"static inline real_t CalcAlpha(real_bucket_t density, real_bucket_t gamma, real_bucket_t linrange)\n"//Not the slightest clue what this is doing.//DOC
+	"{\n"
+	"	real_bucket_t frac, alpha, funcval = pow(linrange, gamma);\n"
+	"\n"
+	"	if (density > 0)\n"
+	"	{\n"
+	"		if (density < linrange)\n"
+	"		{\n"
+	"			frac = density / linrange;\n"
+	"			alpha = (1.0 - frac) * density * (funcval / linrange) + frac * pow(density, gamma);\n"
+	"		}\n"
+	"		else\n"
+	"			alpha = pow(density, gamma);\n"
+	"	}\n"
+	"	else\n"
+	"		alpha = 0;\n"
+	"\n"
+	"	return alpha;\n"
+	"}\n"
+	"\n";
+
+
+/// <summary>
+/// OpenCL equivalent of Renderer::CurveAdjust().
+/// Only use float here instead of real_t because the output will be passed to write_imagef()
+/// during final accumulation, which only takes floats.
+/// </summary>
+static constexpr char CurveAdjustFunctionString[] =
+	"static inline void CurveAdjust(__global real4reals_bucket* csa, float* a, uint index)\n"
+	"{\n"
+	"	uint tempIndex = (uint)clamp(*a * CURVES_LENGTH_M1, 0.0f, CURVES_LENGTH_M1);\n"
+	"	uint tempIndex2 = (uint)clamp(csa[tempIndex].m_Real4.x * CURVES_LENGTH_M1, 0.0f, CURVES_LENGTH_M1);\n"
+	"\n"
+	"	*a = (float)csa[tempIndex2].m_Reals[index];\n"
+	"}\n"
+	"\n";
+
+/// <summary>
+/// Use MWC 64 from David Thomas at the Imperial College of London for
+/// random numbers in OpenCL, instead of ISAAC which was used
+/// for CPU rendering.
+/// </summary>
+static constexpr char RandFunctionString[] =
+	"enum { MWC64X_A = 4294883355u };\n\n"
+	"inline uint MwcNext(uint2* s)\n"
+	"{\n"
+	"	uint res = (*s).x ^ (*s).y;			\n"//Calculate the result.
+	"	uint hi = mul_hi((*s).x, MWC64X_A); \n"//Step the RNG.
+	"	(*s).x = (*s).x * MWC64X_A + (*s).y;\n"//Pack the state back up.
+	"	(*s).y = hi + ((*s).x < (*s).y);	\n"
+	"	return res;							\n"//Return the next result.
+	"}\n"
+	"\n"
+	"inline uint MwcNextRange(uint2* s, uint val)\n"
+	"{\n"
+	"	return (val == 0) ? MwcNext(s) : (uint)(((ulong)MwcNext(s) * (ulong)val) >> 32);\n"
+	"}\n"
+	"\n"
+	"inline real_t MwcNext01(uint2* s)\n"
+	"{\n"
+	"	return MwcNext(s) * (real_t)(1.0 / 4294967296.0);\n"
+	"}\n"
+	"\n"
+	"inline uint MwcNextCrand(uint2* s)\n"
+	"{\n"
+	"	return MwcNextRange(s, 32767u);\n"
+	"}\n"
+	"\n"
+	"inline real_t MwcNextFRange(uint2* s, real_t lower, real_t upper)\n"
+	"{\n"
+	"	real_t f = (real_t)MwcNext(s) / (real_t)UINT_MAX;\n"
+#ifdef USEFMA
+	"	return fma(f, upper - lower, lower);\n"
+#else
+	"	return (f * (upper - lower) + lower);\n"
+#endif
+	"}\n"
+	"\n"
+	"inline real_t MwcNextNeg1Pos1(uint2* s)\n"
+	"{\n"
+	"	real_t f = (real_t)MwcNext(s) / (real_t)UINT_MAX;\n"
+#ifdef USEFMA
+	"	return fma(f, (real_t)2.0, (real_t)-1.0);\n"
+#else
+	"	return (f * (real_t)2.0 + (real_t)-1.0);\n"
+#endif
+	"}\n"
+	"\n"
+	"inline real_t MwcNext0505(uint2* s)\n"
+	"{\n"
+	"	real_t f = (real_t)MwcNext(s) / (real_t)UINT_MAX;\n"
+	"	return -0.5 + f;\n"
+	"}\n"
+	"\n";
+
+/// <summary>
+/// OpenCL equivalent Renderer::AddToAccum().
+/// </summary>
+static constexpr char AddToAccumWithCheckFunctionString[] =
+	"inline bool AccumCheck(int superRasW, int superRasH, int i, int ii, int j, int jj)\n"
+	"{\n"
+	"	return (j + jj >= 0 && j + jj < superRasH && i + ii >= 0 && i + ii < superRasW);\n"
+	"}\n"
+	"\n";
+
+/// <summary>
+/// OpenCL equivalent various CarToRas member functions.
+/// Normaly would subtract m_RasLlX and m_RasLlY, but they were negated in RendererCL before being passed in, so they could be used with fma().
+/// </summary>
+static constexpr char CarToRasFunctionString[] =
+	"inline void CarToRasConvertPointToSingle(__constant CarToRasCL* carToRas, Point* point, uint* singleBufferIndex)\n"
+	"{\n"
+#ifdef USEFMA
+	"	*singleBufferIndex = (uint)fma(carToRas->m_PixPerImageUnitW, point->m_X, carToRas->m_RasLlX) + (carToRas->m_RasWidth * (uint)fma(carToRas->m_PixPerImageUnitH, point->m_Y, carToRas->m_RasLlY));\n"
+#else
+	"	*singleBufferIndex = (uint)(carToRas->m_PixPerImageUnitW * point->m_X + carToRas->m_RasLlX) + (carToRas->m_RasWidth * (uint)(carToRas->m_PixPerImageUnitH * point->m_Y + carToRas->m_RasLlY));\n"
+#endif
+	"}\n"
+	"\n"
+	"inline bool CarToRasInBounds(__constant CarToRasCL* carToRas, Point* point)\n"
+	"{\n"
+	"	return point->m_X >= carToRas->m_CarLlX &&\n"
+	"		point->m_X < carToRas->m_CarUrX &&\n"
+	"		point->m_Y < carToRas->m_CarUrY &&\n"
+	"		point->m_Y >= carToRas->m_CarLlY;\n"
+	"}\n"
+	"\n";
+
+static constexpr char AtomicString[] =
+   "void AtomicAdd(volatile __global real_bucket_t* source, const real_bucket_t operand)\n"
+   "{\n"
+   "	union\n"
+   "	{\n"
+   "		atomi intVal;\n"
+   "		real_bucket_t realVal;\n"
+   "	} newVal;\n"
+   "\n"
+   "	union\n"
+   "	{\n"
+   "		atomi intVal;\n"
+   "		real_bucket_t realVal;\n"
+   "	} prevVal;\n"
+   "\n"
+   "	do\n"
+   "	{\n"
+   "		prevVal.realVal = *source;\n"
+   "		newVal.realVal = prevVal.realVal + operand;\n"
+   "	} while (atomic_cmpxchg((volatile __global atomi*)source, prevVal.intVal, newVal.intVal) != prevVal.intVal);\n"
+   "}\n";
+
+}
--- a/Source/EmberCL/EmberCLPch.h
+++ b/Source/EmberCL/EmberCLPch.h
@ -1,66 +1,73 @@
-#ifdef _WIN32
-	#pragma once
-#endif
-
-/// <summary>
-/// Precompiled header file. Place all system includes here with appropriate #defines for different operating systems and compilers.
-/// </summary>
-
-#define NOMINMAX
-#define WIN32_LEAN_AND_MEAN//Exclude rarely-used stuff from Windows headers.
-#define _USE_MATH_DEFINES
-//#define CL_USE_DEPRECATED_OPENCL_1_2_APIS 1
-
-#include "Timing.h"
-#include "Renderer.h"
-
-#if defined(_WIN32)
-	#pragma warning(disable : 4251; disable : 4661; disable : 4100)
-	#include <windows.h>
-	#include <SDKDDKVer.h>
-	#include "GL/gl.h"
-#elif defined(__APPLE__)
-	#include <OpenGL/gl.h>
-#else
-	#include "GL/glx.h"
-#endif
-
-#include <utility>
-#include <CL/cl.hpp>
-#include <algorithm>
-#include <atomic>
-#include <cstdio>
-#include <cstdlib>
-#include <fstream>
-#include <iostream>
-#include <set>
-#include <string>
-#include <iterator>
-#include <time.h>
-#include <unordered_map>
-
-#ifdef _WIN32
-	#if defined(BUILDING_EMBERCL)
-		#define EMBERCL_API __declspec(dllexport)
-	#else
-		#define EMBERCL_API __declspec(dllimport)
-	#endif
-#else
-	#define EMBERCL_API
-#endif
-
-using namespace std;
-using namespace EmberNs;
-//#define TEST_CL 1
-//#define TEST_CL_BUFFERS 1
-
-//This special define is made to fix buggy OpenCL compilers on Mac.
-//Rendering is much slower there for unknown reasons. Michel traced it down
-//to the consec variable which keeps track of how many tries are needed to compute
-//a point which is not a bad value. Strangely, keeping this as a local variable
-//is slower than keeping it as an element in a global array.
-//This is counterintuitive, and lends further weight to the idea that OpenCL on Mac
-//is horribly broken.
-#ifdef __APPLE__
-	#define KNL_USE_GLOBAL_CONSEC
-#endif
+#ifdef _WIN32
+	#pragma once
+#endif
+
+/// <summary>
+/// Precompiled header file. Place all system includes here with appropriate #defines for different operating systems and compilers.
+/// </summary>
+
+#define NOMINMAX
+#define WIN32_LEAN_AND_MEAN//Exclude rarely-used stuff from Windows headers.
+#define _USE_MATH_DEFINES
+//#define CL_USE_DEPRECATED_OPENCL_1_2_APIS 1
+//#define CL_USE_DEPRECATED_OPENCL_2_0_APIS 1
+//For reasons unknown, QtCreator cannot use any value higher than 120 with these, because
+//it causes errors when compiling opencl.hpp. This happens even though it's using MSVC under the hood
+//and it compiles in MSVC when using Visual Studio.
+#define CL_TARGET_OPENCL_VERSION 300
+#define CL_HPP_TARGET_OPENCL_VERSION 300
+#define CL_HPP_MINIMUM_OPENCL_VERSION 300
+
+#include "Timing.h"
+#include "Renderer.h"
+
+#if defined(_WIN32)
+	#pragma warning(disable : 4251; disable : 4661; disable : 4100)
+	#include <windows.h>
+	#include <SDKDDKVer.h>
+	#include "GL/gl.h"
+#elif defined(__APPLE__)
+	#include <OpenGL/gl.h>
+#else
+	#include "GL/glx.h"
+#endif
+
+#include <utility>
+#include <CL/opencl.hpp>
+#include <algorithm>
+#include <atomic>
+#include <cstdio>
+#include <cstdlib>
+#include <fstream>
+#include <iostream>
+#include <set>
+#include <string>
+#include <iterator>
+#include <time.h>
+#include <unordered_map>
+
+#ifdef _WIN32
+	#if defined(BUILDING_EMBERCL)
+		#define EMBERCL_API __declspec(dllexport)
+	#else
+		#define EMBERCL_API __declspec(dllimport)
+	#endif
+#else
+	#define EMBERCL_API
+#endif
+
+using namespace std;
+using namespace EmberNs;
+//#define TEST_CL 1
+//#define TEST_CL_BUFFERS 1
+
+//This special define is made to fix buggy OpenCL compilers on Mac.
+//Rendering is much slower there for unknown reasons. Michel traced it down
+//to the consec variable which keeps track of how many tries are needed to compute
+//a point which is not a bad value. Strangely, keeping this as a local variable
+//is slower than keeping it as an element in a global array.
+//This is counterintuitive, and lends further weight to the idea that OpenCL on Mac
+//is horribly broken.
+#ifdef __APPLE__
+	#define KNL_USE_GLOBAL_CONSEC
+#endif
--- a/Source/EmberCL/EmberCLStructs.h
+++ b/Source/EmberCL/EmberCLStructs.h
@ -1,405 +1,405 @@
-#pragma once
-
-#include "EmberCLPch.h"
-
-/// <summary>
-/// Various data structures defined for the CPU and OpenCL.
-/// These are stripped down versions of THE classes in Ember, for use with OpenCL.
-/// Their sole purpose is to pass values from the host to the device.
-/// They retain most of the member variables, but do not contain the functions.
-/// Visual Studio defaults to alighment of 16, but it's made explicit in case another compiler is used.
-/// This must match the alignment specified in the kernel.
-/// </summary>
-
-namespace EmberCLns
-{
-/// <summary>
-/// Various constants needed for rendering.
-/// </summary>
-static string ConstantDefinesString(bool doublePrecision)
-{
-	ostringstream os;
-	os << "#if defined(cl_amd_fp64)\n"//AMD extension available?
-	   "	#pragma OPENCL EXTENSION cl_amd_fp64 : enable\n"
-	   "#endif\n"
-	   "#if defined(cl_khr_fp64)\n"//Khronos extension available?
-	   "	#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
-	   "#endif\n"
-	   "#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable\n";//Only supported on nVidia.
-
-	if (doublePrecision)
-	{
-		os <<
-		   "typedef long intPrec;\n"
-		   "typedef uint atomi;\n"//Same size as real_bucket_t, always 4 bytes.
-		   "typedef double real_t;\n"
-		   "typedef float real_bucket_t;\n"//Assume buckets are always float, even though iter calcs are in double.
-		   "typedef double2 real2;\n"
-		   "typedef double3 real3;\n"
-		   "typedef double4 real4;\n"
-		   "typedef float4 real4_bucket;\n"//And here too.
-		   "#define EPS (DBL_EPSILON)\n"
-		   "#define TLOW (DBL_MIN)\n"
-		   "#define TMAX (DBL_MAX)\n"
-		   ;
-	}
-	else
-	{
-		os << "typedef int intPrec;\n"
-		   "typedef uint atomi;\n"
-		   "typedef float real_t;\n"
-		   "typedef float real_bucket_t;\n"
-		   "typedef float2 real2;\n"
-		   "typedef float3 real3;\n"
-		   "typedef float4 real4;\n"
-		   "typedef float4 real4_bucket;\n"
-		   "#define EPS (FLT_EPSILON)\n"
-		   "#define TLOW (FLT_MIN)\n"
-		   "#define TMAX (FLT_MAX)\n"
-		   ;
-	}
-
-	os <<
-	   "typedef          long int int64;\n"
-	   "typedef unsigned long int uint64;\n"
-	   "\n"
-	   "#define EPS6 ((1e-6))\n"
-	   "\n"
-	   "//The number of threads per block used in the iteration function. Don't change\n"
-	   "//it lightly; the block size is hard coded to be exactly 32 x 8.\n"
-	   "#define NTHREADS 256u\n"
-	   "#define THREADS_PER_WARP 32u\n"
-	   "#define NWARPS (NTHREADS / THREADS_PER_WARP)\n"
-	   "#define DE_THRESH 100u\n"
-	   "#define BadVal(x) (isnan(x))\n"
-	   "#define SQR(x) ((x) * (x))\n"
-	   "#define CUBE(x) ((x) * (x) * (x))\n"
-	   "#define MPI ((real_t)M_PI)\n"
-	   "#define MPI2 ((real_t)M_PI_2)\n"
-	   "#define MPI4 ((real_t)M_PI_4)\n"
-	   "#define M1PI ((real_t)M_1_PI)\n"
-	   "#define M2PI ((real_t)M_2_PI)\n"
-	   "#define M_2PI (MPI * 2)\n"
-	   "#define M_3PI (MPI * 3)\n"
-	   "#define M_SQRT3 ((real_t)(1.7320508075688772935274463415059))\n"
-	   "#define M_SQRT3_2 ((real_t)(0.86602540378443864676372317075294))\n"
-	   "#define M_SQRT3_3 ((real_t)(0.57735026918962576450914878050196))\n"
-	   "#define M_SQRT5 ((real_t)(2.2360679774997896964091736687313))\n"
-	   "#define M_PHI ((real_t)(1.61803398874989484820458683436563))\n"
-	   "#define M_1_2PI ((real_t)(0.15915494309189533576888376337251))\n"
-	   "#define M_PI3 ((real_t)(1.0471975511965977461542144610932))\n"
-	   "#define M_PI6 ((real_t)(0.52359877559829887307710723054658))\n"
-	   "#define DEG_2_RAD (MPI / 180)\n"
-	   "#define CURVES_LENGTH_M1 ((real_bucket_t)" << CURVES_LENGTH_M1 << ")\n" <<
-	   "#define ONE_OVER_CURVES_LENGTH_M1 ((real_bucket_t)" << ONE_OVER_CURVES_LENGTH_M1 << ")\n" <<
-	   "\n"
-	   "//Index in each dimension of a thread within a block.\n"
-	   "#define THREAD_ID_X   (get_local_id(0))\n"
-	   "#define THREAD_ID_Y   (get_local_id(1))\n"
-	   "#define THREAD_ID_Z   (get_local_id(2))\n"
-	   "\n"
-	   "//Index in each dimension of a block within a grid.\n"
-	   "#define BLOCK_ID_X    (get_group_id(0))\n"
-	   "#define BLOCK_ID_Y    (get_group_id(1))\n"
-	   "#define BLOCK_ID_Z    (get_group_id(2))\n"
-	   "\n"
-	   "//Absolute index in each dimension of a thread within a grid.\n"
-	   "#define GLOBAL_ID_X   (get_global_id(0))\n"
-	   "#define GLOBAL_ID_Y   (get_global_id(1))\n"
-	   "#define GLOBAL_ID_Z   (get_global_id(2))\n"
-	   "\n"
-	   "//Dimensions of a block.\n"
-	   "#define BLOCK_SIZE_X  (get_local_size(0))\n"
-	   "#define BLOCK_SIZE_Y  (get_local_size(1))\n"
-	   "#define BLOCK_SIZE_Z  (get_local_size(2))\n"
-	   "\n"
-	   "//Dimensions of a grid, in terms of blocks.\n"
-	   "#define GRID_SIZE_X   (get_num_groups(0))\n"
-	   "#define GRID_SIZE_Y   (get_num_groups(1))\n"
-	   "#define GRID_SIZE_Z   (get_num_groups(2))\n"
-	   "\n"
-	   "//Dimensions of a grid, in terms of threads.\n"
-	   "#define GLOBAL_SIZE_X (get_global_size(0))\n"
-	   "#define GLOBAL_SIZE_Y (get_global_size(1))\n"
-	   "#define GLOBAL_SIZE_Z (get_global_size(2))\n"
-	   "\n"
-	   "#define INDEX_IN_BLOCK_2D (THREAD_ID_Y * BLOCK_SIZE_X + THREAD_ID_X)\n"
-	   "#define INDEX_IN_BLOCK_3D ((BLOCK_SIZE_X * BLOCK_SIZE_Y * THREAD_ID_Z) + INDEX_IN_BLOCK_2D)\n"
-	   "\n"
-	   "#define INDEX_IN_GRID_2D (GLOBAL_ID_Y * GLOBAL_SIZE_X + GLOBAL_ID_X)\n"
-	   "#define INDEX_IN_GRID_3D ((GLOBAL_SIZE_X * GLOBAL_SIZE_Y * GLOBAL_ID_Z) + INDEX_IN_GRID_2D)\n"
-	   "\n"
-	   "#define BLOCK_START_INDEX_IN_GRID_2D ((BLOCK_ID_Y * GRID_SIZE_X * BLOCK_SIZE_Y * BLOCK_SIZE_X) + (BLOCK_ID_X * BLOCK_SIZE_X * BLOCK_SIZE_Y))\n"
-	   "\n";
-	return os.str();
-}
-
-/// <summary>
-/// A point structure on the host that maps to the one used on the device to iterate in OpenCL.
-/// It might seem better to use vec4, however 2D palettes and even 3D coordinates may eventually
-/// be supported, which will make it more than 4 members.
-/// </summary>
-template <typename T>
-struct ALIGN PointCL
-{
-	T m_X;
-	T m_Y;
-	T m_Z;
-	T m_ColorX;
-	uint m_LastXfUsed;
-};
-
-/// <summary>
-/// The point structure used to iterate in OpenCL.
-/// It might seem better to use float4, however 2D palettes and even 3D coordinates may eventually
-/// be supported, which will make it more than 4 members.
-/// </summary>
-static constexpr char PointCLStructString[] =
-	"typedef struct __attribute__ " ALIGN_CL " _Point\n"
-	"{\n"
-	"	real_t m_X;\n"
-	"	real_t m_Y;\n"
-	"	real_t m_Z;\n"
-	"	real_t m_ColorX;\n"
-	"	uint m_LastXfUsed;\n"
-	"} Point;\n"
-	"\n";
-
-/// <summary>
-/// A structure on the host used to hold all of the needed information for an xform used on the device to iterate in OpenCL.
-/// Template argument expected to be float or double.
-/// </summary>
-template <typename T>
-struct ALIGN XformCL
-{
-	T m_A, m_B, m_C, m_D, m_E, m_F;//24 (48)
-	T m_PostA, m_PostB, m_PostC, m_PostD, m_PostE, m_PostF;//48 (96)
-	T m_DirectColor;//52 (104)
-	T m_ColorSpeedCache;//56 (112)
-	T m_OneMinusColorCache;//60 (120)
-	T m_Opacity;//64 (128)
-};
-
-/// <summary>
-/// The xform structure used to iterate in OpenCL.
-/// </summary>
-static constexpr char XformCLStructString[] =
-	"typedef struct __attribute__ " ALIGN_CL " _XformCL\n"
-	"{\n"
-	"	real_t m_A, m_B, m_C, m_D, m_E, m_F;\n"
-	"	real_t m_PostA, m_PostB, m_PostC, m_PostD, m_PostE, m_PostF;\n"
-	"	real_t m_DirectColor;\n"
-	"	real_t m_ColorSpeedCache;\n"
-	"	real_t m_OneMinusColorCache;\n"
-	"	real_t m_Opacity;\n"
-	"} XformCL;\n"
-	"\n";
-
-/// <summary>
-/// A structure on the host used to hold all of the needed information for an ember used on the device to iterate in OpenCL.
-/// Template argument expected to be float or double.
-/// </summary>
-template <typename T>
-struct ALIGN EmberCL
-{
-	T m_RandPointRange;
-	T m_CamZPos;
-	T m_CamPerspective;
-	T m_CamYaw;
-	T m_CamPitch;
-	T m_BlurCurve;
-	T m_CamDepthBlur;
-	T m_BlurCoef;
-	m3T m_CamMat;
-	T m_CenterX, m_CenterY;
-	T m_RotA, m_RotB, m_RotD, m_RotE;
-	T m_Psm1;
-	T m_Psm2;
-};
-
-/// <summary>
-/// The ember structure used to iterate in OpenCL.
-/// </summary>
-static constexpr char EmberCLStructString[] =
-	"typedef struct __attribute__ " ALIGN_CL " _EmberCL\n"
-	"{\n"
-	"	real_t m_RandPointRange;\n"
-	"	real_t m_CamZPos;\n"
-	"	real_t m_CamPerspective;\n"
-	"	real_t m_CamYaw;\n"
-	"	real_t m_CamPitch;\n"
-	"	real_t m_BlurCurve;\n"
-	"	real_t m_CamDepthBlur;\n"
-	"	real_t m_BlurCoef;\n"
-	"	real_t m_C00;\n"
-	"	real_t m_C01;\n"
-	"	real_t m_C02;\n"
-	"	real_t m_C10;\n"
-	"	real_t m_C11;\n"
-	"	real_t m_C12;\n"
-	"	real_t m_C20;\n"
-	"	real_t m_C21;\n"
-	"	real_t m_C22;\n"
-	"	real_t m_CenterX, m_CenterY;\n"
-	"	real_t m_RotA, m_RotB, m_RotD, m_RotE;\n"
-	"	real_t m_Psm1;\n"
-	"	real_t m_Psm2;\n"
-	"} EmberCL;\n"
-	"\n";
-
-/// <summary>
-/// A structure on the host used to hold all of the needed information for cartesian to raster mapping used on the device to iterate in OpenCL.
-/// Template argument expected to be float or double.
-/// </summary>
-template <typename T>
-struct ALIGN CarToRasCL
-{
-	T m_PixPerImageUnitW, m_RasLlX;
-	uint m_RasWidth;
-	T m_PixPerImageUnitH, m_RasLlY;
-	T m_CarLlX, m_CarUrX, m_CarUrY, m_CarLlY;
-	T m_CarHalfX, m_CarHalfY, m_CarCenterX, m_CarCenterY;
-};
-
-/// <summary>
-/// The cartesian to raster structure used to iterate in OpenCL.
-/// </summary>
-static constexpr char CarToRasCLStructString[] =
-	"typedef struct __attribute__ " ALIGN_CL " _CarToRasCL\n"
-	"{\n"
-	"	real_t m_PixPerImageUnitW, m_RasLlX;\n"
-	"	uint m_RasWidth;\n"
-	"	real_t m_PixPerImageUnitH, m_RasLlY;\n"
-	"	real_t m_CarLlX, m_CarUrX, m_CarUrY, m_CarLlY;\n"
-	"	real_t m_CarHalfX, m_CarHalfY, m_CarCenterX, m_CarCenterY;\n"
-	"} CarToRasCL;\n"
-	"\n";
-
-/// <summary>
-/// A structure on the host used to hold all of the needed information for density filtering used on the device to iterate in OpenCL.
-/// Note that the actual filter buffer is held elsewhere.
-/// Template argument expected to be float or double.
-/// </summary>
-template <typename T>
-struct ALIGN DensityFilterCL
-{
-	T m_Curve;
-	T m_K1;
-	T m_K2;
-	uint m_Supersample;
-	uint m_SuperRasW;
-	uint m_SuperRasH;
-	uint m_KernelSize;
-	uint m_MaxFilterIndex;
-	uint m_MaxFilteredCounts;
-	uint m_FilterWidth;
-};
-
-/// <summary>
-/// The density filtering structure used to iterate in OpenCL.
-/// Note that the actual filter buffer is held elsewhere.
-/// </summary>
-static constexpr char DensityFilterCLStructString[] =
-	"typedef struct __attribute__ " ALIGN_CL " _DensityFilterCL\n"
-	"{\n"
-	"	real_bucket_t m_Curve;\n"
-	"	real_bucket_t m_K1;\n"
-	"	real_bucket_t m_K2;\n"
-	"	uint m_Supersample;\n"
-	"	uint m_SuperRasW;\n"
-	"	uint m_SuperRasH;\n"
-	"	uint m_KernelSize;\n"
-	"	uint m_MaxFilterIndex;\n"
-	"	uint m_MaxFilteredCounts;\n"
-	"	uint m_FilterWidth;\n"
-	"} DensityFilterCL;\n"
-	"\n";
-
-/// <summary>
-/// A structure on the host used to hold all of the needed information for spatial filtering used on the device to iterate in OpenCL.
-/// Note that the actual filter buffer is held elsewhere.
-/// </summary>
-template <typename T>
-struct ALIGN SpatialFilterCL
-{
-	uint m_SuperRasW;
-	uint m_SuperRasH;
-	uint m_FinalRasW;
-	uint m_FinalRasH;
-	uint m_Supersample;
-	uint m_FilterWidth;
-	uint m_DensityFilterOffset;
-	uint m_YAxisUp;
-	T m_Vibrancy;
-	T m_HighlightPower;
-	T m_Gamma;
-	T m_LinRange;
-	Color<T> m_Background;
-};
-
-/// <summary>
-/// The spatial filtering structure used to iterate in OpenCL.
-/// Note that the actual filter buffer is held elsewhere.
-/// </summary>
-static constexpr char SpatialFilterCLStructString[] =
-	"typedef struct __attribute__ ((aligned (16))) _SpatialFilterCL\n"
-	"{\n"
-	"	uint m_SuperRasW;\n"
-	"	uint m_SuperRasH;\n"
-	"	uint m_FinalRasW;\n"
-	"	uint m_FinalRasH;\n"
-	"	uint m_Supersample;\n"
-	"	uint m_FilterWidth;\n"
-	"	uint m_DensityFilterOffset;\n"
-	"	uint m_YAxisUp;\n"
-	"	real_bucket_t m_Vibrancy;\n"
-	"	real_bucket_t m_HighlightPower;\n"
-	"	real_bucket_t m_Gamma;\n"
-	"	real_bucket_t m_LinRange;\n"
-	"	real_bucket_t m_Background[4];\n"//For some reason, using float4/double4 here does not align no matter what. So just use an array of 4.
-	"} SpatialFilterCL;\n"
-	"\n";
-
-/// <summary>
-/// EmberCL makes extensive use of the build in vector types, however accessing
-/// their members as a buffer is not natively supported.
-/// Declaring them in a union with a buffer resolves this problem.
-/// </summary>
-static constexpr char UnionCLStructString[] =
-	"typedef union\n"
-	"{\n"
-	"	uchar3 m_Uchar3;\n"
-	"	uchar m_Uchars[3];\n"
-	"} uchar3uchars;\n"
-	"\n"
-	"typedef union\n"
-	"{\n"
-	"	uchar4 m_Uchar4;\n"
-	"	uchar m_Uchars[4];\n"
-	"} uchar4uchars;\n"
-	"\n"
-	"typedef union\n"
-	"{\n"
-	"	uint4 m_Uint4;\n"
-	"	uint m_Uints[4];\n"
-	"} uint4uints;\n"
-	"\n"
-	"typedef union\n"//Use in places where float is required.
-	"{\n"
-	"	float4 m_Float4;\n"
-	"	float m_Floats[4];\n"
-	"} float4floats;\n"
-	"\n"
-	"typedef union\n"//Use in places where float or double can be used depending on the template type.
-	"{\n"
-	"	real4 m_Real4;\n"
-	"	real_t m_Reals[4];\n"
-	"} real4reals;\n"
-	"\n"
-	"typedef union\n"//Used to match the bucket template type.
-	"{\n"
-	"	real4_bucket m_Real4;\n"
-	"	real_bucket_t m_Reals[4];\n"
-	"} real4reals_bucket;\n"
-	"\n";
-}
+#pragma once
+
+#include "EmberCLPch.h"
+
+/// <summary>
+/// Various data structures defined for the CPU and OpenCL.
+/// These are stripped down versions of THE classes in Ember, for use with OpenCL.
+/// Their sole purpose is to pass values from the host to the device.
+/// They retain most of the member variables, but do not contain the functions.
+/// Visual Studio defaults to alighment of 16, but it's made explicit in case another compiler is used.
+/// This must match the alignment specified in the kernel.
+/// </summary>
+
+namespace EmberCLns
+{
+/// <summary>
+/// Various constants needed for rendering.
+/// </summary>
+static string ConstantDefinesString(bool doublePrecision)
+{
+	ostringstream os;
+	os << "#if defined(cl_amd_fp64)\n"//AMD extension available?
+	   "	#pragma OPENCL EXTENSION cl_amd_fp64 : enable\n"
+	   "#endif\n"
+	   "#if defined(cl_khr_fp64)\n"//Khronos extension available?
+	   "	#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
+	   "#endif\n"
+	   "#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable\n";//Only supported on nVidia.
+
+	if (doublePrecision)
+	{
+		os <<
+		   "typedef long intPrec;\n"
+		   "typedef uint atomi;\n"//Same size as real_bucket_t, always 4 bytes.
+		   "typedef double real_t;\n"
+		   "typedef float real_bucket_t;\n"//Assume buckets are always float, even though iter calcs are in double.
+		   "typedef double2 real2;\n"
+		   "typedef double3 real3;\n"
+		   "typedef double4 real4;\n"
+		   "typedef float4 real4_bucket;\n"//And here too.
+		   "#define EPS (DBL_EPSILON)\n"
+		   "#define TLOW (DBL_MIN)\n"
+		   "#define TMAX (DBL_MAX)\n"
+		   ;
+	}
+	else
+	{
+		os << "typedef int intPrec;\n"
+		   "typedef uint atomi;\n"
+		   "typedef float real_t;\n"
+		   "typedef float real_bucket_t;\n"
+		   "typedef float2 real2;\n"
+		   "typedef float3 real3;\n"
+		   "typedef float4 real4;\n"
+		   "typedef float4 real4_bucket;\n"
+		   "#define EPS (FLT_EPSILON)\n"
+		   "#define TLOW (FLT_MIN)\n"
+		   "#define TMAX (FLT_MAX)\n"
+		   ;
+	}
+
+	os <<
+	   "typedef          long int int64;\n"
+	   "typedef unsigned long int uint64;\n"
+	   "\n"
+	   "#define EPS6 ((1e-6))\n"
+	   "\n"
+	   "//The number of threads per block used in the iteration function. Don't change\n"
+	   "//it lightly; the block size is hard coded to be exactly 32 x 8.\n"
+	   "#define NTHREADS 256u\n"
+	   "#define THREADS_PER_WARP 32u\n"
+	   "#define NWARPS (NTHREADS / THREADS_PER_WARP)\n"
+	   "#define DE_THRESH 100u\n"
+	   "#define BadVal(x) (isnan(x))\n"
+	   "#define SQR(x) ((x) * (x))\n"
+	   "#define CUBE(x) ((x) * (x) * (x))\n"
+	   "#define MPI ((real_t)M_PI)\n"
+	   "#define MPI2 ((real_t)M_PI_2)\n"
+	   "#define MPI4 ((real_t)M_PI_4)\n"
+	   "#define M1PI ((real_t)M_1_PI)\n"
+	   "#define M2PI ((real_t)M_2_PI)\n"
+	   "#define M_2PI (MPI * 2)\n"
+	   "#define M_3PI (MPI * 3)\n"
+	   "#define M_SQRT3 ((real_t)(1.7320508075688772935274463415059))\n"
+	   "#define M_SQRT3_2 ((real_t)(0.86602540378443864676372317075294))\n"
+	   "#define M_SQRT3_3 ((real_t)(0.57735026918962576450914878050196))\n"
+	   "#define M_SQRT5 ((real_t)(2.2360679774997896964091736687313))\n"
+	   "#define M_PHI ((real_t)(1.61803398874989484820458683436563))\n"
+	   "#define M_1_2PI ((real_t)(0.15915494309189533576888376337251))\n"
+	   "#define M_PI3 ((real_t)(1.0471975511965977461542144610932))\n"
+	   "#define M_PI6 ((real_t)(0.52359877559829887307710723054658))\n"
+	   "#define DEG_2_RAD (MPI / 180)\n"
+	   "#define CURVES_LENGTH_M1 ((real_bucket_t)" << CURVES_LENGTH_M1 << ")\n" <<
+	   "#define ONE_OVER_CURVES_LENGTH_M1 ((real_bucket_t)" << ONE_OVER_CURVES_LENGTH_M1 << ")\n" <<
+	   "\n"
+	   "//Index in each dimension of a thread within a block.\n"
+	   "#define THREAD_ID_X   (get_local_id(0))\n"
+	   "#define THREAD_ID_Y   (get_local_id(1))\n"
+	   "#define THREAD_ID_Z   (get_local_id(2))\n"
+	   "\n"
+	   "//Index in each dimension of a block within a grid.\n"
+	   "#define BLOCK_ID_X    (get_group_id(0))\n"
+	   "#define BLOCK_ID_Y    (get_group_id(1))\n"
+	   "#define BLOCK_ID_Z    (get_group_id(2))\n"
+	   "\n"
+	   "//Absolute index in each dimension of a thread within a grid.\n"
+	   "#define GLOBAL_ID_X   (get_global_id(0))\n"
+	   "#define GLOBAL_ID_Y   (get_global_id(1))\n"
+	   "#define GLOBAL_ID_Z   (get_global_id(2))\n"
+	   "\n"
+	   "//Dimensions of a block.\n"
+	   "#define BLOCK_SIZE_X  (get_local_size(0))\n"
+	   "#define BLOCK_SIZE_Y  (get_local_size(1))\n"
+	   "#define BLOCK_SIZE_Z  (get_local_size(2))\n"
+	   "\n"
+	   "//Dimensions of a grid, in terms of blocks.\n"
+	   "#define GRID_SIZE_X   (get_num_groups(0))\n"
+	   "#define GRID_SIZE_Y   (get_num_groups(1))\n"
+	   "#define GRID_SIZE_Z   (get_num_groups(2))\n"
+	   "\n"
+	   "//Dimensions of a grid, in terms of threads.\n"
+	   "#define GLOBAL_SIZE_X (get_global_size(0))\n"
+	   "#define GLOBAL_SIZE_Y (get_global_size(1))\n"
+	   "#define GLOBAL_SIZE_Z (get_global_size(2))\n"
+	   "\n"
+	   "#define INDEX_IN_BLOCK_2D (THREAD_ID_Y * BLOCK_SIZE_X + THREAD_ID_X)\n"
+	   "#define INDEX_IN_BLOCK_3D ((BLOCK_SIZE_X * BLOCK_SIZE_Y * THREAD_ID_Z) + INDEX_IN_BLOCK_2D)\n"
+	   "\n"
+	   "#define INDEX_IN_GRID_2D (GLOBAL_ID_Y * GLOBAL_SIZE_X + GLOBAL_ID_X)\n"
+	   "#define INDEX_IN_GRID_3D ((GLOBAL_SIZE_X * GLOBAL_SIZE_Y * GLOBAL_ID_Z) + INDEX_IN_GRID_2D)\n"
+	   "\n"
+	   "#define BLOCK_START_INDEX_IN_GRID_2D ((BLOCK_ID_Y * GRID_SIZE_X * BLOCK_SIZE_Y * BLOCK_SIZE_X) + (BLOCK_ID_X * BLOCK_SIZE_X * BLOCK_SIZE_Y))\n"
+	   "\n";
+	return os.str();
+}
+
+/// <summary>
+/// A point structure on the host that maps to the one used on the device to iterate in OpenCL.
+/// It might seem better to use vec4, however 2D palettes and even 3D coordinates may eventually
+/// be supported, which will make it more than 4 members.
+/// </summary>
+template <typename T>
+struct ALIGN PointCL
+{
+	T m_X;
+	T m_Y;
+	T m_Z;
+	T m_ColorX;
+	uint m_LastXfUsed;
+};
+
+/// <summary>
+/// The point structure used to iterate in OpenCL.
+/// It might seem better to use float4, however 2D palettes and even 3D coordinates may eventually
+/// be supported, which will make it more than 4 members.
+/// </summary>
+static constexpr char PointCLStructString[] =
+	"typedef struct __attribute__ " ALIGN_CL " _Point\n"
+	"{\n"
+	"	real_t m_X;\n"
+	"	real_t m_Y;\n"
+	"	real_t m_Z;\n"
+	"	real_t m_ColorX;\n"
+	"	uint m_LastXfUsed;\n"
+	"} Point;\n"
+	"\n";
+
+/// <summary>
+/// A structure on the host used to hold all of the needed information for an xform used on the device to iterate in OpenCL.
+/// Template argument expected to be float or double.
+/// </summary>
+template <typename T>
+struct ALIGN XformCL
+{
+	T m_A, m_B, m_C, m_D, m_E, m_F;//24 (48)
+	T m_PostA, m_PostB, m_PostC, m_PostD, m_PostE, m_PostF;//48 (96)
+	T m_DirectColor;//52 (104)
+	T m_ColorSpeedCache;//56 (112)
+	T m_OneMinusColorCache;//60 (120)
+	T m_Opacity;//64 (128)
+};
+
+/// <summary>
+/// The xform structure used to iterate in OpenCL.
+/// </summary>
+static constexpr char XformCLStructString[] =
+	"typedef struct __attribute__ " ALIGN_CL " _XformCL\n"
+	"{\n"
+	"	real_t m_A, m_B, m_C, m_D, m_E, m_F;\n"
+	"	real_t m_PostA, m_PostB, m_PostC, m_PostD, m_PostE, m_PostF;\n"
+	"	real_t m_DirectColor;\n"
+	"	real_t m_ColorSpeedCache;\n"
+	"	real_t m_OneMinusColorCache;\n"
+	"	real_t m_Opacity;\n"
+	"} XformCL;\n"
+	"\n";
+
+/// <summary>
+/// A structure on the host used to hold all of the needed information for an ember used on the device to iterate in OpenCL.
+/// Template argument expected to be float or double.
+/// </summary>
+template <typename T>
+struct ALIGN EmberCL
+{
+	T m_RandPointRange;
+	T m_CamZPos;
+	T m_CamPerspective;
+	T m_CamYaw;
+	T m_CamPitch;
+	T m_BlurCurve;
+	T m_CamDepthBlur;
+	T m_BlurCoef;
+	m3T m_CamMat;
+	T m_CenterX, m_CenterY;
+	T m_RotA, m_RotB, m_RotD, m_RotE;
+	T m_Psm1;
+	T m_Psm2;
+};
+
+/// <summary>
+/// The ember structure used to iterate in OpenCL.
+/// </summary>
+static constexpr char EmberCLStructString[] =
+	"typedef struct __attribute__ " ALIGN_CL " _EmberCL\n"
+	"{\n"
+	"	real_t m_RandPointRange;\n"
+	"	real_t m_CamZPos;\n"
+	"	real_t m_CamPerspective;\n"
+	"	real_t m_CamYaw;\n"
+	"	real_t m_CamPitch;\n"
+	"	real_t m_BlurCurve;\n"
+	"	real_t m_CamDepthBlur;\n"
+	"	real_t m_BlurCoef;\n"
+	"	real_t m_C00;\n"
+	"	real_t m_C01;\n"
+	"	real_t m_C02;\n"
+	"	real_t m_C10;\n"
+	"	real_t m_C11;\n"
+	"	real_t m_C12;\n"
+	"	real_t m_C20;\n"
+	"	real_t m_C21;\n"
+	"	real_t m_C22;\n"
+	"	real_t m_CenterX, m_CenterY;\n"
+	"	real_t m_RotA, m_RotB, m_RotD, m_RotE;\n"
+	"	real_t m_Psm1;\n"
+	"	real_t m_Psm2;\n"
+	"} EmberCL;\n"
+	"\n";
+
+/// <summary>
+/// A structure on the host used to hold all of the needed information for cartesian to raster mapping used on the device to iterate in OpenCL.
+/// Template argument expected to be float or double.
+/// </summary>
+template <typename T>
+struct ALIGN CarToRasCL
+{
+	T m_PixPerImageUnitW, m_RasLlX;
+	uint m_RasWidth;
+	T m_PixPerImageUnitH, m_RasLlY;
+	T m_CarLlX, m_CarUrX, m_CarUrY, m_CarLlY;
+	T m_CarHalfX, m_CarHalfY, m_CarCenterX, m_CarCenterY;
+};
+
+/// <summary>
+/// The cartesian to raster structure used to iterate in OpenCL.
+/// </summary>
+static constexpr char CarToRasCLStructString[] =
+	"typedef struct __attribute__ " ALIGN_CL " _CarToRasCL\n"
+	"{\n"
+	"	real_t m_PixPerImageUnitW, m_RasLlX;\n"
+	"	uint m_RasWidth;\n"
+	"	real_t m_PixPerImageUnitH, m_RasLlY;\n"
+	"	real_t m_CarLlX, m_CarUrX, m_CarUrY, m_CarLlY;\n"
+	"	real_t m_CarHalfX, m_CarHalfY, m_CarCenterX, m_CarCenterY;\n"
+	"} CarToRasCL;\n"
+	"\n";
+
+/// <summary>
+/// A structure on the host used to hold all of the needed information for density filtering used on the device to iterate in OpenCL.
+/// Note that the actual filter buffer is held elsewhere.
+/// Template argument expected to be float or double.
+/// </summary>
+template <typename T>
+struct ALIGN DensityFilterCL
+{
+	T m_Curve;
+	T m_K1;
+	T m_K2;
+	uint m_Supersample;
+	uint m_SuperRasW;
+	uint m_SuperRasH;
+	uint m_KernelSize;
+	uint m_MaxFilterIndex;
+	uint m_MaxFilteredCounts;
+	uint m_FilterWidth;
+};
+
+/// <summary>
+/// The density filtering structure used to iterate in OpenCL.
+/// Note that the actual filter buffer is held elsewhere.
+/// </summary>
+static constexpr char DensityFilterCLStructString[] =
+	"typedef struct __attribute__ " ALIGN_CL " _DensityFilterCL\n"
+	"{\n"
+	"	real_bucket_t m_Curve;\n"
+	"	real_bucket_t m_K1;\n"
+	"	real_bucket_t m_K2;\n"
+	"	uint m_Supersample;\n"
+	"	uint m_SuperRasW;\n"
+	"	uint m_SuperRasH;\n"
+	"	uint m_KernelSize;\n"
+	"	uint m_MaxFilterIndex;\n"
+	"	uint m_MaxFilteredCounts;\n"
+	"	uint m_FilterWidth;\n"
+	"} DensityFilterCL;\n"
+	"\n";
+
+/// <summary>
+/// A structure on the host used to hold all of the needed information for spatial filtering used on the device to iterate in OpenCL.
+/// Note that the actual filter buffer is held elsewhere.
+/// </summary>
+template <typename T>
+struct ALIGN SpatialFilterCL
+{
+	uint m_SuperRasW = 0;
+	uint m_SuperRasH = 0;
+	uint m_FinalRasW = 0;
+	uint m_FinalRasH = 0;
+	uint m_Supersample = 0;
+	uint m_FilterWidth = 0;
+	uint m_DensityFilterOffset = 0;
+	uint m_YAxisUp = 0;
+	T m_Vibrancy = 0;
+	T m_HighlightPower = 0;
+	T m_Gamma = 0;
+	T m_LinRange = 0;
+	Color<T> m_Background;
+};
+
+/// <summary>
+/// The spatial filtering structure used to iterate in OpenCL.
+/// Note that the actual filter buffer is held elsewhere.
+/// </summary>
+static constexpr char SpatialFilterCLStructString[] =
+	"typedef struct __attribute__ ((aligned (16))) _SpatialFilterCL\n"
+	"{\n"
+	"	uint m_SuperRasW;\n"
+	"	uint m_SuperRasH;\n"
+	"	uint m_FinalRasW;\n"
+	"	uint m_FinalRasH;\n"
+	"	uint m_Supersample;\n"
+	"	uint m_FilterWidth;\n"
+	"	uint m_DensityFilterOffset;\n"
+	"	uint m_YAxisUp;\n"
+	"	real_bucket_t m_Vibrancy;\n"
+	"	real_bucket_t m_HighlightPower;\n"
+	"	real_bucket_t m_Gamma;\n"
+	"	real_bucket_t m_LinRange;\n"
+	"	real_bucket_t m_Background[4];\n"//For some reason, using float4/double4 here does not align no matter what. So just use an array of 4.
+	"} SpatialFilterCL;\n"
+	"\n";
+
+/// <summary>
+/// EmberCL makes extensive use of the build in vector types, however accessing
+/// their members as a buffer is not natively supported.
+/// Declaring them in a union with a buffer resolves this problem.
+/// </summary>
+static constexpr char UnionCLStructString[] =
+	"typedef union\n"
+	"{\n"
+	"	uchar3 m_Uchar3;\n"
+	"	uchar m_Uchars[3];\n"
+	"} uchar3uchars;\n"
+	"\n"
+	"typedef union\n"
+	"{\n"
+	"	uchar4 m_Uchar4;\n"
+	"	uchar m_Uchars[4];\n"
+	"} uchar4uchars;\n"
+	"\n"
+	"typedef union\n"
+	"{\n"
+	"	uint4 m_Uint4;\n"
+	"	uint m_Uints[4];\n"
+	"} uint4uints;\n"
+	"\n"
+	"typedef union\n"//Use in places where float is required.
+	"{\n"
+	"	float4 m_Float4;\n"
+	"	float m_Floats[4];\n"
+	"} float4floats;\n"
+	"\n"
+	"typedef union\n"//Use in places where float or double can be used depending on the template type.
+	"{\n"
+	"	real4 m_Real4;\n"
+	"	real_t m_Reals[4];\n"
+	"} real4reals;\n"
+	"\n"
+	"typedef union\n"//Used to match the bucket template type.
+	"{\n"
+	"	real4_bucket m_Real4;\n"
+	"	real_bucket_t m_Reals[4];\n"
+	"} real4reals_bucket;\n"
+	"\n";
+}
--- a/Source/EmberCL/FinalAccumOpenCLKernelCreator.cpp
+++ b/Source/EmberCL/FinalAccumOpenCLKernelCreator.cpp
@ -1,316 +1,316 @@
-#include "EmberCLPch.h"
-#include "FinalAccumOpenCLKernelCreator.h"
-
-namespace EmberCLns
-{
-/// <summary>
-/// Constructor that creates all kernel strings.
-/// The caller will access these strings through the accessor functions.
-/// </summary>
-FinalAccumOpenCLKernelCreator::FinalAccumOpenCLKernelCreator(bool doublePrecision)
-{
-	m_DoublePrecision = doublePrecision;
-	m_GammaCorrectionWithoutAlphaCalcKernel                   = CreateGammaCorrectionKernelString();
-	m_FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumKernel = CreateFinalAccumKernelString(true);
-	m_FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumKernel  = CreateFinalAccumKernelString(false);
-}
-
-/// <summary>
-/// Kernel source and entry point properties, getters only.
-/// </summary>
-
-const string& FinalAccumOpenCLKernelCreator::FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumKernel()     const { return m_FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumKernel;     }
-const string& FinalAccumOpenCLKernelCreator::FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumEntryPoint() const { return m_FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumEntryPoint; }
-
-const string& FinalAccumOpenCLKernelCreator::FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumKernel()     const { return m_FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumKernel;     }
-const string& FinalAccumOpenCLKernelCreator::FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumEntryPoint() const { return m_FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumEntryPoint; }
-
-const string& FinalAccumOpenCLKernelCreator::GammaCorrectionEntryPoint() const { return m_GammaCorrectionWithoutAlphaCalcEntryPoint; }
-const string& FinalAccumOpenCLKernelCreator::GammaCorrectionKernel() const { return m_GammaCorrectionWithoutAlphaCalcKernel; }
-
-/// <summary>
-/// Get the final accumulation entry point.
-/// </summary>
-/// <param name="earlyClip">True if early clip is desired, else false.</param>
-/// <returns>The name of the final accumulation entry point kernel function</returns>
-const string& FinalAccumOpenCLKernelCreator::FinalAccumEntryPoint(bool earlyClip) const
-{
-	if (earlyClip)
-		return FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumEntryPoint();
-	else
-		return FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumEntryPoint();
-}
-
-/// <summary>
-/// Get the final accumulation kernel string.
-/// </summary>
-/// <param name="earlyClip">True if early clip is desired, else false.</param>
-/// <returns>The final accumulation kernel string</returns>
-const string& FinalAccumOpenCLKernelCreator::FinalAccumKernel(bool earlyClip) const
-{
-	if (earlyClip)
-		return FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumKernel();
-	else
-		return FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumKernel();
-}
-
-/// <summary>
-/// Create the final accumulation kernel string
-/// </summary>
-/// <param name="earlyClip">True if early clip is desired, else false.</param>
-/// <returns>The final accumulation kernel string</returns>
-string FinalAccumOpenCLKernelCreator::CreateFinalAccumKernelString(bool earlyClip)
-{
-	ostringstream os;
-	os <<
-	   ConstantDefinesString(m_DoublePrecision) <<
-	   UnionCLStructString <<
-	   RgbToHsvFunctionString <<
-	   HsvToRgbFunctionString <<
-	   CalcAlphaFunctionString <<
-	   CurveAdjustFunctionString <<
-	   SpatialFilterCLStructString;
-
-	if (earlyClip)
-	{
-		os << "__kernel void " << m_FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumEntryPoint << "(\n";
-	}
-	else
-	{
-		os <<
-		   CreateCalcNewRgbFunctionString(false) <<
-		   CreateGammaCorrectionFunctionString(false, true) <<
-		   "__kernel void " << m_FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumEntryPoint << "(\n";
-	}
-
-	os <<
-	   "	const __global real4reals_bucket* accumulator,\n"
-	   "	__write_only image2d_t pixels,\n"
-	   "	__constant SpatialFilterCL* spatialFilter,\n"
-	   "	__constant real_bucket_t* filterCoefs,\n"
-	   "	__global real4reals_bucket* csa,\n"
-	   "	const uint doCurves\n"
-	   "\t)\n"
-	   "{\n"
-	   "\n"
-	   "	if ((GLOBAL_ID_Y >= spatialFilter->m_FinalRasH) || (GLOBAL_ID_X >= spatialFilter->m_FinalRasW))\n"
-	   "		return;\n"
-	   "\n"
-	   "	uint accumX = spatialFilter->m_DensityFilterOffset + (GLOBAL_ID_X * spatialFilter->m_Supersample);\n"
-	   "	uint accumY = spatialFilter->m_DensityFilterOffset + (GLOBAL_ID_Y * spatialFilter->m_Supersample);\n"
-	   "    uint clampedFilterH = min((uint)spatialFilter->m_FilterWidth, spatialFilter->m_SuperRasH - accumY);"
-	   "    uint clampedFilterW = min((uint)spatialFilter->m_FilterWidth, spatialFilter->m_SuperRasW - accumX);"
-	   "	int2 finalCoord;\n"
-	   "	finalCoord.x = GLOBAL_ID_X;\n"
-	   "	finalCoord.y = (int)((spatialFilter->m_YAxisUp == 1) ? ((spatialFilter->m_FinalRasH - GLOBAL_ID_Y) - 1) : GLOBAL_ID_Y);\n"
-	   "	float4floats finalColor;\n"
-	   "	int ii, jj;\n"
-	   "	uint filterKRowIndex;\n"
-	   "	const __global real4reals_bucket* accumBucket;\n"
-	   "	real4reals_bucket newBucket;\n"
-	   "	newBucket.m_Real4 = 0;\n"
-	   "\n"
-	   "	for (jj = 0; jj < clampedFilterH; jj++)\n"
-	   "	{\n"
-	   "		filterKRowIndex = jj * spatialFilter->m_FilterWidth;\n"//Use the full, non-clamped width to get the filter value.
-	   "\n"
-	   "		for (ii = 0; ii < clampedFilterW; ii++)\n"
-	   "		{\n"
-	   "			real_bucket_t k = filterCoefs[filterKRowIndex + ii];\n"
-	   "\n"
-	   "			accumBucket = accumulator + ((accumY + jj) * spatialFilter->m_SuperRasW) + (accumX + ii);\n"
-	   "			newBucket.m_Real4 += (k * accumBucket->m_Real4);\n"
-	   "		}\n"
-	   "	}\n"
-	   "\n";
-
-	if (earlyClip)//If early clip, simply assign values directly to the temp float4 since they've been gamma corrected already, then write it straight to the output image below.
-	{
-		os <<
-		   "	finalColor.m_Float4.x = (float)newBucket.m_Real4.x;\n"//CPU side clamps, skip here because write_imagef() does the clamping for us.
-		   "	finalColor.m_Float4.y = (float)newBucket.m_Real4.y;\n"
-		   "	finalColor.m_Float4.z = (float)newBucket.m_Real4.z;\n"
-		   "	finalColor.m_Float4.w = (float)newBucket.m_Real4.w;\n";
-	}
-	else
-	{
-		//Late clip, so must gamma correct from the temp newBucket to temp finalColor float4.
-		if (m_DoublePrecision)
-		{
-			os <<
-			   "	real4reals_bucket realFinal;\n"
-			   "\n"
-			   "	GammaCorrectionFloats(&newBucket, &(spatialFilter->m_Background[0]), spatialFilter->m_Gamma, spatialFilter->m_LinRange, spatialFilter->m_Vibrancy, spatialFilter->m_HighlightPower, &(realFinal.m_Reals[0]));\n"
-			   "	finalColor.m_Float4.x = (float)realFinal.m_Real4.x;\n"
-			   "	finalColor.m_Float4.y = (float)realFinal.m_Real4.y;\n"
-			   "	finalColor.m_Float4.z = (float)realFinal.m_Real4.z;\n"
-			   "	finalColor.m_Float4.w = (float)realFinal.m_Real4.w;\n"
-			   ;
-		}
-		else
-		{
-			os <<
-			   "	GammaCorrectionFloats(&newBucket, &(spatialFilter->m_Background[0]), spatialFilter->m_Gamma, spatialFilter->m_LinRange, spatialFilter->m_Vibrancy, spatialFilter->m_HighlightPower, &(finalColor.m_Floats[0]));\n";
-		}
-	}
-
-	os <<
-	   "\n"
-	   "	if (doCurves)\n"
-	   "	{\n"
-	   "		CurveAdjust(csa, &(finalColor.m_Floats[0]), 1);\n"
-	   "		CurveAdjust(csa, &(finalColor.m_Floats[1]), 2);\n"
-	   "		CurveAdjust(csa, &(finalColor.m_Floats[2]), 3);\n"
-	   "	}\n"
-	   "\n"
-	   "	write_imagef(pixels, finalCoord, finalColor.m_Float4);\n"//Use write_imagef instead of write_imageui because only the former works when sharing with an OpenGL texture.
-	   "	barrier(CLK_GLOBAL_MEM_FENCE);\n"//Required, or else page tearing will occur during interactive rendering.
-	   "}\n"
-	   ;
-	return os.str();
-}
-
-/// <summary>
-/// Creates the gamma correction function string.
-/// This is not a full kernel, just a function that is used in the kernels.
-/// </summary>
-/// <param name="globalBucket">True if writing to a global buffer (early clip), else false (late clip).</param>
-/// <param name="finalOut">True if writing to global buffer (late clip), else false (early clip).</param>
-/// <returns>The gamma correction function string</returns>
-string FinalAccumOpenCLKernelCreator::CreateGammaCorrectionFunctionString(bool globalBucket, bool finalOut)
-{
-	ostringstream os;
-	string dataType;
-	string unionMember;
-	dataType = "real_bucket_t";
-	//Use real_t for all cases, early clip and final accum.
-	os << "void GammaCorrectionFloats(" << (globalBucket ? "__global " : "") << "real4reals_bucket* bucket, __constant real_bucket_t* background, real_bucket_t g, real_bucket_t linRange, real_bucket_t vibrancy, real_bucket_t highlightPower, " << (finalOut ? "" : "__global") << " real_bucket_t* correctedChannels)\n";
-	os << "{\n"
-	   << "	real_bucket_t alpha, ls, tmp, a;\n"
-	   << "	real4reals_bucket newRgb;\n"
-	   << "\n"
-	   << "	if (bucket->m_Reals[3] <= 0)\n"
-	   << "	{\n"
-	   << "		alpha = 0;\n"
-	   << "		ls = 0;\n"
-	   << "	}\n"
-	   << "	else\n"
-	   << "	{\n"
-	   << "		tmp = bucket->m_Reals[3];\n"
-	   << "		alpha = CalcAlpha(tmp, g, linRange);\n"
-	   << "		ls = vibrancy * alpha / tmp;\n"
-	   << "		alpha = clamp(alpha, (real_bucket_t)0.0, (real_bucket_t)1.0);\n"
-	   << "	}\n"
-	   << "\n"
-	   << "	CalcNewRgb(bucket, ls, highlightPower, &newRgb);\n"
-	   << "\n"
-	   << "	for (uint rgbi = 0; rgbi < 3; rgbi++)\n"
-	   << "	{\n"
-	   << "		a = newRgb.m_Reals[rgbi] + ((1.0 - vibrancy) * pow(fabs(bucket->m_Reals[rgbi]), g));\n"
-	   << "		a += ((1.0 - alpha) * background[rgbi]);\n"
-	   << "		correctedChannels[rgbi] = (" << dataType << ")clamp(a, (real_bucket_t)0.0, (real_bucket_t)1.0);\n"
-	   << "	}\n"
-	   << "\n"
-	   << "	correctedChannels[3] = (" << dataType << ")alpha;\n"
-	   << "}\n"
-	   << "\n";
-	return os.str();
-}
-
-/// <summary>
-/// OpenCL equivalent of Palette::CalcNewRgb().
-/// </summary>
-/// <param name="globalBucket">True if writing the corrected value to a global buffer (early clip), else false (late clip).</param>
-/// <returns>The CalcNewRgb function string</returns>
-string FinalAccumOpenCLKernelCreator::CreateCalcNewRgbFunctionString(bool globalBucket)
-{
-	ostringstream os;
-	os <<
-	   "static void CalcNewRgb(" << (globalBucket ? "__global " : "") << "real4reals_bucket* oldRgb, real_bucket_t ls, real_bucket_t highPow, real4reals_bucket* newRgb)\n"
-	   "{\n"
-	   "	int rgbi;\n"
-	   "	real_bucket_t lsratio;\n"
-	   "	real4reals_bucket newHsv;\n"
-	   "	real_bucket_t maxa, maxc, newls;\n"
-	   "	real_bucket_t adjhlp;\n"
-	   "\n"
-	   "	if (ls == 0 || (oldRgb->m_Real4.x == 0 && oldRgb->m_Real4.y == 0 && oldRgb->m_Real4.z == 0))\n"//Can't do a vector compare to zero.
-	   "	{\n"
-	   "		newRgb->m_Real4 = 0;\n"
-	   "		return;\n"
-	   "	}\n"
-	   "\n"
-	   //Identify the most saturated channel.
-	   "	maxc = max(max(oldRgb->m_Reals[0], oldRgb->m_Reals[1]), oldRgb->m_Reals[2]);\n"
-	   "	maxa = ls * maxc;\n"
-	   "	newls = 1 / maxc;\n"
-	   "\n"
-	   //If a channel is saturated and highlight power is non-negative
-	   //modify the color to prevent hue shift.
-	   "	if (maxa > 1 && highPow >= 0)\n"
-	   "	{\n"
-	   "		lsratio = pow(newls / ls, highPow);\n"
-	   "\n"
-	   //Calculate the max-value color (ranged 0 - 1).
-	   "		for (rgbi = 0; rgbi < 3; rgbi++)\n"
-	   "			newRgb->m_Reals[rgbi] = newls * oldRgb->m_Reals[rgbi];\n"
-	   "\n"
-	   //Reduce saturation by the lsratio.
-	   "		RgbToHsv(&(newRgb->m_Real4), &(newHsv.m_Real4));\n"
-	   "		newHsv.m_Real4.y *= lsratio;\n"
-	   "		HsvToRgb(&(newHsv.m_Real4), &(newRgb->m_Real4));\n"
-	   "	}\n"
-	   "	else\n"
-	   "	{\n"
-	   "		adjhlp = -highPow;\n"
-	   "\n"
-	   "		if (adjhlp > 1)\n"
-	   "			adjhlp = 1;\n"
-	   "\n"
-	   "		if (maxa <= 1)\n"
-	   "			adjhlp = 1;\n"
-	   "\n"
-	   //Calculate the max-value color (ranged 0 - 1) interpolated with the old behavior.
-	   "		for (rgbi = 0; rgbi < 3; rgbi++)\n"//Unrolling, caching and vectorizing makes no difference.
-	   "			newRgb->m_Reals[rgbi] = ((1.0 - adjhlp) * newls + adjhlp * ls) * oldRgb->m_Reals[rgbi];\n"
-	   "	}\n"
-	   "}\n"
-	   "\n";
-	return os.str();
-}
-
-/// <summary>
-/// Create the gamma correction kernel string used for early clipping.
-/// </summary>
-/// <returns>The gamma correction kernel string used for early clipping</returns>
-string FinalAccumOpenCLKernelCreator::CreateGammaCorrectionKernelString()
-{
-	ostringstream os;
-	string dataType;
-	os <<
-	   ConstantDefinesString(m_DoublePrecision) <<
-	   UnionCLStructString <<
-	   RgbToHsvFunctionString <<
-	   HsvToRgbFunctionString <<
-	   CalcAlphaFunctionString <<
-	   CreateCalcNewRgbFunctionString(true) <<
-	   SpatialFilterCLStructString <<
-	   CreateGammaCorrectionFunctionString(true, false);//Will only be used with float in this case, early clip. Will always alpha accum.
-	os << "__kernel void " << m_GammaCorrectionWithoutAlphaCalcEntryPoint << "(\n" <<
-	   "	__global real4reals_bucket* accumulator,\n"
-	   "	__constant SpatialFilterCL* spatialFilter\n"
-	   ")\n"
-	   "{\n"
-	   "	int testGutter = 0;\n"
-	   "\n"
-	   "	if (GLOBAL_ID_Y >= (spatialFilter->m_SuperRasH - testGutter) || GLOBAL_ID_X >= (spatialFilter->m_SuperRasW - testGutter))\n"
-	   "		return;\n"
-	   "\n"
-	   "	uint superIndex = (GLOBAL_ID_Y * spatialFilter->m_SuperRasW) + GLOBAL_ID_X;\n"
-	   "	__global real4reals_bucket* bucket = accumulator + superIndex;\n"
-	   "	GammaCorrectionFloats(bucket, &(spatialFilter->m_Background[0]), spatialFilter->m_Gamma, spatialFilter->m_LinRange, spatialFilter->m_Vibrancy, spatialFilter->m_HighlightPower, &(bucket->m_Reals[0]));\n"
-	   "}\n"
-	   ;
-	return os.str();
-}
-}
+#include "EmberCLPch.h"
+#include "FinalAccumOpenCLKernelCreator.h"
+
+namespace EmberCLns
+{
+/// <summary>
+/// Constructor that creates all kernel strings.
+/// The caller will access these strings through the accessor functions.
+/// </summary>
+FinalAccumOpenCLKernelCreator::FinalAccumOpenCLKernelCreator(bool doublePrecision)
+{
+	m_DoublePrecision = doublePrecision;
+	m_GammaCorrectionWithoutAlphaCalcKernel                   = CreateGammaCorrectionKernelString();
+	m_FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumKernel = CreateFinalAccumKernelString(true);
+	m_FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumKernel  = CreateFinalAccumKernelString(false);
+}
+
+/// <summary>
+/// Kernel source and entry point properties, getters only.
+/// </summary>
+
+const string& FinalAccumOpenCLKernelCreator::FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumKernel()     const { return m_FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumKernel;     }
+const string& FinalAccumOpenCLKernelCreator::FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumEntryPoint() const { return m_FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumEntryPoint; }
+
+const string& FinalAccumOpenCLKernelCreator::FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumKernel()     const { return m_FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumKernel;     }
+const string& FinalAccumOpenCLKernelCreator::FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumEntryPoint() const { return m_FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumEntryPoint; }
+
+const string& FinalAccumOpenCLKernelCreator::GammaCorrectionEntryPoint() const { return m_GammaCorrectionWithoutAlphaCalcEntryPoint; }
+const string& FinalAccumOpenCLKernelCreator::GammaCorrectionKernel() const { return m_GammaCorrectionWithoutAlphaCalcKernel; }
+
+/// <summary>
+/// Get the final accumulation entry point.
+/// </summary>
+/// <param name="earlyClip">True if early clip is desired, else false.</param>
+/// <returns>The name of the final accumulation entry point kernel function</returns>
+const string& FinalAccumOpenCLKernelCreator::FinalAccumEntryPoint(bool earlyClip) const
+{
+	if (earlyClip)
+		return FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumEntryPoint();
+	else
+		return FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumEntryPoint();
+}
+
+/// <summary>
+/// Get the final accumulation kernel string.
+/// </summary>
+/// <param name="earlyClip">True if early clip is desired, else false.</param>
+/// <returns>The final accumulation kernel string</returns>
+const string& FinalAccumOpenCLKernelCreator::FinalAccumKernel(bool earlyClip) const
+{
+	if (earlyClip)
+		return FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumKernel();
+	else
+		return FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumKernel();
+}
+
+/// <summary>
+/// Create the final accumulation kernel string
+/// </summary>
+/// <param name="earlyClip">True if early clip is desired, else false.</param>
+/// <returns>The final accumulation kernel string</returns>
+string FinalAccumOpenCLKernelCreator::CreateFinalAccumKernelString(bool earlyClip)
+{
+	ostringstream os;
+	os <<
+	   ConstantDefinesString(m_DoublePrecision) <<
+	   UnionCLStructString <<
+	   RgbToHsvFunctionString <<
+	   HsvToRgbFunctionString <<
+	   CalcAlphaFunctionString <<
+	   CurveAdjustFunctionString <<
+	   SpatialFilterCLStructString;
+
+	if (earlyClip)
+	{
+		os << "__kernel void " << m_FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumEntryPoint << "(\n";
+	}
+	else
+	{
+		os <<
+		   CreateCalcNewRgbFunctionString(false) <<
+		   CreateGammaCorrectionFunctionString(false, true) <<
+		   "__kernel void " << m_FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumEntryPoint << "(\n";
+	}
+
+	os <<
+	   "	const __global real4reals_bucket* accumulator,\n"
+	   "	__write_only image2d_t pixels,\n"
+	   "	__constant SpatialFilterCL* spatialFilter,\n"
+	   "	__constant real_bucket_t* filterCoefs,\n"
+	   "	__global real4reals_bucket* csa,\n"
+	   "	const uint doCurves\n"
+	   "\t)\n"
+	   "{\n"
+	   "\n"
+	   "	if ((GLOBAL_ID_Y >= spatialFilter->m_FinalRasH) || (GLOBAL_ID_X >= spatialFilter->m_FinalRasW))\n"
+	   "		return;\n"
+	   "\n"
+	   "	uint accumX = spatialFilter->m_DensityFilterOffset + (GLOBAL_ID_X * spatialFilter->m_Supersample);\n"
+	   "	uint accumY = spatialFilter->m_DensityFilterOffset + (GLOBAL_ID_Y * spatialFilter->m_Supersample);\n"
+	   "    uint clampedFilterH = min((uint)spatialFilter->m_FilterWidth, spatialFilter->m_SuperRasH - accumY);"
+	   "    uint clampedFilterW = min((uint)spatialFilter->m_FilterWidth, spatialFilter->m_SuperRasW - accumX);"
+	   "	int2 finalCoord;\n"
+	   "	finalCoord.x = GLOBAL_ID_X;\n"
+	   "	finalCoord.y = (int)((spatialFilter->m_YAxisUp == 1) ? ((spatialFilter->m_FinalRasH - GLOBAL_ID_Y) - 1) : GLOBAL_ID_Y);\n"
+	   "	float4floats finalColor;\n"
+	   "	int ii, jj;\n"
+	   "	uint filterKRowIndex;\n"
+	   "	const __global real4reals_bucket* accumBucket;\n"
+	   "	real4reals_bucket newBucket;\n"
+	   "	newBucket.m_Real4 = 0;\n"
+	   "\n"
+	   "	for (jj = 0; jj < clampedFilterH; jj++)\n"
+	   "	{\n"
+	   "		filterKRowIndex = jj * spatialFilter->m_FilterWidth;\n"//Use the full, non-clamped width to get the filter value.
+	   "\n"
+	   "		for (ii = 0; ii < clampedFilterW; ii++)\n"
+	   "		{\n"
+	   "			real_bucket_t k = filterCoefs[filterKRowIndex + ii];\n"
+	   "\n"
+	   "			accumBucket = accumulator + ((accumY + jj) * spatialFilter->m_SuperRasW) + (accumX + ii);\n"
+	   "			newBucket.m_Real4 += (k * accumBucket->m_Real4);\n"
+	   "		}\n"
+	   "	}\n"
+	   "\n";
+
+	if (earlyClip)//If early clip, simply assign values directly to the temp float4 since they've been gamma corrected already, then write it straight to the output image below.
+	{
+		os <<
+		   "	finalColor.m_Float4.x = (float)newBucket.m_Real4.x;\n"//CPU side clamps, skip here because write_imagef() does the clamping for us.
+		   "	finalColor.m_Float4.y = (float)newBucket.m_Real4.y;\n"
+		   "	finalColor.m_Float4.z = (float)newBucket.m_Real4.z;\n"
+		   "	finalColor.m_Float4.w = (float)newBucket.m_Real4.w;\n";
+	}
+	else
+	{
+		//Late clip, so must gamma correct from the temp newBucket to temp finalColor float4.
+		if (m_DoublePrecision)
+		{
+			os <<
+			   "	real4reals_bucket realFinal;\n"
+			   "\n"
+			   "	GammaCorrectionFloats(&newBucket, &(spatialFilter->m_Background[0]), spatialFilter->m_Gamma, spatialFilter->m_LinRange, spatialFilter->m_Vibrancy, spatialFilter->m_HighlightPower, &(realFinal.m_Reals[0]));\n"
+			   "	finalColor.m_Float4.x = (float)realFinal.m_Real4.x;\n"
+			   "	finalColor.m_Float4.y = (float)realFinal.m_Real4.y;\n"
+			   "	finalColor.m_Float4.z = (float)realFinal.m_Real4.z;\n"
+			   "	finalColor.m_Float4.w = (float)realFinal.m_Real4.w;\n"
+			   ;
+		}
+		else
+		{
+			os <<
+			   "	GammaCorrectionFloats(&newBucket, &(spatialFilter->m_Background[0]), spatialFilter->m_Gamma, spatialFilter->m_LinRange, spatialFilter->m_Vibrancy, spatialFilter->m_HighlightPower, &(finalColor.m_Floats[0]));\n";
+		}
+	}
+
+	os <<
+	   "\n"
+	   "	if (doCurves)\n"
+	   "	{\n"
+	   "		CurveAdjust(csa, &(finalColor.m_Floats[0]), 1);\n"
+	   "		CurveAdjust(csa, &(finalColor.m_Floats[1]), 2);\n"
+	   "		CurveAdjust(csa, &(finalColor.m_Floats[2]), 3);\n"
+	   "	}\n"
+	   "\n"
+	   "	write_imagef(pixels, finalCoord, finalColor.m_Float4);\n"//Use write_imagef instead of write_imageui because only the former works when sharing with an OpenGL texture.
+	   "	barrier(CLK_GLOBAL_MEM_FENCE);\n"//Required, or else page tearing will occur during interactive rendering.
+	   "}\n"
+	   ;
+	return os.str();
+}
+
+/// <summary>
+/// Creates the gamma correction function string.
+/// This is not a full kernel, just a function that is used in the kernels.
+/// </summary>
+/// <param name="globalBucket">True if writing to a global buffer (early clip), else false (late clip).</param>
+/// <param name="finalOut">True if writing to global buffer (late clip), else false (early clip).</param>
+/// <returns>The gamma correction function string</returns>
+string FinalAccumOpenCLKernelCreator::CreateGammaCorrectionFunctionString(bool globalBucket, bool finalOut)
+{
+	ostringstream os;
+	string dataType;
+	string unionMember;
+	dataType = "real_bucket_t";
+	//Use real_t for all cases, early clip and final accum.
+	os << "void GammaCorrectionFloats(" << (globalBucket ? "__global " : "") << "real4reals_bucket* bucket, __constant real_bucket_t* background, real_bucket_t g, real_bucket_t linRange, real_bucket_t vibrancy, real_bucket_t highlightPower, " << (finalOut ? "" : "__global") << " real_bucket_t* correctedChannels)\n";
+	os << "{\n"
+	   << "	real_bucket_t alpha, ls, tmp, a;\n"
+	   << "	real4reals_bucket newRgb;\n"
+	   << "\n"
+	   << "	if (bucket->m_Reals[3] <= 0)\n"
+	   << "	{\n"
+	   << "		alpha = 0;\n"
+	   << "		ls = 0;\n"
+	   << "	}\n"
+	   << "	else\n"
+	   << "	{\n"
+	   << "		tmp = bucket->m_Reals[3];\n"
+	   << "		alpha = CalcAlpha(tmp, g, linRange);\n"
+	   << "		ls = vibrancy * alpha / tmp;\n"
+	   << "		alpha = clamp(alpha, (real_bucket_t)0.0, (real_bucket_t)1.0);\n"
+	   << "	}\n"
+	   << "\n"
+	   << "	CalcNewRgb(bucket, ls, highlightPower, &newRgb);\n"
+	   << "\n"
+	   << "	for (uint rgbi = 0; rgbi < 3; rgbi++)\n"
+	   << "	{\n"
+	   << "		a = newRgb.m_Reals[rgbi] + ((1.0 - vibrancy) * pow(fabs(bucket->m_Reals[rgbi]), g));\n"
+	   << "		a += ((1.0 - alpha) * background[rgbi]);\n"
+	   << "		correctedChannels[rgbi] = (" << dataType << ")clamp(a, (real_bucket_t)0.0, (real_bucket_t)1.0);\n"
+	   << "	}\n"
+	   << "\n"
+	   << "	correctedChannels[3] = (" << dataType << ")alpha;\n"
+	   << "}\n"
+	   << "\n";
+	return os.str();
+}
+
+/// <summary>
+/// OpenCL equivalent of Palette::CalcNewRgb().
+/// </summary>
+/// <param name="globalBucket">True if writing the corrected value to a global buffer (early clip), else false (late clip).</param>
+/// <returns>The CalcNewRgb function string</returns>
+string FinalAccumOpenCLKernelCreator::CreateCalcNewRgbFunctionString(bool globalBucket)
+{
+	ostringstream os;
+	os <<
+	   "static void CalcNewRgb(" << (globalBucket ? "__global " : "") << "real4reals_bucket* oldRgb, real_bucket_t ls, real_bucket_t highPow, real4reals_bucket* newRgb)\n"
+	   "{\n"
+	   "	int rgbi;\n"
+	   "	real_bucket_t lsratio;\n"
+	   "	real4reals_bucket newHsv;\n"
+	   "	real_bucket_t maxa, maxc, newls;\n"
+	   "	real_bucket_t adjhlp;\n"
+	   "\n"
+	   "	if (ls == 0 || (oldRgb->m_Real4.x == 0 && oldRgb->m_Real4.y == 0 && oldRgb->m_Real4.z == 0))\n"//Can't do a vector compare to zero.
+	   "	{\n"
+	   "		newRgb->m_Real4 = 0;\n"
+	   "		return;\n"
+	   "	}\n"
+	   "\n"
+	   //Identify the most saturated channel.
+	   "	maxc = max(max(oldRgb->m_Reals[0], oldRgb->m_Reals[1]), oldRgb->m_Reals[2]);\n"
+	   "	maxa = ls * maxc;\n"
+	   "	newls = 1 / maxc;\n"
+	   "\n"
+	   //If a channel is saturated and highlight power is non-negative
+	   //modify the color to prevent hue shift.
+	   "	if (maxa > 1 && highPow >= 0)\n"
+	   "	{\n"
+	   "		lsratio = pow(newls / ls, highPow);\n"
+	   "\n"
+	   //Calculate the max-value color (ranged 0 - 1).
+	   "		for (rgbi = 0; rgbi < 3; rgbi++)\n"
+	   "			newRgb->m_Reals[rgbi] = newls * oldRgb->m_Reals[rgbi];\n"
+	   "\n"
+	   //Reduce saturation by the lsratio.
+	   "		RgbToHsv(&(newRgb->m_Real4), &(newHsv.m_Real4));\n"
+	   "		newHsv.m_Real4.y *= lsratio;\n"
+	   "		HsvToRgb(&(newHsv.m_Real4), &(newRgb->m_Real4));\n"
+	   "	}\n"
+	   "	else\n"
+	   "	{\n"
+	   "		adjhlp = -highPow;\n"
+	   "\n"
+	   "		if (adjhlp > 1)\n"
+	   "			adjhlp = 1;\n"
+	   "\n"
+	   "		if (maxa <= 1)\n"
+	   "			adjhlp = 1;\n"
+	   "\n"
+	   //Calculate the max-value color (ranged 0 - 1) interpolated with the old behavior.
+	   "		for (rgbi = 0; rgbi < 3; rgbi++)\n"//Unrolling, caching and vectorizing makes no difference.
+	   "			newRgb->m_Reals[rgbi] = ((1.0 - adjhlp) * newls + adjhlp * ls) * oldRgb->m_Reals[rgbi];\n"
+	   "	}\n"
+	   "}\n"
+	   "\n";
+	return os.str();
+}
+
+/// <summary>
+/// Create the gamma correction kernel string used for early clipping.
+/// </summary>
+/// <returns>The gamma correction kernel string used for early clipping</returns>
+string FinalAccumOpenCLKernelCreator::CreateGammaCorrectionKernelString()
+{
+	ostringstream os;
+	string dataType;
+	os <<
+	   ConstantDefinesString(m_DoublePrecision) <<
+	   UnionCLStructString <<
+	   RgbToHsvFunctionString <<
+	   HsvToRgbFunctionString <<
+	   CalcAlphaFunctionString <<
+	   CreateCalcNewRgbFunctionString(true) <<
+	   SpatialFilterCLStructString <<
+	   CreateGammaCorrectionFunctionString(true, false);//Will only be used with float in this case, early clip. Will always alpha accum.
+	os << "__kernel void " << m_GammaCorrectionWithoutAlphaCalcEntryPoint << "(\n" <<
+	   "	__global real4reals_bucket* accumulator,\n"
+	   "	__constant SpatialFilterCL* spatialFilter\n"
+	   ")\n"
+	   "{\n"
+	   "	int testGutter = 0;\n"
+	   "\n"
+	   "	if (GLOBAL_ID_Y >= (spatialFilter->m_SuperRasH - testGutter) || GLOBAL_ID_X >= (spatialFilter->m_SuperRasW - testGutter))\n"
+	   "		return;\n"
+	   "\n"
+	   "	uint superIndex = (GLOBAL_ID_Y * spatialFilter->m_SuperRasW) + GLOBAL_ID_X;\n"
+	   "	__global real4reals_bucket* bucket = accumulator + superIndex;\n"
+	   "	GammaCorrectionFloats(bucket, &(spatialFilter->m_Background[0]), spatialFilter->m_Gamma, spatialFilter->m_LinRange, spatialFilter->m_Vibrancy, spatialFilter->m_HighlightPower, &(bucket->m_Reals[0]));\n"
+	   "}\n"
+	   ;
+	return os.str();
+}
+}
--- a/Source/EmberCL/FinalAccumOpenCLKernelCreator.h
+++ b/Source/EmberCL/FinalAccumOpenCLKernelCreator.h
@ -1,54 +1,54 @@
-#pragma once
-
-#include "EmberCLPch.h"
-#include "EmberCLStructs.h"
-#include "EmberCLFunctions.h"
-
-/// <summary>
-/// FinalAccumOpenCLKernelCreator class.
-/// </summary>
-
-namespace EmberCLns
-{
-/// <summary>
-/// Class for creating the final accumulation code in OpenCL.
-/// There are many conditionals in the CPU code to create the
-/// final output image. This class creates many different kernels
-/// with all conditionals and unnecessary calculations stripped out.
-/// The conditionals are:
-/// Early clip/late clip
-/// </summary>
-class EMBERCL_API FinalAccumOpenCLKernelCreator
-{
-public:
-	FinalAccumOpenCLKernelCreator(bool doublePrecision);
-
-	const string& FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumKernel() const;
-	const string& FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumEntryPoint() const;
-	const string& FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumKernel() const;
-	const string& FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumEntryPoint() const;
-	const string& GammaCorrectionEntryPoint() const;
-	const string& GammaCorrectionKernel() const;
-	const string& FinalAccumEntryPoint(bool earlyClip) const;
-	const string& FinalAccumKernel(bool earlyClip) const;
-
-private:
-	string CreateFinalAccumKernelString(bool earlyClip);
-	string CreateGammaCorrectionKernelString();
-
-	string CreateGammaCorrectionFunctionString(bool globalBucket, bool finalOut);
-	string CreateCalcNewRgbFunctionString(bool globalBucket);
-
-	string m_GammaCorrectionWithoutAlphaCalcKernel;
-	string m_GammaCorrectionWithoutAlphaCalcEntryPoint = "GammaCorrectionWithoutAlphaCalcKernel";
-
-	string m_FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumKernel;
-	string m_FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumEntryPoint = "FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumKernel";
-
-	string m_FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumKernel;
-	string m_FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumEntryPoint = "FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumKernel";
-
-	string m_Empty;
-	bool m_DoublePrecision;
-};
-}
+#pragma once
+
+#include "EmberCLPch.h"
+#include "EmberCLStructs.h"
+#include "EmberCLFunctions.h"
+
+/// <summary>
+/// FinalAccumOpenCLKernelCreator class.
+/// </summary>
+
+namespace EmberCLns
+{
+/// <summary>
+/// Class for creating the final accumulation code in OpenCL.
+/// There are many conditionals in the CPU code to create the
+/// final output image. This class creates many different kernels
+/// with all conditionals and unnecessary calculations stripped out.
+/// The conditionals are:
+/// Early clip/late clip
+/// </summary>
+class EMBERCL_API FinalAccumOpenCLKernelCreator
+{
+public:
+	FinalAccumOpenCLKernelCreator(bool doublePrecision);
+
+	const string& FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumKernel() const;
+	const string& FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumEntryPoint() const;
+	const string& FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumKernel() const;
+	const string& FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumEntryPoint() const;
+	const string& GammaCorrectionEntryPoint() const;
+	const string& GammaCorrectionKernel() const;
+	const string& FinalAccumEntryPoint(bool earlyClip) const;
+	const string& FinalAccumKernel(bool earlyClip) const;
+
+private:
+	string CreateFinalAccumKernelString(bool earlyClip);
+	string CreateGammaCorrectionKernelString();
+
+	string CreateGammaCorrectionFunctionString(bool globalBucket, bool finalOut);
+	string CreateCalcNewRgbFunctionString(bool globalBucket);
+
+	string m_GammaCorrectionWithoutAlphaCalcKernel;
+	string m_GammaCorrectionWithoutAlphaCalcEntryPoint = "GammaCorrectionWithoutAlphaCalcKernel";
+
+	string m_FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumKernel;
+	string m_FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumEntryPoint = "FinalAccumEarlyClipWithoutAlphaCalcWithAlphaAccumKernel";
+
+	string m_FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumKernel;
+	string m_FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumEntryPoint = "FinalAccumLateClipWithoutAlphaCalcWithAlphaAccumKernel";
+
+	string m_Empty;
+	bool m_DoublePrecision;
+};
+}
--- a/Source/EmberCL/FunctionMapper.cpp
+++ b/Source/EmberCL/FunctionMapper.cpp
--- a/Source/EmberCL/FunctionMapper.h
+++ b/Source/EmberCL/FunctionMapper.h
@ -1,22 +1,22 @@
-#pragma once
-
-#include "EmberCLPch.h"
-
-namespace EmberCLns
-{
-/// <summary>
-/// Functionality to map OpenCL function names to their full function body program strings.
-/// This is used to ensure only the functions that are needed by a program are included once
-/// in the program string.
-/// </summary>
-class EMBERCL_API FunctionMapper
-{
-public:
-	FunctionMapper();
-	static const string* GetGlobalFunc(const string& func);
-	static const std::unordered_map<string, string> GetGlobalMapCopy();
-
-private:
-	static std::unordered_map<string, string> s_GlobalMap;
-};
+#pragma once
+
+#include "EmberCLPch.h"
+
+namespace EmberCLns
+{
+/// <summary>
+/// Functionality to map OpenCL function names to their full function body program strings.
+/// This is used to ensure only the functions that are needed by a program are included once
+/// in the program string.
+/// </summary>
+class EMBERCL_API FunctionMapper
+{
+public:
+	FunctionMapper();
+	static const string* GetGlobalFunc(const string& func);
+	static const std::unordered_map<string, string> GetGlobalMapCopy();
+
+private:
+	static std::unordered_map<string, string> s_GlobalMap;
+};
 }
--- a/Source/EmberCL/IterOpenCLKernelCreator.cpp
+++ b/Source/EmberCL/IterOpenCLKernelCreator.cpp
--- a/Source/EmberCL/IterOpenCLKernelCreator.h
+++ b/Source/EmberCL/IterOpenCLKernelCreator.h
@ -1,83 +1,83 @@
-#pragma once
-
-#include "EmberCLPch.h"
-#include "EmberCLStructs.h"
-#include "EmberCLFunctions.h"
-#include "FunctionMapper.h"
-
-/// <summary>
-/// IterOpenCLKernelCreator class.
-/// </summary>
-
-namespace EmberCLns
-{
-/// <summary>
-/// Class for creating the main iteration code in OpenCL.
-/// It uses the Cuburn method of iterating where all conditionals
-/// are stripped out and a specific kernel is compiled at run-time.
-/// It uses a very sophisticated method for randomization that avoids
-/// the problem of warp/wavefront divergence that would occur if every
-/// thread selected a random xform to apply.
-/// This only works with embers of type float, double is not supported.
-/// </summary>
-template <typename T>
-class EMBERCL_API IterOpenCLKernelCreator
-{
-public:
-	IterOpenCLKernelCreator();
-	const string& ZeroizeKernel() const;
-	const string& ZeroizeEntryPoint() const;
-	const string& SumHistKernel() const;
-	const string& SumHistEntryPoint() const;
-	const string& IterEntryPoint() const;
-	string CreateIterKernelString(const Ember<T>& ember, const string& parVarDefines, const string& globalSharedDefines, bool optAffine, bool lockAccum = false, bool doAccum = true);
-	string GlobalFunctionsString(const Ember<T>& ember);
-	static void ParVarIndexDefines(const Ember<T>& ember, pair<string, vector<T>>& params, bool doVals = true, bool doString = true);
-	static void SharedDataIndexDefines(const Ember<T>& ember, pair<string, vector<T>>& params, bool doVals = true, bool doString = true);
-	static string VariationStateString(const Ember<T>& ember);
-	static string VariationStateInitString(const Ember<T>& ember);
-	static bool AnyZeroOpacity(const Ember<T>& ember);
-	static bool IsBuildRequired(const Ember<T>& ember1, const Ember<T>& ember2, bool optAffine);
-
-private:
-	string CreateZeroizeKernelString() const;
-	string CreateSumHistKernelString() const;
-	string CreateProjectionString(const Ember<T>& ember) const;
-
-	string m_IterEntryPoint = "IterateKernel";
-	string m_ZeroizeKernel;
-	string m_ZeroizeEntryPoint = "ZeroizeKernel";
-	string m_SumHistKernel;
-	string m_SumHistEntryPoint = "SumHisteKernel";
-	FunctionMapper m_FunctionMapper;
-};
-
-#ifdef OPEN_CL_TEST_AREA
-typedef void (*KernelFuncPointer) (size_t gridWidth, size_t gridHeight, size_t blockWidth, size_t blockHeight,
-								   size_t BLOCK_ID_X, size_t BLOCK_ID_Y, size_t THREAD_ID_X, size_t THREAD_ID_Y);
-
-static void OpenCLSim(size_t gridWidth, size_t gridHeight, size_t blockWidth, size_t blockHeight, KernelFuncPointer func)
-{
-	cout << "OpenCLSim(): ";
-	cout << "\n	Params: ";
-	cout << "\n		gridW: " << gridWidth;
-	cout << "\n		gridH: " << gridHeight;
-	cout << "\n		blockW: " << blockWidth;
-	cout << "\n		blockH: " << blockHeight;
-
-	for (size_t i = 0; i < gridHeight; i += blockHeight)
-	{
-		for (size_t j = 0; j < gridWidth; j += blockWidth)
-		{
-			for (size_t k = 0; k < blockHeight; k++)
-			{
-				for (size_t l = 0; l < blockWidth; l++)
-				{
-					func(gridWidth, gridHeight, blockWidth, blockHeight, j / blockWidth, i / blockHeight, l, k);
-				}
-			}
-		}
-	}
-}
-#endif
-}
+#pragma once
+
+#include "EmberCLPch.h"
+#include "EmberCLStructs.h"
+#include "EmberCLFunctions.h"
+#include "FunctionMapper.h"
+
+/// <summary>
+/// IterOpenCLKernelCreator class.
+/// </summary>
+
+namespace EmberCLns
+{
+/// <summary>
+/// Class for creating the main iteration code in OpenCL.
+/// It uses the Cuburn method of iterating where all conditionals
+/// are stripped out and a specific kernel is compiled at run-time.
+/// It uses a very sophisticated method for randomization that avoids
+/// the problem of warp/wavefront divergence that would occur if every
+/// thread selected a random xform to apply.
+/// This only works with embers of type float, double is not supported.
+/// </summary>
+template <typename T>
+class EMBERCL_API IterOpenCLKernelCreator
+{
+public:
+	IterOpenCLKernelCreator();
+	const string& ZeroizeKernel() const;
+	const string& ZeroizeEntryPoint() const;
+	const string& SumHistKernel() const;
+	const string& SumHistEntryPoint() const;
+	const string& IterEntryPoint() const;
+	string CreateIterKernelString(const Ember<T>& ember, const string& parVarDefines, const string& globalSharedDefines, bool optAffine, bool lockAccum = false, bool doAccum = true);
+	string GlobalFunctionsString(const Ember<T>& ember);
+	static void ParVarIndexDefines(const Ember<T>& ember, pair<string, vector<T>>& params, bool doVals = true, bool doString = true);
+	static void SharedDataIndexDefines(const Ember<T>& ember, pair<string, vector<T>>& params, bool doVals = true, bool doString = true);
+	static string VariationStateString(const Ember<T>& ember);
+	static string VariationStateInitString(const Ember<T>& ember);
+	static bool AnyZeroOpacity(const Ember<T>& ember);
+	static bool IsBuildRequired(const Ember<T>& ember1, const Ember<T>& ember2, bool optAffine);
+
+private:
+	string CreateZeroizeKernelString() const;
+	string CreateSumHistKernelString() const;
+	string CreateProjectionString(const Ember<T>& ember) const;
+
+	string m_IterEntryPoint = "IterateKernel";
+	string m_ZeroizeKernel;
+	string m_ZeroizeEntryPoint = "ZeroizeKernel";
+	string m_SumHistKernel;
+	string m_SumHistEntryPoint = "SumHisteKernel";
+	FunctionMapper m_FunctionMapper;
+};
+
+#ifdef OPEN_CL_TEST_AREA
+typedef void (*KernelFuncPointer) (size_t gridWidth, size_t gridHeight, size_t blockWidth, size_t blockHeight,
+								   size_t BLOCK_ID_X, size_t BLOCK_ID_Y, size_t THREAD_ID_X, size_t THREAD_ID_Y);
+
+static void OpenCLSim(size_t gridWidth, size_t gridHeight, size_t blockWidth, size_t blockHeight, KernelFuncPointer func)
+{
+	cout << "OpenCLSim(): ";
+	cout << "\n	Params: ";
+	cout << "\n		gridW: " << gridWidth;
+	cout << "\n		gridH: " << gridHeight;
+	cout << "\n		blockW: " << blockWidth;
+	cout << "\n		blockH: " << blockHeight;
+
+	for (size_t i = 0; i < gridHeight; i += blockHeight)
+	{
+		for (size_t j = 0; j < gridWidth; j += blockWidth)
+		{
+			for (size_t k = 0; k < blockHeight; k++)
+			{
+				for (size_t l = 0; l < blockWidth; l++)
+				{
+					func(gridWidth, gridHeight, blockWidth, blockHeight, j / blockWidth, i / blockHeight, l, k);
+				}
+			}
+		}
+	}
+}
+#endif
+}
--- a/Source/EmberCL/OpenCLInfo.cpp
+++ b/Source/EmberCL/OpenCLInfo.cpp
@ -1,460 +1,460 @@
-#include "EmberCLPch.h"
-#include "OpenCLInfo.h"
-
-namespace EmberCLns
-{
-/// <summary>
-/// Initialize the all platforms and devices and keep information about them in lists.
-/// </summary>
-OpenCLInfo::OpenCLInfo()
-{
-	cl_int err;
-	vector<cl::Platform> platforms;
-	vector<vector<cl::Device>> devices;
-	intmax_t workingPlatformIndex = -1;
-	m_Init = false;
-	cl::Platform::get(&platforms);
-	devices.resize(platforms.size());
-	m_Platforms.reserve(platforms.size());
-	m_Devices.reserve(platforms.size());
-	m_DeviceNames.reserve(platforms.size());
-	m_AllDeviceNames.reserve(platforms.size());
-	m_DeviceIndices.reserve(platforms.size());
-
-	for (size_t i = 0; i < platforms.size(); i++)
-		platforms[i].getDevices(CL_DEVICE_TYPE_ALL, &devices[i]);
-
-	for (size_t platform = 0; platform < platforms.size(); platform++)
-	{
-		bool platformOk = false;
-		bool deviceOk = false;
-		cl::Context context;
-
-		if (CreateContext(platforms[platform], context, false))//Platform is ok, now do context. Unshared by default.
-		{
-			size_t workingDeviceIndex = 0;
-
-			for (size_t device = 0; device < devices[platform].size(); device++)//Context is ok, now do devices.
-			{
-				auto q = cl::CommandQueue(context, devices[platform][device], 0, &err);//At least one GPU device is present, so create a command queue.
-
-				if (CheckCL(err, "cl::CommandQueue()"))
-				{
-					if (!platformOk)
-					{
-						m_Platforms.push_back(platforms[platform]);
-						m_PlatformNames.push_back(platforms[platform].getInfo<CL_PLATFORM_VENDOR>(nullptr).c_str() + " "s + platforms[platform].getInfo<CL_PLATFORM_NAME>(nullptr).c_str() + " "s + platforms[platform].getInfo<CL_PLATFORM_VERSION>(nullptr).c_str());
-						workingPlatformIndex++;
-						platformOk = true;
-					}
-
-					if (!deviceOk)
-					{
-						m_Devices.push_back(vector<cl::Device>());
-						m_DeviceNames.push_back(vector<string>());
-						m_Devices.back().reserve(devices[platform].size());
-						m_DeviceNames.back().reserve(devices[platform].size());
-						deviceOk = true;
-					}
-
-					m_Devices.back().push_back(devices[platform][device]);
-					m_DeviceNames.back().push_back(devices[platform][device].getInfo<CL_DEVICE_VENDOR>(nullptr).c_str() + " "s + devices[platform][device].getInfo<CL_DEVICE_NAME>(nullptr).c_str());// + " " + devices[platform][device].getInfo<CL_DEVICE_VERSION>().c_str());
-					m_AllDeviceNames.push_back(m_DeviceNames.back().back());
-					m_DeviceIndices.push_back(pair<size_t, size_t>(workingPlatformIndex, workingDeviceIndex++));
-					m_Init = true;//If at least one platform and device succeeded, OpenCL is ok. It's now ok to begin building and running programs.
-				}
-			}
-		}
-	}
-}
-
-/// <summary>
-/// Get a const reference to the vector of available platforms.
-/// </summary>
-/// <returns>A const reference to the vector of available platforms</returns>
-const vector<cl::Platform>& OpenCLInfo::Platforms() const
-{
-	return m_Platforms;
-}
-
-/// <summary>
-/// Get a const reference to the platform name at the specified index.
-/// </summary>
-/// <param name="i">The platform index to get the name of</param>
-/// <returns>The platform name if found, else empty string</returns>
-const string& OpenCLInfo::PlatformName(size_t platform) const
-{
-	static string s;
-	return platform < m_PlatformNames.size() ? m_PlatformNames[platform] : s;
-}
-
-/// <summary>
-/// Get a const reference to a vector of all available platform names on the system as a vector of strings.
-/// </summary>
-/// <returns>All available platform names on the system as a vector of strings</returns>
-const vector<string>& OpenCLInfo::PlatformNames() const
-{
-	return m_PlatformNames;
-}
-
-/// <summary>
-/// Get a const reference to a vector of vectors of all available devices on the system.
-/// Each outer vector is a different platform.
-/// </summary>
-/// <returns>All available devices on the system, grouped by platform.</returns>
-const vector<vector<cl::Device>>& OpenCLInfo::Devices() const
-{
-	return m_Devices;
-}
-
-/// <summary>
-/// Get a const reference to the device name at the specified index on the platform
-/// at the specified index.
-/// </summary>
-/// <param name="platform">The platform index of the device</param>
-/// <param name="device">The device index</param>
-/// <returns>The name of the device if found, else empty string</returns>
-const string& OpenCLInfo::DeviceName(size_t platform, size_t device) const
-{
-	static string s;
-
-	if (platform < m_Platforms.size() && platform < m_Devices.size())
-		if (device < m_Devices[platform].size())
-			return m_DeviceNames[platform][device];
-
-	return s;
-}
-
-/// <summary>
-/// Get a const reference to a vector of pairs of uints which contain the platform,device
-/// indices of all available devices on the system.
-/// </summary>
-/// <returns>All available devices on the system as platform,device index pairs</returns>
-const vector<pair<size_t, size_t>>& OpenCLInfo::DeviceIndices() const
-{
-	return m_DeviceIndices;
-}
-
-/// <summary>
-/// Get a const reference to a vector of all available device names on the system as a vector of strings.
-/// </summary>
-/// <returns>All available device names on the system as a vector of strings</returns>
-const vector<string>& OpenCLInfo::AllDeviceNames() const
-{
-	return m_AllDeviceNames;
-}
-
-/// <summary>
-/// Get a const reference to a vector of all available device names on the platform
-/// at the specified index as a vector of strings.
-/// </summary>
-/// <param name="platform">The platform index whose devices names will be returned</param>
-/// <returns>All available device names on the platform at the specified index as a vector of strings if within range, else empty vector.</returns>
-const vector<string>& OpenCLInfo::DeviceNames(size_t platform) const
-{
-	static vector<string> v;
-
-	if (platform < m_DeviceNames.size())
-		return m_DeviceNames[platform];
-
-	return v;
-}
-
-/// <summary>
-/// Get the total device index at the specified platform and device index.
-/// </summary>
-/// <param name="platform">The platform index of the device</param>
-/// <param name="device">The device index within the platform</param>
-/// <returns>The total device index if found, else 0</returns>
-size_t OpenCLInfo::TotalDeviceIndex(size_t platform, size_t device) const
-{
-	size_t index = 0;
-	pair<size_t, size_t> p{ platform, device };
-
-	for (size_t i = 0; i < m_DeviceIndices.size(); i++)
-	{
-		if (p == m_DeviceIndices[i])
-		{
-			index = i;
-			break;
-		}
-	}
-
-	return index;
-}
-
-/// <summary>
-/// Get a pointer to a device based on its ID.
-/// </summary>
-/// <param name="id">The device ID</param>
-/// <param name="platform">Stores the platform index of the device if found.</param>
-/// <param name="device">Stores the device index of the device if found.</param>
-/// <returns>A pointer to the device if found, else nullptr.</returns>
-const cl::Device* OpenCLInfo::DeviceFromId(cl_device_id id, size_t& platform, size_t& device) const
-{
-	for (auto& p : m_DeviceIndices)
-	{
-		if (m_Devices[p.first][p.second]() == id)
-		{
-			platform = p.first;
-			device = p.second;
-			return &(m_Devices[p.first][p.second]);
-		}
-	}
-
-	platform = device = 0;
-	return nullptr;
-}
-
-/// <summary>
-/// Create a context that is optionally shared with OpenGL and place it in the
-/// passed in context ref parameter.
-/// </summary>
-/// <param name="platform">The platform object to create the context on</param>
-/// <param name="context">The context object to store the result in</param>
-/// <param name="shared">True if shared with OpenGL, else not shared.</param>
-/// <returns>True if success, else false.</returns>
-bool OpenCLInfo::CreateContext(const cl::Platform& platform, cl::Context& context, bool shared)
-{
-	cl_int err;
-
-	if (shared)
-	{
-		//Define OS-specific context properties and create the OpenCL context.
-#if defined (__APPLE__) || defined(MACOSX)
-		CGLContextObj kCGLContext = CGLGetCurrentContext();
-		CGLShareGroupObj kCGLShareGroup = CGLGetShareGroup(kCGLContext);
-		cl_context_properties props[] =
-		{
-			CL_CONTEXT_PROPERTY_USE_CGL_SHAREGROUP_APPLE, (cl_context_properties)kCGLShareGroup,
-			0
-		};
-		context = cl::Context(CL_DEVICE_TYPE_GPU, props, nullptr, nullptr, &err);//May need to tinker with this on Mac.
-#else
-#if defined WIN32
-		//::wglMakeCurrent(wglGetCurrentDC(), wglGetCurrentContext());
-		cl_context_properties props[] =
-		{
-			CL_GL_CONTEXT_KHR, (cl_context_properties)wglGetCurrentContext(),
-			CL_WGL_HDC_KHR, (cl_context_properties)wglGetCurrentDC(),
-			CL_CONTEXT_PLATFORM, reinterpret_cast<cl_context_properties>((platform)()),
-			0
-		};
-		context = cl::Context(CL_DEVICE_TYPE_GPU, props, nullptr, nullptr, &err);
-#else
-		cl_context_properties props[] =
-		{
-			CL_GL_CONTEXT_KHR, cl_context_properties(glXGetCurrentContext()),
-			CL_GLX_DISPLAY_KHR, cl_context_properties(glXGetCurrentDisplay()),
-			CL_CONTEXT_PLATFORM, reinterpret_cast<cl_context_properties>((platform)()),
-			0
-		};
-		context = cl::Context(CL_DEVICE_TYPE_GPU, props, nullptr, nullptr, &err);
-#endif
-#endif
-	}
-	else
-	{
-		cl_context_properties props[3] =
-		{
-			CL_CONTEXT_PLATFORM,
-			reinterpret_cast<cl_context_properties>((platform)()),
-			0
-		};
-		context = cl::Context(CL_DEVICE_TYPE_ALL, props, nullptr, nullptr, &err);
-	}
-
-	return CheckCL(err, "cl::Context()");
-}
-
-/// <summary>
-/// Return whether at least one device has been found and properly initialized.
-/// </summary>
-/// <returns>True if success, else false.</returns>
-bool OpenCLInfo::Ok() const
-{
-	return m_Init;
-}
-
-/// <summary>
-/// Get all information about all platforms and devices.
-/// </summary>
-/// <returns>A string with all information about all platforms and devices</returns>
-string OpenCLInfo::DumpInfo() const
-{
-	ostringstream os;
-	vector<size_t> sizes;
-	os.imbue(locale(""));
-
-	for (size_t platform = 0; platform < m_Platforms.size(); platform++)
-	{
-		os << "Platform " << platform << ": " << PlatformName(platform) << "\n";
-
-		for (size_t device = 0; device < m_Devices[platform].size(); device++)
-		{
-			os << "Device " << device << ": " << DeviceName(platform, device);
-			os << "\nCL_DEVICE_OPENCL_C_VERSION: " << GetInfo<string>(platform, device, CL_DEVICE_OPENCL_C_VERSION).c_str();
-			os << "\nCL_DEVICE_LOCAL_MEM_SIZE: " << GetInfo<cl_ulong>(platform, device, CL_DEVICE_LOCAL_MEM_SIZE);
-			os << "\nCL_DEVICE_LOCAL_MEM_TYPE: " << GetInfo<cl_uint>(platform, device, CL_DEVICE_LOCAL_MEM_TYPE);
-			os << "\nCL_DEVICE_MAX_COMPUTE_UNITS: " << GetInfo<cl_uint>(platform, device, CL_DEVICE_MAX_COMPUTE_UNITS);
-			os << "\nCL_DEVICE_MAX_READ_IMAGE_ARGS: " << GetInfo<cl_uint>(platform, device, CL_DEVICE_MAX_READ_IMAGE_ARGS);
-			os << "\nCL_DEVICE_MAX_WRITE_IMAGE_ARGS: " << GetInfo<cl_uint>(platform, device, CL_DEVICE_MAX_WRITE_IMAGE_ARGS);
-			os << "\nCL_DEVICE_MAX_MEM_ALLOC_SIZE: " << GetInfo<cl_ulong>(platform, device, CL_DEVICE_MAX_MEM_ALLOC_SIZE);
-			os << "\nCL_DEVICE_ADDRESS_BITS: " << GetInfo<cl_uint>(platform, device, CL_DEVICE_ADDRESS_BITS);
-			os << "\nCL_DEVICE_GLOBAL_MEM_CACHE_TYPE: " << GetInfo<cl_uint>(platform, device, CL_DEVICE_GLOBAL_MEM_CACHE_TYPE);
-			os << "\nCL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE: " << GetInfo<cl_uint>(platform, device, CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE);
-			os << "\nCL_DEVICE_GLOBAL_MEM_CACHE_SIZE: " << GetInfo<cl_ulong>(platform, device, CL_DEVICE_GLOBAL_MEM_CACHE_SIZE);
-			os << "\nCL_DEVICE_GLOBAL_MEM_SIZE: " << GetInfo<cl_ulong>(platform, device, CL_DEVICE_GLOBAL_MEM_SIZE);
-			os << "\nCL_DEVICE_MAX_CONSTANT_BUFFER_SIZE: " << GetInfo<cl_ulong>(platform, device, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE);
-			os << "\nCL_DEVICE_MAX_CONSTANT_ARGS: " << GetInfo<cl_uint>(platform, device, CL_DEVICE_MAX_CONSTANT_ARGS);
-			os << "\nCL_DEVICE_MAX_WORK_ITEM_DIMENSIONS: " << GetInfo<cl_uint>(platform, device, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS);
-			os << "\nCL_DEVICE_MAX_WORK_GROUP_SIZE: " << GetInfo<size_t>(platform, device, CL_DEVICE_MAX_WORK_GROUP_SIZE);
-			sizes = GetInfo<vector<size_t>>(platform, device, CL_DEVICE_MAX_WORK_ITEM_SIZES);
-			os << "\nCL_DEVICE_MAX_WORK_ITEM_SIZES: " << sizes[0] << ", " << sizes[1] << ", " << sizes[2] << "\n" << "\n";
-
-			if (device != m_Devices[platform].size() - 1 && platform != m_Platforms.size() - 1)
-				os << "\n";
-		}
-
-		os << "\n";
-	}
-
-	return os.str();
-}
-
-/// <summary>
-/// Check an OpenCL return value for errors.
-/// </summary>
-/// <param name="err">The error code to inspect</param>
-/// <param name="name">A description of where the value was gotten from</param>
-/// <returns>True if success, else false.</returns>
-bool OpenCLInfo::CheckCL(cl_int err, const char* name)
-{
-	if (err != CL_SUCCESS)
-	{
-		ostringstream ss;
-		ss << "ERROR: " << ErrorToStringCL(err) << " in " << name << ".\n";
-		AddToReport(ss.str());
-	}
-
-	return err == CL_SUCCESS;
-}
-
-/// <summary>
-/// Translate an OpenCL error code into a human readable string.
-/// </summary>
-/// <param name="err">The error code to translate</param>
-/// <returns>A human readable description of the error passed in</returns>
-string OpenCLInfo::ErrorToStringCL(cl_int err)
-{
-	switch (err)
-	{
-		case CL_SUCCESS:								   return "Success";
-
-		case CL_DEVICE_NOT_FOUND:						   return "Device not found";
-
-		case CL_DEVICE_NOT_AVAILABLE:					   return "Device not available";
-
-		case CL_COMPILER_NOT_AVAILABLE:					   return "Compiler not available";
-
-		case CL_MEM_OBJECT_ALLOCATION_FAILURE:			   return "Memory object allocation failure";
-
-		case CL_OUT_OF_RESOURCES:						   return "Out of resources";
-
-		case CL_OUT_OF_HOST_MEMORY:						   return "Out of host memory";
-
-		case CL_PROFILING_INFO_NOT_AVAILABLE:			   return "Profiling information not available";
-
-		case CL_MEM_COPY_OVERLAP:						   return "Memory copy overlap";
-
-		case CL_IMAGE_FORMAT_MISMATCH:					   return "Image format mismatch";
-
-		case CL_IMAGE_FORMAT_NOT_SUPPORTED:				   return "Image format not supported";
-
-		case CL_BUILD_PROGRAM_FAILURE:					   return "Program build failure";
-
-		case CL_MAP_FAILURE:							   return "Map failure";
-
-		case CL_MISALIGNED_SUB_BUFFER_OFFSET:			   return "Misaligned sub buffer offset";
-
-		case CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST: return "Exec status error for events in wait list";
-
-		case CL_INVALID_VALUE:							   return "Invalid value";
-
-		case CL_INVALID_DEVICE_TYPE:					   return "Invalid device type";
-
-		case CL_INVALID_PLATFORM:						   return "Invalid platform";
-
-		case CL_INVALID_DEVICE:							   return "Invalid device";
-
-		case CL_INVALID_CONTEXT:						   return "Invalid context";
-
-		case CL_INVALID_QUEUE_PROPERTIES:				   return "Invalid queue properties";
-
-		case CL_INVALID_COMMAND_QUEUE:					   return "Invalid command queue";
-
-		case CL_INVALID_HOST_PTR:						   return "Invalid host pointer";
-
-		case CL_INVALID_MEM_OBJECT:						   return "Invalid memory object";
-
-		case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR:		   return "Invalid image format descriptor";
-
-		case CL_INVALID_IMAGE_SIZE:						   return "Invalid image size";
-
-		case CL_INVALID_SAMPLER:						   return "Invalid sampler";
-
-		case CL_INVALID_BINARY:							   return "Invalid binary";
-
-		case CL_INVALID_BUILD_OPTIONS:					   return "Invalid build options";
-
-		case CL_INVALID_PROGRAM:						   return "Invalid program";
-
-		case CL_INVALID_PROGRAM_EXECUTABLE:				   return "Invalid program executable";
-
-		case CL_INVALID_KERNEL_NAME:					   return "Invalid kernel name";
-
-		case CL_INVALID_KERNEL_DEFINITION:				   return "Invalid kernel definition";
-
-		case CL_INVALID_KERNEL:							   return "Invalid kernel";
-
-		case CL_INVALID_ARG_INDEX:						   return "Invalid argument index";
-
-		case CL_INVALID_ARG_VALUE:						   return "Invalid argument value";
-
-		case CL_INVALID_ARG_SIZE:						   return "Invalid argument size";
-
-		case CL_INVALID_KERNEL_ARGS:					   return "Invalid kernel arguments";
-
-		case CL_INVALID_WORK_DIMENSION:					   return "Invalid work dimension";
-
-		case CL_INVALID_WORK_GROUP_SIZE:				   return "Invalid work group size";
-
-		case CL_INVALID_WORK_ITEM_SIZE:					   return "Invalid work item size";
-
-		case CL_INVALID_GLOBAL_OFFSET:					   return "Invalid global offset";
-
-		case CL_INVALID_EVENT_WAIT_LIST:				   return "Invalid event wait list";
-
-		case CL_INVALID_EVENT:							   return "Invalid event";
-
-		case CL_INVALID_OPERATION:						   return "Invalid operation";
-
-		case CL_INVALID_GL_OBJECT:						   return "Invalid OpenGL object";
-
-		case CL_INVALID_BUFFER_SIZE:					   return "Invalid buffer size";
-
-		case CL_INVALID_MIP_LEVEL:						   return "Invalid mip-map level";
-
-		case CL_INVALID_GLOBAL_WORK_SIZE:				   return "Invalid global work size";
-
-		case CL_INVALID_PROPERTY:						   return "Invalid property";
-
-		default:
-		{
-			ostringstream ss;
-			ss << "<Unknown error code> " << err;
-			return ss.str();
-		}
-	}
-}
+#include "EmberCLPch.h"
+#include "OpenCLInfo.h"
+
+namespace EmberCLns
+{
+/// <summary>
+/// Initialize the all platforms and devices and keep information about them in lists.
+/// </summary>
+OpenCLInfo::OpenCLInfo()
+{
+	cl_int err;
+	vector<cl::Platform> platforms;
+	vector<vector<cl::Device>> devices;
+	intmax_t workingPlatformIndex = -1;
+	m_Init = false;
+	cl::Platform::get(&platforms);
+	devices.resize(platforms.size());
+	m_Platforms.reserve(platforms.size());
+	m_Devices.reserve(platforms.size());
+	m_DeviceNames.reserve(platforms.size());
+	m_AllDeviceNames.reserve(platforms.size());
+	m_DeviceIndices.reserve(platforms.size());
+
+	for (size_t i = 0; i < platforms.size(); i++)
+		platforms[i].getDevices(CL_DEVICE_TYPE_ALL, &devices[i]);
+
+	for (size_t platform = 0; platform < platforms.size(); platform++)
+	{
+		bool platformOk = false;
+		bool deviceOk = false;
+		cl::Context context;
+
+		if (CreateContext(platforms[platform], context, false))//Platform is ok, now do context. Unshared by default.
+		{
+			size_t workingDeviceIndex = 0;
+
+			for (size_t device = 0; device < devices[platform].size(); device++)//Context is ok, now do devices.
+			{
+				auto q = cl::CommandQueue(context, devices[platform][device], 0, &err);//At least one GPU device is present, so create a command queue.
+
+				if (CheckCL(err, "cl::CommandQueue()"))
+				{
+					if (!platformOk)
+					{
+						m_Platforms.push_back(platforms[platform]);
+						m_PlatformNames.push_back(platforms[platform].getInfo<CL_PLATFORM_VENDOR>(nullptr).c_str() + " "s + platforms[platform].getInfo<CL_PLATFORM_NAME>(nullptr).c_str() + " "s + platforms[platform].getInfo<CL_PLATFORM_VERSION>(nullptr).c_str());
+						workingPlatformIndex++;
+						platformOk = true;
+					}
+
+					if (!deviceOk)
+					{
+						m_Devices.push_back(vector<cl::Device>());
+						m_DeviceNames.push_back(vector<string>());
+						m_Devices.back().reserve(devices[platform].size());
+						m_DeviceNames.back().reserve(devices[platform].size());
+						deviceOk = true;
+					}
+
+					m_Devices.back().push_back(devices[platform][device]);
+					m_DeviceNames.back().push_back(devices[platform][device].getInfo<CL_DEVICE_VENDOR>(nullptr).c_str() + " "s + devices[platform][device].getInfo<CL_DEVICE_NAME>(nullptr).c_str());// + " " + devices[platform][device].getInfo<CL_DEVICE_VERSION>().c_str());
+					m_AllDeviceNames.push_back(m_DeviceNames.back().back());
+					m_DeviceIndices.push_back(pair<size_t, size_t>(workingPlatformIndex, workingDeviceIndex++));
+					m_Init = true;//If at least one platform and device succeeded, OpenCL is ok. It's now ok to begin building and running programs.
+				}
+			}
+		}
+	}
+}
+
+/// <summary>
+/// Get a const reference to the vector of available platforms.
+/// </summary>
+/// <returns>A const reference to the vector of available platforms</returns>
+const vector<cl::Platform>& OpenCLInfo::Platforms() const
+{
+	return m_Platforms;
+}
+
+/// <summary>
+/// Get a const reference to the platform name at the specified index.
+/// </summary>
+/// <param name="i">The platform index to get the name of</param>
+/// <returns>The platform name if found, else empty string</returns>
+const string& OpenCLInfo::PlatformName(size_t platform) const
+{
+	static string s;
+	return platform < m_PlatformNames.size() ? m_PlatformNames[platform] : s;
+}
+
+/// <summary>
+/// Get a const reference to a vector of all available platform names on the system as a vector of strings.
+/// </summary>
+/// <returns>All available platform names on the system as a vector of strings</returns>
+const vector<string>& OpenCLInfo::PlatformNames() const
+{
+	return m_PlatformNames;
+}
+
+/// <summary>
+/// Get a const reference to a vector of vectors of all available devices on the system.
+/// Each outer vector is a different platform.
+/// </summary>
+/// <returns>All available devices on the system, grouped by platform.</returns>
+const vector<vector<cl::Device>>& OpenCLInfo::Devices() const
+{
+	return m_Devices;
+}
+
+/// <summary>
+/// Get a const reference to the device name at the specified index on the platform
+/// at the specified index.
+/// </summary>
+/// <param name="platform">The platform index of the device</param>
+/// <param name="device">The device index</param>
+/// <returns>The name of the device if found, else empty string</returns>
+const string& OpenCLInfo::DeviceName(size_t platform, size_t device) const
+{
+	static string s;
+
+	if (platform < m_Platforms.size() && platform < m_Devices.size())
+		if (device < m_Devices[platform].size())
+			return m_DeviceNames[platform][device];
+
+	return s;
+}
+
+/// <summary>
+/// Get a const reference to a vector of pairs of uints which contain the platform,device
+/// indices of all available devices on the system.
+/// </summary>
+/// <returns>All available devices on the system as platform,device index pairs</returns>
+const vector<pair<size_t, size_t>>& OpenCLInfo::DeviceIndices() const
+{
+	return m_DeviceIndices;
+}
+
+/// <summary>
+/// Get a const reference to a vector of all available device names on the system as a vector of strings.
+/// </summary>
+/// <returns>All available device names on the system as a vector of strings</returns>
+const vector<string>& OpenCLInfo::AllDeviceNames() const
+{
+	return m_AllDeviceNames;
+}
+
+/// <summary>
+/// Get a const reference to a vector of all available device names on the platform
+/// at the specified index as a vector of strings.
+/// </summary>
+/// <param name="platform">The platform index whose devices names will be returned</param>
+/// <returns>All available device names on the platform at the specified index as a vector of strings if within range, else empty vector.</returns>
+const vector<string>& OpenCLInfo::DeviceNames(size_t platform) const
+{
+	static vector<string> v;
+
+	if (platform < m_DeviceNames.size())
+		return m_DeviceNames[platform];
+
+	return v;
+}
+
+/// <summary>
+/// Get the total device index at the specified platform and device index.
+/// </summary>
+/// <param name="platform">The platform index of the device</param>
+/// <param name="device">The device index within the platform</param>
+/// <returns>The total device index if found, else 0</returns>
+size_t OpenCLInfo::TotalDeviceIndex(size_t platform, size_t device) const
+{
+	size_t index = 0;
+	pair<size_t, size_t> p{ platform, device };
+
+	for (size_t i = 0; i < m_DeviceIndices.size(); i++)
+	{
+		if (p == m_DeviceIndices[i])
+		{
+			index = i;
+			break;
+		}
+	}
+
+	return index;
+}
+
+/// <summary>
+/// Get a pointer to a device based on its ID.
+/// </summary>
+/// <param name="id">The device ID</param>
+/// <param name="platform">Stores the platform index of the device if found.</param>
+/// <param name="device">Stores the device index of the device if found.</param>
+/// <returns>A pointer to the device if found, else nullptr.</returns>
+const cl::Device* OpenCLInfo::DeviceFromId(cl_device_id id, size_t& platform, size_t& device) const
+{
+	for (auto& p : m_DeviceIndices)
+	{
+		if (m_Devices[p.first][p.second]() == id)
+		{
+			platform = p.first;
+			device = p.second;
+			return &(m_Devices[p.first][p.second]);
+		}
+	}
+
+	platform = device = 0;
+	return nullptr;
+}
+
+/// <summary>
+/// Create a context that is optionally shared with OpenGL and place it in the
+/// passed in context ref parameter.
+/// </summary>
+/// <param name="platform">The platform object to create the context on</param>
+/// <param name="context">The context object to store the result in</param>
+/// <param name="shared">True if shared with OpenGL, else not shared.</param>
+/// <returns>True if success, else false.</returns>
+bool OpenCLInfo::CreateContext(const cl::Platform& platform, cl::Context& context, bool shared)
+{
+	cl_int err;
+
+	if (shared)
+	{
+		//Define OS-specific context properties and create the OpenCL context.
+#if defined (__APPLE__) || defined(MACOSX)
+		CGLContextObj kCGLContext = CGLGetCurrentContext();
+		CGLShareGroupObj kCGLShareGroup = CGLGetShareGroup(kCGLContext);
+		cl_context_properties props[] =
+		{
+			CL_CONTEXT_PROPERTY_USE_CGL_SHAREGROUP_APPLE, (cl_context_properties)kCGLShareGroup,
+			0
+		};
+		context = cl::Context(CL_DEVICE_TYPE_GPU, props, nullptr, nullptr, &err);//May need to tinker with this on Mac.
+#else
+#if defined WIN32
+		//::wglMakeCurrent(wglGetCurrentDC(), wglGetCurrentContext());
+		cl_context_properties props[] =
+		{
+			CL_GL_CONTEXT_KHR, (cl_context_properties)wglGetCurrentContext(),
+			CL_WGL_HDC_KHR, (cl_context_properties)wglGetCurrentDC(),
+			CL_CONTEXT_PLATFORM, reinterpret_cast<cl_context_properties>((platform)()),
+			0
+		};
+		context = cl::Context(CL_DEVICE_TYPE_GPU, props, nullptr, nullptr, &err);
+#else
+		cl_context_properties props[] =
+		{
+			CL_GL_CONTEXT_KHR, cl_context_properties(glXGetCurrentContext()),
+			CL_GLX_DISPLAY_KHR, cl_context_properties(glXGetCurrentDisplay()),
+			CL_CONTEXT_PLATFORM, reinterpret_cast<cl_context_properties>((platform)()),
+			0
+		};
+		context = cl::Context(CL_DEVICE_TYPE_GPU, props, nullptr, nullptr, &err);
+#endif
+#endif
+	}
+	else
+	{
+		cl_context_properties props[3] =
+		{
+			CL_CONTEXT_PLATFORM,
+			reinterpret_cast<cl_context_properties>((platform)()),
+			0
+		};
+		context = cl::Context(CL_DEVICE_TYPE_ALL, props, nullptr, nullptr, &err);
+	}
+
+	return CheckCL(err, "cl::Context()");
+}
+
+/// <summary>
+/// Return whether at least one device has been found and properly initialized.
+/// </summary>
+/// <returns>True if success, else false.</returns>
+bool OpenCLInfo::Ok() const
+{
+	return m_Init;
+}
+
+/// <summary>
+/// Get all information about all platforms and devices.
+/// </summary>
+/// <returns>A string with all information about all platforms and devices</returns>
+string OpenCLInfo::DumpInfo() const
+{
+	ostringstream os;
+	vector<size_t> sizes;
+	os.imbue(locale(""));
+
+	for (size_t platform = 0; platform < m_Platforms.size(); platform++)
+	{
+		os << "Platform " << platform << ": " << PlatformName(platform) << "\n";
+
+		for (size_t device = 0; device < m_Devices[platform].size(); device++)
+		{
+			os << "Device " << device << ": " << DeviceName(platform, device);
+			os << "\nCL_DEVICE_OPENCL_C_VERSION: " << GetInfo<string>(platform, device, CL_DEVICE_OPENCL_C_VERSION).c_str();
+			os << "\nCL_DEVICE_LOCAL_MEM_SIZE: " << GetInfo<cl_ulong>(platform, device, CL_DEVICE_LOCAL_MEM_SIZE);
+			os << "\nCL_DEVICE_LOCAL_MEM_TYPE: " << GetInfo<cl_uint>(platform, device, CL_DEVICE_LOCAL_MEM_TYPE);
+			os << "\nCL_DEVICE_MAX_COMPUTE_UNITS: " << GetInfo<cl_uint>(platform, device, CL_DEVICE_MAX_COMPUTE_UNITS);
+			os << "\nCL_DEVICE_MAX_READ_IMAGE_ARGS: " << GetInfo<cl_uint>(platform, device, CL_DEVICE_MAX_READ_IMAGE_ARGS);
+			os << "\nCL_DEVICE_MAX_WRITE_IMAGE_ARGS: " << GetInfo<cl_uint>(platform, device, CL_DEVICE_MAX_WRITE_IMAGE_ARGS);
+			os << "\nCL_DEVICE_MAX_MEM_ALLOC_SIZE: " << GetInfo<cl_ulong>(platform, device, CL_DEVICE_MAX_MEM_ALLOC_SIZE);
+			os << "\nCL_DEVICE_ADDRESS_BITS: " << GetInfo<cl_uint>(platform, device, CL_DEVICE_ADDRESS_BITS);
+			os << "\nCL_DEVICE_GLOBAL_MEM_CACHE_TYPE: " << GetInfo<cl_uint>(platform, device, CL_DEVICE_GLOBAL_MEM_CACHE_TYPE);
+			os << "\nCL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE: " << GetInfo<cl_uint>(platform, device, CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE);
+			os << "\nCL_DEVICE_GLOBAL_MEM_CACHE_SIZE: " << GetInfo<cl_ulong>(platform, device, CL_DEVICE_GLOBAL_MEM_CACHE_SIZE);
+			os << "\nCL_DEVICE_GLOBAL_MEM_SIZE: " << GetInfo<cl_ulong>(platform, device, CL_DEVICE_GLOBAL_MEM_SIZE);
+			os << "\nCL_DEVICE_MAX_CONSTANT_BUFFER_SIZE: " << GetInfo<cl_ulong>(platform, device, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE);
+			os << "\nCL_DEVICE_MAX_CONSTANT_ARGS: " << GetInfo<cl_uint>(platform, device, CL_DEVICE_MAX_CONSTANT_ARGS);
+			os << "\nCL_DEVICE_MAX_WORK_ITEM_DIMENSIONS: " << GetInfo<cl_uint>(platform, device, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS);
+			os << "\nCL_DEVICE_MAX_WORK_GROUP_SIZE: " << GetInfo<size_t>(platform, device, CL_DEVICE_MAX_WORK_GROUP_SIZE);
+			sizes = GetInfo<vector<size_t>>(platform, device, CL_DEVICE_MAX_WORK_ITEM_SIZES);
+			os << "\nCL_DEVICE_MAX_WORK_ITEM_SIZES: " << sizes[0] << ", " << sizes[1] << ", " << sizes[2] << "\n" << "\n";
+
+			if (device != m_Devices[platform].size() - 1 && platform != m_Platforms.size() - 1)
+				os << "\n";
+		}
+
+		os << "\n";
+	}
+
+	return os.str();
+}
+
+/// <summary>
+/// Check an OpenCL return value for errors.
+/// </summary>
+/// <param name="err">The error code to inspect</param>
+/// <param name="name">A description of where the value was gotten from</param>
+/// <returns>True if success, else false.</returns>
+bool OpenCLInfo::CheckCL(cl_int err, const char* name)
+{
+	if (err != CL_SUCCESS)
+	{
+		ostringstream ss;
+		ss << "ERROR: " << ErrorToStringCL(err) << " in " << name << ".\n";
+		AddToReport(ss.str());
+	}
+
+	return err == CL_SUCCESS;
+}
+
+/// <summary>
+/// Translate an OpenCL error code into a human readable string.
+/// </summary>
+/// <param name="err">The error code to translate</param>
+/// <returns>A human readable description of the error passed in</returns>
+string OpenCLInfo::ErrorToStringCL(cl_int err)
+{
+	switch (err)
+	{
+		case CL_SUCCESS:								   return "Success";
+
+		case CL_DEVICE_NOT_FOUND:						   return "Device not found";
+
+		case CL_DEVICE_NOT_AVAILABLE:					   return "Device not available";
+
+		case CL_COMPILER_NOT_AVAILABLE:					   return "Compiler not available";
+
+		case CL_MEM_OBJECT_ALLOCATION_FAILURE:			   return "Memory object allocation failure";
+
+		case CL_OUT_OF_RESOURCES:						   return "Out of resources";
+
+		case CL_OUT_OF_HOST_MEMORY:						   return "Out of host memory";
+
+		case CL_PROFILING_INFO_NOT_AVAILABLE:			   return "Profiling information not available";
+
+		case CL_MEM_COPY_OVERLAP:						   return "Memory copy overlap";
+
+		case CL_IMAGE_FORMAT_MISMATCH:					   return "Image format mismatch";
+
+		case CL_IMAGE_FORMAT_NOT_SUPPORTED:				   return "Image format not supported";
+
+		case CL_BUILD_PROGRAM_FAILURE:					   return "Program build failure";
+
+		case CL_MAP_FAILURE:							   return "Map failure";
+
+		case CL_MISALIGNED_SUB_BUFFER_OFFSET:			   return "Misaligned sub buffer offset";
+
+		case CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST: return "Exec status error for events in wait list";
+
+		case CL_INVALID_VALUE:							   return "Invalid value";
+
+		case CL_INVALID_DEVICE_TYPE:					   return "Invalid device type";
+
+		case CL_INVALID_PLATFORM:						   return "Invalid platform";
+
+		case CL_INVALID_DEVICE:							   return "Invalid device";
+
+		case CL_INVALID_CONTEXT:						   return "Invalid context";
+
+		case CL_INVALID_QUEUE_PROPERTIES:				   return "Invalid queue properties";
+
+		case CL_INVALID_COMMAND_QUEUE:					   return "Invalid command queue";
+
+		case CL_INVALID_HOST_PTR:						   return "Invalid host pointer";
+
+		case CL_INVALID_MEM_OBJECT:						   return "Invalid memory object";
+
+		case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR:		   return "Invalid image format descriptor";
+
+		case CL_INVALID_IMAGE_SIZE:						   return "Invalid image size";
+
+		case CL_INVALID_SAMPLER:						   return "Invalid sampler";
+
+		case CL_INVALID_BINARY:							   return "Invalid binary";
+
+		case CL_INVALID_BUILD_OPTIONS:					   return "Invalid build options";
+
+		case CL_INVALID_PROGRAM:						   return "Invalid program";
+
+		case CL_INVALID_PROGRAM_EXECUTABLE:				   return "Invalid program executable";
+
+		case CL_INVALID_KERNEL_NAME:					   return "Invalid kernel name";
+
+		case CL_INVALID_KERNEL_DEFINITION:				   return "Invalid kernel definition";
+
+		case CL_INVALID_KERNEL:							   return "Invalid kernel";
+
+		case CL_INVALID_ARG_INDEX:						   return "Invalid argument index";
+
+		case CL_INVALID_ARG_VALUE:						   return "Invalid argument value";
+
+		case CL_INVALID_ARG_SIZE:						   return "Invalid argument size";
+
+		case CL_INVALID_KERNEL_ARGS:					   return "Invalid kernel arguments";
+
+		case CL_INVALID_WORK_DIMENSION:					   return "Invalid work dimension";
+
+		case CL_INVALID_WORK_GROUP_SIZE:				   return "Invalid work group size";
+
+		case CL_INVALID_WORK_ITEM_SIZE:					   return "Invalid work item size";
+
+		case CL_INVALID_GLOBAL_OFFSET:					   return "Invalid global offset";
+
+		case CL_INVALID_EVENT_WAIT_LIST:				   return "Invalid event wait list";
+
+		case CL_INVALID_EVENT:							   return "Invalid event";
+
+		case CL_INVALID_OPERATION:						   return "Invalid operation";
+
+		case CL_INVALID_GL_OBJECT:						   return "Invalid OpenGL object";
+
+		case CL_INVALID_BUFFER_SIZE:					   return "Invalid buffer size";
+
+		case CL_INVALID_MIP_LEVEL:						   return "Invalid mip-map level";
+
+		case CL_INVALID_GLOBAL_WORK_SIZE:				   return "Invalid global work size";
+
+		case CL_INVALID_PROPERTY:						   return "Invalid property";
+
+		default:
+		{
+			ostringstream ss;
+			ss << "<Unknown error code> " << err;
+			return ss.str();
+		}
+	}
+}
 }
--- a/Source/EmberCL/OpenCLInfo.h
+++ b/Source/EmberCL/OpenCLInfo.h
@ -1,72 +1,72 @@
-#pragma once
-
-#include "EmberCLPch.h"
-
-/// <summary>
-/// OpenCLInfo class.
-/// </summary>
-
-namespace EmberCLns
-{
-/// <summary>
-/// Keeps information about all valid OpenCL devices on this system.
-/// Devices which do not successfully create a test command queue are not
-/// added to the list.
-/// The pattern is singleton, so there is only one instance per program,
-/// retreivable by reference via the Instance() function.
-/// This class derives from EmberReport, so the caller is able
-/// to retrieve a text dump of error information if any errors occur.
-/// </summary>
-class EMBERCL_API OpenCLInfo : public EmberReport, public Singleton<OpenCLInfo>
-{
-public:
-	const vector<cl::Platform>& Platforms() const;
-	const string& PlatformName(size_t platform) const;
-	const vector<string>& PlatformNames() const;
-	const vector<vector<cl::Device>>& Devices() const;
-	const string& DeviceName(size_t platform, size_t device) const;
-	const vector<pair<size_t, size_t>>& DeviceIndices() const;
-	const vector<string>& AllDeviceNames() const;
-	const vector<string>& DeviceNames(size_t platform) const;
-	size_t TotalDeviceIndex(size_t platform, size_t device) const;
-	const cl::Device* DeviceFromId(cl_device_id id, size_t& platform, size_t& device) const;
-	string DumpInfo() const;
-	bool Ok() const;
-	bool CreateContext(const cl::Platform& platform, cl::Context& context, bool shared);
-	bool CheckCL(cl_int err, const char* name);
-	string ErrorToStringCL(cl_int err);
-
-	/// <summary>
-	/// Get device information for the specified field.
-	/// Template argument expected to be cl_ulong, cl_uint or cl_int;
-	/// </summary>
-	/// <param name="platform">The index platform of the platform to use</param>
-	/// <param name="device">The index device of the device to use</param>
-	/// <param name="name">The device field/feature to query</param>
-	/// <returns>The value of the field</returns>
-	template<typename T>
-	T GetInfo(size_t platform, size_t device, cl_device_info name) const
-	{
-		T val = T();
-
-		if (platform < m_Devices.size() && device < m_Devices[platform].size())
-			m_Devices[platform][device].getInfo(name, &val);
-
-		return val;
-	}
-
-	SINGLETON_DERIVED_IMPL(OpenCLInfo);
-
-private:
-
-	OpenCLInfo();
-
-	bool m_Init;
-	vector<cl::Platform> m_Platforms;
-	vector<vector<cl::Device>> m_Devices;
-	vector<string> m_PlatformNames;
-	vector<vector<string>> m_DeviceNames;
-	vector<pair<size_t, size_t>> m_DeviceIndices;
-	vector<string> m_AllDeviceNames;
-};
-}
+#pragma once
+
+#include "EmberCLPch.h"
+
+/// <summary>
+/// OpenCLInfo class.
+/// </summary>
+
+namespace EmberCLns
+{
+/// <summary>
+/// Keeps information about all valid OpenCL devices on this system.
+/// Devices which do not successfully create a test command queue are not
+/// added to the list.
+/// The pattern is singleton, so there is only one instance per program,
+/// retreivable by reference via the Instance() function.
+/// This class derives from EmberReport, so the caller is able
+/// to retrieve a text dump of error information if any errors occur.
+/// </summary>
+class EMBERCL_API OpenCLInfo : public EmberReport, public Singleton<OpenCLInfo>
+{
+public:
+	const vector<cl::Platform>& Platforms() const;
+	const string& PlatformName(size_t platform) const;
+	const vector<string>& PlatformNames() const;
+	const vector<vector<cl::Device>>& Devices() const;
+	const string& DeviceName(size_t platform, size_t device) const;
+	const vector<pair<size_t, size_t>>& DeviceIndices() const;
+	const vector<string>& AllDeviceNames() const;
+	const vector<string>& DeviceNames(size_t platform) const;
+	size_t TotalDeviceIndex(size_t platform, size_t device) const;
+	const cl::Device* DeviceFromId(cl_device_id id, size_t& platform, size_t& device) const;
+	string DumpInfo() const;
+	bool Ok() const;
+	bool CreateContext(const cl::Platform& platform, cl::Context& context, bool shared);
+	bool CheckCL(cl_int err, const char* name);
+	string ErrorToStringCL(cl_int err);
+
+	/// <summary>
+	/// Get device information for the specified field.
+	/// Template argument expected to be cl_ulong, cl_uint or cl_int;
+	/// </summary>
+	/// <param name="platform">The index platform of the platform to use</param>
+	/// <param name="device">The index device of the device to use</param>
+	/// <param name="name">The device field/feature to query</param>
+	/// <returns>The value of the field</returns>
+	template<typename T>
+	T GetInfo(size_t platform, size_t device, cl_device_info name) const
+	{
+		T val = T();
+
+		if (platform < m_Devices.size() && device < m_Devices[platform].size())
+			m_Devices[platform][device].getInfo(name, &val);
+
+		return val;
+	}
+
+	SINGLETON_DERIVED_IMPL(OpenCLInfo);
+
+private:
+
+	OpenCLInfo();
+
+	bool m_Init;
+	vector<cl::Platform> m_Platforms;
+	vector<vector<cl::Device>> m_Devices;
+	vector<string> m_PlatformNames;
+	vector<vector<string>> m_DeviceNames;
+	vector<pair<size_t, size_t>> m_DeviceIndices;
+	vector<string> m_AllDeviceNames;
+};
+}
--- a/Source/EmberCL/OpenCLWrapper.cpp
+++ b/Source/EmberCL/OpenCLWrapper.cpp
--- a/Source/EmberCL/OpenCLWrapper.h
+++ b/Source/EmberCL/OpenCLWrapper.h
@ -1,210 +1,210 @@
-#pragma once
-
-#include "EmberCLPch.h"
-#include "OpenCLInfo.h"
-
-/// <summary>
-/// OpenCLWrapper, Spk, NamedBuffer, NamedImage2D, NamedImage2DGL classes.
-/// </summary>
-
-namespace EmberCLns
-{
-/// <summary>
-/// Class to contain all of the things needed to store an OpenCL program.
-/// The name of it, the source, the compiled program object and the kernel.
-/// </summary>
-class EMBERCL_API Spk
-{
-public:
-	string m_Name;
-	cl::Program::Sources m_Source;
-	cl::Program m_Program;
-	cl::Kernel m_Kernel;
-};
-
-/// <summary>
-/// Class to hold an OpenCL buffer with a name to identify it by.
-/// </summary>
-class EMBERCL_API NamedBuffer
-{
-public:
-	NamedBuffer()
-	{
-	}
-
-	NamedBuffer(const cl::Buffer& buff, const string& name)
-	{
-		m_Buffer = buff;
-		m_Name = name;
-	}
-
-	cl::Buffer m_Buffer;
-	string m_Name;
-};
-
-/// <summary>
-/// Class to hold a 2D image with a name to identify it by.
-/// </summary>
-class EMBERCL_API NamedImage2D
-{
-public:
-	NamedImage2D()
-	{
-	}
-
-	NamedImage2D(const cl::Image2D& image, const string& name)
-	{
-		m_Image = image;
-		m_Name = name;
-	}
-
-	cl::Image2D m_Image;
-	string m_Name;
-};
-
-/// <summary>
-/// Class to hold a 2D image that is mapped to an OpenGL texture
-/// and a name to identify it by.
-/// </summary>
-class EMBERCL_API NamedImage2DGL
-{
-public:
-	NamedImage2DGL()
-	{
-	}
-
-	NamedImage2DGL(const cl::ImageGL& image, const string& name)
-	{
-		m_Image = image;
-		m_Name = name;
-	}
-
-	cl::ImageGL m_Image;
-	string m_Name;
-};
-
-/// <summary>
-/// Running kernels in OpenCL can require quite a bit of setup, tear down and
-/// general housekeeping. This class helps shield the user from such hassles.
-/// Its main utility is in holding collections of programs, buffers and images
-/// all identified by names. That way, a user can access them as needed without
-/// having to pollute their code.
-/// In addition, writing to an existing object by name determines if the object
-/// can be overwritten, or if it needs to be deleted and replaced by the new one.
-/// This class derives from EmberReport, so the caller is able
-/// to retrieve a text dump of error information if any errors occur.
-/// </summary>
-class EMBERCL_API OpenCLWrapper : public EmberReport
-{
-public:
-	OpenCLWrapper();
-	bool Init(size_t platformIndex, size_t deviceIndex, bool shared = false);
-
-	//Programs.
-	bool AddProgram(const string& name, const string& program, const string& entryPoint, bool doublePrecision);
-	void ClearPrograms();
-
-	//Buffers.
-	bool AddBuffer(const string& name, size_t size, cl_mem_flags flags = CL_MEM_READ_WRITE);
-	bool AddHostBuffer(const string& name, size_t size, void* data);
-	bool AddAndWriteBuffer(const string& name, void* data, size_t size, cl_mem_flags flags = CL_MEM_READ_WRITE);
-	bool WriteBuffer(const string& name, void* data, size_t size);
-	bool WriteBuffer(size_t bufferIndex, void* data, size_t size);
-	bool ReadBuffer(const string& name, void* data, size_t size);
-	bool ReadBuffer(size_t bufferIndex, void* data, size_t size);
-	int FindBufferIndex(const string& name);
-	size_t GetBufferSize(const string& name);
-	size_t GetBufferSize(size_t bufferIndex);
-	void ClearBuffers();
-
-	//Images.
-	bool AddAndWriteImage(const string& name, cl_mem_flags flags, const cl::ImageFormat& format, ::size_t width, ::size_t height, ::size_t row_pitch, void* data = nullptr, bool shared = false, GLuint texName = 0);
-	bool WriteImage2D(size_t index, bool shared, size_t width, size_t height, size_t row_pitch, void* data);
-	bool ReadImage(const string& name, ::size_t width, ::size_t height, ::size_t row_pitch, bool shared, void* data);
-	bool ReadImage(size_t imageIndex, ::size_t width, ::size_t height, ::size_t row_pitch, bool shared, void* data);
-	int FindImageIndex(const string& name, bool shared);
-	size_t GetImageSize(const string& name, bool shared);
-	size_t GetImageSize(size_t imageIndex, bool shared);
-	bool CompareImageParams(cl::Image& image, cl_mem_flags flags, const cl::ImageFormat& format, ::size_t width, ::size_t height, ::size_t row_pitch);
-	void ClearImages(bool shared);
-	bool CreateImage2D(cl::Image2D& image2D, cl_mem_flags flags, cl::ImageFormat format, ::size_t width, ::size_t height, ::size_t row_pitch = 0, void* data = nullptr);
-	bool CreateImage2DGL(cl::ImageGL& image2DGL, cl_mem_flags flags, GLenum target, GLint miplevel, GLuint texobj);
-	bool EnqueueAcquireGLObjects(const string& name);
-	bool EnqueueAcquireGLObjects(cl::ImageGL& image);
-	bool EnqueueReleaseGLObjects(const string& name);
-	bool EnqueueReleaseGLObjects(cl::ImageGL& image);
-	bool EnqueueAcquireGLObjects(const VECTOR_CLASS<cl::Memory>* memObjects = nullptr);
-	bool EnqueueReleaseGLObjects(const VECTOR_CLASS<cl::Memory>* memObjects = nullptr);
-	bool CreateSampler(cl::Sampler& sampler, cl_bool normalizedCoords, cl_addressing_mode addressingMode, cl_filter_mode filterMode);
-
-	//Arguments.
-	bool SetBufferArg(size_t kernelIndex, cl_uint argIndex, const string& name);
-	bool SetBufferArg(size_t kernelIndex, cl_uint argIndex, size_t bufferIndex);
-	bool SetImageArg(size_t kernelIndex, cl_uint argIndex, bool shared, const string& name);
-	bool SetImageArg(size_t kernelIndex, cl_uint argIndex, bool shared, size_t imageIndex);
-
-	/// <summary>
-	/// Set an argument in the specified kernel, at the specified argument index.
-	/// Must keep this here in the .h because it's templated.
-	/// </summary>
-	/// <param name="kernelIndex">Index of the kernel whose argument will be set</param>
-	/// <param name="argIndex">Index of the argument to set</param>
-	/// <param name="arg">The argument value to set</param>
-	/// <returns>True if success, else false</returns>
-	template <typename T>
-	bool SetArg(size_t kernelIndex, cl_uint argIndex, T arg)
-	{
-		if (m_Init && kernelIndex < m_Programs.size())
-		{
-			cl_int err = m_Programs[kernelIndex].m_Kernel.setArg(argIndex, arg);
-			return m_Info->CheckCL(err, "cl::Kernel::setArg()");
-		}
-
-		return false;
-	}
-
-	//Kernels.
-	int FindKernelIndex(const string& name);
-	bool RunKernel(size_t kernelIndex, size_t totalGridWidth, size_t totalGridHeight, size_t totalGridDepth, size_t blockWidth, size_t blockHeight, size_t blockDepth);
-
-	//Accessors.
-	bool Ok() const;
-	bool Shared() const;
-	const cl::Context& Context() const;
-	size_t PlatformIndex() const;
-	size_t DeviceIndex() const;
-	size_t TotalDeviceIndex() const;
-	const string& DeviceName() const;
-	size_t LocalMemSize() const;
-	size_t GlobalMemSize() const;
-	size_t MaxAllocSize() const;
-
-	//Public virtual functions overridden from base classes.
-	virtual void ClearErrorReport() override;
-	virtual string ErrorReportString() override;
-	virtual vector<string> ErrorReport() override;
-
-	static void MakeEvenGridDims(size_t blockW, size_t blockH, size_t& gridW, size_t& gridH);
-
-private:
-	bool CreateSPK(const string& name, const string& program, const string& entryPoint, Spk& spk, bool doublePrecision);
-
-	bool m_Init = false;
-	bool m_Shared = false;
-	size_t m_PlatformIndex = 0;
-	size_t m_DeviceIndex = 0;
-	size_t m_LocalMemSize = 0;
-	size_t m_GlobalMemSize;
-	size_t m_MaxAllocSize;
-	cl::Platform m_Platform;
-	cl::Context m_Context;
-	cl::Device m_Device;
-	cl::CommandQueue m_Queue;
-	shared_ptr<OpenCLInfo> m_Info = OpenCLInfo::Instance();
-	std::vector<cl::Device> m_DeviceVec;
-	std::vector<Spk> m_Programs;
-	std::vector<NamedBuffer> m_Buffers;
-	std::vector<NamedImage2D> m_Images;
-	std::vector<NamedImage2DGL> m_GLImages;
-};
-}
+#pragma once
+
+#include "EmberCLPch.h"
+#include "OpenCLInfo.h"
+
+/// <summary>
+/// OpenCLWrapper, Spk, NamedBuffer, NamedImage2D, NamedImage2DGL classes.
+/// </summary>
+
+namespace EmberCLns
+{
+/// <summary>
+/// Class to contain all of the things needed to store an OpenCL program.
+/// The name of it, the source, the compiled program object and the kernel.
+/// </summary>
+class EMBERCL_API Spk
+{
+public:
+	string m_Name;
+	cl::Program::Sources m_Source;
+	cl::Program m_Program;
+	cl::Kernel m_Kernel;
+};
+
+/// <summary>
+/// Class to hold an OpenCL buffer with a name to identify it by.
+/// </summary>
+class EMBERCL_API NamedBuffer
+{
+public:
+	NamedBuffer() noexcept
+	{
+	}
+
+	NamedBuffer(const cl::Buffer& buff, const string& name)
+	{
+		m_Buffer = buff;
+		m_Name = name;
+	}
+
+	cl::Buffer m_Buffer;
+	string m_Name;
+};
+
+/// <summary>
+/// Class to hold a 2D image with a name to identify it by.
+/// </summary>
+class EMBERCL_API NamedImage2D
+{
+public:
+	NamedImage2D() noexcept
+	{
+	}
+
+	NamedImage2D(const cl::Image2D& image, const string& name)
+	{
+		m_Image = image;
+		m_Name = name;
+	}
+
+	cl::Image2D m_Image;
+	string m_Name;
+};
+
+/// <summary>
+/// Class to hold a 2D image that is mapped to an OpenGL texture
+/// and a name to identify it by.
+/// </summary>
+class EMBERCL_API NamedImage2DGL
+{
+public:
+	NamedImage2DGL() noexcept
+	{
+	}
+
+	NamedImage2DGL(const cl::ImageGL& image, const string& name)
+	{
+		m_Image = image;
+		m_Name = name;
+	}
+
+	cl::ImageGL m_Image;
+	string m_Name;
+};
+
+/// <summary>
+/// Running kernels in OpenCL can require quite a bit of setup, tear down and
+/// general housekeeping. This class helps shield the user from such hassles.
+/// Its main utility is in holding collections of programs, buffers and images
+/// all identified by names. That way, a user can access them as needed without
+/// having to pollute their code.
+/// In addition, writing to an existing object by name determines if the object
+/// can be overwritten, or if it needs to be deleted and replaced by the new one.
+/// This class derives from EmberReport, so the caller is able
+/// to retrieve a text dump of error information if any errors occur.
+/// </summary>
+class EMBERCL_API OpenCLWrapper : public EmberReport
+{
+public:
+	OpenCLWrapper();
+	bool Init(size_t platformIndex, size_t deviceIndex, bool shared = false);
+
+	//Programs.
+	bool AddProgram(const string& name, const string& program, const string& entryPoint, bool doublePrecision);
+	void ClearPrograms();
+
+	//Buffers.
+	bool AddBuffer(const string& name, size_t size, cl_mem_flags flags = CL_MEM_READ_WRITE);
+	bool AddHostBuffer(const string& name, size_t size, void* data);
+	bool AddAndWriteBuffer(const string& name, void* data, size_t size, cl_mem_flags flags = CL_MEM_READ_WRITE);
+	bool WriteBuffer(const string& name, void* data, size_t size);
+	bool WriteBuffer(size_t bufferIndex, void* data, size_t size);
+	bool ReadBuffer(const string& name, void* data, size_t size);
+	bool ReadBuffer(size_t bufferIndex, void* data, size_t size);
+	int FindBufferIndex(const string& name);
+	size_t GetBufferSize(const string& name);
+	size_t GetBufferSize(size_t bufferIndex);
+	void ClearBuffers();
+
+	//Images.
+	bool AddAndWriteImage(const string& name, cl_mem_flags flags, const cl::ImageFormat& format, ::size_t width, ::size_t height, ::size_t row_pitch, void* data = nullptr, bool shared = false, GLuint texName = 0);
+	bool WriteImage2D(size_t index, bool shared, size_t width, size_t height, size_t row_pitch, void* data);
+	bool ReadImage(const string& name, ::size_t width, ::size_t height, ::size_t row_pitch, bool shared, void* data);
+	bool ReadImage(size_t imageIndex, ::size_t width, ::size_t height, ::size_t row_pitch, bool shared, void* data);
+	int FindImageIndex(const string& name, bool shared);
+	size_t GetImageSize(const string& name, bool shared);
+	size_t GetImageSize(size_t imageIndex, bool shared);
+	bool CompareImageParams(cl::Image& image, cl_mem_flags flags, const cl::ImageFormat& format, ::size_t width, ::size_t height, ::size_t row_pitch);
+	void ClearImages(bool shared);
+	bool CreateImage2D(cl::Image2D& image2D, cl_mem_flags flags, cl::ImageFormat format, ::size_t width, ::size_t height, ::size_t row_pitch = 0, void* data = nullptr);
+	bool CreateImage2DGL(cl::ImageGL& image2DGL, cl_mem_flags flags, GLenum target, GLint miplevel, GLuint texobj);
+	bool EnqueueAcquireGLObjects(const string& name);
+	bool EnqueueAcquireGLObjects(cl::ImageGL& image);
+	bool EnqueueReleaseGLObjects(const string& name);
+	bool EnqueueReleaseGLObjects(cl::ImageGL& image);
+	bool EnqueueAcquireGLObjects(const cl::vector<cl::Memory>* memObjects = nullptr);
+	bool EnqueueReleaseGLObjects(const cl::vector<cl::Memory>* memObjects = nullptr);
+	bool CreateSampler(cl::Sampler& sampler, cl_bool normalizedCoords, cl_addressing_mode addressingMode, cl_filter_mode filterMode);
+
+	//Arguments.
+	bool SetBufferArg(size_t kernelIndex, cl_uint argIndex, const string& name);
+	bool SetBufferArg(size_t kernelIndex, cl_uint argIndex, size_t bufferIndex);
+	bool SetImageArg(size_t kernelIndex, cl_uint argIndex, bool shared, const string& name);
+	bool SetImageArg(size_t kernelIndex, cl_uint argIndex, bool shared, size_t imageIndex);
+
+	/// <summary>
+	/// Set an argument in the specified kernel, at the specified argument index.
+	/// Must keep this here in the .h because it's templated.
+	/// </summary>
+	/// <param name="kernelIndex">Index of the kernel whose argument will be set</param>
+	/// <param name="argIndex">Index of the argument to set</param>
+	/// <param name="arg">The argument value to set</param>
+	/// <returns>True if success, else false</returns>
+	template <typename T>
+	bool SetArg(size_t kernelIndex, cl_uint argIndex, T arg)
+	{
+		if (m_Init && kernelIndex < m_Programs.size())
+		{
+			cl_int err = m_Programs[kernelIndex].m_Kernel.setArg(argIndex, arg);
+			return m_Info->CheckCL(err, "cl::Kernel::setArg()");
+		}
+
+		return false;
+	}
+
+	//Kernels.
+	int FindKernelIndex(const string& name);
+	bool RunKernel(size_t kernelIndex, size_t totalGridWidth, size_t totalGridHeight, size_t totalGridDepth, size_t blockWidth, size_t blockHeight, size_t blockDepth);
+
+	//Accessors.
+	bool Ok() const;
+	bool Shared() const;
+	const cl::Context& Context() const;
+	size_t PlatformIndex() const;
+	size_t DeviceIndex() const;
+	size_t TotalDeviceIndex() const;
+	const string& DeviceName() const;
+	size_t LocalMemSize() const;
+	size_t GlobalMemSize() const;
+	size_t MaxAllocSize() const;
+
+	//Public virtual functions overridden from base classes.
+	void ClearErrorReport() noexcept override;
+	string ErrorReportString() override;
+	vector<string> ErrorReport() override;
+
+	static void MakeEvenGridDims(size_t blockW, size_t blockH, size_t& gridW, size_t& gridH);
+
+private:
+	bool CreateSPK(const string& name, const string& program, const string& entryPoint, Spk& spk, bool doublePrecision);
+
+	bool m_Init = false;
+	bool m_Shared = false;
+	size_t m_PlatformIndex = 0;
+	size_t m_DeviceIndex = 0;
+	size_t m_LocalMemSize = 0;
+	size_t m_GlobalMemSize = 0;
+	size_t m_MaxAllocSize = 0;
+	cl::Platform m_Platform;
+	cl::Context m_Context;
+	cl::Device m_Device;
+	cl::CommandQueue m_Queue;
+	shared_ptr<OpenCLInfo> m_Info = OpenCLInfo::Instance();
+	std::vector<cl::Device> m_DeviceVec;
+	std::vector<Spk> m_Programs;
+	std::vector<NamedBuffer> m_Buffers;
+	std::vector<NamedImage2D> m_Images;
+	std::vector<NamedImage2DGL> m_GLImages;
+};
+}
--- a/Source/EmberCL/RendererCL.cpp
+++ b/Source/EmberCL/RendererCL.cpp
@ -33,7 +33,7 @@ RendererCL<T, bucketT>::RendererCL(const vector<pair<size_t, size_t>>& devices,
 	m_FinalFormat.image_channel_order = CL_RGBA;
 	m_FinalFormat.image_channel_data_type = CL_FLOAT;
 	m_CompileBegun = [&]() { };
-	m_IterCountPerKernel = size_t(m_SubBatchPercentPerThread * m_Ember.m_SubBatchSize);
+	m_IterCountPerKernel = size_t(double(m_SubBatchPercentPerThread) * m_Ember.m_SubBatchSize);
 	Init(devices, shared, outputTexID);
 }

@ -183,24 +183,29 @@ bool RendererCL<T, bucketT>::SetOutputTexture(GLuint outputTexID)
 /// </summary>

 //Iters per kernel/block/grid.
-template <typename T, typename bucketT> size_t RendererCL<T, bucketT>::IterCountPerKernel() const { return m_IterCountPerKernel;						  }
-template <typename T, typename bucketT> size_t RendererCL<T, bucketT>::IterCountPerBlock()  const { return IterCountPerKernel() * IterBlockKernelCount(); }
-template <typename T, typename bucketT> size_t RendererCL<T, bucketT>::IterCountPerGrid()   const { return IterCountPerKernel() * IterGridKernelCount();  }
+template <typename T, typename bucketT> size_t RendererCL<T, bucketT>::IterCountPerKernel() const noexcept { return m_IterCountPerKernel;						   }
+template <typename T, typename bucketT> size_t RendererCL<T, bucketT>::IterCountPerBlock()  const noexcept { return IterCountPerKernel() * IterBlockKernelCount(); }
+template <typename T, typename bucketT> size_t RendererCL<T, bucketT>::IterCountPerGrid()   const noexcept { return IterCountPerKernel() * IterGridKernelCount();  }

 //Kernels per block.
-template <typename T, typename bucketT> size_t RendererCL<T, bucketT>::IterBlockKernelWidth()  const { return m_IterBlockWidth;								    }
-template <typename T, typename bucketT> size_t RendererCL<T, bucketT>::IterBlockKernelHeight() const { return m_IterBlockHeight;								}
-template <typename T, typename bucketT> size_t RendererCL<T, bucketT>::IterBlockKernelCount()  const { return IterBlockKernelWidth() * IterBlockKernelHeight(); }
+template <typename T, typename bucketT> size_t RendererCL<T, bucketT>::IterBlockKernelWidth()  const noexcept { return m_IterBlockWidth;								 }
+template <typename T, typename bucketT> size_t RendererCL<T, bucketT>::IterBlockKernelHeight() const noexcept { return m_IterBlockHeight;								 }
+template <typename T, typename bucketT> size_t RendererCL<T, bucketT>::IterBlockKernelCount()  const noexcept { return IterBlockKernelWidth() * IterBlockKernelHeight(); }

 //Kernels per grid.
-template <typename T, typename bucketT> size_t RendererCL<T, bucketT>::IterGridKernelWidth()  const { return IterGridBlockWidth() * IterBlockKernelWidth();   }
-template <typename T, typename bucketT> size_t RendererCL<T, bucketT>::IterGridKernelHeight() const { return IterGridBlockHeight() * IterBlockKernelHeight(); }
-template <typename T, typename bucketT> size_t RendererCL<T, bucketT>::IterGridKernelCount()  const { return IterGridKernelWidth() * IterGridKernelHeight();  }
+template <typename T, typename bucketT> size_t RendererCL<T, bucketT>::IterGridKernelWidth()  const noexcept { return IterGridBlockWidth() * IterBlockKernelWidth();   }
+template <typename T, typename bucketT> size_t RendererCL<T, bucketT>::IterGridKernelHeight() const noexcept { return IterGridBlockHeight() * IterBlockKernelHeight(); }
+template <typename T, typename bucketT> size_t RendererCL<T, bucketT>::IterGridKernelCount()  const noexcept { return IterGridKernelWidth() * IterGridKernelHeight();  }

 //Blocks per grid.
-template <typename T, typename bucketT> size_t RendererCL<T, bucketT>::IterGridBlockWidth()  const { return m_IterBlocksWide;							    }
-template <typename T, typename bucketT> size_t RendererCL<T, bucketT>::IterGridBlockHeight() const { return m_IterBlocksHigh;							    }
-template <typename T, typename bucketT> size_t RendererCL<T, bucketT>::IterGridBlockCount()  const { return IterGridBlockWidth() * IterGridBlockHeight();   }
+template <typename T, typename bucketT> size_t RendererCL<T, bucketT>::IterGridBlockWidth()  const noexcept { return m_IterBlocksWide;							   }
+template <typename T, typename bucketT> size_t RendererCL<T, bucketT>::IterGridBlockHeight() const noexcept { return m_IterBlocksHigh;							   }
+template <typename T, typename bucketT> size_t RendererCL<T, bucketT>::IterGridBlockCount()  const noexcept { return IterGridBlockWidth() * IterGridBlockHeight(); }
+
+//Allow for setting the number of blocks in each grid dimension.
+//These should only be calle before a run starts.
+template <typename T, typename bucketT> void RendererCL<T, bucketT>::IterBlocksWide(size_t w) noexcept { m_IterBlocksWide = w; }
+template <typename T, typename bucketT> void RendererCL<T, bucketT>::IterBlocksHigh(size_t h) noexcept { m_IterBlocksHigh = h; }

 /// <summary>
 /// Read the histogram of the specified into the host side CPU buffer.
@ -590,7 +595,7 @@ bool RendererCL<T, bucketT>::Shared() const { return m_Shared; }
 /// Clear the error report for this class as well as the OpenCLWrapper members of each device.
 /// </summary>
 template <typename T, typename bucketT>
-void RendererCL<T, bucketT>::ClearErrorReport()
+void RendererCL<T, bucketT>::ClearErrorReport() noexcept
 {
 	EmberReport::ClearErrorReport();

@ -669,7 +674,7 @@ bool RendererCL<T, bucketT>::RandVec(vector<QTIsaac<ISAAC_SIZE, ISAAC_INT>>& ran
 /// </summary>
 /// <returns>True if an devices are from Nvidia, else false.</returns>
 template <typename T, typename bucketT>
-bool RendererCL<T, bucketT>::AnyNvidia() const
+bool RendererCL<T, bucketT>::AnyNvidia() const noexcept
 {
 	for (auto& dev : m_Devices)
 		if (dev->Nvidia())
@ -701,7 +706,7 @@ bool RendererCL<T, bucketT>::Alloc(bool histOnly)
 	static std::string loc = __FUNCTION__;
 	auto& wrapper = m_Devices[0]->m_Wrapper;
 	InitStateVec();
-	m_IterCountPerKernel = size_t(m_SubBatchPercentPerThread * m_Ember.m_SubBatchSize);//This isn't the greatest place to put this, but it must be computed before the number of iters to do is computed in the base.
+	m_IterCountPerKernel = size_t(double(m_SubBatchPercentPerThread) * m_Ember.m_SubBatchSize);//This isn't the greatest place to put this, but it must be computed before the number of iters to do is computed in the base.

 	if (b && !(b = wrapper.AddBuffer(m_DEFilterParamsBufferName, sizeof(m_DensityFilterCL))))      { ErrorStr(loc, "Failed to set DE filter parameters buffer", m_Devices[0].get()); }

@ -1305,11 +1310,11 @@ eRenderStatus RendererCL<T, bucketT>::RunDensityFilter()
 		//that are far enough apart such that their filters do not overlap.
 		//Do the latter.
 		//Gap is in terms of blocks and specifies how many blocks must separate two blocks running at the same time.
-		const auto gapW = static_cast<uint>(ceil(fw2 / blockSizeW));
+		const auto gapW = static_cast<size_t>(ceil(fw2 / blockSizeW));
 		const auto chunkSizeW = gapW + 1;//Chunk size is also in terms of blocks and is one block (the one running) plus the gap to the right of it.
-		const auto gapH = static_cast<uint>(ceil(fw2 / blockSizeH));
+		const auto gapH = static_cast<size_t>(ceil(fw2 / blockSizeH));
 		const auto chunkSizeH = gapH + 1;//Chunk size is also in terms of blocks and is one block (the one running) plus the gap below it.
-		double totalChunks = chunkSizeW * chunkSizeH;
+		double totalChunks = double(chunkSizeW * chunkSizeH);

 		if (b && !(b = wrapper.AddAndWriteBuffer(m_DEFilterParamsBufferName, reinterpret_cast<void*>(&m_DensityFilterCL), sizeof(m_DensityFilterCL)))) { ErrorStr(loc, "Writing DE filter parameters buffer failed", m_Devices[0].get()); }

@ -1350,12 +1355,12 @@ eRenderStatus RendererCL<T, bucketT>::RunDensityFilter()
 		gridH /= chunkSizeH;
 		OpenCLWrapper::MakeEvenGridDims(blockSizeW, blockSizeH, gridW, gridH);

-		for (uint rowChunkPass = 0; b && !m_Abort && rowChunkPass < chunkSizeH; rowChunkPass++)//Number of vertical passes.
+		for (size_t rowChunkPass = 0; b && !m_Abort && rowChunkPass < chunkSizeH; rowChunkPass++)//Number of vertical passes.
 		{
-			for (uint colChunkPass = 0; b && !m_Abort && colChunkPass < chunkSizeW; colChunkPass++)//Number of horizontal passes.
+			for (size_t colChunkPass = 0; b && !m_Abort && colChunkPass < chunkSizeW; colChunkPass++)//Number of horizontal passes.
 			{
 				//t2.Tic();
-				if (b && !(b = RunDensityFilterPrivate(kernelIndex, gridW, gridH, blockSizeW, blockSizeH, chunkSizeW, chunkSizeH, colChunkPass, rowChunkPass)))
+				if (b && !(b = RunDensityFilterPrivate(kernelIndex, gridW, gridH, blockSizeW, blockSizeH, uint(chunkSizeW), uint(chunkSizeH), uint(colChunkPass), uint(rowChunkPass))))
 				{
 					ErrorStr(loc, "Running DE filter program for row chunk "s + std::to_string(rowChunkPass) + ", col chunk "s + std::to_string(colChunkPass) + " failed", m_Devices[0].get());
 				}
@ -1510,7 +1515,7 @@ bool RendererCL<T, bucketT>::ClearBuffer(size_t device, const string& bufferName
 		{
 			size_t blockW = m_Devices[device]->Nvidia() ? 32 : 16;//Max work group size is 256 on AMD, which means 16x16.
 			size_t blockH = m_Devices[device]->Nvidia() ? 32 : 16;
-			size_t gridW = width * elementSize;
+			size_t gridW = size_t(width) * elementSize;
 			size_t gridH = height;
 			b = true;
 			OpenCLWrapper::MakeEvenGridDims(blockW, blockH, gridW, gridH);
--- a/Source/EmberCL/RendererCL.h
+++ b/Source/EmberCL/RendererCL.h
@ -1,270 +1,274 @@
-#pragma once
-
-#include "EmberCLPch.h"
-#include "OpenCLWrapper.h"
-#include "DEOpenCLKernelCreator.h"
-#include "FinalAccumOpenCLKernelCreator.h"
-#include "RendererClDevice.h"
-
-/// <summary>
-/// RendererCLBase and RendererCL classes.
-/// </summary>
-
-namespace EmberCLns
-{
-/// <summary>
-/// Serves only as an interface for OpenCL specific rendering functions.
-/// </summary>
-class EMBERCL_API RendererCLBase
-{
-public:
-	virtual ~RendererCLBase() { }
-	virtual bool ReadFinal(v4F* pixels) { return false; }
-	virtual bool ClearFinal() { return false; }
-	virtual bool AnyNvidia() const { return false; }
-	bool OptAffine() const { return m_OptAffine; }
-	void OptAffine(bool optAffine) { m_OptAffine = optAffine; }
-
-	std::function<void(void)> m_CompileBegun;
-
-protected:
-	bool m_OptAffine = false;
-};
-
-/// <summary>
-/// RendererCL is a derivation of the basic CPU renderer which
-/// overrides various functions to render on the GPU using OpenCL.
-/// This supports multi-GPU rendering and is done in the following manner:
-///		-When rendering a single image, the iterations will be split between devices in sub batches.
-///		-When animating, a renderer for each device will be created by the calling code,
-///			and the frames will each be rendered by a single device as available.
-/// The synchronization across devices is done through a single atomic counter.
-/// Since this class derives from EmberReport and also contains an
-/// OpenCLWrapper member which also derives from EmberReport, the
-/// reporting functions are overridden to aggregate the errors from
-/// both sources.
-/// Template argument T expected to be float or double.
-/// Template argument bucketT must always be float.
-/// </summary>
-template <typename T, typename bucketT>
-class EMBERCL_API RendererCL : public Renderer<T, bucketT>, public RendererCLBase
-{
-	using EmberNs::Renderer<T, bucketT>::RendererBase::Abort;
-	using EmberNs::Renderer<T, bucketT>::RendererBase::EarlyClip;
-	using EmberNs::Renderer<T, bucketT>::RendererBase::EnterResize;
-	using EmberNs::Renderer<T, bucketT>::RendererBase::LeaveResize;
-	using EmberNs::Renderer<T, bucketT>::RendererBase::FinalRasW;
-	using EmberNs::Renderer<T, bucketT>::RendererBase::FinalRasH;
-	using EmberNs::Renderer<T, bucketT>::RendererBase::SuperRasW;
-	using EmberNs::Renderer<T, bucketT>::RendererBase::SuperRasH;
-	using EmberNs::Renderer<T, bucketT>::RendererBase::SuperSize;
-	using EmberNs::Renderer<T, bucketT>::RendererBase::BytesPerChannel;
-	using EmberNs::Renderer<T, bucketT>::RendererBase::TemporalSamples;
-	using EmberNs::Renderer<T, bucketT>::RendererBase::ItersPerTemporalSample;
-	using EmberNs::Renderer<T, bucketT>::RendererBase::FuseCount;
-	using EmberNs::Renderer<T, bucketT>::RendererBase::DensityFilterOffset;
-	using EmberNs::Renderer<T, bucketT>::RendererBase::PrepFinalAccumVector;
-	using EmberNs::Renderer<T, bucketT>::RendererBase::Paused;
-	using EmberNs::Renderer<T, bucketT>::RendererBase::m_ProgressParameter;
-	using EmberNs::Renderer<T, bucketT>::RendererBase::m_YAxisUp;
-	using EmberNs::Renderer<T, bucketT>::RendererBase::m_LockAccum;
-	using EmberNs::Renderer<T, bucketT>::RendererBase::m_Abort;
-	using EmberNs::Renderer<T, bucketT>::RendererBase::m_LastIter;
-	using EmberNs::Renderer<T, bucketT>::RendererBase::m_LastIterPercent;
-	using EmberNs::Renderer<T, bucketT>::RendererBase::m_Stats;
-	using EmberNs::Renderer<T, bucketT>::RendererBase::m_Callback;
-	using EmberNs::Renderer<T, bucketT>::RendererBase::m_Rand;
-	using EmberNs::Renderer<T, bucketT>::RendererBase::m_RenderTimer;
-	using EmberNs::Renderer<T, bucketT>::RendererBase::m_IterTimer;
-	using EmberNs::Renderer<T, bucketT>::RendererBase::m_ProgressTimer;
-	using EmberNs::Renderer<T, bucketT>::RendererBase::EmberReport::AddToReport;
-	using EmberNs::Renderer<T, bucketT>::RendererBase::m_ResizeCs;
-	using EmberNs::Renderer<T, bucketT>::RendererBase::m_ProcessAction;
-	using EmberNs::Renderer<T, bucketT>::m_RotMat;
-	using EmberNs::Renderer<T, bucketT>::m_Ember;
-	using EmberNs::Renderer<T, bucketT>::m_Csa;
-	using EmberNs::Renderer<T, bucketT>::m_CurvesSet;
-	using EmberNs::Renderer<T, bucketT>::CenterX;
-	using EmberNs::Renderer<T, bucketT>::CenterY;
-	using EmberNs::Renderer<T, bucketT>::K1;
-	using EmberNs::Renderer<T, bucketT>::K2;
-	using EmberNs::Renderer<T, bucketT>::Supersample;
-	using EmberNs::Renderer<T, bucketT>::HighlightPower;
-	using EmberNs::Renderer<T, bucketT>::HistBuckets;
-	using EmberNs::Renderer<T, bucketT>::AccumulatorBuckets;
-	using EmberNs::Renderer<T, bucketT>::GetDensityFilter;
-	using EmberNs::Renderer<T, bucketT>::GetSpatialFilter;
-	using EmberNs::Renderer<T, bucketT>::CoordMap;
-	using EmberNs::Renderer<T, bucketT>::XformDistributions;
-	using EmberNs::Renderer<T, bucketT>::XformDistributionsSize;
-	using EmberNs::Renderer<T, bucketT>::m_Dmap;
-	using EmberNs::Renderer<T, bucketT>::m_DensityFilter;
-	using EmberNs::Renderer<T, bucketT>::m_SpatialFilter;
-
-public:
-	RendererCL(const vector<pair<size_t, size_t>>& devices, bool shared = false, GLuint outputTexID = 0);
-	RendererCL(const RendererCL<T, bucketT>& renderer) = delete;
-	RendererCL<T, bucketT>& operator = (const RendererCL<T, bucketT>& renderer) = delete;
-	virtual ~RendererCL() = default;
-
-	//Non-virtual member functions for OpenCL specific tasks.
-	bool Init(const vector<pair<size_t, size_t>>& devices, bool shared, GLuint outputTexID);
-	bool SetOutputTexture(GLuint outputTexID);
-
-	//Iters per kernel/block/grid.
-	inline size_t IterCountPerKernel() const;
-	inline size_t IterCountPerBlock() const;
-	inline size_t IterCountPerGrid() const;
-
-	//Kernels per block.
-	inline size_t IterBlockKernelWidth() const;
-	inline size_t IterBlockKernelHeight() const;
-	inline size_t IterBlockKernelCount() const;
-
-	//Kernels per grid.
-	inline size_t IterGridKernelWidth() const;
-	inline size_t IterGridKernelHeight() const;
-	inline size_t IterGridKernelCount() const;
-
-	//Blocks per grid.
-	inline size_t IterGridBlockWidth() const;
-	inline size_t IterGridBlockHeight() const;
-	inline size_t IterGridBlockCount() const;
-
-	bool ReadHist(size_t device);
-	bool ReadAccum();
-	bool ReadPoints(size_t device, vector<PointCL<T>>& vec);
-	bool ClearHist();
-	bool ClearHist(size_t device);
-	bool ClearAccum();
-	bool WritePoints(size_t device, vector<PointCL<T>>& vec);
-#ifdef TEST_CL
-	bool WriteRandomPoints(size_t device);
-#endif
-	void InitStateVec();
-	void SubBatchPercentPerThread(float f);
-	float SubBatchPercentPerThread() const;
-	const string& IterKernel() const;
-	const string& DEKernel() const;
-	const string& FinalAccumKernel() const;
-
-	//Access to underlying OpenCL structures. Use cautiously.
-	const vector<unique_ptr<RendererClDevice>>& Devices() const;
-
-	//Virtual functions overridden from RendererCLBase.
-	virtual bool ReadFinal(v4F* pixels);
-	virtual bool ClearFinal();
-
-	//Public virtual functions overridden from Renderer or RendererBase.
-	size_t MemoryAvailable() override;
-	bool Ok() const override;
-	size_t SubBatchSize() const override;
-	size_t ThreadCount() const override;
-	bool CreateDEFilter(bool& newAlloc) override;
-	bool CreateSpatialFilter(bool& newAlloc) override;
-	eRendererType RendererType() const override;
-	bool Shared() const override;
-	void ClearErrorReport() override;
-	string ErrorReportString() override;
-	vector<string> ErrorReport() override;
-	bool RandVec(vector<QTIsaac<ISAAC_SIZE, ISAAC_INT>>& randVec) override;
-	bool AnyNvidia() const override;
-
-#ifndef TEST_CL
-protected:
-#endif
-	//Protected virtual functions overridden from Renderer.
-	bool Alloc(bool histOnly = false) override;
-	bool ResetBuckets(bool resetHist = true, bool resetAccum = true) override;
-	eRenderStatus LogScaleDensityFilter(bool forceOutput = false) override;
-	eRenderStatus GaussianDensityFilter() override;
-	eRenderStatus AccumulatorToFinalImage(vector<v4F>& pixels, size_t finalOffset) override;
-	EmberStats Iterate(size_t iterCount, size_t temporalSample) override;
-
-#ifndef TEST_CL
-private:
-#endif
-	//Private functions for making and running OpenCL programs.
-	bool BuildIterProgramForEmber(bool doAccum = true);
-	bool RunIter(size_t iterCount, size_t temporalSample, size_t& itersRan);
-	eRenderStatus RunLogScaleFilter();
-	eRenderStatus RunDensityFilter();
-	eRenderStatus RunFinalAccum();
-	bool ClearBuffer(size_t device, const string& bufferName, uint width, uint height, uint elementSize);
-	bool RunDensityFilterPrivate(size_t kernelIndex, size_t gridW, size_t gridH, size_t blockW, size_t blockH, uint chunkSizeW, uint chunkSizeH, uint colChunkPass, uint rowChunkPass);
-	int MakeAndGetDensityFilterProgram(size_t ss, uint filterWidth);
-	int MakeAndGetFinalAccumProgram();
-	int MakeAndGetGammaCorrectionProgram();
-	bool CreateHostBuffer();
-	bool SumDeviceHist();
-	void FillSeeds();
-
-	//Private functions passing data to OpenCL programs.
-	void ConvertDensityFilter();
-	void ConvertSpatialFilter();
-	void ConvertEmber(Ember<T>& ember, EmberCL<T>& emberCL, vector<XformCL<T>>& xformsCL);
-	void ConvertCarToRas(const CarToRas<T>& carToRas);
-	std::string ErrorStr(const std::string& loc, const std::string& error, RendererClDevice* dev);
-	bool m_Init = false;
-	bool m_Shared = false;
-	bool m_DoublePrecision = typeid(T) == typeid(double);
-	float m_SubBatchPercentPerThread = 0.025f;//0.025 * 10,240 gives a default value of 256 iters per thread for the default sub batch size of 10,240 which almost all flames will use.
-	//It's critical that these numbers never change. They are
-	//based on the cuburn model of each kernel launch containing
-	//256 threads. 32 wide by 8 high. Everything done in the OpenCL
-	//iteraion kernel depends on these dimensions.
-	size_t m_IterCountPerKernel = 256;
-	size_t m_IterBlocksWide = 64, m_IterBlockWidth = 32;
-	size_t m_IterBlocksHigh = 2, m_IterBlockHeight = 8;
-	size_t m_MaxDEBlockSizeW;
-	size_t m_MaxDEBlockSizeH;
-
-	//Buffer names.
-	string m_EmberBufferName = "Ember";
-	string m_XformsBufferName = "Xforms";
-	string m_ParVarsBufferName = "ParVars";
-	string m_GlobalSharedBufferName = "GlobalShared";
-	string m_SeedsBufferName = "Seeds";
-	string m_DistBufferName = "Dist";
-	string m_CarToRasBufferName = "CarToRas";
-	string m_DEFilterParamsBufferName = "DEFilterParams";
-	string m_SpatialFilterParamsBufferName = "SpatialFilterParams";
-	string m_DECoefsBufferName = "DECoefs";
-	string m_DEWidthsBufferName = "DEWidths";
-	string m_DECoefIndicesBufferName = "DECoefIndices";
-	string m_SpatialFilterCoefsBufferName = "SpatialFilterCoefs";
-	string m_CurvesCsaName = "CurvesCsa";
-	string m_HostBufferName = "Host";
-	string m_HistBufferName = "Hist";
-	string m_AccumBufferName = "Accum";
-	string m_FinalImageName = "Final";
-	string m_PointsBufferName = "Points";
-#ifdef KNL_USE_GLOBAL_CONSEC
-	string m_ConsecBufferName = "Consec";
-#endif
-	string m_VarStateBufferName = "VarState";
-
-	//Kernels.
-	string m_IterKernel;
-
-	cl::ImageFormat m_PaletteFormat;
-	cl::ImageFormat m_FinalFormat;
-	cl::Image2D m_Palette;
-	cl::ImageGL m_AccumImage;
-	GLuint m_OutputTexID;
-	EmberCL<T> m_EmberCL;
-	vector<XformCL<T>> m_XformsCL;
-	vector<vector<glm::highp_uvec2>> m_Seeds;
-	CarToRasCL<T> m_CarToRasCL;
-	DensityFilterCL<bucketT> m_DensityFilterCL;
-	SpatialFilterCL<bucketT> m_SpatialFilterCL;
-	IterOpenCLKernelCreator<T> m_IterOpenCLKernelCreator;
-	DEOpenCLKernelCreator m_DEOpenCLKernelCreator;
-	FinalAccumOpenCLKernelCreator m_FinalAccumOpenCLKernelCreator;
-	pair<string, vector<T>> m_Params;
-	pair<string, vector<T>> m_GlobalShared;
-	vector<T> m_VarStates;
-	vector<unique_ptr<RendererClDevice>> m_Devices;
-	Ember<T> m_LastBuiltEmber;
-};
-}
+#pragma once
+
+#include "EmberCLPch.h"
+#include "OpenCLWrapper.h"
+#include "DEOpenCLKernelCreator.h"
+#include "FinalAccumOpenCLKernelCreator.h"
+#include "RendererClDevice.h"
+
+/// <summary>
+/// RendererCLBase and RendererCL classes.
+/// </summary>
+
+namespace EmberCLns
+{
+/// <summary>
+/// Serves only as an interface for OpenCL specific rendering functions.
+/// </summary>
+class EMBERCL_API RendererCLBase
+{
+public:
+	virtual ~RendererCLBase() { }
+	virtual bool ReadFinal(v4F* pixels) { return false; }
+	virtual bool ClearFinal() { return false; }
+	virtual bool AnyNvidia() const noexcept { return false; }
+	bool OptAffine() const noexcept { return m_OptAffine; }
+	void OptAffine(bool optAffine) noexcept { m_OptAffine = optAffine; }
+
+	std::function<void(void)> m_CompileBegun;
+
+protected:
+	bool m_OptAffine = false;
+};
+
+/// <summary>
+/// RendererCL is a derivation of the basic CPU renderer which
+/// overrides various functions to render on the GPU using OpenCL.
+/// This supports multi-GPU rendering and is done in the following manner:
+///		-When rendering a single image, the iterations will be split between devices in sub batches.
+///		-When animating, a renderer for each device will be created by the calling code,
+///			and the frames will each be rendered by a single device as available.
+/// The synchronization across devices is done through a single atomic counter.
+/// Since this class derives from EmberReport and also contains an
+/// OpenCLWrapper member which also derives from EmberReport, the
+/// reporting functions are overridden to aggregate the errors from
+/// both sources.
+/// Template argument T expected to be float or double.
+/// Template argument bucketT must always be float.
+/// </summary>
+template <typename T, typename bucketT>
+class EMBERCL_API RendererCL : public Renderer<T, bucketT>, public RendererCLBase
+{
+	using EmberNs::Renderer<T, bucketT>::RendererBase::Abort;
+	using EmberNs::Renderer<T, bucketT>::RendererBase::EarlyClip;
+	using EmberNs::Renderer<T, bucketT>::RendererBase::EnterResize;
+	using EmberNs::Renderer<T, bucketT>::RendererBase::LeaveResize;
+	using EmberNs::Renderer<T, bucketT>::RendererBase::FinalRasW;
+	using EmberNs::Renderer<T, bucketT>::RendererBase::FinalRasH;
+	using EmberNs::Renderer<T, bucketT>::RendererBase::SuperRasW;
+	using EmberNs::Renderer<T, bucketT>::RendererBase::SuperRasH;
+	using EmberNs::Renderer<T, bucketT>::RendererBase::SuperSize;
+	using EmberNs::Renderer<T, bucketT>::RendererBase::BytesPerChannel;
+	using EmberNs::Renderer<T, bucketT>::RendererBase::TemporalSamples;
+	using EmberNs::Renderer<T, bucketT>::RendererBase::ItersPerTemporalSample;
+	using EmberNs::Renderer<T, bucketT>::RendererBase::FuseCount;
+	using EmberNs::Renderer<T, bucketT>::RendererBase::DensityFilterOffset;
+	using EmberNs::Renderer<T, bucketT>::RendererBase::PrepFinalAccumVector;
+	using EmberNs::Renderer<T, bucketT>::RendererBase::Paused;
+	using EmberNs::Renderer<T, bucketT>::RendererBase::m_ProgressParameter;
+	using EmberNs::Renderer<T, bucketT>::RendererBase::m_YAxisUp;
+	using EmberNs::Renderer<T, bucketT>::RendererBase::m_LockAccum;
+	using EmberNs::Renderer<T, bucketT>::RendererBase::m_Abort;
+	using EmberNs::Renderer<T, bucketT>::RendererBase::m_LastIter;
+	using EmberNs::Renderer<T, bucketT>::RendererBase::m_LastIterPercent;
+	using EmberNs::Renderer<T, bucketT>::RendererBase::m_Stats;
+	using EmberNs::Renderer<T, bucketT>::RendererBase::m_Callback;
+	using EmberNs::Renderer<T, bucketT>::RendererBase::m_Rand;
+	using EmberNs::Renderer<T, bucketT>::RendererBase::m_RenderTimer;
+	using EmberNs::Renderer<T, bucketT>::RendererBase::m_IterTimer;
+	using EmberNs::Renderer<T, bucketT>::RendererBase::m_ProgressTimer;
+	using EmberNs::Renderer<T, bucketT>::RendererBase::EmberReport::AddToReport;
+	using EmberNs::Renderer<T, bucketT>::RendererBase::m_ResizeCs;
+	using EmberNs::Renderer<T, bucketT>::RendererBase::m_ProcessAction;
+	using EmberNs::Renderer<T, bucketT>::m_RotMat;
+	using EmberNs::Renderer<T, bucketT>::m_Ember;
+	using EmberNs::Renderer<T, bucketT>::m_Csa;
+	using EmberNs::Renderer<T, bucketT>::m_CurvesSet;
+	using EmberNs::Renderer<T, bucketT>::CenterX;
+	using EmberNs::Renderer<T, bucketT>::CenterY;
+	using EmberNs::Renderer<T, bucketT>::K1;
+	using EmberNs::Renderer<T, bucketT>::K2;
+	using EmberNs::Renderer<T, bucketT>::Supersample;
+	using EmberNs::Renderer<T, bucketT>::HighlightPower;
+	using EmberNs::Renderer<T, bucketT>::HistBuckets;
+	using EmberNs::Renderer<T, bucketT>::AccumulatorBuckets;
+	using EmberNs::Renderer<T, bucketT>::GetDensityFilter;
+	using EmberNs::Renderer<T, bucketT>::GetSpatialFilter;
+	using EmberNs::Renderer<T, bucketT>::CoordMap;
+	using EmberNs::Renderer<T, bucketT>::XformDistributions;
+	using EmberNs::Renderer<T, bucketT>::XformDistributionsSize;
+	using EmberNs::Renderer<T, bucketT>::m_Dmap;
+	using EmberNs::Renderer<T, bucketT>::m_DensityFilter;
+	using EmberNs::Renderer<T, bucketT>::m_SpatialFilter;
+
+public:
+	RendererCL(const vector<pair<size_t, size_t>>& devices, bool shared = false, GLuint outputTexID = 0);
+	RendererCL(const RendererCL<T, bucketT>& renderer) = delete;
+	RendererCL<T, bucketT>& operator = (const RendererCL<T, bucketT>& renderer) = delete;
+	virtual ~RendererCL() = default;
+
+	//Non-virtual member functions for OpenCL specific tasks.
+	bool Init(const vector<pair<size_t, size_t>>& devices, bool shared, GLuint outputTexID);
+	bool SetOutputTexture(GLuint outputTexID);
+
+	//Iters per kernel/block/grid.
+	inline size_t IterCountPerKernel() const noexcept;
+	inline size_t IterCountPerBlock() const noexcept;
+	inline size_t IterCountPerGrid() const noexcept;
+
+	//Kernels per block.
+	inline size_t IterBlockKernelWidth() const noexcept;
+	inline size_t IterBlockKernelHeight() const noexcept;
+	inline size_t IterBlockKernelCount() const noexcept;
+
+	//Kernels per grid.
+	inline size_t IterGridKernelWidth() const noexcept;
+	inline size_t IterGridKernelHeight() const noexcept;
+	inline size_t IterGridKernelCount() const noexcept;
+
+	//Blocks per grid.
+	inline size_t IterGridBlockWidth() const noexcept;
+	inline size_t IterGridBlockHeight() const noexcept;
+	inline size_t IterGridBlockCount() const noexcept;
+
+	//Allow for changing the number of blocks in each dimension of the grid.
+	void IterBlocksWide(size_t w) noexcept;
+	void IterBlocksHigh(size_t h) noexcept;
+
+	bool ReadHist(size_t device);
+	bool ReadAccum();
+	bool ReadPoints(size_t device, vector<PointCL<T>>& vec);
+	bool ClearHist();
+	bool ClearHist(size_t device);
+	bool ClearAccum();
+	bool WritePoints(size_t device, vector<PointCL<T>>& vec);
+#ifdef TEST_CL
+	bool WriteRandomPoints(size_t device);
+#endif
+	void InitStateVec();
+	void SubBatchPercentPerThread(float f);
+	float SubBatchPercentPerThread() const;
+	const string& IterKernel() const;
+	const string& DEKernel() const;
+	const string& FinalAccumKernel() const;
+
+	//Access to underlying OpenCL structures. Use cautiously.
+	const vector<unique_ptr<RendererClDevice>>& Devices() const;
+
+	//Virtual functions overridden from RendererCLBase.
+	virtual bool ReadFinal(v4F* pixels);
+	virtual bool ClearFinal();
+
+	//Public virtual functions overridden from Renderer or RendererBase.
+	size_t MemoryAvailable() override;
+	bool Ok() const override;
+	size_t SubBatchSize() const override;
+	size_t ThreadCount() const override;
+	bool CreateDEFilter(bool& newAlloc) override;
+	bool CreateSpatialFilter(bool& newAlloc) override;
+	eRendererType RendererType() const override;
+	bool Shared() const override;
+	void ClearErrorReport() noexcept override;
+	string ErrorReportString() override;
+	vector<string> ErrorReport() override;
+	bool RandVec(vector<QTIsaac<ISAAC_SIZE, ISAAC_INT>>& randVec) override;
+	bool AnyNvidia() const noexcept override;
+
+#ifndef TEST_CL
+protected:
+#endif
+	//Protected virtual functions overridden from Renderer.
+	bool Alloc(bool histOnly = false) override;
+	bool ResetBuckets(bool resetHist = true, bool resetAccum = true) override;
+	eRenderStatus LogScaleDensityFilter(bool forceOutput = false) override;
+	eRenderStatus GaussianDensityFilter() override;
+	eRenderStatus AccumulatorToFinalImage(vector<v4F>& pixels, size_t finalOffset) override;
+	EmberStats Iterate(size_t iterCount, size_t temporalSample) override;
+
+#ifndef TEST_CL
+private:
+#endif
+	//Private functions for making and running OpenCL programs.
+	bool BuildIterProgramForEmber(bool doAccum = true);
+	bool RunIter(size_t iterCount, size_t temporalSample, size_t& itersRan);
+	eRenderStatus RunLogScaleFilter();
+	eRenderStatus RunDensityFilter();
+	eRenderStatus RunFinalAccum();
+	bool ClearBuffer(size_t device, const string& bufferName, uint width, uint height, uint elementSize);
+	bool RunDensityFilterPrivate(size_t kernelIndex, size_t gridW, size_t gridH, size_t blockW, size_t blockH, uint chunkSizeW, uint chunkSizeH, uint colChunkPass, uint rowChunkPass);
+	int MakeAndGetDensityFilterProgram(size_t ss, uint filterWidth);
+	int MakeAndGetFinalAccumProgram();
+	int MakeAndGetGammaCorrectionProgram();
+	bool CreateHostBuffer();
+	bool SumDeviceHist();
+	void FillSeeds();
+
+	//Private functions passing data to OpenCL programs.
+	void ConvertDensityFilter();
+	void ConvertSpatialFilter();
+	void ConvertEmber(Ember<T>& ember, EmberCL<T>& emberCL, vector<XformCL<T>>& xformsCL);
+	void ConvertCarToRas(const CarToRas<T>& carToRas);
+	std::string ErrorStr(const std::string& loc, const std::string& error, RendererClDevice* dev);
+	bool m_Init = false;
+	bool m_Shared = false;
+	bool m_DoublePrecision = typeid(T) == typeid(double);
+	float m_SubBatchPercentPerThread = 0.025f;//0.025 * 10,240 gives a default value of 256 iters per thread for the default sub batch size of 10,240 which almost all flames will use.
+	//It's critical that these numbers never change. They are
+	//based on the cuburn model of each kernel launch containing
+	//256 threads. 32 wide by 8 high. Everything done in the OpenCL
+	//iteraion kernel depends on these dimensions.
+	size_t m_IterCountPerKernel = 256;
+	size_t m_IterBlocksWide = 64, m_IterBlockWidth = 32;
+	size_t m_IterBlocksHigh = 2, m_IterBlockHeight = 8;
+	size_t m_MaxDEBlockSizeW;
+	size_t m_MaxDEBlockSizeH;
+
+	//Buffer names.
+	string m_EmberBufferName = "Ember";
+	string m_XformsBufferName = "Xforms";
+	string m_ParVarsBufferName = "ParVars";
+	string m_GlobalSharedBufferName = "GlobalShared";
+	string m_SeedsBufferName = "Seeds";
+	string m_DistBufferName = "Dist";
+	string m_CarToRasBufferName = "CarToRas";
+	string m_DEFilterParamsBufferName = "DEFilterParams";
+	string m_SpatialFilterParamsBufferName = "SpatialFilterParams";
+	string m_DECoefsBufferName = "DECoefs";
+	string m_DEWidthsBufferName = "DEWidths";
+	string m_DECoefIndicesBufferName = "DECoefIndices";
+	string m_SpatialFilterCoefsBufferName = "SpatialFilterCoefs";
+	string m_CurvesCsaName = "CurvesCsa";
+	string m_HostBufferName = "Host";
+	string m_HistBufferName = "Hist";
+	string m_AccumBufferName = "Accum";
+	string m_FinalImageName = "Final";
+	string m_PointsBufferName = "Points";
+#ifdef KNL_USE_GLOBAL_CONSEC
+	string m_ConsecBufferName = "Consec";
+#endif
+	string m_VarStateBufferName = "VarState";
+
+	//Kernels.
+	string m_IterKernel;
+
+	cl::ImageFormat m_PaletteFormat;
+	cl::ImageFormat m_FinalFormat;
+	cl::Image2D m_Palette;
+	cl::ImageGL m_AccumImage;
+	GLuint m_OutputTexID;
+	EmberCL<T> m_EmberCL;
+	vector<XformCL<T>> m_XformsCL;
+	vector<vector<glm::highp_uvec2>> m_Seeds;
+	CarToRasCL<T> m_CarToRasCL;
+	DensityFilterCL<bucketT> m_DensityFilterCL;
+	SpatialFilterCL<bucketT> m_SpatialFilterCL;
+	IterOpenCLKernelCreator<T> m_IterOpenCLKernelCreator;
+	DEOpenCLKernelCreator m_DEOpenCLKernelCreator;
+	FinalAccumOpenCLKernelCreator m_FinalAccumOpenCLKernelCreator;
+	pair<string, vector<T>> m_Params;
+	pair<string, vector<T>> m_GlobalShared;
+	vector<T> m_VarStates;
+	vector<unique_ptr<RendererClDevice>> m_Devices;
+	Ember<T> m_LastBuiltEmber;
+};
+}
--- a/Source/EmberCL/RendererClDevice.cpp
+++ b/Source/EmberCL/RendererClDevice.cpp
@ -1,93 +1,93 @@
-#include "EmberCLPch.h"
-#include "RendererClDevice.h"
-
-namespace EmberCLns
-{
-/// <summary>
-/// Constructor that assigns members.
-/// The object is not fully initialized at this point, the caller
-/// must manually call Init().
-/// </summary>
-/// <param name="platform">The index of the platform to use</param>
-/// <param name="device">The index device of the device to use</param>
-/// <param name="shared">True if shared with OpenGL, else false.</param>
-/// <returns>True if success, else false.</returns>
-RendererClDevice::RendererClDevice(size_t platform, size_t device, bool shared)
-{
-	m_Init = false;
-	m_Shared = shared;
-	m_NVidia = false;
-	m_WarpSize = 0;
-	m_Calls = 0;
-	m_PlatformIndex = platform;
-	m_DeviceIndex = device;
-	m_Info = OpenCLInfo::Instance();
-}
-
-/// <summary>
-/// Initialization of the OpenCLWrapper member.
-/// </summary>
-/// <returns>True if success, else false.</returns>
-bool RendererClDevice::Init()
-{
-	bool b = true;
-
-	if (!m_Wrapper.Ok())
-	{
-		m_Init = false;
-		b = m_Wrapper.Init(m_PlatformIndex, m_DeviceIndex, m_Shared);
-	}
-
-	if (b && m_Wrapper.Ok() && !m_Init)
-	{
-		m_NVidia = Find(ToLower(m_Info->PlatformName(m_PlatformIndex)), "nvidia") && m_Wrapper.LocalMemSize() > (32 * 1024);
-		m_WarpSize = m_NVidia ? 32 : 64;
-		m_Init = true;
-	}
-
-	return b;
-}
-
-/// <summary>
-/// OpenCL property accessors, getters only.
-/// </summary>
-bool RendererClDevice::Ok() const { return m_Init; }
-bool RendererClDevice::Shared() const { return m_Shared; }
-bool RendererClDevice::Nvidia() const { return m_NVidia; }
-size_t RendererClDevice::WarpSize() const { return m_WarpSize; }
-size_t RendererClDevice::PlatformIndex() const { return m_PlatformIndex; }
-size_t RendererClDevice::DeviceIndex() const { return m_DeviceIndex; }
-
-/// <summary>
-/// Clear the error report for this class as well as the wrapper.
-/// </summary>
-void RendererClDevice::ClearErrorReport()
-{
-	EmberReport::ClearErrorReport();
-	m_Wrapper.ClearErrorReport();
-}
-
-/// <summary>
-/// Concatenate and return the error report for this class and the
-/// wrapper as a single string.
-/// </summary>
-/// <returns>The concatenated error report string</returns>
-string RendererClDevice::ErrorReportString()
-{
-	const auto s = EmberReport::ErrorReportString();
-	return s + m_Wrapper.ErrorReportString();
-}
-
-/// <summary>
-/// Concatenate and return the error report for this class and the
-/// wrapper as a vector of strings.
-/// </summary>
-/// <returns>The concatenated error report vector of strings</returns>
-vector<string> RendererClDevice::ErrorReport()
-{
-	auto ours = EmberReport::ErrorReport();
-	const auto s = m_Wrapper.ErrorReport();
-	ours.insert(ours.end(), s.begin(), s.end());
-	return ours;
-}
-}
+#include "EmberCLPch.h"
+#include "RendererClDevice.h"
+
+namespace EmberCLns
+{
+/// <summary>
+/// Constructor that assigns members.
+/// The object is not fully initialized at this point, the caller
+/// must manually call Init().
+/// </summary>
+/// <param name="platform">The index of the platform to use</param>
+/// <param name="device">The index device of the device to use</param>
+/// <param name="shared">True if shared with OpenGL, else false.</param>
+/// <returns>True if success, else false.</returns>
+RendererClDevice::RendererClDevice(size_t platform, size_t device, bool shared)
+{
+	m_Init = false;
+	m_Shared = shared;
+	m_NVidia = false;
+	m_WarpSize = 0;
+	m_Calls = 0;
+	m_PlatformIndex = platform;
+	m_DeviceIndex = device;
+	m_Info = OpenCLInfo::Instance();
+}
+
+/// <summary>
+/// Initialization of the OpenCLWrapper member.
+/// </summary>
+/// <returns>True if success, else false.</returns>
+bool RendererClDevice::Init()
+{
+	bool b = true;
+
+	if (!m_Wrapper.Ok())
+	{
+		m_Init = false;
+		b = m_Wrapper.Init(m_PlatformIndex, m_DeviceIndex, m_Shared);
+	}
+
+	if (b && m_Wrapper.Ok() && !m_Init)
+	{
+		m_NVidia = Find(ToLower(m_Info->PlatformName(m_PlatformIndex)), "nvidia") && m_Wrapper.LocalMemSize() > (32 * 1024);
+		m_WarpSize = m_NVidia ? 32 : 64;
+		m_Init = true;
+	}
+
+	return b;
+}
+
+/// <summary>
+/// OpenCL property accessors, getters only.
+/// </summary>
+bool RendererClDevice::Ok() const noexcept { return m_Init; }
+bool RendererClDevice::Shared() const noexcept { return m_Shared; }
+bool RendererClDevice::Nvidia() const noexcept { return m_NVidia; }
+size_t RendererClDevice::WarpSize() const noexcept { return m_WarpSize; }
+size_t RendererClDevice::PlatformIndex() const noexcept { return m_PlatformIndex; }
+size_t RendererClDevice::DeviceIndex() const noexcept { return m_DeviceIndex; }
+
+/// <summary>
+/// Clear the error report for this class as well as the wrapper.
+/// </summary>
+void RendererClDevice::ClearErrorReport() noexcept
+{
+	EmberReport::ClearErrorReport();
+	m_Wrapper.ClearErrorReport();
+}
+
+/// <summary>
+/// Concatenate and return the error report for this class and the
+/// wrapper as a single string.
+/// </summary>
+/// <returns>The concatenated error report string</returns>
+string RendererClDevice::ErrorReportString()
+{
+	const auto s = EmberReport::ErrorReportString();
+	return s + m_Wrapper.ErrorReportString();
+}
+
+/// <summary>
+/// Concatenate and return the error report for this class and the
+/// wrapper as a vector of strings.
+/// </summary>
+/// <returns>The concatenated error report vector of strings</returns>
+vector<string> RendererClDevice::ErrorReport()
+{
+	auto ours = EmberReport::ErrorReport();
+	const auto s = m_Wrapper.ErrorReport();
+	ours.insert(ours.end(), s.begin(), s.end());
+	return ours;
+}
+}
--- a/Source/EmberCL/RendererClDevice.h
+++ b/Source/EmberCL/RendererClDevice.h
@ -1,47 +1,47 @@
-#pragma once
-
-#include "EmberCLPch.h"
-#include "OpenCLWrapper.h"
-#include "IterOpenCLKernelCreator.h"
-
-/// <summary>
-/// RendererClDevice class.
-/// </summary>
-
-namespace EmberCLns
-{
-/// <summary>
-/// Class to manage a device that does the iteration portion of
-/// the rendering process. Having a separate class for this purpose
-/// enables multi-GPU support.
-/// </summary>
-class EMBERCL_API RendererClDevice : public EmberReport
-{
-public:
-	RendererClDevice(size_t platform, size_t device, bool shared);
-	bool Init();
-	bool Ok() const;
-	bool Shared() const;
-	bool Nvidia() const;
-	size_t WarpSize() const;
-	size_t PlatformIndex() const;
-	size_t DeviceIndex() const;
-
-	//Public virtual functions overridden from base classes.
-	void ClearErrorReport() override;
-	string ErrorReportString() override;
-	vector<string> ErrorReport() override;
-
-	size_t m_Calls;
-	OpenCLWrapper m_Wrapper;
-
-private:
-	bool m_Init;
-	bool m_Shared;
-	bool m_NVidia;
-	size_t m_WarpSize;
-	size_t m_PlatformIndex;
-	size_t m_DeviceIndex;
-	shared_ptr<OpenCLInfo> m_Info;
-};
-}
+#pragma once
+
+#include "EmberCLPch.h"
+#include "OpenCLWrapper.h"
+#include "IterOpenCLKernelCreator.h"
+
+/// <summary>
+/// RendererClDevice class.
+/// </summary>
+
+namespace EmberCLns
+{
+/// <summary>
+/// Class to manage a device that does the iteration portion of
+/// the rendering process. Having a separate class for this purpose
+/// enables multi-GPU support.
+/// </summary>
+class EMBERCL_API RendererClDevice : public EmberReport
+{
+public:
+	RendererClDevice(size_t platform, size_t device, bool shared);
+	bool Init();
+	bool Ok() const noexcept;
+	bool Shared() const noexcept;
+	bool Nvidia() const noexcept;
+	size_t WarpSize() const noexcept;
+	size_t PlatformIndex() const noexcept;
+	size_t DeviceIndex() const noexcept;
+
+	//Public virtual functions overridden from base classes.
+	void ClearErrorReport() noexcept override;
+	string ErrorReportString() override;
+	vector<string> ErrorReport() override;
+
+	size_t m_Calls;
+	OpenCLWrapper m_Wrapper;
+
+private:
+	bool m_Init;
+	bool m_Shared;
+	bool m_NVidia;
+	size_t m_WarpSize;
+	size_t m_PlatformIndex;
+	size_t m_DeviceIndex;
+	shared_ptr<OpenCLInfo> m_Info;
+};
+}