#include "EmberCLPch.h" #include "RendererCL.h" namespace EmberCLns { ///

/// Constructor that inintializes various buffer names, block dimensions, image formats /// and finally initializes one or more OpenCL devices using the passed in parameters. /// When running with multiple devices, the first device is considered the "primary", while /// others are "secondary". /// The differences are: /// -Only the primary device will report progress, however the progress count will contain the combined progress of all devices. /// -The primary device runs in this thread, while others run on their own threads. /// -The primary device does density filtering and final accumulation, while the others only iterate. /// -Upon completion of iteration, the histograms from the secondary devices are: /// Copied to a temporary host side buffer. /// Copied from the host side buffer to the primary device's density filtering buffer as a temporary device storage area. /// Summed from the density filtering buffer, to the primary device's histogram. /// When this process happens for the last device, the density filtering buffer is cleared since it will be used shortly. /// Kernel creators are set to be non-nvidia by default. Will be properly set in Init(). ///

/// A vector of the platform,device index pairs to use. The first device will be the primary and will run non-threaded. /// True if shared with OpenGL, else false. Default: false. /// The texture ID of the shared OpenGL texture if shared. Default: 0. template RendererCL::RendererCL(const vector>& devices, bool shared, GLuint outputTexID) : m_IterOpenCLKernelCreator(), m_FinalAccumOpenCLKernelCreator(typeid(T) == typeid(double)) { m_PaletteFormat.image_channel_order = CL_RGBA; m_PaletteFormat.image_channel_data_type = CL_FLOAT; m_FinalFormat.image_channel_order = CL_RGBA; m_FinalFormat.image_channel_data_type = CL_FLOAT; m_CompileBegun = [&]() { }; m_IterCountPerKernel = size_t(m_SubBatchPercentPerThread * m_Ember.m_SubBatchSize); Init(devices, shared, outputTexID); } ///

/// Non-virtual member functions for OpenCL specific tasks. ///

///

/// Initialize OpenCL. /// In addition to initializing, this function will create the zeroization program, /// as well as the basic log scale filtering programs. This is done to ensure basic /// compilation works. Further compilation will be done later for iteration, density filtering, /// and final accumulation. ///

/// A vector of the platform,device index pairs to use. The first device will be the primary and will run non-threaded. /// True if shared with OpenGL, else false. /// The texture ID of the shared OpenGL texture if shared /// True if success, else false. template bool RendererCL::Init(const vector>& devices, bool shared, GLuint outputTexID) { if (devices.empty()) return false; bool b = false; static std::string loc = __FUNCTION__; auto& zeroizeProgram = m_IterOpenCLKernelCreator.ZeroizeKernel(); auto& sumHistProgram = m_IterOpenCLKernelCreator.SumHistKernel(); ostringstream os; m_Init = false; m_Shared = false; m_Devices.clear(); m_Devices.reserve(devices.size()); m_OutputTexID = outputTexID; m_GlobalShared.second.resize(16);//Dummy data until a real alloc is needed. for (size_t i = 0; i < devices.size(); i++) { try { unique_ptr cld(new RendererClDevice(devices[i].first, devices[i].second, i == 0 ? shared : false)); if ((b = cld->Init()))//Build a simple program to ensure OpenCL is working right. { if (b && !(b = cld->m_Wrapper.AddProgram(m_IterOpenCLKernelCreator.ZeroizeEntryPoint(), zeroizeProgram, m_IterOpenCLKernelCreator.ZeroizeEntryPoint(), m_DoublePrecision))) { ErrorStr(loc, "Failed to init zeroize program: "s + cld->ErrorReportString(), cld.get()); } if (b && !(b = cld->m_Wrapper.AddAndWriteImage("Palette", CL_MEM_READ_ONLY, m_PaletteFormat, m_Ember.m_Palette.Size(), 1, 0, nullptr))) { ErrorStr(loc, "Failed to init palette buffer: "s + cld->ErrorReportString(), cld.get()); } if (b && !(b = cld->m_Wrapper.AddAndWriteBuffer(m_GlobalSharedBufferName, m_GlobalShared.second.data(), m_GlobalShared.second.size() * sizeof(m_GlobalShared.second[0])))) { ErrorStr(loc, "Failed to init global shared buffer: "s + cld->ErrorReportString(), cld.get()); }//Empty at start, will be filled in later if needed. if (b) { m_Devices.push_back(std::move(cld));//Success, so move to the device vector, else it will go out of scope and be deleted. } else { ErrorStr(loc, "Failed to init programs for platform", cld.get()); break; } } else { ErrorStr(loc, "Failed to init device, "s + cld->ErrorReportString(), cld.get()); break; } } catch (const std::exception& e) { ErrorStr(loc, "Failed to init platform: "s + e.what(), nullptr); } catch (...) { ErrorStr(loc, "Failed to init platform with unknown exception", nullptr); } } if (b && (m_Devices.size() == devices.size())) { auto& firstWrapper = m_Devices[0]->m_Wrapper; m_DEOpenCLKernelCreator = DEOpenCLKernelCreator(m_DoublePrecision, m_Devices[0]->Nvidia());//This will cause it to be created a second time, because it was already done once in the constructor. //Build a simple program to ensure OpenCL is working right. if (b && !(b = firstWrapper.AddProgram(m_DEOpenCLKernelCreator.LogScaleAssignDEEntryPoint(), m_DEOpenCLKernelCreator.LogScaleAssignDEKernel(), m_DEOpenCLKernelCreator.LogScaleAssignDEEntryPoint(), m_DoublePrecision))) { ErrorStr(loc, "failed to init log scale program", m_Devices[0].get()); } if (b && !(b = firstWrapper.AddProgram(m_IterOpenCLKernelCreator.SumHistEntryPoint(), sumHistProgram, m_IterOpenCLKernelCreator.SumHistEntryPoint(), m_DoublePrecision))) { ErrorStr(loc, "Failed to init sum histogram program", m_Devices[0].get()); } if (b) { //This is the maximum box dimension for density filtering which consists of (blockSize * blockSize) + (2 * filterWidth). //These blocks should be square, and ideally, 32x32. //Sadly, at the moment, the GPU runs out of resources at that block size because the DE filter function is so complex. //The next best block size seems to be 24x24. //AMD is further limited because of less local memory so these have to be 16 on AMD. //Users have reported crashes on Nvidia cards even at size 24, so just to be safe, make them both 16 for all manufacturers. m_MaxDEBlockSizeW = 16; m_MaxDEBlockSizeH = 16; FillSeeds(); for (size_t device = 0; device < m_Devices.size(); device++) if (b && !(b = m_Devices[device]->m_Wrapper.AddAndWriteBuffer(m_SeedsBufferName, reinterpret_cast(m_Seeds[device].data()), SizeOf(m_Seeds[device])))) { ErrorStr(loc, "Failed to init seeds buffer", m_Devices[device].get()); break; } } m_Shared = shared; m_Init = b; } else { ErrorStr(loc, "Failed to init all devices and platforms", nullptr); } return m_Init; } ///

/// Set the shared output texture of the primary device where final accumulation will be written to. ///

/// The texture ID of the shared OpenGL texture if shared /// True if success, else false. template bool RendererCL::SetOutputTexture(GLuint outputTexID) { bool success = true; static std::string loc = __FUNCTION__; if (!m_Devices.empty()) { OpenCLWrapper& firstWrapper = m_Devices[0]->m_Wrapper; m_OutputTexID = outputTexID; EnterResize(); if (!firstWrapper.AddAndWriteImage(m_FinalImageName, CL_MEM_WRITE_ONLY, m_FinalFormat, FinalRasW(), FinalRasH(), 0, nullptr, firstWrapper.Shared(), m_OutputTexID)) { ErrorStr(loc, "Failed to init set output texture", m_Devices[0].get()); success = false; } LeaveResize(); } else success = false; return success; } ///

/// OpenCL property accessors, getters only. ///

//Iters per kernel/block/grid. template size_t RendererCL::IterCountPerKernel() const { return m_IterCountPerKernel; } template size_t RendererCL::IterCountPerBlock() const { return IterCountPerKernel() * IterBlockKernelCount(); } template size_t RendererCL::IterCountPerGrid() const { return IterCountPerKernel() * IterGridKernelCount(); } //Kernels per block. template size_t RendererCL::IterBlockKernelWidth() const { return m_IterBlockWidth; } template size_t RendererCL::IterBlockKernelHeight() const { return m_IterBlockHeight; } template size_t RendererCL::IterBlockKernelCount() const { return IterBlockKernelWidth() * IterBlockKernelHeight(); } //Kernels per grid. template size_t RendererCL::IterGridKernelWidth() const { return IterGridBlockWidth() * IterBlockKernelWidth(); } template size_t RendererCL::IterGridKernelHeight() const { return IterGridBlockHeight() * IterBlockKernelHeight(); } template size_t RendererCL::IterGridKernelCount() const { return IterGridKernelWidth() * IterGridKernelHeight(); } //Blocks per grid. template size_t RendererCL::IterGridBlockWidth() const { return m_IterBlocksWide; } template size_t RendererCL::IterGridBlockHeight() const { return m_IterBlocksHigh; } template size_t RendererCL::IterGridBlockCount() const { return IterGridBlockWidth() * IterGridBlockHeight(); } ///

/// Read the histogram of the specified into the host side CPU buffer. ///

/// The index device of the device whose histogram will be read /// True if success, else false. template bool RendererCL::ReadHist(size_t device) { if (device < m_Devices.size()) if (Renderer::Alloc(true))//Allocate the histogram memory to read into, other buffers not needed. return m_Devices[device]->m_Wrapper.ReadBuffer(m_HistBufferName, reinterpret_cast(HistBuckets()), SuperSize() * sizeof(v4bT));//HistBuckets should have been created as a ClBuffer with HOST_PTR if more than one device is used. return false; } ///

/// Read the density filtering buffer into the host side CPU buffer. /// Used for debugging. ///

/// True if success, else false. template bool RendererCL::ReadAccum() { if (Renderer::Alloc() && !m_Devices.empty())//Allocate the memory to read into. return m_Devices[0]->m_Wrapper.ReadBuffer(m_AccumBufferName, reinterpret_cast(AccumulatorBuckets()), SuperSize() * sizeof(v4bT)); return false; } ///

/// Read the temporary points buffer from a device into a host side CPU buffer. /// Used for debugging. ///

/// The index in the device buffer whose points will be read /// The host side buffer to read into /// True if success, else false. template bool RendererCL::ReadPoints(size_t device, vector>& vec) { vec.resize(IterGridKernelCount());//Allocate the memory to read into. if (vec.size() >= IterGridKernelCount() && device < m_Devices.size()) return m_Devices[device]->m_Wrapper.ReadBuffer(m_PointsBufferName, reinterpret_cast(vec.data()), IterGridKernelCount() * sizeof(PointCL)); return false; } ///

/// Clear the histogram buffer for all devices with all zeroes. ///

/// True if success, else false. template bool RendererCL::ClearHist() { bool b = !m_Devices.empty(); static std::string loc = __FUNCTION__; for (size_t i = 0; i < m_Devices.size(); i++) if (b && !(b = ClearBuffer(i, m_HistBufferName, uint(SuperRasW()), uint(SuperRasH()), sizeof(v4bT)))) { ErrorStr(loc, "Failed to clear histogram", m_Devices[i].get()); break; } return b; } ///

/// Clear the histogram buffer for a single device with all zeroes. ///

/// The index in the device buffer whose histogram will be cleared /// True if success, else false. template bool RendererCL::ClearHist(size_t device) { bool b = device < m_Devices.size(); static std::string loc = __FUNCTION__; if (b && !(b = ClearBuffer(device, m_HistBufferName, uint(SuperRasW()), uint(SuperRasH()), sizeof(v4bT)))) { ErrorStr(loc, "Failed to clear histogram", m_Devices[device].get()); } return b; } ///

/// Clear the density filtering buffer with all zeroes. ///

/// True if success, else false. template bool RendererCL::ClearAccum() { return ClearBuffer(0, m_AccumBufferName, uint(SuperRasW()), uint(SuperRasH()), sizeof(v4bT)); } ///

/// Write values from a host side CPU buffer into the temporary points buffer for the specified device. /// Used for debugging. ///

/// The index in the device buffer whose points will be written to /// The host side buffer whose values to write /// True if success, else false. template bool RendererCL::WritePoints(size_t device, vector>& vec) { bool b = false; static std::string loc = __FUNCTION__; if (device < m_Devices.size()) if (!(b = m_Devices[device]->m_Wrapper.WriteBuffer(m_PointsBufferName, reinterpret_cast(vec.data()), SizeOf(vec)))) { ErrorStr(loc, "Failed to write points buffer", m_Devices[device].get()); } return b; } #ifdef TEST_CL template bool RendererCL::WriteRandomPoints(size_t device) { size_t size = IterGridKernelCount(); vector> vec(size); for (int i = 0; i < size; i++) { vec[i].m_X = m_Rand[0].Frand11(); vec[i].m_Y = m_Rand[0].Frand11(); vec[i].m_Z = 0; vec[i].m_ColorX = m_Rand[0].Frand01(); vec[i].m_LastXfUsed = 0; } return WritePoints(device, vec); } #endif ///

/// Resize the variation state vector to hold all of the variation state variables across all variations /// in the ember, aligned to 16, for each thread that will be launched on a device. ///

template void RendererCL::InitStateVec() { size_t count = 0, i = 0, j = 0, k = 0; while (auto xform = m_Ember.GetTotalXform(i++)) for (j = 0; j < xform->TotalVariationCount(); j++) if (auto var = xform->GetVariation(j)) count += var->StateParamCount() * sizeof(T); //Round to 16 and resize the buffer to be copied to OpenCL buffer here. auto igkc = IterGridKernelCount(); size_t index = 0, count16 = ((count / 16) * 16) + (count % 16 > 0 ? 16 : 0); auto elcount = count16 / sizeof(T); m_VarStates.resize(igkc * elcount); if (count16) { for (k = 0; k < igkc; k++) { i = 0; index = k * elcount; while (auto xform = m_Ember.GetTotalXform(i++)) for (j = 0; j < xform->TotalVariationCount(); j++) if (auto var = xform->GetVariation(j)) var->InitStateVars(m_VarStates.data(), index); } } } ///

/// Set the percentage of a sub batch that should be executed in each thread per kernel call. ///

/// The percentage as a value from 0.01 to 1.0 template void RendererCL::SubBatchPercentPerThread(float f) { m_SubBatchPercentPerThread = Clamp(f, 0.01f, 1.0f); } ///

/// Get the percentage of a sub batch that should be executed in each thread per kernel call. ///

/// The percentage as a value from 0.01 to 1.0 template float RendererCL::SubBatchPercentPerThread() const { return m_SubBatchPercentPerThread; } ///

/// Get the kernel string for the last built iter program. ///

/// The string representation of the kernel for the last built iter program. template const string& RendererCL::IterKernel() const { return m_IterKernel; } ///

/// Get the kernel string for the last built density filtering program. ///

/// The string representation of the kernel for the last built density filtering program. template const string& RendererCL::DEKernel() const { return m_DEOpenCLKernelCreator.GaussianDEKernel(Supersample(), m_DensityFilterCL.m_FilterWidth); } ///

/// Get the kernel string for the last built final accumulation program. ///

/// The string representation of the kernel for the last built final accumulation program. template const string& RendererCL::FinalAccumKernel() const { return m_FinalAccumOpenCLKernelCreator.FinalAccumKernel(EarlyClip()); } ///

/// Get the a const referece to the devices this renderer will use. /// Use this cautiously and do not use const_cast to manipulate the vector. ///

/// A const reference to a vector of unique_ptr of devices template const vector>& RendererCL::Devices() const { return m_Devices; } ///

/// Virtual functions overridden from RendererCLBase. ///

///

/// Read the final image buffer buffer from the primary device into the host side CPU buffer. /// This must be called before saving the final output image to file. ///

/// The host side buffer to read into /// True if success, else false. template bool RendererCL::ReadFinal(v4F* pixels) { if (pixels && !m_Devices.empty()) return m_Devices[0]->m_Wrapper.ReadImage(m_FinalImageName, FinalRasW(), FinalRasH(), 0, m_Devices[0]->m_Wrapper.Shared(), pixels); return false; } ///

/// Clear the final image output buffer of the primary device with all zeroes by copying a host side buffer. /// Slow, but never used because the final output image is always completely overwritten. ///

/// True if success, else false. template bool RendererCL::ClearFinal() { vector v; static std::string loc = __FUNCTION__; if (!m_Devices.empty()) { auto& wrapper = m_Devices[0]->m_Wrapper; uint index = wrapper.FindImageIndex(m_FinalImageName, wrapper.Shared()); if (this->PrepFinalAccumVector(v)) { if (!wrapper.WriteImage2D(index, wrapper.Shared(), FinalRasW(), FinalRasH(), 0, v.data())) ErrorStr(loc, "Failed to clear final buffer", m_Devices[0].get()); else return false; } else return false; } return false; } ///

/// Public virtual functions overridden from Renderer or RendererBase. ///

///

/// The amount of video RAM available on the first GPU to render with. ///

/// An unsigned 64-bit integer specifying how much video memory is available template size_t RendererCL::MemoryAvailable() { return Ok() ? m_Devices[0]->m_Wrapper.GlobalMemSize() : 0ULL; } ///

/// Return whether OpenCL has been properly initialized. ///

/// True if OpenCL has been properly initialized, else false. template bool RendererCL::Ok() const { return !m_Devices.empty() && m_Init; } ///

/// The sub batch size for OpenCL will always be how many /// iterations are ran per kernel call. The caller can't /// change this. ///

/// The number of iterations ran in a single kernel call template size_t RendererCL::SubBatchSize() const { return IterCountPerGrid(); } ///

/// The thread count for OpenCL is always considered to be 1, however /// the kernel internally runs many threads. ///

/// 1 template size_t RendererCL::ThreadCount() const { return 1; } ///

/// Create the density filter in the base class and copy the filter values /// to the corresponding OpenCL buffers on the primary device. ///

/// True if a new filter instance was created, else false. /// True if success, else false. template bool RendererCL::CreateDEFilter(bool& newAlloc) { bool b = true; static std::string loc = __FUNCTION__; if (!m_Devices.empty() && Renderer::CreateDEFilter(newAlloc)) { //Copy coefs and widths here. Convert and copy the other filter params right before calling the filtering kernel. if (newAlloc) { auto& wrapper = m_Devices[0]->m_Wrapper; if (b && !(b = wrapper.AddAndWriteBuffer(m_DECoefsBufferName, reinterpret_cast(const_cast(m_DensityFilter->Coefs())), m_DensityFilter->CoefsSizeBytes()))) ErrorStr(loc, "Failed to set DE coefficients buffer", m_Devices[0].get()); if (b && !(b = wrapper.AddAndWriteBuffer(m_DEWidthsBufferName, reinterpret_cast(const_cast(m_DensityFilter->Widths())), m_DensityFilter->WidthsSizeBytes()))) ErrorStr(loc, "Failed to set DE widths buffer", m_Devices[0].get()); if (b && !(b = wrapper.AddAndWriteBuffer(m_DECoefIndicesBufferName, reinterpret_cast(const_cast(m_DensityFilter->CoefIndices())), m_DensityFilter->CoefsIndicesSizeBytes()))) ErrorStr(loc, "Failed to set DE coefficient indices buffer", m_Devices[0].get()); } } else b = false; return b; } ///

/// Create the spatial filter in the base class and copy the filter values /// to the corresponding OpenCL buffers on the primary device. ///

/// True if a new filter instance was created, else false. /// True if success, else false. template bool RendererCL::CreateSpatialFilter(bool& newAlloc) { bool b = true; static std::string loc = __FUNCTION__; if (!m_Devices.empty() && Renderer::CreateSpatialFilter(newAlloc)) { if (newAlloc) if (!(b = m_Devices[0]->m_Wrapper.AddAndWriteBuffer(m_SpatialFilterCoefsBufferName, reinterpret_cast(m_SpatialFilter->Filter()), m_SpatialFilter->BufferSizeBytes()))) ErrorStr(loc, "Failed to set patial filter coefficients buffer", m_Devices[0].get()); } else b = false; return b; } ///

/// Get the renderer type enum. ///

/// eRendererType::OPENCL_RENDERER template eRendererType RendererCL::RendererType() const { return eRendererType::OPENCL_RENDERER; } ///

/// Get whether the renderer uses a shared texture with OpenGL. ///

/// True if shared, else false. template bool RendererCL::Shared() const { return m_Shared; } ///

/// Clear the error report for this class as well as the OpenCLWrapper members of each device. ///

template void RendererCL::ClearErrorReport() { EmberReport::ClearErrorReport(); for (auto& device : m_Devices) device->m_Wrapper.ClearErrorReport(); } ///

/// Concatenate and return the error report for this class and the /// OpenCLWrapper member of each device as a single string. ///

/// The concatenated error report string template string RendererCL::ErrorReportString() { auto s = EmberReport::ErrorReportString(); for (auto& device : m_Devices) s += device->ErrorReportString(); return s; } ///

/// Concatenate and return the error report for this class and the /// OpenCLWrapper member of each device as a vector of strings. ///

/// The concatenated error report vector of strings template vector RendererCL::ErrorReport() { auto ours = EmberReport::ErrorReport(); for (auto& device : m_Devices) { auto s = device->ErrorReport(); ours.insert(ours.end(), s.begin(), s.end()); } return ours; } ///

/// Set the vector of random contexts on every device. /// Call the base, and reset the seeds vector. /// Used on the command line when the user wants a specific set of seeds to start with to /// produce an exact result. Mostly for debugging. ///

/// The vector of random contexts to assign /// True if the size of the vector matched the number of threads used for rendering and writing seeds to OpenCL succeeded, else false. template bool RendererCL::RandVec(vector>& randVec) { bool b = Renderer::RandVec(randVec); static std::string loc = __FUNCTION__; if (!m_Devices.empty()) { FillSeeds(); for (size_t device = 0; device < m_Devices.size(); device++) if (b && !(b = m_Devices[device]->m_Wrapper.AddAndWriteBuffer(m_SeedsBufferName, reinterpret_cast(m_Seeds[device].data()), SizeOf(m_Seeds[device])))) { ErrorStr(loc, "Failed to set randoms buffer", m_Devices[device].get()); break; } } else b = false; return b; } ///

/// Get whether any devices are from Nvidia. ///

/// True if an devices are from Nvidia, else false. template bool RendererCL::AnyNvidia() const { for (auto& dev : m_Devices) if (dev->Nvidia()) return true; return false; } ///

/// Protected virtual functions overridden from Renderer. ///

///

/// Allocate all buffers required for running as well as the final /// 2D image and perform some other initialization. /// Note that only iteration-related buffers are allocated on secondary devices. ///

/// True if success, else false. template bool RendererCL::Alloc(bool histOnly) { if (!Ok()) return false; EnterResize(); m_XformsCL.resize(m_Ember.TotalXformCount()); bool b = true; size_t size = SuperSize() * sizeof(v4bT);//Size of histogram and density filter buffer. static std::string loc = __FUNCTION__; auto& wrapper = m_Devices[0]->m_Wrapper; InitStateVec(); m_IterCountPerKernel = size_t(m_SubBatchPercentPerThread * m_Ember.m_SubBatchSize);//This isn't the greatest place to put this, but it must be computed before the number of iters to do is computed in the base. if (b && !(b = wrapper.AddBuffer(m_DEFilterParamsBufferName, sizeof(m_DensityFilterCL)))) { ErrorStr(loc, "Failed to set DE filter parameters buffer", m_Devices[0].get()); } if (b && !(b = wrapper.AddBuffer(m_SpatialFilterParamsBufferName, sizeof(m_SpatialFilterCL)))) { ErrorStr(loc, "Failed to set spatial filter parameters buffer", m_Devices[0].get()); } if (b && !(b = wrapper.AddBuffer(m_CurvesCsaName, SizeOf(m_Csa)))) { ErrorStr(loc, "Failed to set curves buffer", m_Devices[0].get()); } if (b && !(b = wrapper.AddBuffer(m_AccumBufferName, size))) { ErrorStr(loc, "Failed to set accum buffer", m_Devices[0].get()); } for (auto& device : m_Devices) { if (b && !(b = device->m_Wrapper.AddBuffer(m_EmberBufferName, sizeof(m_EmberCL)))) { ErrorStr(loc, "Failed to set ember buffer", device.get()); break; } if (b && !(b = device->m_Wrapper.AddBuffer(m_XformsBufferName, SizeOf(m_XformsCL)))) { ErrorStr(loc, "Failed to set xforms buffer", device.get()); break; } if (b && !(b = device->m_Wrapper.AddBuffer(m_ParVarsBufferName, 128 * sizeof(T)))) { ErrorStr(loc, "Failed to set parametric variations buffer", device.get()); break; }//Will be resized with the needed amount later. if (b && !(b = device->m_Wrapper.AddBuffer(m_DistBufferName, CHOOSE_XFORM_GRAIN))) { ErrorStr(loc, "Failed to set xforms distribution buffer", device.get()); break; }//Will be resized for xaos. if (b && !(b = device->m_Wrapper.AddBuffer(m_CarToRasBufferName, sizeof(m_CarToRasCL)))) { ErrorStr(loc, "Failed to set cartesian to raster buffer", device.get()); break; } if (b && !(b = device->m_Wrapper.AddBuffer(m_HistBufferName, size))) { ErrorStr(loc, "Failed to set histogram buffer", device.get()); break; }//Histogram. Will memset to zero later. if (b && !(b = device->m_Wrapper.AddBuffer(m_PointsBufferName, IterGridKernelCount() * sizeof(PointCL)))) { ErrorStr(loc, "Failed to set points buffer", device.get()); break; }//Points between iter calls. #ifdef KNL_USE_GLOBAL_CONSEC if (b && !(b = device->m_Wrapper.AddBuffer(m_ConsecBufferName, IterGridKernelCount() * sizeof(cl_uchar)))) { ErrorStr(loc, "Failed to set consec buffer", device.get()); break; }//Global sequence. #endif if (m_VarStates.size()) if (b && !(b = device->m_Wrapper.AddBuffer(m_VarStateBufferName, SizeOf(m_VarStates)))) { ErrorStr(loc, "Failed to set variation state buffer", device.get()); break; }//Points between iter calls. //Global shared is allocated once and written when building the kernel. } if (m_Devices.size() > 1) b = CreateHostBuffer(); LeaveResize(); if (b && !(b = SetOutputTexture(m_OutputTexID))) { ErrorStr(loc, "Failed to set output texture", m_Devices[0].get()); } return b; } ///

/// Clear OpenCL histogram on all devices and/or density filtering buffer on the primary device to all zeroes. ///

/// Clear histogram if true, else don't. /// Clear density filtering buffer if true, else don't. /// True if success, else false. template bool RendererCL::ResetBuckets(bool resetHist, bool resetAccum) { bool b = true; if (resetHist) b &= ClearHist(); if (resetAccum) b &= ClearAccum(); return b; } ///

/// Perform log scale density filtering on the primary device. ///

/// Whether this output was forced due to an interactive render /// True if success and not aborted, else false. template eRenderStatus RendererCL::LogScaleDensityFilter(bool forceOutput) { return RunLogScaleFilter(); } ///

/// Run gaussian density estimation filtering on the primary device. ///

/// True if success and not aborted, else false. template eRenderStatus RendererCL::GaussianDensityFilter() { //This commented section is for debugging density filtering by making it run on the CPU //then copying the results back to the GPU. //if (ReadHist()) //{ // uint accumLength = SuperSize() * sizeof(glm::detail::tvec4); // const char* loc = __FUNCTION__; // // Renderer::ResetBuckets(false, true); // Renderer::GaussianDensityFilter(); // // if (!m_Wrapper.WriteBuffer(m_AccumBufferName, AccumulatorBuckets(), accumLength)) { AddToReport(loc); return RENDER_ERROR; } // return RENDER_OK; //} //else // return RENDER_ERROR; //Timing t(4); eRenderStatus status = RunDensityFilter(); //t.Toc(__FUNCTION__ " RunKernel()"); return status; } ///

/// Run final accumulation on the primary device. /// If the first device is not shared, the output will remain in the OpenCL 2D image and no copying will take place. /// If it is shared, then the image will be copied into the pixels vector. ///

/// The pixel vector to allocate and store the final image in /// Offset in the buffer to store the pixels to /// True if not prematurely aborted, else false. template eRenderStatus RendererCL::AccumulatorToFinalImage(vector& pixels, size_t finalOffset) { auto status = RunFinalAccum(); if (status == eRenderStatus::RENDER_OK && !m_Devices.empty() && !m_Devices[0]->m_Wrapper.Shared()) { if (PrepFinalAccumVector(pixels)) { auto p = pixels.data(); p += finalOffset; if (!ReadFinal(p)) status = eRenderStatus::RENDER_ERROR; } else status = eRenderStatus::RENDER_ERROR; } return status; } ///

/// Run the iteration algorithm for the specified number of iterations, splitting the work /// across devices. /// This is only called after all other setup has been done. /// This will recompile the OpenCL program on every device if this ember differs significantly /// from the previous run. /// Note that the bad value count is not recorded when running with OpenCL. If it's /// needed, run on the CPU. ///

/// The number of iterations to run /// The temporal sample within the current pass this is running for /// Rendering statistics template EmberStats RendererCL::Iterate(size_t iterCount, size_t temporalSample) { EmberStats stats;//Do not record bad vals with with GPU. If the user needs to investigate bad vals, use the CPU. static std::string loc = __FUNCTION__; bool& b = stats.m_Success; //Only need to do this once on the beginning of a new render. Last iter will always be 0 at the beginning of a full render or temporal sample. if (m_LastIter == 0) { ConvertEmber(m_Ember, m_EmberCL, m_XformsCL); ConvertCarToRas(CoordMap()); //Rebuilding is expensive, so only do it if it's required. if (IterOpenCLKernelCreator::IsBuildRequired(m_Ember, m_LastBuiltEmber, m_OptAffine)) b = BuildIterProgramForEmber(true); if (b) { //Setup buffers on all devices. for (auto& device : m_Devices) { auto& wrapper = device->m_Wrapper; if (b && !(b = wrapper.WriteBuffer(m_EmberBufferName, reinterpret_cast(&m_EmberCL), sizeof(m_EmberCL)))) { ErrorStr(loc, "Write ember buffer failed", device.get()); break; } if (b && !(b = wrapper.WriteBuffer(m_XformsBufferName, reinterpret_cast(m_XformsCL.data()), SizeOf(m_XformsCL)))) { ErrorStr(loc, "Write xforms buffer failed", device.get()); break; } if (b && !(b = wrapper.AddAndWriteBuffer(m_DistBufferName, reinterpret_cast(const_cast(XformDistributions())), XformDistributionsSize())))//Will be resized for xaos. { ErrorStr(loc, "Write xforms distribution buffer failed", device.get()); break; } if (b && !(b = wrapper.WriteBuffer(m_CarToRasBufferName, reinterpret_cast(&m_CarToRasCL), sizeof(m_CarToRasCL)))) { ErrorStr(loc, "Write cartesian to raster buffer failed", device.get()); break; } if (m_VarStates.size()) { if (b && !(b = wrapper.AddAndWriteBuffer(m_VarStateBufferName, reinterpret_cast(m_VarStates.data()), SizeOf(m_VarStates)))) { ErrorStr(loc, "Write variation state buffer failed", device.get()); break; } } if (b && !(b = wrapper.AddAndWriteImage("Palette", CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, m_PaletteFormat, m_Dmap.Size(), 1, 0, m_Dmap.m_Entries.data()))) { ErrorStr(loc, "Write palette buffer failed", device.get()); break; } if (b) { IterOpenCLKernelCreator::ParVarIndexDefines(m_Ember, m_Params, true, false);//Always do this to get the values (but no string), regardless of whether a rebuild is necessary. //Don't know the size of the parametric varations parameters buffer until the ember is examined. //So set it up right before the run. if (!m_Params.second.empty()) { if (!wrapper.AddAndWriteBuffer(m_ParVarsBufferName, m_Params.second.data(), SizeOf(m_Params.second))) { ErrorStr(loc, "Write parametric variations buffer failed", device.get()); break; } } } else break; } } } if (b) { m_IterTimer.Tic();//Tic() here to avoid including build time in iter time measurement. if (m_LastIter == 0 && m_ProcessAction != eProcessAction::KEEP_ITERATING)//Only reset the call count on the beginning of a new render. Do not reset on KEEP_ITERATING. for (auto& dev : m_Devices) dev->m_Calls = 0; b = RunIter(iterCount, temporalSample, stats.m_Iters); stats.m_IterMs = m_IterTimer.Toc(); } else { ErrorStr(loc, "Iiteration failed", nullptr); } return stats; } ///

/// Private functions for making and running OpenCL programs. ///

///

/// Build the iteration program on every device for the current ember. /// This is parallelized by placing the build for each device on its own thread. ///

/// Whether to build in accumulation, only for debugging. Default: true. /// True if successful for all devices, else false. template bool RendererCL::BuildIterProgramForEmber(bool doAccum) { //Timing t; bool b = !m_Devices.empty(); static std::string loc = __FUNCTION__; IterOpenCLKernelCreator::ParVarIndexDefines(m_Ember, m_Params, false, true);//Do with string and no vals. IterOpenCLKernelCreator::SharedDataIndexDefines(m_Ember, m_GlobalShared, true, true);//Do with string and vals only once on build since it won't change until another build occurs. if (b) { m_CompileBegun(); m_IterKernel = m_IterOpenCLKernelCreator.CreateIterKernelString(m_Ember, m_Params.first, m_GlobalShared.first, m_OptAffine, m_LockAccum, doAccum); //cout << "Building: " << "\n" << iterProgram << "\n"; vector threads; std::function func = [&](RendererClDevice * dev) { if (!dev->m_Wrapper.AddProgram(m_IterOpenCLKernelCreator.IterEntryPoint(), m_IterKernel, m_IterOpenCLKernelCreator.IterEntryPoint(), m_DoublePrecision)) { rlg l(m_ResizeCs);//Just use the resize CS for lack of a better one. b = false; ErrorStr(loc, "Building the following program failed\n"s + m_IterKernel, dev); } else if (!m_GlobalShared.second.empty()) { if (!dev->m_Wrapper.AddAndWriteBuffer(m_GlobalSharedBufferName, m_GlobalShared.second.data(), m_GlobalShared.second.size() * sizeof(m_GlobalShared.second[0]))) { rlg l(m_ResizeCs);//Just use the resize CS for lack of a better one. b = false; ErrorStr(loc, "Adding global shared buffer failed", dev); } } }; threads.reserve(m_Devices.size() - 1); for (size_t device = m_Devices.size() - 1; device >= 0 && device < m_Devices.size(); device--)//Check both extents because size_t will wrap. { if (!device)//Secondary devices on their own threads. threads.push_back(std::thread([&](RendererClDevice * dev) { func(dev); }, m_Devices[device].get())); else//Primary device on this thread. func(m_Devices[device].get()); } Join(threads); if (b) { //t.Toc(__FUNCTION__ " program build"); //cout << string(loc) << "():\nBuilding the following program succeeded: \n" << iterProgram << "\n"; m_LastBuiltEmber = m_Ember; } } return b; } ///

/// Run the iteration kernel on all devices. /// Fusing on the CPU is done once per sub batch, usually 10,000 iters. Here, /// the same fusing frequency is kept, but is done per kernel thread. ///

/// The number of iterations to run /// The temporal sample this is running for /// The storage for the number of iterations ran /// True if success, else false. template bool RendererCL::RunIter(size_t iterCount, size_t temporalSample, size_t& itersRan) { //Timing t;//, t2(4); if (iterCount == 0)//In rare cases this can happen in the interactive renderer, so just assume it's finished iterating to avoid dividing by zero below. return true; bool success = !m_Devices.empty(); uint histSuperSize = uint(SuperSize()); size_t launches = size_t(ceil(double(iterCount) / IterCountPerGrid())); static std::string loc = __FUNCTION__; vector threadVec; std::atomic atomLaunchesRan; std::atomic atomItersRan, atomItersRemaining; size_t adjustedIterCountPerKernel = m_IterCountPerKernel; itersRan = 0; atomItersRan.store(0); atomItersRemaining.store(iterCount); atomLaunchesRan.store(0); threadVec.reserve(m_Devices.size()); //If a very small number of iters is requested, and multiple devices //are present, then try to spread the launches over the devices. //Otherwise, only one device would get used. //This also applies to when running a single device, and the requested iters per thread based on the //sub batch size, is greater than is required to run all requested iters. This will reduce the iters //per thread to the appropriate value. //Note that this can lead to doing a few more iterations than requested //due to rounding up to ~32k kernel threads per launch. if (m_Devices.size() >= launches) { launches = m_Devices.size(); adjustedIterCountPerKernel = size_t(std::ceil(std::ceil(double(iterCount) / m_Devices.size()) / IterGridKernelCount())); } size_t fuseFreq = Renderer::SubBatchSize() / adjustedIterCountPerKernel;//Use the base sbs to determine when to fuse. #ifdef TEST_CL m_Abort = false; #endif std::function iterFunc = [&](size_t dev, int kernelIndex) { bool b = true; auto& wrapper = m_Devices[dev]->m_Wrapper; intmax_t itersRemaining = 0; //Timing looptimer; while (b && (atomLaunchesRan.fetch_add(1) + 1 <= launches) && ((itersRemaining = atomItersRemaining.load()) > 0) && success && !m_Abort) { //Check if the user wanted to suspend the process. while (Paused()) std::this_thread::sleep_for(500ms); //looptimer.Tic(); cl_uint argIndex = 0; #ifdef TEST_CL uint fuse = 0; #else uint fuse = uint((m_Devices[dev]->m_Calls % fuseFreq) == 0u ? FuseCount() : 0u); #endif //Similar to what's done in the base class. //The number of iters per thread must be adjusted if they've requested less iters than is normally ran in a grid (256 * 256 * 64 * 2 = 32,768). uint iterCountPerKernel = std::min(uint(adjustedIterCountPerKernel), uint(ceil(double(itersRemaining) / IterGridKernelCount()))); size_t iterCountThisLaunch = iterCountPerKernel * IterGridKernelWidth() * IterGridKernelHeight(); //cout << "itersRemaining " << itersRemaining << ", iterCountPerKernel " << iterCountPerKernel << ", iterCountThisLaunch " << iterCountThisLaunch << "\n"; if (b && !(b = wrapper.SetArg (kernelIndex, argIndex++, iterCountPerKernel))) { ErrorStr(loc, "Setting iter count argument failed", m_Devices[dev].get()); }//Number of iters for each thread to run. if (b && !(b = wrapper.SetArg (kernelIndex, argIndex++, fuse))) { ErrorStr(loc, "Setting fuse count argument failed", m_Devices[dev].get()); }//Number of iters to fuse. if (b && !(b = wrapper.SetBufferArg(kernelIndex, argIndex++, m_SeedsBufferName))) { ErrorStr(loc, "Setting seeds buffer argument failed", m_Devices[dev].get()); }//Seeds. if (b && !(b = wrapper.SetBufferArg(kernelIndex, argIndex++, m_EmberBufferName))) { ErrorStr(loc, "Setting ember buffer argument failed", m_Devices[dev].get()); }//Ember. if (b && !(b = wrapper.SetBufferArg(kernelIndex, argIndex++, m_XformsBufferName))) { ErrorStr(loc, "Setting xforms buffer argument failed", m_Devices[dev].get()); }//Xforms. if (b && !(b = wrapper.SetBufferArg(kernelIndex, argIndex++, m_ParVarsBufferName))) { ErrorStr(loc, "Setting parametric variations buffer argument failed", m_Devices[dev].get()); }//Parametric variation parameters. if (b && !(b = wrapper.SetBufferArg(kernelIndex, argIndex++, m_GlobalSharedBufferName))) { ErrorStr(loc, "Setting global shared buffer argument failed", m_Devices[dev].get()); }//Global shared data. if (b && !(b = wrapper.SetBufferArg(kernelIndex, argIndex++, m_DistBufferName))) { ErrorStr(loc, "Setting xforms distribution buffer argument failed", m_Devices[dev].get()); }//Xform distributions. if (b && !(b = wrapper.SetBufferArg(kernelIndex, argIndex++, m_CarToRasBufferName))) { ErrorStr(loc, "Setting cartesian to raster buffer argument failed", m_Devices[dev].get()); }//Coordinate converter. if (b && !(b = wrapper.SetBufferArg(kernelIndex, argIndex++, m_HistBufferName))) { ErrorStr(loc, "Setting histogram buffer argument failed", m_Devices[dev].get()); }//Histogram. if (!m_VarStates.empty()) if (b && !(b = wrapper.SetBufferArg(kernelIndex, argIndex++, m_VarStateBufferName))) { ErrorStr(loc, "Setting variation state buffer argument failed", m_Devices[dev].get()); }//Variation state. if (b && !(b = wrapper.SetArg (kernelIndex, argIndex++, histSuperSize))) { ErrorStr(loc, "Setting histogram size argument failed", m_Devices[dev].get()); }//Histogram size. if (b && !(b = wrapper.SetImageArg (kernelIndex, argIndex++, false, "Palette"))) { ErrorStr(loc, "Setting palette argument failed", m_Devices[dev].get()); }//Palette. if (b && !(b = wrapper.SetBufferArg(kernelIndex, argIndex++, m_PointsBufferName))) { ErrorStr(loc, "Setting points buffer argument failed", m_Devices[dev].get()); }//Random start points. #ifdef KNL_USE_GLOBAL_CONSEC if (b && !(b = wrapper.SetBufferArg(kernelIndex, argIndex++, m_ConsecBufferName))) { ErrorStr(loc, "Setting consec buffer argument failed", m_Devices[dev].get()); }//Global sequence. #endif if (b && !(b = wrapper.RunKernel(kernelIndex, IterGridKernelWidth(),//Total grid dims. IterGridKernelHeight(), 1, IterBlockKernelWidth(),//Individual block dims. IterBlockKernelHeight(), 1))) { success = false; ErrorStr(loc, "Error running iteration program", m_Devices[dev].get()); atomLaunchesRan.fetch_sub(1); break; } atomItersRan.fetch_add(iterCountThisLaunch); atomItersRemaining.store(iterCount - atomItersRan.load()); m_Devices[dev]->m_Calls++; if (m_Callback && !dev)//Will only do callback on the first device, however it will report the progress of all devices. { double percent = 100.0 * double ( double ( double ( double(m_LastIter + atomItersRan.load()) / double(ItersPerTemporalSample()) ) + temporalSample ) / double(TemporalSamples()) ); double percentDiff = percent - m_LastIterPercent; double toc = m_ProgressTimer.Toc(); if (percentDiff >= 10 || (toc > 1000 && percentDiff >= 1))//Call callback function if either 10% has passed, or one second (and 1%). { auto startingpercent = 100.0 * (m_LastIter / double(ItersPerTemporalSample()));//This is done to support incremental renders, starting from the percentage it left off on. auto currentpercent = percent - startingpercent;//Current percent in terms of starting percentage. So starting at 50% and progressing 5% will give a value of 5%, not 55%. auto etaMs = currentpercent == 0 ? 0 : (((100.0 - startingpercent) - currentpercent) / currentpercent) * m_RenderTimer.Toc();//Subtract startingpercent from 100% so that it's properly scaled, meaning rendering from 50% - 100% will be treated as 0% - 100%. if (!m_Callback->ProgressFunc(m_Ember, m_ProgressParameter, percent, 0, etaMs)) Abort(); m_LastIterPercent = percent; m_ProgressTimer.Tic(); } } //cout << "CL kernel call " << atomLaunchesRan << " took: " << looptimer.Toc() << endl; } }; //Iterate backward to run all secondary devices on threads first, then finally the primary device on this thread. for (size_t device = m_Devices.size() - 1; device >= 0 && device < m_Devices.size(); device--)//Check both extents because size_t will wrap. { int index = m_Devices[device]->m_Wrapper.FindKernelIndex(m_IterOpenCLKernelCreator.IterEntryPoint()); if (index == -1) { success = false; break; } //If animating, treat each temporal sample as a newly started render for fusing purposes. if (temporalSample > 0) m_Devices[device]->m_Calls = 0; if (device != 0)//Secondary devices on their own threads. threadVec.push_back(std::thread([&](size_t dev, int kernelIndex) { iterFunc(dev, kernelIndex); }, device, index)); else//Primary device on this thread. iterFunc(device, index); } Join(threadVec); itersRan = atomItersRan.load(); if (success && m_Devices.size() > 1)//Determine whether/when to sum histograms of secondary devices with the primary. { if (((TemporalSamples() == 1) || (temporalSample == TemporalSamples() - 1)) &&//If there are no temporal samples (not animating), or the current one is the last... (probably doesn't matter anymore since we never use multiple renders for a single frame when animating, instead each frame gets its own renderer). ((m_LastIter + itersRan) >= ItersPerTemporalSample()))//...and the required number of iters for that sample have completed... if (success && !(success = SumDeviceHist())) { ErrorStr(loc, "Summing histograms failed", nullptr); }//...read the histogram from the secondary devices and sum them to the primary. } //t2.Toc(__FUNCTION__); return success; } ///

/// Run the log scale filter on the primary device. ///

/// True if success, else false. template eRenderStatus RendererCL::RunLogScaleFilter() { //Timing t(4); bool b = !m_Devices.empty(); static std::string loc = __FUNCTION__; if (b) { auto& wrapper = m_Devices[0]->m_Wrapper; int kernelIndex = wrapper.FindKernelIndex(m_DEOpenCLKernelCreator.LogScaleAssignDEEntryPoint()); if (kernelIndex != -1) { ConvertDensityFilter(); cl_uint argIndex = 0; size_t blockW = m_Devices[0]->WarpSize(); size_t blockH = 4;//A height of 4 seems to run the fastest. size_t gridW = m_DensityFilterCL.m_SuperRasW; size_t gridH = m_DensityFilterCL.m_SuperRasH; OpenCLWrapper::MakeEvenGridDims(blockW, blockH, gridW, gridH); if (b && !(b = wrapper.AddAndWriteBuffer(m_DEFilterParamsBufferName, reinterpret_cast(&m_DensityFilterCL), sizeof(m_DensityFilterCL)))) { ErrorStr(loc, "Adding DE filter parameters buffer failed", m_Devices[0].get()); } if (b && !(b = wrapper.SetBufferArg(kernelIndex, argIndex++, m_HistBufferName))) { ErrorStr(loc, "Setting histogram buffer argument failed", m_Devices[0].get()); }//Histogram. if (b && !(b = wrapper.SetBufferArg(kernelIndex, argIndex++, m_AccumBufferName))) { ErrorStr(loc, "Setting accumulator buffer argument failed", m_Devices[0].get()); }//Accumulator. if (b && !(b = wrapper.SetBufferArg(kernelIndex, argIndex++, m_DEFilterParamsBufferName))) { ErrorStr(loc, "Setting DE filter parameters buffer argument failed", m_Devices[0].get()); }//DensityFilterCL. //t.Tic(); if (b && !(b = wrapper.RunKernel(kernelIndex, gridW, gridH, 1, blockW, blockH, 1))) { ErrorStr(loc, "Running log scale program failed", m_Devices[0].get()); } //t.Toc(loc); if (b && m_Callback) if (!m_Callback->ProgressFunc(m_Ember, m_ProgressParameter, 100.0, 1, 0.0)) Abort(); } else { b = false; ErrorStr(loc, "Invalid kernel index for log scale program", m_Devices[0].get()); } } return m_Abort ? eRenderStatus::RENDER_ABORT : (b ? eRenderStatus::RENDER_OK : eRenderStatus::RENDER_ERROR); } ///

/// Run the Gaussian density filter on the primary device. /// Method 7: Each block processes a 16x16(AMD) or 24x24(Nvidia) block and exits. No column or row advancements happen. ///

/// True if success and not aborted, else false. template eRenderStatus RendererCL::RunDensityFilter() { bool b = !m_Devices.empty(); Timing t(4);// , t2(4); ConvertDensityFilter(); int kernelIndex = MakeAndGetDensityFilterProgram(Supersample(), m_DensityFilterCL.m_FilterWidth); const char* loc = __FUNCTION__; if (kernelIndex != -1) { uint ssm1 = m_DensityFilterCL.m_Supersample - 1; uint leftBound = ssm1; uint rightBound = m_DensityFilterCL.m_SuperRasW - ssm1; uint topBound = leftBound; uint botBound = m_DensityFilterCL.m_SuperRasH - ssm1; size_t gridW = rightBound - leftBound; size_t gridH = botBound - topBound; size_t blockSizeW = m_MaxDEBlockSizeW; size_t blockSizeH = m_MaxDEBlockSizeH; double fw2 = m_DensityFilterCL.m_FilterWidth * 2.0; auto& wrapper = m_Devices[0]->m_Wrapper; //Can't just blindly pass dimension in vals. Must adjust them first to evenly divide the thread count //into the total grid dimensions. OpenCLWrapper::MakeEvenGridDims(blockSizeW, blockSizeH, gridW, gridH); //t.Tic(); //The classic problem with performing DE on adjacent pixels is that the filter will overlap. //This can be solved in 2 ways. One is to use atomics, which is unacceptably slow. //The other is to proces the entire image in multiple passes, and each pass processes blocks of pixels //that are far enough apart such that their filters do not overlap. //Do the latter. //Gap is in terms of blocks and specifies how many blocks must separate two blocks running at the same time. uint gapW = uint(ceil(fw2 / blockSizeW)); uint chunkSizeW = gapW + 1;//Chunk size is also in terms of blocks and is one block (the one running) plus the gap to the right of it. uint gapH = uint(ceil(fw2 / blockSizeH)); uint chunkSizeH = gapH + 1;//Chunk size is also in terms of blocks and is one block (the one running) plus the gap below it. double totalChunks = chunkSizeW * chunkSizeH; if (b && !(b = wrapper.AddAndWriteBuffer(m_DEFilterParamsBufferName, reinterpret_cast(&m_DensityFilterCL), sizeof(m_DensityFilterCL)))) { ErrorStr(loc, "Writing DE filter parameters buffer failed", m_Devices[0].get()); } #ifdef ROW_ONLY_DE blockSizeW = 64;//These *must* both be divisible by 16 or else pixels will go missing. blockSizeH = 1; gapW = (uint)ceil((m_DensityFilterCL.m_FilterWidth * 2.0) / (double)blockSizeW); chunkSizeW = gapW + 1; gapH = (uint)ceil((m_DensityFilterCL.m_FilterWidth * 2.0) / (double)32);//Block height is 1, but iterates over 32 rows. chunkSizeH = gapH + 1; totalChunks = chunkSizeW * chunkSizeH; OpenCLWrapper::MakeEvenGridDims(blockSizeW, blockSizeH, gridW, gridH); gridW /= chunkSizeW; gridH /= chunkSizeH; for (uint rowChunk = 0; b && !m_Abort && rowChunk < chunkSizeH; rowChunk++) { for (uint colChunk = 0; b && !m_Abort && colChunk < chunkSizeW; colChunk++) { //t2.Tic(); if (b && !(b = RunDensityFilterPrivate(kernelIndex, gridW, gridH, blockSizeW, blockSizeH, chunkSizeW, chunkSizeH, colChunk, rowChunk))) { m_Abort = true; AddToReport(loc); } //t2.Toc(loc); if (b && m_Callback) { double percent = (double((rowChunk * chunkSizeW) + (colChunk + 1)) / totalChunks) * 100.0; double etaMs = ((100.0 - percent) / percent) * t.Toc(); if (!m_Callback->ProgressFunc(m_Ember, m_ProgressParameter, percent, 1, etaMs)) Abort(); } } } #else gridW /= chunkSizeW;//Grid must be scaled down by number of chunks. gridH /= chunkSizeH; OpenCLWrapper::MakeEvenGridDims(blockSizeW, blockSizeH, gridW, gridH); for (uint rowChunkPass = 0; b && !m_Abort && rowChunkPass < chunkSizeH; rowChunkPass++)//Number of vertical passes. { for (uint colChunkPass = 0; b && !m_Abort && colChunkPass < chunkSizeW; colChunkPass++)//Number of horizontal passes. { //t2.Tic(); if (b && !(b = RunDensityFilterPrivate(kernelIndex, gridW, gridH, blockSizeW, blockSizeH, chunkSizeW, chunkSizeH, colChunkPass, rowChunkPass))) { ErrorStr(loc, "Running DE filter program for row chunk "s + std::to_string(rowChunkPass) + ", col chunk "s + std::to_string(colChunkPass) + " failed", m_Devices[0].get()); } //t2.Toc(loc); if (b && m_Callback) { double percent = (double((rowChunkPass * chunkSizeW) + (colChunkPass + 1)) / totalChunks) * 100.0; double etaMs = ((100.0 - percent) / percent) * t.Toc(); if (!m_Callback->ProgressFunc(m_Ember, m_ProgressParameter, percent, 1, etaMs)) Abort(); } } } #endif if (b && m_Callback) if (!m_Callback->ProgressFunc(m_Ember, m_ProgressParameter, 100.0, 1, 0.0)) Abort(); //t2.Toc(__FUNCTION__ " all passes"); } else { b = false; ErrorStr(loc, "Invalid kernel index for DE filter program", m_Devices[0].get()); } return m_Abort ? eRenderStatus::RENDER_ABORT : (b ? eRenderStatus::RENDER_OK : eRenderStatus::RENDER_ERROR); } ///

/// Run final accumulation to the 2D output image on the primary device. ///

/// True if success and not aborted, else false. template eRenderStatus RendererCL::RunFinalAccum() { //Timing t(4); bool b = true; int accumKernelIndex = MakeAndGetFinalAccumProgram(); cl_uint argIndex; size_t gridW; size_t gridH; size_t blockW; size_t blockH; uint curvesSet = m_CurvesSet ? 1 : 0; static std::string loc = __FUNCTION__; if (!m_Abort && accumKernelIndex != -1) { auto& wrapper = m_Devices[0]->m_Wrapper; //This is needed with or without early clip. ConvertSpatialFilter(); if (b && !(b = wrapper.AddAndWriteBuffer(m_SpatialFilterParamsBufferName, reinterpret_cast(&m_SpatialFilterCL), sizeof(m_SpatialFilterCL)))) { ErrorStr(loc, "Adding spatial filter parameters buffer", m_Devices[0].get()); } if (b && !(b = wrapper.AddAndWriteBuffer(m_CurvesCsaName, m_Csa.data(), SizeOf(m_Csa)))) { ErrorStr(loc, "Adding curves buffer", m_Devices[0].get()); } //Since early clip requires gamma correcting the entire accumulator first, //it can't be done inside of the normal final accumulation kernel, so //an additional kernel must be launched first. if (b && EarlyClip()) { int gammaCorrectKernelIndex = MakeAndGetGammaCorrectionProgram(); if (gammaCorrectKernelIndex != -1) { argIndex = 0; blockW = m_Devices[0]->WarpSize(); blockH = 4;//A height of 4 seems to run the fastest. gridW = m_SpatialFilterCL.m_SuperRasW;//Using super dimensions because this processes the density filtering bufer. gridH = m_SpatialFilterCL.m_SuperRasH; OpenCLWrapper::MakeEvenGridDims(blockW, blockH, gridW, gridH); if (b && !(b = wrapper.SetBufferArg(gammaCorrectKernelIndex, argIndex++, m_AccumBufferName))) { ErrorStr(loc, "Setting early clip accumulator buffer argument failed", m_Devices[0].get()); }//Accumulator. if (b && !(b = wrapper.SetBufferArg(gammaCorrectKernelIndex, argIndex++, m_SpatialFilterParamsBufferName))) { ErrorStr(loc, "Setting early clip spatial filter parameters buffer argument failed", m_Devices[0].get()); }//SpatialFilterCL. if (b && !(b = wrapper.RunKernel(gammaCorrectKernelIndex, gridW, gridH, 1, blockW, blockH, 1))) { ErrorStr(loc, "Running early clip gamma correction program failed", m_Devices[0].get()); } } else { b = false; ErrorStr(loc, "Invalid kernel index for early clip gamma correction program", m_Devices[0].get()); } } argIndex = 0; blockW = m_Devices[0]->WarpSize(); blockH = 4;//A height of 4 seems to run the fastest. gridW = m_SpatialFilterCL.m_FinalRasW; gridH = m_SpatialFilterCL.m_FinalRasH; OpenCLWrapper::MakeEvenGridDims(blockW, blockH, gridW, gridH); if (b && !(b = wrapper.SetBufferArg(accumKernelIndex, argIndex++, m_AccumBufferName))) { ErrorStr(loc, "Setting accumulator buffer argument failed", m_Devices[0].get()); }//Accumulator. if (b && !(b = wrapper.SetImageArg(accumKernelIndex, argIndex++, wrapper.Shared(), m_FinalImageName))) { ErrorStr(loc, "Setting accumulator final image buffer argument failed", m_Devices[0].get()); }//Final image. if (b && !(b = wrapper.SetBufferArg(accumKernelIndex, argIndex++, m_SpatialFilterParamsBufferName))) { ErrorStr(loc, "Setting spatial filter parameters buffer argument failed", m_Devices[0].get()); }//SpatialFilterCL. if (b && !(b = wrapper.SetBufferArg(accumKernelIndex, argIndex++, m_SpatialFilterCoefsBufferName))) { ErrorStr(loc, "Setting spatial filter coefficients buffer argument failed", m_Devices[0].get()); }//Filter coefs. if (b && !(b = wrapper.SetBufferArg(accumKernelIndex, argIndex++, m_CurvesCsaName))) { ErrorStr(loc, "Setting curves buffer argument failed", m_Devices[0].get()); }//Curve points. if (b && !(b = wrapper.SetArg (accumKernelIndex, argIndex++, curvesSet))) { ErrorStr(loc, "Setting curves boolean argument failed", m_Devices[0].get()); }//Do curves. if (b && wrapper.Shared()) if (b && !(b = wrapper.EnqueueAcquireGLObjects(m_FinalImageName))) { ErrorStr(loc, "Acquiring OpenGL texture failed", m_Devices[0].get()); } if (b && !(b = wrapper.RunKernel(accumKernelIndex, gridW, gridH, 1, blockW, blockH, 1))) { ErrorStr(loc, "Running final accumulation program failed", m_Devices[0].get()); } if (b && wrapper.Shared()) if (b && !(b = wrapper.EnqueueReleaseGLObjects(m_FinalImageName))) { ErrorStr(loc, "Releasing OpenGL texture failed", m_Devices[0].get()); } //t.Toc((char*)loc); } else { b = false; ErrorStr(loc, "Invalid kernel index for final accumulation program", m_Devices[0].get()); } return b ? eRenderStatus::RENDER_OK : eRenderStatus::RENDER_ERROR; } ///

/// Zeroize a buffer of the specified size on the specified device. ///

/// The index in the device buffer to clear /// Name of the buffer to clear /// Width in elements /// Height in elements /// Size of each element /// True if success, else false. template bool RendererCL::ClearBuffer(size_t device, const string& bufferName, uint width, uint height, uint elementSize) { bool b = false; if (device < m_Devices.size()) { auto& wrapper = m_Devices[device]->m_Wrapper; int kernelIndex = wrapper.FindKernelIndex(m_IterOpenCLKernelCreator.ZeroizeEntryPoint()); cl_uint argIndex = 0; static std::string loc = __FUNCTION__; if (kernelIndex != -1) { size_t blockW = m_Devices[device]->Nvidia() ? 32 : 16;//Max work group size is 256 on AMD, which means 16x16. size_t blockH = m_Devices[device]->Nvidia() ? 32 : 16; size_t gridW = width * elementSize; size_t gridH = height; b = true; OpenCLWrapper::MakeEvenGridDims(blockW, blockH, gridW, gridH); if (b && !(b = wrapper.SetBufferArg(kernelIndex, argIndex++, bufferName))) { ErrorStr(loc, "Setting clear buffer argument failed", m_Devices[device].get()); }//Buffer of byte. if (b && !(b = wrapper.SetArg(kernelIndex, argIndex++, width * elementSize))) { ErrorStr(loc, "Setting clear buffer width argument failed", m_Devices[device].get()); }//Width. if (b && !(b = wrapper.SetArg(kernelIndex, argIndex++, height))) { ErrorStr(loc, "Setting clear buffer height argument failed", m_Devices[device].get()); }//Height. if (b && !(b = wrapper.RunKernel(kernelIndex, gridW, gridH, 1, blockW, blockH, 1))) { ErrorStr(loc, "Running clear buffer program failed", m_Devices[device].get()); } } else { ErrorStr(loc, "Invalid kernel index for clear buffer program", m_Devices[device].get()); } } return b; } ///

/// Private wrapper around calling Gaussian density filtering kernel. /// The parameters are very specific to how the kernel is internally implemented. ///

/// Index of the kernel to call /// Grid width /// Grid height /// Block width /// Block height /// Chunk size width (gapW + 1) /// Chunk size height (gapH + 1) /// The current horizontal pass index /// The current vertical pass index /// True if success, else false. template bool RendererCL::RunDensityFilterPrivate(size_t kernelIndex, size_t gridW, size_t gridH, size_t blockW, size_t blockH, uint chunkSizeW, uint chunkSizeH, uint colChunkPass, uint rowChunkPass) { //Timing t(4); bool b = true; cl_uint argIndex = 0; static std::string loc = __FUNCTION__; if (!m_Devices.empty()) { auto& wrapper = m_Devices[0]->m_Wrapper; if (b && !(b = wrapper.SetBufferArg(kernelIndex, argIndex, m_HistBufferName))) { ErrorStr(loc, "Setting histogram buffer argument failed", m_Devices[0].get()); } argIndex++;//Histogram. if (b && !(b = wrapper.SetBufferArg(kernelIndex, argIndex, m_AccumBufferName))) { ErrorStr(loc, "Setting accumulator buffer argument failed", m_Devices[0].get()); } argIndex++;//Accumulator. if (b && !(b = wrapper.SetBufferArg(kernelIndex, argIndex, m_DEFilterParamsBufferName))) { ErrorStr(loc, "Setting DE filter parameters buffer argument failed", m_Devices[0].get()); } argIndex++;//FlameDensityFilterCL. if (b && !(b = wrapper.SetBufferArg(kernelIndex, argIndex, m_DECoefsBufferName))) { ErrorStr(loc, "Setting DE coefficients buffer argument failed", m_Devices[0].get()); } argIndex++;//Coefs. if (b && !(b = wrapper.SetBufferArg(kernelIndex, argIndex, m_DEWidthsBufferName))) { ErrorStr(loc, "Setting DE widths buffer argument failed", m_Devices[0].get()); } argIndex++;//Widths. if (b && !(b = wrapper.SetBufferArg(kernelIndex, argIndex, m_DECoefIndicesBufferName))) { ErrorStr(loc, "Setting DE coefficient indices buffer argument failed", m_Devices[0].get()); } argIndex++;//Coef indices. if (b && !(b = wrapper.SetArg(kernelIndex, argIndex, chunkSizeW))) { ErrorStr(loc, "Setting chunk size width argument failed", m_Devices[0].get()); } argIndex++;//Chunk size width (gapW + 1). if (b && !(b = wrapper.SetArg(kernelIndex, argIndex, chunkSizeH))) { ErrorStr(loc, "Setting chunk size height argument failed", m_Devices[0].get()); } argIndex++;//Chunk size height (gapH + 1). if (b && !(b = wrapper.SetArg(kernelIndex, argIndex, colChunkPass))) { ErrorStr(loc, "Setting col chunk pass argument failed", m_Devices[0].get()); } argIndex++;//Column chunk, horizontal pass. if (b && !(b = wrapper.SetArg(kernelIndex, argIndex, rowChunkPass))) { ErrorStr(loc, "Setting row chunk pass argument failed", m_Devices[0].get()); } argIndex++;//Row chunk, vertical pass. //t.Toc(__FUNCTION__ " set args"); //t.Tic(); if (b && !(b = wrapper.RunKernel(kernelIndex, gridW, gridH, 1, blockW, blockH, 1))) { ErrorStr(loc, "Running DE filter program failed", m_Devices[0].get()); }//Method 7, accumulating to temp box area. //t.Toc(__FUNCTION__ " RunKernel()"); return b; } return false; } ///

/// Make the Gaussian density filter program on the primary device and return its index. ///

/// The supersample being used for the current ember /// Width of the gaussian filter /// The kernel index if successful, else -1. template int RendererCL::MakeAndGetDensityFilterProgram(size_t ss, uint filterWidth) { int kernelIndex = -1; if (!m_Devices.empty()) { auto& wrapper = m_Devices[0]->m_Wrapper; auto& deEntryPoint = m_DEOpenCLKernelCreator.GaussianDEEntryPoint(ss, filterWidth); const char* loc = __FUNCTION__; if ((kernelIndex = wrapper.FindKernelIndex(deEntryPoint)) == -1)//Has not been built yet. { auto& kernel = m_DEOpenCLKernelCreator.GaussianDEKernel(ss, filterWidth); if (wrapper.AddProgram(deEntryPoint, kernel, deEntryPoint, m_DoublePrecision)) kernelIndex = wrapper.FindKernelIndex(deEntryPoint);//Try to find it again, it will be present if successfully built. else ErrorStr(loc, "Adding the DE filter program at "s + deEntryPoint + " failed to build:\n"s + kernel, m_Devices[0].get()); } } return kernelIndex; } ///

/// Make the final accumulation on the primary device program and return its index. ///

/// The kernel index if successful, else -1. template int RendererCL::MakeAndGetFinalAccumProgram() { int kernelIndex = -1; if (!m_Devices.empty()) { auto& wrapper = m_Devices[0]->m_Wrapper; auto& finalAccumEntryPoint = m_FinalAccumOpenCLKernelCreator.FinalAccumEntryPoint(EarlyClip()); const char* loc = __FUNCTION__; if ((kernelIndex = wrapper.FindKernelIndex(finalAccumEntryPoint)) == -1)//Has not been built yet. { auto& kernel = m_FinalAccumOpenCLKernelCreator.FinalAccumKernel(EarlyClip()); if (wrapper.AddProgram(finalAccumEntryPoint, kernel, finalAccumEntryPoint, m_DoublePrecision)) kernelIndex = wrapper.FindKernelIndex(finalAccumEntryPoint);//Try to find it again, it will be present if successfully built. else ErrorStr(loc, "Adding final accumulation program "s + finalAccumEntryPoint + " failed"s, m_Devices[0].get()); } } return kernelIndex; } ///

/// Make the gamma correction program on the primary device for early clipping and return its index. ///

/// The kernel index if successful, else -1. template int RendererCL::MakeAndGetGammaCorrectionProgram() { if (!m_Devices.empty()) { auto& wrapper = m_Devices[0]->m_Wrapper; auto& gammaEntryPoint = m_FinalAccumOpenCLKernelCreator.GammaCorrectionEntryPoint(); int kernelIndex = wrapper.FindKernelIndex(gammaEntryPoint); static std::string loc = __FUNCTION__; if (kernelIndex == -1)//Has not been built yet. { auto& kernel = m_FinalAccumOpenCLKernelCreator.GammaCorrectionKernel(); bool b = wrapper.AddProgram(gammaEntryPoint, kernel, gammaEntryPoint, m_DoublePrecision); if (b) kernelIndex = wrapper.FindKernelIndex(gammaEntryPoint);//Try to find it again, it will be present if successfully built. else ErrorStr(loc, "Adding gamma correction program "s + gammaEntryPoint + " failed"s, m_Devices[0].get()); } return kernelIndex; } return -1; } ///

/// Create the ClBuffer HOST_PTR wrapper around the CPU histogram buffer. /// This is only used with multiple devices, and therefore should only be called in such cases. ///

/// True if success, felse false. template bool RendererCL::CreateHostBuffer() { bool b = true; size_t size = SuperSize() * sizeof(v4bT);//Size of histogram and density filter buffer. static std::string loc = __FUNCTION__; if (b = Renderer::Alloc(true))//Allocate the histogram memory to point this HOST_PTR buffer to, other buffers not needed. { if (b && !(b = m_Devices[0]->m_Wrapper.AddHostBuffer(m_HostBufferName, size, reinterpret_cast(HistBuckets()))))//Host side histogram for temporary use with multiple devices. ErrorStr(loc, "Creating OpenCL HOST_PTR buffer to point to host side histogram failed", m_Devices[0].get()); } else ErrorStr(loc, "Allocating host side histogram failed", m_Devices[0].get());//Allocating histogram failed, something is seriously wrong. return b; } ///

/// Sum all histograms from the secondary devices with the histogram on the primary device. /// This works by reading the histogram from those devices one at a time into the host side buffer, which /// is just an OpenCL pointer to the CPU histogram to use it as a temp space. /// Then pass that buffer to a kernel that sums it with the histogram on the primary device. ///

/// True if success, else false. template bool RendererCL::SumDeviceHist() { if (m_Devices.size() > 1) { //Timing t; bool b = true; auto& wrapper = m_Devices[0]->m_Wrapper; static std::string loc = __FUNCTION__; size_t blockW = m_Devices[0]->Nvidia() ? 32 : 16;//Max work group size is 256 on AMD, which means 16x16. size_t blockH = m_Devices[0]->Nvidia() ? 32 : 16; size_t gridW = SuperRasW(); size_t gridH = SuperRasH(); OpenCLWrapper::MakeEvenGridDims(blockW, blockH, gridW, gridH); int kernelIndex = wrapper.FindKernelIndex(m_IterOpenCLKernelCreator.SumHistEntryPoint()); if ((b = (kernelIndex != -1))) { for (size_t device = 1; device < m_Devices.size(); device++)//All secondary devices. { //m_HostBufferName will have been created as a ClBuffer to wrap the CPU histogram buffer as a temp space. //So read into it, then pass to the kernel below to sum to the primary device's histogram. if ((b = (ReadHist(device) && ClearHist(device))))//Must clear hist on secondary devices after reading and summing because they'll be reused on a quality increase (KEEP_ITERATING). { cl_uint argIndex = 0; if (b && !(b = wrapper.SetBufferArg(kernelIndex, argIndex++, m_HostBufferName))) { ErrorStr(loc, "Setting host buffer argument failed", m_Devices[device].get()); break; }//Source buffer of v4bT. if (b && !(b = wrapper.SetBufferArg(kernelIndex, argIndex++, m_HistBufferName))) { ErrorStr(loc, "Setting histogram buffer argument failed", m_Devices[device].get()); break; }//Dest buffer of v4bT. if (b && !(b = wrapper.SetArg (kernelIndex, argIndex++, uint(SuperRasW())))) { ErrorStr(loc, "Setting width argument failed", m_Devices[device].get()); break; }//Width in pixels. if (b && !(b = wrapper.SetArg (kernelIndex, argIndex++, uint(SuperRasH())))) { ErrorStr(loc, "Setting height argument failed", m_Devices[device].get()); break; }//Height in pixels. if (b && !(b = wrapper.SetArg (kernelIndex, argIndex++, (device == m_Devices.size() - 1) ? 1 : 0))) { ErrorStr(loc, "Setting clear argument failed", m_Devices[device].get()); break; }//Clear the source buffer on the last device. if (b && !(b = wrapper.RunKernel (kernelIndex, gridW, gridH, 1, blockW, blockH, 1))) { ErrorStr(loc, "Running histogram sum program failed", m_Devices[device].get()); break; } } else { ErrorStr(loc, "Running histogram reading and clearing programs failed", m_Devices[device].get()); break; } } } if (!b) { ErrorStr(loc, "Summing histograms from the secondary device(s) to the primary device failed", nullptr); } //t.Toc(loc); return b; } else { return m_Devices.size() == 1; } } ///

/// Private functions passing data to OpenCL programs. ///

///

/// Convert the currently used host side DensityFilter object into the DensityFilterCL member /// for passing to OpenCL. /// Some of the values are note populated when the filter object is null. This will be the case /// when only log scaling is needed. ///

template void RendererCL::ConvertDensityFilter() { m_DensityFilterCL.m_Supersample = uint(Supersample()); m_DensityFilterCL.m_SuperRasW = uint(SuperRasW()); m_DensityFilterCL.m_SuperRasH = uint(SuperRasH()); m_DensityFilterCL.m_K1 = K1(); m_DensityFilterCL.m_K2 = K2(); if (m_DensityFilter.get()) { m_DensityFilterCL.m_Curve = m_DensityFilter->Curve(); m_DensityFilterCL.m_KernelSize = uint(m_DensityFilter->KernelSize()); m_DensityFilterCL.m_MaxFilterIndex = uint(m_DensityFilter->MaxFilterIndex()); m_DensityFilterCL.m_MaxFilteredCounts = uint(m_DensityFilter->MaxFilteredCounts()); m_DensityFilterCL.m_FilterWidth = uint(m_DensityFilter->FilterWidth()); } } ///

/// Convert the currently used host side SpatialFilter object into the SpatialFilterCL member /// for passing to OpenCL. ///

template void RendererCL::ConvertSpatialFilter() { bucketT g, linRange, vibrancy; Color background; if (m_SpatialFilter.get()) { this->PrepFinalAccumVals(background, g, linRange, vibrancy); m_SpatialFilterCL.m_SuperRasW = uint(SuperRasW()); m_SpatialFilterCL.m_SuperRasH = uint(SuperRasH()); m_SpatialFilterCL.m_FinalRasW = uint(FinalRasW()); m_SpatialFilterCL.m_FinalRasH = uint(FinalRasH()); m_SpatialFilterCL.m_Supersample = uint(Supersample()); m_SpatialFilterCL.m_FilterWidth = uint(m_SpatialFilter->FinalFilterWidth()); m_SpatialFilterCL.m_DensityFilterOffset = uint(DensityFilterOffset()); m_SpatialFilterCL.m_YAxisUp = uint(m_YAxisUp); m_SpatialFilterCL.m_Vibrancy = vibrancy; m_SpatialFilterCL.m_HighlightPower = HighlightPower(); m_SpatialFilterCL.m_Gamma = g; m_SpatialFilterCL.m_LinRange = linRange; m_SpatialFilterCL.m_Background = background; } } ///

/// Convert the host side Ember object into an EmberCL object /// and a vector of XformCL for passing to OpenCL. ///

/// The Ember object to convert /// The converted EmberCL /// The converted vector of XformCL template void RendererCL::ConvertEmber(Ember& ember, EmberCL& emberCL, vector>& xformsCL) { memset(&emberCL, 0, sizeof(EmberCL));//Might not really be needed. emberCL.m_RandPointRange = ember.m_RandPointRange; emberCL.m_CamZPos = ember.m_CamZPos; emberCL.m_CamPerspective = ember.m_CamPerspective; emberCL.m_CamYaw = ember.m_CamYaw; emberCL.m_CamPitch = ember.m_CamPitch; emberCL.m_BlurCurve = ember.m_BlurCurve; emberCL.m_CamDepthBlur = ember.m_CamDepthBlur; emberCL.m_BlurCoef = ember.BlurCoef(); emberCL.m_CamMat = ember.m_CamMat; emberCL.m_CenterX = ember.m_CenterX; emberCL.m_CenterY = ember.m_RotCenterY; emberCL.m_RotA = m_RotMat.A(); emberCL.m_RotB = m_RotMat.B(); emberCL.m_RotD = m_RotMat.D(); emberCL.m_RotE = m_RotMat.E(); emberCL.m_Psm1 = T(m_Dmap.Size() - 1); emberCL.m_Psm2 = T(m_Dmap.Size() - 2); for (size_t i = 0; i < ember.TotalXformCount() && i < xformsCL.size(); i++) { auto xform = ember.GetTotalXform(i); xformsCL[i].m_A = xform->m_Affine.A(); xformsCL[i].m_B = xform->m_Affine.B(); xformsCL[i].m_C = xform->m_Affine.C(); xformsCL[i].m_D = xform->m_Affine.D(); xformsCL[i].m_E = xform->m_Affine.E(); xformsCL[i].m_F = xform->m_Affine.F(); xformsCL[i].m_PostA = xform->m_Post.A(); xformsCL[i].m_PostB = xform->m_Post.B(); xformsCL[i].m_PostC = xform->m_Post.C(); xformsCL[i].m_PostD = xform->m_Post.D(); xformsCL[i].m_PostE = xform->m_Post.E(); xformsCL[i].m_PostF = xform->m_Post.F(); xformsCL[i].m_DirectColor = xform->m_DirectColor; xformsCL[i].m_ColorSpeedCache = xform->ColorSpeedCache(); xformsCL[i].m_OneMinusColorCache = xform->OneMinusColorCache(); xformsCL[i].m_Opacity = xform->m_Opacity; } } ///

/// Convert the host side CarToRas object into the CarToRasCL member /// for passing to OpenCL. ///

/// The CarToRas object to convert template void RendererCL::ConvertCarToRas(const CarToRas& carToRas) { m_CarToRasCL.m_RasWidth = uint(carToRas.RasWidth()); m_CarToRasCL.m_PixPerImageUnitW = carToRas.PixPerImageUnitW(); m_CarToRasCL.m_RasLlX = -carToRas.RasLlX();//Flip here because it's only used by CarToRasConvertPointToSingle(), which only needs the negative of it. m_CarToRasCL.m_PixPerImageUnitH = carToRas.PixPerImageUnitH(); m_CarToRasCL.m_RasLlY = -carToRas.RasLlY();//Ditto here. m_CarToRasCL.m_CarLlX = carToRas.CarLlX(); m_CarToRasCL.m_CarLlY = carToRas.CarLlY(); m_CarToRasCL.m_CarUrX = carToRas.CarUrX(); m_CarToRasCL.m_CarUrY = carToRas.CarUrY(); m_CarToRasCL.m_CarHalfX = carToRas.CachedCarHalfX(); m_CarToRasCL.m_CarHalfY = carToRas.CachedCarHalfY(); m_CarToRasCL.m_CarCenterX = carToRas.CarCenterX(); m_CarToRasCL.m_CarCenterY = carToRas.CarCenterY(); } ///

/// Fill a seeds buffer for all devices, each of which gets passed to its /// respective device when launching the iteration kernel. /// The range of each seed will be spaced to ensure no duplicates are added. /// Note, WriteBuffer() must be called after this to actually copy the /// data from the host to the device. ///

template void RendererCL::FillSeeds() { if (!m_Devices.empty()) { double start, delta = std::floor(double(std::numeric_limits::max()) / (IterGridKernelCount() * 2 * m_Devices.size())); m_Seeds.resize(m_Devices.size()); start = delta; for (size_t device = 0; device < m_Devices.size(); device++) { m_Seeds[device].resize(IterGridKernelCount()); for (auto& seed : m_Seeds[device]) { seed.x = uint(m_Rand[0].template Frand(start, start + delta)); start += delta; seed.y = uint(m_Rand[0].template Frand(start, start + delta)); start += delta; } } } } ///

/// Compose an error string based on the strings and device passed in, add it to the error report and return the string. ///

/// The location where the error occurred /// The text of the error /// The device the error occurred on /// The new error string template std::string RendererCL::ErrorStr(const std::string& loc, const std::string& error, RendererClDevice* dev) { std::string str = loc + "()"s + (dev ? "\n"s + dev->m_Wrapper.DeviceName() + "\nPlatform: " + std::to_string(dev->PlatformIndex()) + ", device: " + std::to_string(dev->DeviceIndex()) : "") + ", error:\n" + error + "\n"; AddToReport(str); return str; } template EMBERCL_API class RendererCL; #ifdef DO_DOUBLE template EMBERCL_API class RendererCL; #endif }