#include "EmberCLPch.h" #include "RendererCL.h" namespace EmberCLns { /// /// Constructor that inintializes various buffer names, block dimensions, image formats /// and finally initializes OpenCL using the passed in parameters. /// /// The index platform of the platform to use. Default: 0. /// The index device of the device to use. Default: 0. /// True if shared with OpenGL, else false. Default: false. /// The texture ID of the shared OpenGL texture if shared. Default: 0. template RendererCL::RendererCL(unsigned int platform, unsigned int device, bool shared, GLuint outputTexID) { m_Init = false; m_NVidia = false; m_DoublePrecision = typeid(T) == typeid(double); m_NumChannels = 4; m_Calls = 0; //Buffer names. m_EmberBufferName = "Ember"; m_ParVarsBufferName = "ParVars"; m_DistBufferName = "Dist"; m_CarToRasBufferName = "CarToRas"; m_DEFilterParamsBufferName = "DEFilterParams"; m_SpatialFilterParamsBufferName = "SpatialFilterParams"; m_DECoefsBufferName = "DECoefs"; m_DEWidthsBufferName = "DEWidths"; m_DECoefIndicesBufferName = "DECoefIndices"; m_SpatialFilterCoefsBufferName = "SpatialFilterCoefs"; m_HistBufferName = "Hist"; m_AccumBufferName = "Accum"; m_FinalImageName = "Final"; m_PointsBufferName = "Points"; //It's critical that these numbers never change. They are //based on the cuburn model of each kernel launch containing //256 threads. 32 wide by 8 high. Everything done in the OpenCL //iteraion kernel depends on these dimensions. m_IterCountPerKernel = 256; m_IterBlockWidth = 32; m_IterBlockHeight = 8; m_IterBlocksWide = 64; m_IterBlocksHigh = 2; m_PaletteFormat.image_channel_order = CL_RGBA; m_PaletteFormat.image_channel_data_type = CL_FLOAT; m_FinalFormat.image_channel_order = CL_RGBA; m_FinalFormat.image_channel_data_type = CL_UNORM_INT8;//Change if this ever supports 2BPC outputs for PNG. Init(platform, device, shared, outputTexID);//Init OpenCL upon construction and create programs that will not change. } /// /// Virtual destructor. /// template RendererCL::~RendererCL() { } /// /// Non-virtual member functions for OpenCL specific tasks. /// /// /// Initialize OpenCL. /// In addition to initializing, this function will create the zeroization program, /// as well as the basic log scale filtering programs. This is done to ensure basic /// compilation works. Further compilation will be done later for iteration, density filtering, /// and final accumulation. /// /// The index platform of the platform to use /// The index device of the device to use /// True if shared with OpenGL, else false. /// The texture ID of the shared OpenGL texture if shared /// True if success, else false. template bool RendererCL::Init(unsigned int platform, unsigned int device, bool shared, GLuint outputTexID) { //Timing t; bool b = true; m_OutputTexID = outputTexID; const char* loc = __FUNCTION__; if (!m_Wrapper.Ok() || PlatformIndex() != platform || DeviceIndex() != device) { m_Init = false; b = m_Wrapper.Init(platform, device, shared); } if (b && m_Wrapper.Ok() && !m_Init) { m_NVidia = ToLower(m_Wrapper.DeviceAndPlatformNames()).find_first_of("nvidia") != string::npos && m_Wrapper.LocalMemSize() > (32 * 1024); m_WarpSize = m_NVidia ? 32 : 64; m_IterOpenCLKernelCreator = IterOpenCLKernelCreator(m_NVidia); m_DEOpenCLKernelCreator = DEOpenCLKernelCreator(m_NVidia); string zeroizeProgram = m_IterOpenCLKernelCreator.ZeroizeKernel(); string logAssignProgram = m_DEOpenCLKernelCreator.LogScaleAssignDEKernel(); string logSumProgram = m_DEOpenCLKernelCreator.LogScaleSumDEKernel();//Build a couple of simple programs to ensure OpenCL is working right. if (b && !(b = m_Wrapper.AddProgram(m_IterOpenCLKernelCreator.ZeroizeEntryPoint(), zeroizeProgram, m_IterOpenCLKernelCreator.ZeroizeEntryPoint(), m_DoublePrecision))) { m_ErrorReport.push_back(loc); } if (b && !(b = m_Wrapper.AddProgram(m_DEOpenCLKernelCreator.LogScaleAssignDEEntryPoint(), logAssignProgram, m_DEOpenCLKernelCreator.LogScaleAssignDEEntryPoint(), m_DoublePrecision))) { m_ErrorReport.push_back(loc); } if (b && !(b = m_Wrapper.AddProgram(m_DEOpenCLKernelCreator.LogScaleSumDEEntryPoint(), logSumProgram, m_DEOpenCLKernelCreator.LogScaleSumDEEntryPoint(), m_DoublePrecision))) { m_ErrorReport.push_back(loc); } if (b && !(b = m_Wrapper.AddAndWriteImage("Palette", CL_MEM_READ_ONLY, m_PaletteFormat, 256, 1, 0, NULL))) { m_ErrorReport.push_back(loc); } //This is the maximum box dimension for density filtering which consists of (blockSize * blockSize) + (2 * filterWidth). //These blocks must be square, and ideally, 32x32. //Sadly, at the moment, Fermi runs out of resources at that block size because the DE filter function is so complex. //The next best block size seems to be 24x24. //AMD is further limited because of less local memory so these have to be 16 on AMD. m_MaxDEBlockSizeW = m_NVidia ? 32 : 16;//These *must* both be divisible by 16 or else pixels will go missing. m_MaxDEBlockSizeH = m_NVidia ? 32 : 16; m_Init = true; //t.Toc(loc); } return b; } template bool RendererCL::SetOutputTexture(GLuint outputTexID) { bool success = true; const char* loc = __FUNCTION__; if (!m_Wrapper.Ok()) return false; m_OutputTexID = outputTexID; EnterResize(); if (!m_Wrapper.AddAndWriteImage(m_FinalImageName, CL_MEM_WRITE_ONLY, m_FinalFormat, FinalRasW(), FinalRasH(), 0, NULL, m_Wrapper.Shared(), m_OutputTexID)) { m_ErrorReport.push_back(loc); success = false; } LeaveResize(); return success; } /// /// OpenCL property accessors, getters only. /// template unsigned int RendererCL::IterCountPerKernel() { return m_IterCountPerKernel; } template unsigned int RendererCL::IterBlocksWide() { return m_IterBlocksWide; } template unsigned int RendererCL::IterBlocksHigh() { return m_IterBlocksHigh; } template unsigned int RendererCL::IterBlockWidth() { return m_IterBlockWidth; } template unsigned int RendererCL::IterBlockHeight() { return m_IterBlockHeight; } template unsigned int RendererCL::IterGridWidth() { return IterBlocksWide() * IterBlockWidth(); } template unsigned int RendererCL::IterGridHeight() { return IterBlocksHigh() * IterBlockHeight(); } template unsigned int RendererCL::TotalIterKernelCount() { return IterGridWidth() * IterGridHeight(); } template unsigned int RendererCL::PlatformIndex() { return m_Wrapper.PlatformIndex(); } template unsigned int RendererCL::DeviceIndex() { return m_Wrapper.DeviceIndex(); } /// /// Read the histogram into the host side CPU buffer. /// Used for debugging. /// /// True if success, else false. template bool RendererCL::ReadHist() { if (Renderer::Alloc())//Allocate the memory to read into. return m_Wrapper.ReadBuffer(m_HistBufferName, (void*)HistBuckets(), SuperSize() * sizeof(v4T)); return false; } /// /// Read the density filtering buffer into the host side CPU buffer. /// Used for debugging. /// /// True if success, else false. template bool RendererCL::ReadAccum() { if (Renderer::Alloc())//Allocate the memory to read into. return m_Wrapper.ReadBuffer(m_AccumBufferName, (void*)AccumulatorBuckets(), SuperSize() * sizeof(v4T)); return false; } /// /// Read the temporary points buffer into a host side CPU buffer. /// Used for debugging. /// /// The host side buffer to read into /// True if success, else false. template bool RendererCL::ReadPoints(vector>& vec) { vec.resize(TotalIterKernelCount());//Allocate the memory to read into. if (vec.size() >= TotalIterKernelCount()) return m_Wrapper.ReadBuffer(m_PointsBufferName, (void*)vec.data(), TotalIterKernelCount() * sizeof(PointCL)); return false; } /// /// Clear the histogram buffer with all zeroes. /// /// True if success, else false. template bool RendererCL::ClearHist() { return ClearBuffer(m_HistBufferName, (unsigned int)SuperRasW(), (unsigned int)SuperRasH(), sizeof(v4T)); } /// /// Clear the desnity filtering buffer with all zeroes. /// /// True if success, else false. template bool RendererCL::ClearAccum() { return ClearBuffer(m_AccumBufferName, (unsigned int)SuperRasW(), (unsigned int)SuperRasH(), sizeof(v4T)); } /// /// Write values from a host side CPU buffer into the temporary points buffer. /// Used for debugging. /// /// The host side buffer whose values to write /// True if success, else false. template bool RendererCL::WritePoints(vector>& vec) { return m_Wrapper.WriteBuffer(m_PointsBufferName, (void*)vec.data(), vec.size() * sizeof(vec[0])); } /// /// Get the kernel string for the last built iter program. /// /// The string representation of the kernel for the last built iter program. template string RendererCL::IterKernel() { return m_IterKernel; } /// /// Virtual functions overridden from RendererCLBase. /// /// /// Read the final image buffer buffer into the host side CPU buffer. /// This must be called before saving the final output image to file. /// /// The host side buffer to read into /// True if success, else false. template bool RendererCL::ReadFinal(unsigned char* pixels) { if (pixels) return m_Wrapper.ReadImage(m_FinalImageName, FinalRasW(), FinalRasH(), 0, m_Wrapper.Shared(), pixels); return false; } /// /// Clear the final image output buffer with all zeroes by copying a host side buffer. /// Slow, but never used because the final output image is always completely overwritten. /// /// True if success, else false. template bool RendererCL::ClearFinal() { vector v; unsigned int index = m_Wrapper.FindImageIndex(m_FinalImageName, m_Wrapper.Shared()); if (PrepFinalAccumVector(v)) { bool b = m_Wrapper.WriteImage2D(index, m_Wrapper.Shared(), FinalRasW(), FinalRasH(), 0, v.data()); if (!b) m_ErrorReport.push_back(__FUNCTION__); return b; } else return false; } /// /// Public virtual functions overridden from Renderer or RendererBase. /// /// /// The amount of video RAM available on the GPU to render with. /// /// An unsigned 64-bit integer specifying how much video memory is available template size_t RendererCL::MemoryAvailable() { return Ok() ? m_Wrapper.GetInfo(PlatformIndex(), DeviceIndex(), CL_DEVICE_GLOBAL_MEM_SIZE) : 0ULL; } /// /// Return whether OpenCL has been properly initialized. /// /// True if OpenCL has been properly initialized, else false. template bool RendererCL::Ok() const { return m_Init; } /// /// Override to force num channels to be 4 because RGBA is always used for OpenCL /// since the output is actually an image rather than just a buffer. /// /// The number of channels, ignored. template void RendererCL::NumChannels(size_t numChannels) { m_NumChannels = 4; } /// /// Dump the error report for this class as well as the OpenCLWrapper member. /// template void RendererCL::DumpErrorReport() { EmberReport::DumpErrorReport(); m_Wrapper.DumpErrorReport(); } /// /// Clear the error report for this class as well as the OpenCLWrapper member. /// template void RendererCL::ClearErrorReport() { EmberReport::ClearErrorReport(); m_Wrapper.ClearErrorReport(); } /// /// The sub batch size for OpenCL will always be how many /// iterations are ran per kernel call. The caller can't /// change this. /// /// The number of iterations ran in a single kernel call template size_t RendererCL::SubBatchSize() const { return m_IterBlocksWide * m_IterBlocksHigh * SQR(m_IterCountPerKernel); } /// /// The thread count for OpenCL is always considered to be 1, however /// the kernel internally runs many threads. /// /// 1 template size_t RendererCL::ThreadCount() const { return 1; } /// /// Create the density filter in the base class and copy the filter values /// to the corresponding OpenCL buffers. /// /// True if a new filter instance was created, else false. /// True if success, else false. template bool RendererCL::CreateDEFilter(bool& newAlloc) { bool b = true; if (Renderer::CreateDEFilter(newAlloc)) { //Copy coefs and widths here. Convert and copy the other filter params right before calling the filtering kernel. if (newAlloc) { const char* loc = __FUNCTION__; DensityFilter* filter = dynamic_cast*>(GetDensityFilter()); if (b && !(b = m_Wrapper.AddAndWriteBuffer(m_DECoefsBufferName, (void*)filter->Coefs(), filter->CoefsSizeBytes()))) { m_ErrorReport.push_back(loc); } if (b && !(b = m_Wrapper.AddAndWriteBuffer(m_DEWidthsBufferName, (void*)filter->Widths(), filter->WidthsSizeBytes()))) { m_ErrorReport.push_back(loc); } if (b && !(b = m_Wrapper.AddAndWriteBuffer(m_DECoefIndicesBufferName, (void*)filter->CoefIndices(), filter->CoefsIndicesSizeBytes()))) { m_ErrorReport.push_back(loc); } } } else b = false; return b; } /// /// Create the spatial filter in the base class and copy the filter values /// to the corresponding OpenCL buffers. /// /// True if a new filter instance was created, else false. /// True if success, else false. template bool RendererCL::CreateSpatialFilter(bool& newAlloc) { bool b = true; if (Renderer::CreateSpatialFilter(newAlloc)) { if (newAlloc) if (b && !(b = m_Wrapper.AddAndWriteBuffer(m_SpatialFilterCoefsBufferName, (void*)GetSpatialFilter()->Filter(), GetSpatialFilter()->BufferSizeBytes()))) { m_ErrorReport.push_back(__FUNCTION__); } } else b = false; return b; } /// /// Get the renderer type enum. /// /// OPENCL_RENDERER template eRendererType RendererCL::RendererType() const { return OPENCL_RENDERER; } /// /// Concatenate and return the error report for this class and the /// OpenCLWrapper member as a single string. /// /// The concatenated error report string template string RendererCL::ErrorReportString() { return EmberReport::ErrorReportString() + m_Wrapper.ErrorReportString(); } /// /// Concatenate and return the error report for this class and the /// OpenCLWrapper member as a vector of strings. /// /// The concatenated error report vector of strings template vector RendererCL::ErrorReport() { vector ours = EmberReport::ErrorReport(); vector wrappers = m_Wrapper.ErrorReport(); ours.insert(ours.end(), wrappers.begin(), wrappers.end()); return ours; } /// /// Protected virtual functions overridden from Renderer. /// /// /// Make the final palette used for iteration. /// This override differs from the base in that it does not use /// bucketT as the output palette type. This is because OpenCL /// only supports floats for texture images. /// /// The color scalar to multiply the ember's palette by template void RendererCL::MakeDmap(T colorScalar) { m_Ember.m_Palette.MakeDmap(m_Dmap, colorScalar); } /// /// Allocate all buffers required for running as well as the final /// 2D image. /// /// True if success, else false. template bool RendererCL::Alloc() { if (!m_Wrapper.Ok()) return false; EnterResize(); bool b = true; size_t histLength = SuperSize() * sizeof(v4T); size_t accumLength = SuperSize() * sizeof(v4T); const char* loc = __FUNCTION__; if (b && !(b = m_Wrapper.AddBuffer(m_EmberBufferName, sizeof(m_EmberCL)))) { m_ErrorReport.push_back(loc); } if (b && !(b = m_Wrapper.AddBuffer(m_ParVarsBufferName, 128 * sizeof(T)))) { m_ErrorReport.push_back(loc); } if (b && !(b = m_Wrapper.AddBuffer(m_DistBufferName, CHOOSE_XFORM_GRAIN))) { m_ErrorReport.push_back(loc); }//Will be resized for xaos. if (b && !(b = m_Wrapper.AddBuffer(m_CarToRasBufferName, sizeof(m_CarToRasCL)))) { m_ErrorReport.push_back(loc); } if (b && !(b = m_Wrapper.AddBuffer(m_DEFilterParamsBufferName, sizeof(m_DensityFilterCL)))) { m_ErrorReport.push_back(loc); } if (b && !(b = m_Wrapper.AddBuffer(m_SpatialFilterParamsBufferName, sizeof(m_SpatialFilterCL)))) { m_ErrorReport.push_back(loc); } if (b && !(b = m_Wrapper.AddBuffer(m_HistBufferName, histLength))) { m_ErrorReport.push_back(loc); }//Histogram. Will memset to zero later. if (b && !(b = m_Wrapper.AddBuffer(m_AccumBufferName, accumLength))) { m_ErrorReport.push_back(loc); }//Accum buffer. if (b && !(b = m_Wrapper.AddBuffer(m_PointsBufferName, TotalIterKernelCount() * sizeof(PointCL)))) { m_ErrorReport.push_back(loc); }//Points between iter calls. if (b && !(b = SetOutputTexture(m_OutputTexID))) { m_ErrorReport.push_back(loc); } LeaveResize(); return b; } /// /// Clear OpenCL histogram and/or density filtering buffers to all zeroes. /// /// Clear histogram if true, else don't. /// Clear density filtering buffer if true, else don't. /// True if success, else false. template bool RendererCL::ResetBuckets(bool resetHist, bool resetAccum) { bool b = true; if (resetHist) b &= ClearHist(); if (resetAccum) b &= ClearAccum(); return b; } /// /// Perform log scale density filtering. /// /// True if success and not aborted, else false. template eRenderStatus RendererCL::LogScaleDensityFilter() { return RunLogScaleFilter(); } /// /// Run gaussian density estimation filtering. /// /// True if success and not aborted, else false. template eRenderStatus RendererCL::GaussianDensityFilter() { //This commented section is for debugging density filtering by making it run on the CPU //then copying the results back to the GPU. //if (ReadHist()) //{ // unsigned int accumLength = SuperSize() * sizeof(glm::detail::tvec4); // const char* loc = __FUNCTION__; // // Renderer::ResetBuckets(false, true); // Renderer::GaussianDensityFilter(); // // if (!m_Wrapper.WriteBuffer(m_AccumBufferName, AccumulatorBuckets(), accumLength)) { m_ErrorReport.push_back(loc); return RENDER_ERROR; } // return RENDER_OK; //} //else // return RENDER_ERROR; //Timing t(4); eRenderStatus status = RunDensityFilter(); //t.Toc(__FUNCTION__ " RunKernel()"); return status; } /// /// Run final accumulation. /// If pixels is NULL, the output will remain in the OpenCL 2D image. /// However, if pixels is not NULL, the output will be copied. This is /// useful when rendering in OpenCL, but saving the output to a file. /// /// The pixels to copy the final image to if not NULL /// Offset in the buffer to store the pixels to /// True if success and not aborted, else false. template eRenderStatus RendererCL::AccumulatorToFinalImage(unsigned char* pixels, size_t finalOffset) { eRenderStatus status = RunFinalAccum(); if (status == RENDER_OK && pixels != NULL && !m_Wrapper.Shared()) { pixels += finalOffset; if (!ReadFinal(pixels)) status = RENDER_ERROR; } return status; } /// /// Run the iteration algorithm for the specified number of iterations. /// This is only called after all other setup has been done. /// This will recompile the OpenCL program if this ember differs significantly /// from the previous run. /// Note that the bad value count is not recorded when running with OpenCL. If it's /// needed, run on the CPU. /// /// The number of iterations to run /// The pass this is running for /// The temporal sample within the current pass this is running for /// Rendering statistics template EmberStats RendererCL::Iterate(size_t iterCount, size_t pass, size_t temporalSample) { bool b = true; EmberStats stats;//Do not record bad vals with with GPU. If the user needs to investigate bad vals, use the CPU. const char* loc = __FUNCTION__; IterOpenCLKernelCreator::ParVarIndexDefines(m_Ember, m_Params, true, false);//Always do this to get the values (but no string), regardless of whether a rebuild is necessary. //Don't know the size of the parametric varations parameters buffer until the ember is examined. //So set it up right before the run. if (!m_Params.second.empty()) { if (!m_Wrapper.AddAndWriteBuffer(m_ParVarsBufferName, m_Params.second.data(), m_Params.second.size() * sizeof(m_Params.second[0]))) { m_Abort = true; m_ErrorReport.push_back(loc); return stats; } } //Rebuilding is expensive, so only do it if it's required. if (IterOpenCLKernelCreator::IsBuildRequired(m_Ember, m_LastBuiltEmber)) b = BuildIterProgramForEmber(true); if (b) { m_IterTimer.Tic();//Tic() here to avoid including build time in iter time measurement. if (m_Stats.m_Iters == 0)//Only reset the call count on the beginning of a new render. Do not reset on KEEP_ITERATING. m_Calls = 0; b = RunIter(iterCount, pass, temporalSample, stats.m_Iters); if (!b || stats.m_Iters == 0)//If no iters were executed, something went catastrophically wrong. m_Abort = true; stats.m_IterMs = m_IterTimer.Toc(); } else { m_Abort = true; m_ErrorReport.push_back(loc); } return stats; } /// /// Private functions for making and running OpenCL programs. /// /// /// Build the iteration program for the current ember. /// /// Whether to build in accumulation, only for debugging. Default: true. /// True if success, else false. template bool RendererCL::BuildIterProgramForEmber(bool doAccum) { //Timing t; const char* loc = __FUNCTION__; IterOpenCLKernelCreator::ParVarIndexDefines(m_Ember, m_Params, false, true);//Do with string and no vals. m_IterKernel = m_IterOpenCLKernelCreator.CreateIterKernelString(m_Ember, m_Params.first, m_LockAccum, doAccum); //cout << "Building: " << endl << iterProgram << endl; //A program build is roughly .66s which will detract from the user experience. //Need to experiment with launching this in a thread/task and returning once it's done.//TODO if (m_Wrapper.AddProgram(m_IterOpenCLKernelCreator.IterEntryPoint(), m_IterKernel, m_IterOpenCLKernelCreator.IterEntryPoint(), m_DoublePrecision)) { //t.Toc(__FUNCTION__ " program build"); //cout << string(loc) << "():\nBuilding the following program succeeded: \n" << iterProgram << endl; m_LastBuiltEmber = m_Ember; } else { m_ErrorReport.push_back(string(loc) + "():\nBuilding the following program failed: \n" + m_IterKernel + "\n"); return false; } return true; } /// /// Run the iteration kernel. /// Fusing on the CPU is done once per sub batch, usually 10,000 iters, however /// determining when to do it in OpenCL is much more difficult. /// Currently it's done once every 4 kernel calls which seems to be a good balance /// between quality of the final image and performance. /// /// The number of iterations to run /// The pass this is running for /// The temporal sample within the current pass this is running for /// The storage for the number of iterations ran /// True if success, else false. template bool RendererCL::RunIter(size_t iterCount, size_t pass, size_t temporalSample, size_t& itersRan) { Timing t;//, t2(4); bool b = true; unsigned int seed, fuse, argIndex; unsigned int iterCountPerKernel = m_IterCountPerKernel; unsigned int iterCountPerBlock = iterCountPerKernel * m_IterBlockWidth * m_IterBlockHeight; unsigned int supersize = (unsigned int)SuperSize(); int kernelIndex = m_Wrapper.FindKernelIndex(m_IterOpenCLKernelCreator.IterEntryPoint()); size_t fuseFreq = m_SubBatchSize / m_IterCountPerKernel; size_t itersRemaining, localIterCount = 0; double percent, etaMs; const char* loc = __FUNCTION__; itersRan = 0; #ifdef TEST_CL m_Abort = false; #endif if (kernelIndex != -1) { m_EmberCL = ConvertEmber(m_Ember); m_CarToRasCL = ConvertCarToRas(*CoordMap()); if (b && !(b = m_Wrapper.WriteBuffer (m_EmberBufferName, (void*)&m_EmberCL, sizeof(m_EmberCL)))) { m_ErrorReport.push_back(loc); } if (b && !(b = m_Wrapper.AddAndWriteBuffer(m_DistBufferName, (void*)XformDistributions(), XformDistributionsSize()))) { m_ErrorReport.push_back(loc); }//Will be resized for xaos. if (b && !(b = m_Wrapper.WriteBuffer (m_CarToRasBufferName, (void*)&m_CarToRasCL, sizeof(m_CarToRasCL)))) { m_ErrorReport.push_back(loc); } if (b && !(b = m_Wrapper.AddAndWriteImage("Palette", CL_MEM_READ_ONLY, m_PaletteFormat, m_Dmap.m_Entries.size(), 1, 0, m_Dmap.m_Entries.data()))) { m_ErrorReport.push_back(loc); } //If animating, treat each temporal sample as a newly started render for fusing purposes. if (temporalSample > 0) m_Calls = 0; while (b && itersRan < iterCount && !m_Abort) { argIndex = 0; seed = m_Rand[0].Rand(); #ifdef TEST_CL fuse = 0; #else //fuse = 100; fuse = ((m_Calls % fuseFreq) == 0 ? (EarlyClip() ? 100u : 15u) : 0u); //fuse = ((m_Calls % 4) == 0 ? 100u : 0u); #endif itersRemaining = iterCount - itersRan; unsigned int gridW = (unsigned int)min(ceil((double)itersRemaining / (double)iterCountPerBlock), (double)IterBlocksWide()); unsigned int gridH = (unsigned int)min(ceil((double)itersRemaining / ((double)gridW * iterCountPerBlock)), (double)IterBlocksHigh()); unsigned int iterCountThisLaunch = iterCountPerBlock * gridW * gridH; //Similar to what's done in the base class. //The number of iters per thread must be adjusted if they've requested less iters than is normally ran in a block (256 * 256). if (iterCountThisLaunch > iterCount) { iterCountPerKernel = (unsigned int)ceil((double)iterCount / (double)(gridW * gridH * m_IterBlockWidth * m_IterBlockHeight)); iterCountThisLaunch = iterCountPerKernel * (gridW * gridH * m_IterBlockWidth * m_IterBlockHeight); } if (b && !(b = m_Wrapper.SetArg (kernelIndex, argIndex++, iterCountPerKernel))) { m_ErrorReport.push_back(loc); }//Number of iters for each thread to run. if (b && !(b = m_Wrapper.SetArg (kernelIndex, argIndex++, fuse))) { m_ErrorReport.push_back(loc); }//Number of iters to fuse. if (b && !(b = m_Wrapper.SetArg (kernelIndex, argIndex++, seed))) { m_ErrorReport.push_back(loc); }//Seed. if (b && !(b = m_Wrapper.SetBufferArg(kernelIndex, argIndex++, m_EmberBufferName))) { m_ErrorReport.push_back(loc); }//Flame. if (b && !(b = m_Wrapper.SetBufferArg(kernelIndex, argIndex++, m_ParVarsBufferName))) { m_ErrorReport.push_back(loc); }//Parametric variation parameters. if (b && !(b = m_Wrapper.SetBufferArg(kernelIndex, argIndex++, m_DistBufferName))) { m_ErrorReport.push_back(loc); }//Xform distributions. if (b && !(b = m_Wrapper.SetBufferArg(kernelIndex, argIndex++, m_CarToRasBufferName))) { m_ErrorReport.push_back(loc); }//Coordinate converter. if (b && !(b = m_Wrapper.SetBufferArg(kernelIndex, argIndex++, m_HistBufferName))) { m_ErrorReport.push_back(loc); }//Histogram. if (b && !(b = m_Wrapper.SetArg (kernelIndex, argIndex++, supersize))) { m_ErrorReport.push_back(loc); }//Histogram size. if (b && !(b = m_Wrapper.SetImageArg (kernelIndex, argIndex++, false, "Palette"))) { m_ErrorReport.push_back(loc); }//Palette. if (b && !(b = m_Wrapper.SetBufferArg(kernelIndex, argIndex++, m_PointsBufferName))) { m_ErrorReport.push_back(loc); }//Random start points. if (b && !(b = m_Wrapper.RunKernel(kernelIndex, gridW * IterBlockWidth(),//Total grid dims. gridH * IterBlockHeight(), 1, IterBlockWidth(),//Individual block dims. IterBlockHeight(), 1))) { m_Abort = true; m_ErrorReport.push_back(loc); break; } itersRan += iterCountThisLaunch; m_Calls++; if (m_Callback) { percent = 100.0 * double ( double ( double ( double ( double(m_LastIter + itersRan) / double(ItersPerTemporalSample()) ) + temporalSample ) / (double)TemporalSamples() ) + (double)pass ) / (double)Passes(); double percentDiff = percent - m_LastIterPercent; double toc = m_ProgressTimer.Toc(); if (percentDiff >= 10 || (toc > 1000 && percentDiff >= 1))//Call callback function if either 10% has passed, or one second (and 1%). { etaMs = ((100.0 - percent) / percent) * m_RenderTimer.Toc(); if (!m_Callback->ProgressFunc(m_Ember, m_ProgressParameter, percent, 0, etaMs)) Abort(); m_LastIterPercent = percent; m_ProgressTimer.Tic(); } } } } else { b = false; m_ErrorReport.push_back(loc); } //t2.Toc(__FUNCTION__); return b; } /// /// Run the log scale filter. /// /// True if success, else false. template eRenderStatus RendererCL::RunLogScaleFilter() { //Timing t(4); bool b = true; int kernelIndex; const char* loc = __FUNCTION__; eRenderStatus status = RENDER_OK; if (Passes() == 1) kernelIndex = m_Wrapper.FindKernelIndex(m_DEOpenCLKernelCreator.LogScaleAssignDEEntryPoint()); else kernelIndex = m_Wrapper.FindKernelIndex(m_DEOpenCLKernelCreator.LogScaleSumDEEntryPoint()); if (kernelIndex != -1) { m_DensityFilterCL = ConvertDensityFilter(); unsigned int argIndex = 0; unsigned int blockW = m_WarpSize; unsigned int blockH = 4;//A height of 4 seems to run the fastest. unsigned int gridW = m_DensityFilterCL.m_SuperRasW; unsigned int gridH = m_DensityFilterCL.m_SuperRasH; OpenCLWrapper::MakeEvenGridDims(blockW, blockH, gridW, gridH); if (b && !(b = m_Wrapper.AddAndWriteBuffer(m_DEFilterParamsBufferName, (void*)&m_DensityFilterCL, sizeof(m_DensityFilterCL)))) { m_ErrorReport.push_back(loc); } if (b && !(b = m_Wrapper.SetBufferArg(kernelIndex, argIndex++, m_HistBufferName))) { m_ErrorReport.push_back(loc); }//Histogram. if (b && !(b = m_Wrapper.SetBufferArg(kernelIndex, argIndex++, m_AccumBufferName))) { m_ErrorReport.push_back(loc); }//Accumulator. if (b && !(b = m_Wrapper.SetBufferArg(kernelIndex, argIndex++, m_DEFilterParamsBufferName))) { m_ErrorReport.push_back(loc); }//DensityFilterCL. //t.Tic(); if (b && !(b = m_Wrapper.RunKernel(kernelIndex, gridW, gridH, 1, blockW, blockH, 1))) { m_ErrorReport.push_back(loc); } //t.Toc(loc); } else { b = false; m_ErrorReport.push_back(loc); } return b ? RENDER_OK : RENDER_ERROR; } /// /// Run the Gaussian density filter. /// Method 7: Each block processes a 32x32 block and exits. No column or row advancements happen. /// /// True if success and not aborted, else false. template eRenderStatus RendererCL::RunDensityFilter() { bool b = true; Timing t(4);//, t2(4); m_DensityFilterCL = ConvertDensityFilter(); int kernelIndex = MakeAndGetDensityFilterProgram(Supersample(), m_DensityFilterCL.m_FilterWidth); const char* loc = __FUNCTION__; if (kernelIndex != -1) { unsigned int leftBound = m_DensityFilterCL.m_Supersample - 1; unsigned int rightBound = m_DensityFilterCL.m_SuperRasW - (m_DensityFilterCL.m_Supersample - 1); unsigned int topBound = leftBound; unsigned int botBound = m_DensityFilterCL.m_SuperRasH - (m_DensityFilterCL.m_Supersample - 1); unsigned int gridW = rightBound - leftBound; unsigned int gridH = botBound - topBound; unsigned int blockSizeW = m_MaxDEBlockSizeW;//These *must* both be divisible by 16 or else pixels will go missing. unsigned int blockSizeH = m_MaxDEBlockSizeH; //OpenCL runs out of resources when using double or a supersample of 2. //Remedy this by reducing the height of the block by 2. if (m_DoublePrecision || m_DensityFilterCL.m_Supersample > 1) blockSizeH -= 2; //Can't just blindly pass in vals. Must adjust them first to evenly divide the block count //into the total grid dimensions. OpenCLWrapper::MakeEvenGridDims(blockSizeW, blockSizeH, gridW, gridH); //t.Tic(); //The classic problem with performing DE on adjacent pixels is that the filter will overlap. //This can be solved in 2 ways. One is to use atomics, which is unacceptably slow. //The other is to proces the entire image in multiple passes, and each pass processes blocks of pixels //that are far enough apart such that their filters do not overlap. //Do the latter. unsigned int gapW = (unsigned int)ceil((m_DensityFilterCL.m_FilterWidth * 2.0) / (double)blockSizeW); unsigned int chunkSizeW = gapW + 1; unsigned int gapH = (unsigned int)ceil((m_DensityFilterCL.m_FilterWidth * 2.0) / (double)blockSizeH); unsigned int chunkSizeH = gapH + 1; double totalChunks = chunkSizeW * chunkSizeH; if (b && !(b = m_Wrapper.AddAndWriteBuffer(m_DEFilterParamsBufferName, (void*)&m_DensityFilterCL, sizeof(m_DensityFilterCL)))) { m_ErrorReport.push_back(loc); } for (unsigned int row = 0; b && !m_Abort && row < chunkSizeH; row++) { for (unsigned int col = 0; b && !m_Abort && col < chunkSizeW; col++) { //t2.Tic(); if (b && !(b = RunDensityFilterPrivate(kernelIndex, gridW, gridH, blockSizeW, blockSizeH, chunkSizeW, chunkSizeH, row, col))) { m_Abort = true; m_ErrorReport.push_back(loc); } //t2.Toc(loc); if (b && m_Callback) { double percent = (double((row * chunkSizeW) + (col + 1)) / totalChunks) * 100.0; double etaMs = ((100.0 - percent) / percent) * t.Toc(); if (!m_Callback->ProgressFunc(m_Ember, m_ProgressParameter, percent, 1, etaMs)) Abort(); } } } if (b && m_Callback) m_Callback->ProgressFunc(m_Ember, m_ProgressParameter, 100.0, 1, 0.0); //t2.Toc(__FUNCTION__ " all passes"); } else { b = false; m_ErrorReport.push_back(loc); } return m_Abort ? RENDER_ABORT : (b ? RENDER_OK : RENDER_ERROR); } /// /// Run final accumulation to the 2D output image. /// /// True if success and not aborted, else false. template eRenderStatus RendererCL::RunFinalAccum() { //Timing t(4); bool b = true; T alphaBase; T alphaScale; int accumKernelIndex = MakeAndGetFinalAccumProgram(alphaBase, alphaScale); unsigned int argIndex; unsigned int gridW; unsigned int gridH; unsigned int blockW; unsigned int blockH; const char* loc = __FUNCTION__; if (!m_Abort && accumKernelIndex != -1) { //This is needed with or without early clip. m_SpatialFilterCL = ConvertSpatialFilter(); if (b && !(b = m_Wrapper.AddAndWriteBuffer(m_SpatialFilterParamsBufferName, (void*)&m_SpatialFilterCL, sizeof(m_SpatialFilterCL)))) { m_ErrorReport.push_back(loc); } //Since early clip requires gamma correcting the entire accumulator first, //it can't be done inside of the normal final accumulation kernel, so //an additional kernel must be launched first. if (b && EarlyClip()) { int gammaCorrectKernelIndex = MakeAndGetGammaCorrectionProgram(); if (gammaCorrectKernelIndex != -1) { argIndex = 0; blockW = m_WarpSize; blockH = 4;//A height of 4 seems to run the fastest. gridW = m_SpatialFilterCL.m_SuperRasW;//Using super dimensions because this processes the density filtering bufer. gridH = m_SpatialFilterCL.m_SuperRasH; OpenCLWrapper::MakeEvenGridDims(blockW, blockH, gridW, gridH); if (b && !(b = m_Wrapper.SetBufferArg(gammaCorrectKernelIndex, argIndex++, m_AccumBufferName))) { m_ErrorReport.push_back(loc); }//Accumulator. if (b && !(b = m_Wrapper.SetBufferArg(gammaCorrectKernelIndex, argIndex++, m_SpatialFilterParamsBufferName))) { m_ErrorReport.push_back(loc); }//SpatialFilterCL. if (b && !(b = m_Wrapper.RunKernel(gammaCorrectKernelIndex, gridW, gridH, 1, blockW, blockH, 1))) { m_ErrorReport.push_back(loc); } } else { b = false; m_ErrorReport.push_back(loc); } } argIndex = 0; blockW = m_WarpSize; blockH = 4;//A height of 4 seems to run the fastest. gridW = m_SpatialFilterCL.m_FinalRasW; gridH = m_SpatialFilterCL.m_FinalRasH; OpenCLWrapper::MakeEvenGridDims(blockW, blockH, gridW, gridH); if (b && !(b = m_Wrapper.SetBufferArg(accumKernelIndex, argIndex++, m_AccumBufferName))) { m_ErrorReport.push_back(loc); }//Accumulator. if (b && !(b = m_Wrapper.SetImageArg (accumKernelIndex, argIndex++, m_Wrapper.Shared(), m_FinalImageName))) { m_ErrorReport.push_back(loc); }//Final image. if (b && !(b = m_Wrapper.SetBufferArg(accumKernelIndex, argIndex++, m_SpatialFilterParamsBufferName))) { m_ErrorReport.push_back(loc); }//SpatialFilterCL. if (b && !(b = m_Wrapper.SetBufferArg(accumKernelIndex, argIndex++, m_SpatialFilterCoefsBufferName))) { m_ErrorReport.push_back(loc); }//Filter coefs. if (b && !(b = m_Wrapper.SetArg (accumKernelIndex, argIndex++, alphaBase))) { m_ErrorReport.push_back(loc); }//Alpha base. if (b && !(b = m_Wrapper.SetArg (accumKernelIndex, argIndex++, alphaScale))) { m_ErrorReport.push_back(loc); }//Alpha scale. if (b && m_Wrapper.Shared()) if (b && !(b = m_Wrapper.EnqueueAcquireGLObjects(m_FinalImageName))) { m_ErrorReport.push_back(loc); } if (b && !(b = m_Wrapper.RunKernel(accumKernelIndex, gridW, gridH, 1, blockW, blockH, 1))) { m_ErrorReport.push_back(loc); } if (b && m_Wrapper.Shared()) if (b && !(b = m_Wrapper.EnqueueReleaseGLObjects(m_FinalImageName))) { m_ErrorReport.push_back(loc); } //t.Toc((char*)loc); } else { b = false; m_ErrorReport.push_back(loc); } return b ? RENDER_OK : RENDER_ERROR; } /// /// Zeroize a buffer of the specified size. /// /// Name of the buffer to clear /// Width in elements /// Height in elements /// Size of each element /// True if success, else false. template bool RendererCL::ClearBuffer(const string& bufferName, unsigned int width, unsigned int height, unsigned int elementSize) { bool b = true; int kernelIndex = m_Wrapper.FindKernelIndex(m_IterOpenCLKernelCreator.ZeroizeEntryPoint()); unsigned int argIndex = 0; const char* loc = __FUNCTION__; if (kernelIndex != -1) { unsigned int blockW = m_NVidia ? 32 : 16;//Max work group size is 256 on AMD, which means 16x16. unsigned int blockH = m_NVidia ? 32 : 16; unsigned int gridW = width * elementSize; unsigned int gridH = height; OpenCLWrapper::MakeEvenGridDims(blockW, blockH, gridW, gridH); if (b && !(b = m_Wrapper.SetBufferArg(kernelIndex, argIndex++, bufferName))) { m_ErrorReport.push_back(loc); }//Buffer of unsigned char. if (b && !(b = m_Wrapper.SetArg (kernelIndex, argIndex++, width * elementSize))) { m_ErrorReport.push_back(loc); }//Width. if (b && !(b = m_Wrapper.SetArg (kernelIndex, argIndex++, height))) { m_ErrorReport.push_back(loc); }//Height. if (b && !(b = m_Wrapper.RunKernel(kernelIndex, gridW, gridH, 1, blockW, blockH, 1))) { m_ErrorReport.push_back(loc); } } else { b = false; m_ErrorReport.push_back(loc); } return b; } /// /// Private wrapper around calling Gaussian density filtering kernel. /// The parameters are very specific to how the kernel is internally implemented. /// /// Index of the kernel to call /// Grid width /// Grid height /// Block width /// Block height /// Chunk size (gap + 1) /// Row parity /// Column parity /// True if success, else false. template bool RendererCL::RunDensityFilterPrivate(unsigned int kernelIndex, unsigned int gridW, unsigned int gridH, unsigned int blockW, unsigned int blockH, unsigned int chunkSizeW, unsigned int chunkSizeH, unsigned int rowParity, unsigned int colParity) { //Timing t; bool b = true; unsigned int argIndex = 0; const char* loc = __FUNCTION__; if (b && !(b = m_Wrapper.SetBufferArg(kernelIndex, argIndex, m_HistBufferName))) { m_ErrorReport.push_back(loc); } argIndex++;//Histogram. if (b && !(b = m_Wrapper.SetBufferArg(kernelIndex, argIndex, m_AccumBufferName))) { m_ErrorReport.push_back(loc); } argIndex++;//Accumulator. if (b && !(b = m_Wrapper.SetBufferArg(kernelIndex, argIndex, m_DEFilterParamsBufferName))) { m_ErrorReport.push_back(loc); } argIndex++;//FlameDensityFilterCL. if (b && !(b = m_Wrapper.SetBufferArg(kernelIndex, argIndex, m_DECoefsBufferName))) { m_ErrorReport.push_back(loc); } argIndex++;//Coefs. if (b && !(b = m_Wrapper.SetBufferArg(kernelIndex, argIndex, m_DEWidthsBufferName))) { m_ErrorReport.push_back(loc); } argIndex++;//Widths. if (b && !(b = m_Wrapper.SetBufferArg(kernelIndex, argIndex, m_DECoefIndicesBufferName))) { m_ErrorReport.push_back(loc); } argIndex++;//Coef indices. if (b && !(b = m_Wrapper.SetArg( kernelIndex, argIndex, chunkSizeW))) { m_ErrorReport.push_back(loc); } argIndex++;//Chunk size width (gapW + 1). if (b && !(b = m_Wrapper.SetArg( kernelIndex, argIndex, chunkSizeH))) { m_ErrorReport.push_back(loc); } argIndex++;//Chunk size height (gapH + 1). if (b && !(b = m_Wrapper.SetArg( kernelIndex, argIndex, rowParity))) { m_ErrorReport.push_back(loc); } argIndex++;//Row parity. if (b && !(b = m_Wrapper.SetArg( kernelIndex, argIndex, colParity))) { m_ErrorReport.push_back(loc); } argIndex++;//Col parity. //t.Toc(__FUNCTION__ " set args"); //t.Tic(); if (b && !(b = m_Wrapper.RunKernel(kernelIndex, gridW, gridH, 1, blockW, blockH, 1))) { m_ErrorReport.push_back(loc); }//Method 7, accumulating to temp box area. //t.Toc(__FUNCTION__ " RunKernel()"); return b; } /// /// Make the Gaussian density filter program and return its index. /// /// The supersample being used for the current ember /// Width of the gaussian filter /// The kernel index if successful, else -1. template int RendererCL::MakeAndGetDensityFilterProgram(size_t ss, unsigned int filterWidth) { string deEntryPoint = m_DEOpenCLKernelCreator.GaussianDEEntryPoint(ss, filterWidth); int kernelIndex = m_Wrapper.FindKernelIndex(deEntryPoint); const char* loc = __FUNCTION__; if (kernelIndex == -1)//Has not been built yet. { string kernel = m_DEOpenCLKernelCreator.GaussianDEKernel(ss, filterWidth); bool b = m_Wrapper.AddProgram(deEntryPoint, kernel, deEntryPoint, m_DoublePrecision); if (b) { kernelIndex = m_Wrapper.FindKernelIndex(deEntryPoint);//Try to find it again, it will be present if successfully built. } else { m_ErrorReport.push_back(string(loc) + "():\nBuilding the following program failed: \n" + kernel + "\n"); //cout << m_ErrorReport.back(); } } return kernelIndex; } /// /// Make the final accumulation program and return its index. /// There are many different kernels for final accum, depending on early clip, alpha channel, and transparency. /// Loading all of these in the beginning is too much, so only load the one for the current case being worked with. /// /// Storage for the alpha base value used in the kernel. 0 if transparency is true, else 255. /// Storage for the alpha scale value used in the kernel. 255 if transparency is true, else 0. /// The kernel index if successful, else -1. template int RendererCL::MakeAndGetFinalAccumProgram(T& alphaBase, T& alphaScale) { string finalAccumEntryPoint = m_FinalAccumOpenCLKernelCreator.FinalAccumEntryPoint(EarlyClip(), Renderer::NumChannels(), Transparency(), alphaBase, alphaScale); int kernelIndex = m_Wrapper.FindKernelIndex(finalAccumEntryPoint); const char* loc = __FUNCTION__; if (kernelIndex == -1)//Has not been built yet. { string kernel = m_FinalAccumOpenCLKernelCreator.FinalAccumKernel(EarlyClip(), Renderer::NumChannels(), Transparency()); bool b = m_Wrapper.AddProgram(finalAccumEntryPoint, kernel, finalAccumEntryPoint, m_DoublePrecision); if (b) kernelIndex = m_Wrapper.FindKernelIndex(finalAccumEntryPoint);//Try to find it again, it will be present if successfully built. else m_ErrorReport.push_back(loc); } return kernelIndex; } /// /// Make the gamma correction program for early clipping and return its index. /// /// The kernel index if successful, else -1. template int RendererCL::MakeAndGetGammaCorrectionProgram() { string gammaEntryPoint = m_FinalAccumOpenCLKernelCreator.GammaCorrectionEntryPoint(Renderer::NumChannels(), Transparency()); int kernelIndex = m_Wrapper.FindKernelIndex(gammaEntryPoint); const char* loc = __FUNCTION__; if (kernelIndex == -1)//Has not been built yet. { string kernel = m_FinalAccumOpenCLKernelCreator.GammaCorrectionKernel(Renderer::NumChannels(), Transparency()); bool b = m_Wrapper.AddProgram(gammaEntryPoint, kernel, gammaEntryPoint, m_DoublePrecision); if (b) kernelIndex = m_Wrapper.FindKernelIndex(gammaEntryPoint);//Try to find it again, it will be present if successfully built. else m_ErrorReport.push_back(loc); } return kernelIndex; } /// /// Private functions passing data to OpenCL programs. /// /// /// Convert the currently used host side DensityFilter object into a DensityFilterCL object /// for passing to OpenCL. /// /// The DensityFilterCL object template DensityFilterCL RendererCL::ConvertDensityFilter() { DensityFilterCL filterCL; DensityFilter* densityFilter = dynamic_cast*>(GetDensityFilter()); filterCL.m_Supersample = (unsigned int)Supersample(); filterCL.m_SuperRasW = (unsigned int)SuperRasW(); filterCL.m_SuperRasH = (unsigned int)SuperRasH(); filterCL.m_K1 = K1(); filterCL.m_K2 = K2(); if (densityFilter) { filterCL.m_Curve = densityFilter->Curve(); filterCL.m_KernelSize = (unsigned int)densityFilter->KernelSize(); filterCL.m_MaxFilterIndex = (unsigned int)densityFilter->MaxFilterIndex(); filterCL.m_MaxFilteredCounts = (unsigned int)densityFilter->MaxFilteredCounts(); filterCL.m_FilterWidth = (unsigned int)densityFilter->FilterWidth(); } return filterCL; } /// /// Convert the currently used host side SpatialFilter object into a SpatialFilterCL object /// for passing to OpenCL. /// /// The SpatialFilterCL object template SpatialFilterCL RendererCL::ConvertSpatialFilter() { T g, linRange, vibrancy; Color background; SpatialFilterCL filterCL; PrepFinalAccumVals(background, g, linRange, vibrancy); filterCL.m_SuperRasW = (unsigned int)SuperRasW(); filterCL.m_SuperRasH = (unsigned int)SuperRasH(); filterCL.m_FinalRasW = (unsigned int)FinalRasW(); filterCL.m_FinalRasH = (unsigned int)FinalRasH(); filterCL.m_Supersample = (unsigned int)Supersample(); filterCL.m_FilterWidth = (unsigned int)GetSpatialFilter()->FinalFilterWidth(); filterCL.m_NumChannels = (unsigned int)Renderer::NumChannels(); filterCL.m_BytesPerChannel = (unsigned int)BytesPerChannel(); filterCL.m_DensityFilterOffset = (unsigned int)DensityFilterOffset(); filterCL.m_Transparency = Transparency(); filterCL.m_YAxisUp = (unsigned int)m_YAxisUp; filterCL.m_Vibrancy = vibrancy; filterCL.m_HighlightPower = HighlightPower(); filterCL.m_Gamma = g; filterCL.m_LinRange = linRange; filterCL.m_Background = background; return filterCL; } /// /// Convert the host side Ember object into an EmberCL object /// for passing to OpenCL. /// /// The Ember object to convert /// The EmberCL object template EmberCL RendererCL::ConvertEmber(Ember& ember) { EmberCL emberCL; memset(&emberCL, 0, sizeof(EmberCL));//Might not really be needed. emberCL.m_RotA = m_RotMat.A(); emberCL.m_RotB = m_RotMat.B(); emberCL.m_RotD = m_RotMat.D(); emberCL.m_RotE = m_RotMat.E(); emberCL.m_CamMat = ember.m_CamMat; emberCL.m_CenterX = CenterX(); emberCL.m_CenterY = CenterY(); emberCL.m_CamZPos = ember.m_CamZPos; emberCL.m_CamPerspective = ember.m_CamPerspective; emberCL.m_CamYaw = ember.m_CamYaw; emberCL.m_CamPitch = ember.m_CamPitch; emberCL.m_CamDepthBlur = ember.m_CamDepthBlur; emberCL.m_BlurCoef = ember.BlurCoef(); for (unsigned int i = 0; i < ember.TotalXformCount() && i < MAX_CL_XFORM; i++)//Copy the relevant values for each xform, capped at the max. { Xform* xform = ember.GetTotalXform(i); emberCL.m_Xforms[i].m_A = xform->m_Affine.A(); emberCL.m_Xforms[i].m_B = xform->m_Affine.B(); emberCL.m_Xforms[i].m_C = xform->m_Affine.C(); emberCL.m_Xforms[i].m_D = xform->m_Affine.D(); emberCL.m_Xforms[i].m_E = xform->m_Affine.E(); emberCL.m_Xforms[i].m_F = xform->m_Affine.F(); emberCL.m_Xforms[i].m_PostA = xform->m_Post.A(); emberCL.m_Xforms[i].m_PostB = xform->m_Post.B(); emberCL.m_Xforms[i].m_PostC = xform->m_Post.C(); emberCL.m_Xforms[i].m_PostD = xform->m_Post.D(); emberCL.m_Xforms[i].m_PostE = xform->m_Post.E(); emberCL.m_Xforms[i].m_PostF = xform->m_Post.F(); emberCL.m_Xforms[i].m_DirectColor = xform->m_DirectColor; emberCL.m_Xforms[i].m_ColorSpeedCache = xform->ColorSpeedCache(); emberCL.m_Xforms[i].m_OneMinusColorCache = xform->OneMinusColorCache(); emberCL.m_Xforms[i].m_Opacity = xform->m_Opacity; emberCL.m_Xforms[i].m_VizAdjusted = xform->VizAdjusted(); for (unsigned int varIndex = 0; varIndex < xform->TotalVariationCount() && varIndex < MAX_CL_VARS; varIndex++)//Assign all variation weights for this xform, with a max of MAX_CL_VARS. emberCL.m_Xforms[i].m_VariationWeights[varIndex] = xform->GetVariation(varIndex)->m_Weight; } return emberCL; } /// /// Convert the host side CarToRas object into a CarToRasCL object /// for passing to OpenCL. /// /// The CarToRas object to convert /// The CarToRasCL object template CarToRasCL RendererCL::ConvertCarToRas(const CarToRas& carToRas) { CarToRasCL carToRasCL; carToRasCL.m_RasWidth = (unsigned int)carToRas.RasWidth(); carToRasCL.m_PixPerImageUnitW = carToRas.PixPerImageUnitW(); carToRasCL.m_RasLlX = carToRas.RasLlX(); carToRasCL.m_PixPerImageUnitH = carToRas.PixPerImageUnitH(); carToRasCL.m_RasLlY = carToRas.RasLlY(); carToRasCL.m_CarLlX = carToRas.CarLlX(); carToRasCL.m_CarLlY = carToRas.CarLlY(); carToRasCL.m_CarUrX = carToRas.CarUrX(); carToRasCL.m_CarUrY = carToRas.CarUrY(); return carToRasCL; } }