#include "EmberPch.h" #include "Renderer.h" namespace EmberNs { ///

/// Constructor that allocates various pieces of memory. ///

template Renderer::Renderer() { //Use a very large number regardless of the size of the output pixels. This should be sufficient granularity, even though //it's technically less than the number of distinct values representable by a 32-bit float. m_Csa.resize(static_cast(CURVES_LENGTH)); //Ensure the renderer at least has sane values for the camera upon startup. //This is needed because due to timing/threading disconnects, the GUI can use the camera //values before the render has started, which will lead to corrupt values. Ember ember; SetEmber(ember, eProcessAction::NOTHING, false); //Manually call these instead of passing true to SetEmber() because it would have created the spatial filter //which we don't want to do until rendering starts (this is so the derived RendererCL can properly create the needed buffers). ComputeBounds(); ComputeQuality(); ComputeCamera(); } ///

/// Non-virtual processing functions. ///

///

/// Add an ember to the end of the embers vector and reset the rendering process. /// Reset the rendering process. ///

/// The ember to add template void Renderer::AddEmber(Ember& ember) { ChangeVal([&] { m_Embers.push_back(ember); if (m_Embers.size() == 1) m_Ember = m_Embers[0]; }, eProcessAction::FULL_RENDER); Prepare(); } ///

/// Set the m_Iterator member to point to the appropriate /// iterator based on whether the ember currently being rendered /// contains xaos. /// After assigning, initialize the xform selection buffer. ///

/// True if assignment and distribution initialization succeeded, else false. template bool Renderer::AssignIterator() { //Setup iterator and distributions. //Both iterator types were setup in the constructor (add more in the future if needed). //So simply assign the pointer to the correct type and re-initialize its distributions //based on the current ember. if (XaosPresent()) m_Iterator = m_XaosIterator.get(); else m_Iterator = m_StandardIterator.get(); //Timing t; return m_Iterator->InitDistributions(m_Ember); //t.Toc("Distrib creation"); } ///

/// Virtual processing functions overriden from RendererBase. ///

///

/// Prepare values for the filters, bounds, quality and camera. ///

template void Renderer::Prepare() { bool b = false; CreateSpatialFilter(b); CreateTemporalFilter(b); ComputeBounds(); ComputeQuality(); ComputeCamera(); m_CarToRas.UpdateCachedHalf(m_CarToRas.CarHalfX(), m_CarToRas.CarHalfY()); } ///

/// Compute the bounds of the histogram and density filtering buffers. /// These are affected by the final requested dimensions, spatial and density /// filter sizes and supersampling. ///

template void Renderer::ComputeBounds() { //Original did a lot of work to compute a gutter that changes size based on various parameters, which seems to be of no benefit. //It also prevents the renderer from only performing filtering or final accum based on a filter parameter change, since that //change may have changed the gutter. //By using a fixed gutter, a filter change can be applied without fully restarting iteration. m_GutterWidth = 10 * Supersample();//Should be enough to fully accommodate most spatial and density filter widths. m_SuperRasW = (Supersample() * FinalRasW()) + (2 * m_GutterWidth); m_SuperRasH = (Supersample() * FinalRasH()) + (2 * m_GutterWidth); m_SuperSize = m_SuperRasW * m_SuperRasH; } ///

/// Compute the scale based on the zoom, then the quality based on the computed scale. /// This must be called before ComputeCamera() which will use scale. ///

template void Renderer::ComputeQuality() { m_Scale = std::pow(static_cast(2), Zoom()); m_ScaledQuality = Quality() * SQR(m_Scale); } ///

/// Compute the camera. /// This sets up the bounds of the cartesian plane that the raster bounds correspond to. /// This must be called after ComputeBounds() which sets up the raster bounds. ///

template void Renderer::ComputeCamera() { m_PixelsPerUnitX = PixelsPerUnit() * m_Scale; m_PixelsPerUnitY = m_PixelsPerUnitX; m_PixelsPerUnitX /= PixelAspectRatio(); T shift = 0; T t0 = static_cast(m_GutterWidth) / (Supersample() * m_PixelsPerUnitX); T t1 = static_cast(m_GutterWidth) / (Supersample() * m_PixelsPerUnitY); //These go from ll to ur, moving from negative to positive. m_LowerLeftX = CenterX() - FinalRasW() / m_PixelsPerUnitX / static_cast(2); m_LowerLeftY = CenterY() - FinalRasH() / m_PixelsPerUnitY / static_cast(2); m_UpperRightX = m_LowerLeftX + FinalRasW() / m_PixelsPerUnitX; m_UpperRightY = m_LowerLeftY + FinalRasH() / m_PixelsPerUnitY; T carLlX = m_LowerLeftX - t0; T carLlY = m_LowerLeftY - t1 + shift; T carUrX = m_UpperRightX + t0; T carUrY = m_UpperRightY + t1 + shift; m_RotMat.MakeID(); m_RotMat.Rotate(-Rotate() * DEG_2_RAD_T); m_CarToRas.Init(carLlX, carLlY, carUrX, carUrY, m_SuperRasW, m_SuperRasH, PixelAspectRatio()); } ///

/// Set the current ember. /// This will also populate the vector of embers with a single element copy /// of the ember passed in. /// Temporal samples will be set to 1 since there's only a single ember. ///

/// The ember to assign /// The requested process action. Note that it's critical the user supply the proper value here. /// For example: Changing dimensions without setting action to eProcessAction::FULL_RENDER will crash the program. /// However, changing only the brightness and setting action to ACCUM_ONLY is perfectly fine. /// Whether to also compute bounds, camera, filters etc. This is useful when other code outside of this needs these values /// before the render actually starts. Default: false. /// template void Renderer::SetEmber(const Ember& ember, eProcessAction action, bool prep) { ChangeVal([&] { m_Embers.clear(); m_Embers.push_back(ember); m_Embers[0].m_TemporalSamples = 1;//Set temporal samples here to 1 because using the real value only makes sense when using a vector of Embers for animation. m_Ember = m_Embers[0]; m_EmbersP = &m_Embers; }, action); if (prep) Prepare(); } ///

/// Copy the embers in the passed in container to the internal vector of embers /// and set the m_Ember member to a copy of the first element. /// Reset the rendering process. ///

/// The container of embers to be copied template template void Renderer::SetEmber(const C& embers) { ChangeVal([&] { CopyCont(m_Embers, embers); m_EmbersP = &m_Embers; if (!m_Embers.empty()) m_Ember = m_Embers[0]; }, eProcessAction::FULL_RENDER); Prepare();//Always prepare with a collection. } ///

/// Move the embers in the passed in vector to the internal vector of embers /// and set the m_Ember member to a copy of the first element. /// Reset the rendering process. /// This is preferred over SetEmber when the size of embers is large and/or /// the caller no longer needs to use the argument after this function returns. ///

/// The vector of embers to be moved template void Renderer::MoveEmbers(vector>& embers) { ChangeVal([&] { m_Embers = std::move(embers); m_EmbersP = &m_Embers; if (!m_Embers.empty()) m_Ember = m_Embers[0]; }, eProcessAction::FULL_RENDER); Prepare(); } template void Renderer::SetExternalEmbersPointer(vector>* embers) { ChangeVal([&] { m_Embers.clear(); m_EmbersP = embers; if (!m_EmbersP->empty()) m_Ember = (*m_EmbersP)[0]; }, eProcessAction::FULL_RENDER); Prepare(); } ///

/// Create the density filter if the current filter parameters differ /// from the last density filter created. /// The filter will be deleted if the max DE radius is 0, in which case regular /// log scale filtering will be used. ///

/// True if a new filter instance was created, else false. /// True if the filter is not nullptr (whether a new one was created or not) or if max rad is 0, else false. template bool Renderer::CreateDEFilter(bool& newAlloc) { //If they wanted DE, create it if needed, else clear the last DE filter which means we'll do regular log filtering after iters are done. newAlloc = false; if (m_Ember.m_MaxRadDE > 0) { //Use intelligent testing so it isn't created every time a new ember is passed in. if ((!m_DensityFilter.get()) || (m_Ember.m_MinRadDE != m_DensityFilter->MinRad()) || (m_Ember.m_MaxRadDE != m_DensityFilter->MaxRad()) || (m_Ember.m_CurveDE != m_DensityFilter->Curve()) || (m_Ember.m_Supersample != m_DensityFilter->Supersample())) { m_DensityFilter = make_unique>(static_cast(m_Ember.m_MinRadDE), static_cast(m_Ember.m_MaxRadDE), static_cast(m_Ember.m_CurveDE), m_Ember.m_Supersample); newAlloc = true; } if (newAlloc) { if (!m_DensityFilter.get()) { return false; }//Did object creation succeed? if (!m_DensityFilter->Create()) { return false; }//Object creation succeeded, did filter creation succeed? } else if (!m_DensityFilter->Valid()) { return false; } //Previously created, are values ok? } else { m_DensityFilter.reset();//They want to do log filtering. Return true because even though the filter is being deleted, nothing went wrong. } return true; } ///

/// Create the spatial filter if the current filter parameters differ /// from the last spatial filter created. ///

/// True if a new filter instance was created, else false. /// True if the filter is not nullptr (whether a new one was created or not), else false. template bool Renderer::CreateSpatialFilter(bool& newAlloc) { newAlloc = false; //Use intelligent testing so it isn't created every time a new ember is passed in. if ((!m_SpatialFilter.get()) || (m_Ember.m_SpatialFilterType != m_SpatialFilter->FilterType()) || (m_Ember.m_SpatialFilterRadius != m_SpatialFilter->FilterRadius()) || (m_Ember.m_Supersample != m_SpatialFilter->Supersample()) || (m_PixelAspectRatio != m_SpatialFilter->PixelAspectRatio())) { m_SpatialFilter = unique_ptr>( SpatialFilterCreator::Create(m_Ember.m_SpatialFilterType, static_cast(m_Ember.m_SpatialFilterRadius), m_Ember.m_Supersample, static_cast(m_PixelAspectRatio))); m_Ember.m_SpatialFilterRadius = m_SpatialFilter->FilterRadius();//It may have been changed internally if it was too small, so ensure they're synced. newAlloc = true; } return m_SpatialFilter.get() != nullptr; } ///

/// Create the temporal filter if the current filter parameters differ /// from the last temporal filter created. ///

/// True if a new filter instance was created, else false. /// True if the filter is not nullptr (whether a new one was created or not), else false. template bool Renderer::CreateTemporalFilter(bool& newAlloc) { newAlloc = false; //static int i = 0; //Use intelligent testing so it isn't created every time a new ember is passed in. if ((!m_TemporalFilter.get()) || (m_Ember.m_TemporalSamples != m_TemporalFilter->TemporalSamples()) || (m_Ember.m_TemporalFilterType != m_TemporalFilter->FilterType()) || (m_Ember.m_TemporalFilterWidth != m_TemporalFilter->FilterWidth()) || (m_Ember.m_TemporalFilterExp != m_TemporalFilter->FilterExp())) { m_TemporalFilter = unique_ptr>( TemporalFilterCreator::Create(m_Ember.m_TemporalFilterType, m_Ember.m_TemporalSamples, m_Ember.m_TemporalFilterWidth, m_Ember.m_TemporalFilterExp)); newAlloc = true; //auto name = TemporalFilterCreator::ToString(m_TemporalFilter->FilterType()); //ostringstream os; //os << "./" << ++i << "_" << name << "_filter.txt"; //ofstream of (os.str()); //auto str = m_TemporalFilter->ToString(); // //if (of.is_open()) // of << str; } return m_TemporalFilter.get() != nullptr; } ///

/// The main render loop. This is the core of the algorithm. /// The processing steps are: Iterating, density filtering, final accumulation. /// Various functions in it are virtual so they will resolve /// to whatever overrides are provided in derived classes. This /// future-proofs the algorithm for GPU-based renderers. /// If the caller calls Abort() at any time, or the progress function returns 0, /// the entire rendering process will exit as soon as it can. /// The concept of passes from flam3 has been removed as it was never used. /// The loop structure is: /// { /// Temporal Samples (Default 1 for single image) /// { /// Iterate (Either to completion or to a specified number of iterations) /// { /// } /// } /// /// Density filtering (Basic log, or full density estimation) /// Final accumulation (Color correction and spatial filtering) /// } /// This loop structure has admittedly been severely butchered from what /// flam3 did. The reason is that it was made to support interactive rendering /// that can exit the process and pick up where it left off in response to the /// user changing values in a fractal flame GUI editor. /// To achieve this, each step in the rendering process is given an enumeration state /// as well as a goto label. This allows the renderer to pick up in the state it left /// off in if no changes prohibiting that have been made. /// It also allows for the bare minimum amount of processing needed to complete the requested /// action. For example, if the process has completed and the user only adjusts the brightness /// of the last rendered ember then there is no need to perform the entire iteration process /// over again. Rather, only final accumulation is needed. ///

/// Storage for the final image. It will be allocated if needed. /// The time if animating, else ignored. /// Run a specified number of sub batches. Default: 0, meaning run to completion. /// True to force rendering a complete image even if iterating is not complete, else don't. Default: false. /// Offset in finalImage to store the pixels to. Default: 0. /// True if nothing went wrong, else false. template eRenderStatus Renderer::Run(vector& finalImage, double time, size_t subBatchCountOverride, bool forceOutput, size_t finalOffset) { m_InRender = true; EnterRender(); m_Abort = false; bool filterAndAccumOnly = m_ProcessAction == eProcessAction::FILTER_AND_ACCUM; bool accumOnly = m_ProcessAction == eProcessAction::ACCUM_ONLY; bool resume = m_ProcessState != eProcessState::NONE; bool newFilterAlloc; size_t temporalSample = 0; T deTime; auto success = eRenderStatus::RENDER_OK; //Reset timers and progress percent if: Beginning anew or only filtering and/or accumulating. if (!resume || accumOnly || filterAndAccumOnly) { if (!resume)//Only set this if it's the first run through. m_ProcessState = eProcessState::ITER_STARTED; m_ProgressTimer.Tic(); } if (!resume)//Beginning, reset everything. { m_RenderTimer.Tic(); m_LastTemporalSample = 0; m_LastIter = 0; m_LastIterPercent = 0; m_Stats.Clear(); m_Gamma = 0; m_Vibrancy = 0;//Accumulate these after each temporal sample. m_VibGamCount = 0; m_CurvesSet = false; m_Background.Clear(); } //User requested an increase in quality after finishing. else if (m_ProcessState == eProcessState::ITER_STARTED && m_ProcessAction == eProcessAction::KEEP_ITERATING && TemporalSamples() == 1) { m_RenderTimer.Tic(); m_LastTemporalSample = 0; m_LastIter = m_Stats.m_Iters; m_LastIterPercent = 0;//Might skip a progress update, but shouldn't matter. m_Gamma = 0; m_Vibrancy = 0; m_VibGamCount = 0; m_Background.Clear(); ComputeQuality();//Must recompute quality when doing a quality increase. } //Make sure values are within valid range. ClampGteRef(m_Ember.m_Supersample, static_cast(1)); //Make sure to get most recent update since loop won't be entered to call Interp(). //Vib, gam and background are normally summed for each temporal sample. However if iteration is skipped, make sure to get the latest. if ((filterAndAccumOnly || accumOnly) && TemporalSamples() == 1)//Disallow jumping when temporal samples > 1. { m_Ember = (*m_EmbersP)[0]; m_Vibrancy = Vibrancy(); m_Gamma = Gamma(); m_Background = m_Ember.m_Background; if (filterAndAccumOnly) goto FilterAndAccum; if (accumOnly) goto AccumOnly; } //it.Tic(); //Interpolate. if (m_EmbersP->size() > 1) m_Interpolater.Interpolate(*m_EmbersP, static_cast(time), 0, m_Ember); //it.Toc("Interp 1"); //Save only for palette insertion. if (m_InsertPalette) m_TempEmber = m_Ember; if (!resume)//Only need to create this when starting a new render. { CreateSpatialFilter(newFilterAlloc);//Will be checked and recreated again if necessary right before final output. CreateTemporalFilter(newFilterAlloc);//But create here just to ensure allocation succeeded. ComputeBounds(); } if (m_SpatialFilter.get() == nullptr || m_TemporalFilter.get() == nullptr) { AddToReport("Spatial and temporal filter allocations failed, aborting.\n"); success = eRenderStatus::RENDER_ERROR; goto Finish; } if (!resume && !Alloc()) { AddToReport("Histogram, accumulator and samples buffer allocations failed, aborting.\n"); success = eRenderStatus::RENDER_ERROR; goto Finish; } if (!resume) { if (!ResetBuckets(true, false))//Only reset hist here and do accum when needed later on. { success = eRenderStatus::RENDER_ERROR; goto Finish; } } deTime = static_cast(time) + *m_TemporalFilter->Deltas(); //Interpolate and get an ember for DE purposes. //Additional interpolation will be done in the temporal samples loop. //it.Tic(); if (m_EmbersP->size() > 1) m_Interpolater.Interpolate(*m_EmbersP, deTime, 0, m_Ember); //it.Toc("Interp 2"); ClampGteRef(m_Ember.m_MinRadDE, 0); ClampGteRef(m_Ember.m_MaxRadDE, 0); ClampGteRef(m_Ember.m_MaxRadDE, m_Ember.m_MinRadDE); if (!CreateDEFilter(newFilterAlloc))//Will be checked and recreated again if necessary right before density filtering. { AddToReport("Density filter creation failed, aborting.\n"); success = eRenderStatus::RENDER_ERROR; goto Finish; } //Temporal samples, loop 1. temporalSample = resume ? m_LastTemporalSample : 0; for (; (temporalSample < TemporalSamples()) && !m_Abort;) { T colorScalar = m_TemporalFilter->Filter()[temporalSample]; T temporalTime = static_cast(time) + m_TemporalFilter->Deltas()[temporalSample]; //Interpolate again. //it.Tic(); if (TemporalSamples() > 1 && m_EmbersP->size() > 1) m_Interpolater.Interpolate(*m_EmbersP, temporalTime, 0, m_Ember);//This will perform all necessary precalcs via the ember/xform/variation assignment operators. //it.Toc("Interp 3"); if (!resume && !AssignIterator()) { AddToReport("Iterator assignment failed, aborting.\n"); success = eRenderStatus::RENDER_ERROR; goto Finish; } //Do this every iteration for an animation, or else do it once for a single image. if (TemporalSamples() > 1 || !resume) { ComputeQuality(); ComputeCamera(); //m_CarToRas.UpdateCachedHalf(m_CarToRas.CarHalfX(), m_CarToRas.CarHalfY()); MakeDmap(colorScalar);//For each temporal sample, the palette m_Dmap needs to be re-created with color scalar. 1 if no temporal samples. } //The actual number of times to iterate. Each thread will get (totalIters / ThreadCount) iters to do. //This is based on zoom and scale calculated in ComputeQuality(). //Note that the iter count is based on the final image dimensions, and not the super sampled dimensions. size_t itersPerTemporalSample = ItersPerTemporalSample();//The total number of iterations for this temporal sample without overrides. size_t sampleItersToDo;//The number of iterations to actually do in this sample, considering overrides. if (subBatchCountOverride > 0) sampleItersToDo = subBatchCountOverride * SubBatchSize() * ThreadCount();//Run a specific number of sub batches. else sampleItersToDo = itersPerTemporalSample;//Run as many iters as specified to complete this temporal sample. sampleItersToDo = std::min(sampleItersToDo, itersPerTemporalSample - m_LastIter); EmberStats stats = Iterate(sampleItersToDo, temporalSample);//The heavy work is done here. //Abort does not indicate an error, it just means the process was interrupted, most likely by the user on the GUI. if (m_Abort) { success = eRenderStatus::RENDER_ABORT; goto Finish; } //If no iters were executed, something went catastrophically wrong. if (!stats.m_Success && stats.m_Iters == 0) { AddToReport("Zero iterations ran, rendering failed, aborting.\n"); success = eRenderStatus::RENDER_ERROR; Abort(); goto Finish; } //Accumulate stats whether this batch ran to completion or exited prematurely. m_LastIter += stats.m_Iters;//Sum of iter count of all threads, reset each temporal sample. m_Stats.m_Iters += stats.m_Iters;//Sum of iter count of all threads, cumulative from beginning to end. m_Stats.m_Badvals += stats.m_Badvals; m_Stats.m_IterMs += stats.m_IterMs; //After each temporal sample, accumulate these. //Allow for incremental rendering by only taking action if the iter loop for this temporal sample is completely done. if (m_LastIter >= itersPerTemporalSample) { m_Vibrancy += Vibrancy(); m_Gamma += Gamma(); m_Background.r += static_cast(m_Ember.m_Background.r); m_Background.g += static_cast(m_Ember.m_Background.g); m_Background.b += static_cast(m_Ember.m_Background.b); m_VibGamCount++; m_LastIter = 0; temporalSample++; } m_LastTemporalSample = temporalSample; if (subBatchCountOverride > 0)//Don't keep going through this loop if only doing an incremental render. break; }//Temporal samples. //If we've completed all temporal samples, then it was a complete render, so report progress. if (temporalSample >= TemporalSamples()) { m_ProcessState = eProcessState::ITER_DONE; if (m_Callback && !m_Callback->ProgressFunc(m_Ember, m_ProgressParameter, 100.0, 0, 0)) { Abort(); success = eRenderStatus::RENDER_ABORT; goto Finish; } } FilterAndAccum: if (filterAndAccumOnly || temporalSample >= TemporalSamples() || forceOutput) { //t.Toc("Iterating and accumulating"); //Compute k1 and k2. auto fullRun = eRenderStatus::RENDER_OK;//Whether density filtering was run to completion without aborting prematurely or triggering an error. T area = FinalRasW() * FinalRasH() / (m_PixelsPerUnitX * m_PixelsPerUnitY);//Need to use temps from field if ever implemented. m_K1 = Brightness(); if (!m_Ember.m_K2 || forceOutput) { //When doing an interactive render, force output early on in the render process, before all iterations are done. //This presents a problem with the normal calculation of K2 since it relies on the quality value; it will scale the colors //to be very dark. Correct it by pretending the number of iters done is the exact quality desired and then scale according to that. if (forceOutput) { T quality = (static_cast(m_Stats.m_Iters) / static_cast(FinalDimensions())) * (m_Scale * m_Scale); m_K2 = static_cast((Supersample() * Supersample()) / (area * quality * m_TemporalFilter->SumFilt())); } else m_K2 = static_cast((Supersample() * Supersample()) / (area * m_ScaledQuality * m_TemporalFilter->SumFilt())); } else m_K2 = static_cast(m_Ember.m_K2); if (!ResetBuckets(false, true))//Only the histogram was reset above, now reset the density filtering buffer. { success = eRenderStatus::RENDER_ERROR; goto Finish; } //t.Tic(); //Make sure a density filter was created with the latest values. ClampGteRef(m_Ember.m_MinRadDE, 0); ClampGteRef(m_Ember.m_MaxRadDE, 0); ClampGteRef(m_Ember.m_MaxRadDE, m_Ember.m_MinRadDE); CreateDEFilter(newFilterAlloc); //Apply appropriate filter if iterating is complete. if (filterAndAccumOnly || temporalSample >= TemporalSamples()) { fullRun = m_DensityFilter.get() ? GaussianDensityFilter() : LogScaleDensityFilter(forceOutput); } else { //Apply requested filter for a forced output during interactive rendering. if (m_DensityFilter.get() && m_InteractiveFilter == eInteractiveFilter::FILTER_DE) fullRun = GaussianDensityFilter(); else if (!m_DensityFilter.get() || m_InteractiveFilter == eInteractiveFilter::FILTER_LOG) fullRun = LogScaleDensityFilter(forceOutput); } //Only update state if iterating and filtering finished completely (didn't arrive here via forceOutput). if (fullRun == eRenderStatus::RENDER_OK && m_ProcessState == eProcessState::ITER_DONE) m_ProcessState = eProcessState::FILTER_DONE; //Take special action if filtering exited prematurely. if (fullRun != eRenderStatus::RENDER_OK) { if (!ResetBuckets(false, true))//Reset the accumulator, come back and try again on the next call. success = eRenderStatus::RENDER_ERROR; else success = fullRun; goto Finish; } if (m_Abort) { success = eRenderStatus::RENDER_ABORT; goto Finish; } //t.Toc("Density estimation filtering time: ", true); } AccumOnly: if (m_ProcessState == eProcessState::FILTER_DONE || forceOutput) { //Original only allowed stages 0 and 1. Add 2 to mean final accum. //Do not update state/progress on forced output because it will be immediately overwritten. if (m_Callback && !forceOutput && !m_Callback->ProgressFunc(m_Ember, m_ProgressParameter, 0, 2, 0)) { Abort(); success = eRenderStatus::RENDER_ABORT; goto Finish; } //Make sure a filter has been created. CreateSpatialFilter(newFilterAlloc); m_DensityFilterOffset = m_GutterWidth - static_cast(Clamp((static_cast(m_SpatialFilter->FinalFilterWidth()) - static_cast(Supersample())) / 2, 0, static_cast(m_GutterWidth))); m_CurvesSet = m_Ember.m_Curves.CurvesSet(); ComputeCurves();//Color curves must be re-calculated as well. if (AccumulatorToFinalImage(finalImage, finalOffset) == eRenderStatus::RENDER_OK) { m_Stats.m_RenderMs += m_RenderTimer.Toc();//Record total time from the very beginning to the very end, including all intermediate calls. //Even though the ember changes throughought the inner loops because of interpolation, it's probably ok to assign here. //This will hold the last interpolated value (even though spatial and temporal filters were created based off of one of the first interpolated values). m_LastEmber = m_Ember; if (m_ProcessState == eProcessState::FILTER_DONE)//Only update state if gotten here legitimately, and not via forceOutput. { m_ProcessState = eProcessState::ACCUM_DONE; if (m_Callback && !m_Callback->ProgressFunc(m_Ember, m_ProgressParameter, 100.0, 2, 0))//Finished. { Abort(); success = eRenderStatus::RENDER_ABORT; goto Finish; } } } else { success = eRenderStatus::RENDER_ERROR; } } Finish: if (success == eRenderStatus::RENDER_OK && m_Abort)//If everything ran ok, but they've aborted, record abort as the status. success = eRenderStatus::RENDER_ABORT; else if (success != eRenderStatus::RENDER_OK)//Regardless of abort status, if there was an error, leave that as the return status. Abort(); LeaveRender(); m_InRender = false; return success; } ///

/// Return EmberImageComments object with image comments filled out. /// Run() should have completed before calling this. ///

/// The depth of the edit tags /// If true, embed a hexadecimal palette instead of Xml Color tags, else use Xml color tags. /// The EmberImageComments object with image comments filled out template EmberImageComments Renderer::ImageComments(const EmberStats& stats, size_t printEditDepth, bool hexPalette) { ostringstream ss; EmberImageComments comments; ss.imbue(std::locale("")); comments.m_Genome = m_EmberToXml.ToString(m_Ember, "", printEditDepth, false, hexPalette); ss << (static_cast(stats.m_Badvals) / static_cast(stats.m_Iters));//Percentage of bad values to iters. comments.m_Badvals = ss.str(); ss.str(""); ss << stats.m_Iters; comments.m_NumIters = ss.str(); ss.str("");//Total iters. ss << (stats.m_RenderMs / 1000.0); comments.m_Runtime = ss.str();//Number of seconds for iterating, accumulating and filtering. return comments; } ///

/// New virtual functions to be overridden in derived renderers that use the GPU, but not accessed outside. ///

///

/// Make the final palette used for iteration. ///

/// The color scalar to multiply the ember's palette by template void Renderer::MakeDmap(T colorScalar) { m_Ember.m_Palette.template MakeDmap(m_Dmap, static_cast(colorScalar)); } ///

/// Allocate various buffers if the image dimensions, thread count, or sub batch size /// has changed. ///

/// True if success, else false template bool Renderer::Alloc(bool histOnly) { auto b = true; const auto lock = (m_SuperSize != m_HistBuckets.size()) || (m_SuperSize != m_AccumulatorBuckets.size()) || (m_ThreadsToUse != m_Samples.size()) || (m_Samples[0].size() != SubBatchSize()); if (lock) EnterResize(); if (m_SuperSize != m_HistBuckets.size()) { m_HistBuckets.resize(m_SuperSize); if (m_ReclaimOnResize) m_HistBuckets.shrink_to_fit(); b &= (m_HistBuckets.size() == m_SuperSize); } if (histOnly) { if (lock) LeaveResize(); return b; } if (m_SuperSize != m_AccumulatorBuckets.size()) { m_AccumulatorBuckets.resize(m_SuperSize); if (m_ReclaimOnResize) m_AccumulatorBuckets.shrink_to_fit(); b &= (m_AccumulatorBuckets.size() == m_SuperSize); } if (m_ThreadsToUse != m_Samples.size()) { m_Samples.resize(m_ThreadsToUse); if (m_ReclaimOnResize) m_Samples.shrink_to_fit(); b &= (m_Samples.size() == m_ThreadsToUse); } for (auto& sample : m_Samples) { if (sample.size() != SubBatchSize()) { sample.resize(SubBatchSize()); if (m_ReclaimOnResize) sample.shrink_to_fit(); b &= (sample.size() == SubBatchSize()); } } if (!m_StandardIterator.get()) m_StandardIterator = make_unique>(); if (!m_XaosIterator.get()) m_XaosIterator = make_unique>(); if (lock) LeaveResize(); return b; } ///

/// Clear histogram and/or density filtering buffers to all zeroes. ///

/// Clear histogram if true, else don't. /// Clear density filtering buffer if true, else don't. /// True if anything was cleared, else false. template bool Renderer::ResetBuckets(bool resetHist, bool resetAccum) { //parallel_invoke( //[&] //{ if (resetHist && !m_HistBuckets.empty()) Memset(m_HistBuckets); //}, //[&] //{ if (resetAccum && !m_AccumulatorBuckets.empty()) Memset(m_AccumulatorBuckets); //}); return resetHist || resetAccum; } ///

/// THIS IS UNUSED. /// Log scales a single row with a specially structured loop that will be vectorized by the compiler. /// Note this adds an epsilon to the denomiator used to compute the logScale /// value because the conditional check for zero would have prevented the loop from /// being vectorized. ///

/// The absolute element index in the histogram this row starts on /// The absolute element index in the histogram this row ends on template void Renderer::VectorizedLogScale(size_t row, size_t rowEnd) { const auto k1 = static_cast(m_K1);//All types must be float. const auto k2 = static_cast(m_K2); auto* __restrict hist = m_HistBuckets.data();//Vectorizer can't tell these point to different locations. auto* __restrict acc = m_AccumulatorBuckets.data(); for (size_t i = row; i < rowEnd; i++) { const float logScale = (k1 * std::log(1.0f + hist[i].a * k2)) / (hist[i].a + std::numeric_limits::epsilon()); acc[i].r = hist[i].r * logScale;//Must break these out individually. Vectorizer can't reason about vec4's overloaded * operator. acc[i].g = hist[i].g * logScale; acc[i].b = hist[i].b * logScale; acc[i].a = hist[i].a * logScale; } } ///

/// Perform log scale density filtering. /// Base case for simple log scale density estimation as discussed (mostly) in the paper /// in section 4, p. 6-9. ///

/// Whether this output was forced due to an interactive render /// True if not prematurely aborted, else false. template eRenderStatus Renderer::LogScaleDensityFilter(bool forceOutput) { size_t startRow = 0; size_t endRow = m_SuperRasH; size_t endCol = m_SuperRasW; //Timing t(4); //Original didn't parallelize this, doing so gives a 50-75% speedup. //The value can be directly assigned, which is quicker than summing. parallel_for(startRow, endRow, static_cast(1), [&](size_t j) { size_t row = j * m_SuperRasW; size_t rowEnd = row + endCol; if (!m_Abort) { for (size_t i = row; i < rowEnd; i++) { //Check for visibility first before doing anything else to avoid all possible unnecessary calculations. if (m_HistBuckets[i].a != 0) { const bucketT logScale = (m_K1 * std::log(1 + m_HistBuckets[i].a * m_K2)) / m_HistBuckets[i].a; //Original did a temporary assignment, then *= logScale, then passed the result to bump_no_overflow(). //Combine here into one operation for a slight speedup. //Vectorized version: bucketT* __restrict hist = glm::value_ptr(m_HistBuckets[i]);//Vectorizer can't tell these point to different locations. bucketT* __restrict acc = glm::value_ptr(m_AccumulatorBuckets[i]); for (size_t v = 0; v < 4; v++)//Vectorized by compiler. acc[v] = hist[v] * logScale; } } } } #if defined(_WIN32) || defined(__APPLE__) , tbb::static_partitioner() #endif ); if (m_Callback && !m_Abort) if (!m_Callback->ProgressFunc(m_Ember, m_ProgressParameter, 100.0, 1, 0)) Abort(); //t.Toc(__FUNCTION__); return m_Abort ? eRenderStatus::RENDER_ABORT : eRenderStatus::RENDER_OK; } ///

/// Perform the more advanced Gaussian density filter. /// More advanced density estimation filtering given less mention in the paper, but used /// much more in practice as it gives the best results. /// Section 8, p. 11-13. ///

/// True if not prematurely aborted, else false. template eRenderStatus Renderer::GaussianDensityFilter() { Timing totalTime, localTime; bool scf = !(Supersample() & 1); intmax_t ss = Floor(Supersample() / static_cast(2)); T scfact = std::pow(Supersample() / (Supersample() + static_cast(1)), static_cast(2)); size_t threads = m_ThreadsToUse; size_t startRow = Supersample() - 1; size_t endRow = m_SuperRasH - (Supersample() - 1);//Original did + which is most likely wrong. intmax_t startCol = Supersample() - 1; intmax_t endCol = m_SuperRasW - (Supersample() - 1); size_t chunkSize = static_cast(std::ceil(static_cast(endRow - startRow) / static_cast(threads))); //parallel_for scales very well, dividing the work almost perfectly among all processors. parallel_for(static_cast(0), threads, static_cast(1), [&] (size_t threadIndex) { size_t pixelNumber = 0; const auto localStartRow = static_cast(std::min(startRow + (threadIndex * chunkSize), endRow - 1)); const auto localEndRow = static_cast(std::min(localStartRow + chunkSize, endRow)); const size_t pixelsThisThread = static_cast(localEndRow - localStartRow) * m_SuperRasW; double lastPercent = 0; tvec4 logScaleBucket; for (intmax_t j = localStartRow; (j < localEndRow) && !m_Abort; j++) { const auto buckets = m_HistBuckets.data(); const auto bucketRowStart = buckets + (j * m_SuperRasW);//Pull out of inner loop for optimization. const auto filterCoefs = m_DensityFilter->Coefs(); const auto filterWidths = m_DensityFilter->Widths(); for (intmax_t i = startCol; i < endCol; i++) { intmax_t ii, jj, arrFilterWidth; size_t filterSelectInt, filterCoefIndex; T filterSelect = 0; auto bucket = bucketRowStart + i; //Don't do anything if there's no hits here. Must also put this first to avoid dividing by zero below. if (bucket->a == 0) continue; const bucketT cacheLog = (m_K1 * std::log(1 + bucket->a * m_K2)) / bucket->a;//Caching this calculation gives a 30% speedup. if (ss == 0) { filterSelect = bucket->a; } else { //The original contained a glaring flaw as it would run past the boundaries of the buffers //when calculating the density for a box centered on the last row or column. //Clamp here to not run over the edge. const intmax_t densityBoxLeftX = (i - std::min(i, ss)); const intmax_t densityBoxRightX = (i + std::min(ss, static_cast(m_SuperRasW) - i - 1)); const intmax_t densityBoxTopY = (j - std::min(j, ss)); const intmax_t densityBoxBottomY = (j + std::min(ss, static_cast(m_SuperRasH) - j - 1)); //Count density in ssxss area. //Original went one col at a time, which is cache inefficient. Go one row at at time here for a slight speedup. for (jj = densityBoxTopY; jj <= densityBoxBottomY; jj++) for (ii = densityBoxLeftX; ii <= densityBoxRightX; ii++) filterSelect += buckets[ii + (jj * m_SuperRasW)].a;//Original divided by 255 in every iteration. Omit here because colors are already in the range of [0..1]. } //Scale if supersample > 1 for equal iters. if (scf) filterSelect *= scfact; if (filterSelect > m_DensityFilter->MaxFilteredCounts()) filterSelectInt = m_DensityFilter->MaxFilterIndex(); else if (filterSelect <= DE_THRESH) filterSelectInt = static_cast(std::ceil(filterSelect)) - 1; else filterSelectInt = DE_THRESH + static_cast(Floor(std::pow(filterSelect - DE_THRESH, m_DensityFilter->Curve()))); //If the filter selected below the min specified clamp it to the min. if (filterSelectInt > m_DensityFilter->MaxFilterIndex()) filterSelectInt = m_DensityFilter->MaxFilterIndex(); //Only have to calculate the values for ~1/8 of the square. filterCoefIndex = filterSelectInt * m_DensityFilter->KernelSize(); arrFilterWidth = static_cast(std::ceil(filterWidths[filterSelectInt])) - 1; for (jj = 0; jj <= arrFilterWidth; jj++) { for (ii = 0; ii <= jj; ii++, filterCoefIndex++) { //Skip if coef is 0. if (filterCoefs[filterCoefIndex] == 0) continue; bucketT logScale = filterCoefs[filterCoefIndex] * cacheLog; //Original first assigned the fields, then scaled them. Combine into a single step for a 1% optimization. logScaleBucket = (*bucket * logScale); if (jj == 0 && ii == 0) { AddToAccum(logScaleBucket, i, ii, j, jj); } else if (ii == 0) { AddToAccum(logScaleBucket, i, 0, j, -jj); AddToAccum(logScaleBucket, i, -jj, j, 0); AddToAccum(logScaleBucket, i, jj, j, 0); AddToAccum(logScaleBucket, i, 0, j, jj); } else if (jj == ii) { AddToAccum(logScaleBucket, i, -ii, j, -jj); AddToAccum(logScaleBucket, i, ii, j, -jj); AddToAccum(logScaleBucket, i, -ii, j, jj); AddToAccum(logScaleBucket, i, ii, j, jj); } else { //Attempting to optimize cache access by putting these in order makes no difference, even on large images, but do it anyway. AddToAccum(logScaleBucket, i, -ii, j, -jj); AddToAccum(logScaleBucket, i, ii, j, -jj); AddToAccum(logScaleBucket, i, -jj, j, -ii); AddToAccum(logScaleBucket, i, jj, j, -ii); AddToAccum(logScaleBucket, i, -jj, j, ii); AddToAccum(logScaleBucket, i, jj, j, ii); AddToAccum(logScaleBucket, i, -ii, j, jj); AddToAccum(logScaleBucket, i, ii, j, jj); } } } } if (m_Callback && threadIndex == 0) { pixelNumber += m_SuperRasW; const auto percent = (static_cast(pixelNumber) / static_cast(pixelsThisThread)) * 100.0; const auto percentDiff = percent - lastPercent; const auto toc = localTime.Toc(); if (percentDiff >= 10 || (toc > 1000 && percentDiff >= 1)) { const auto etaMs = ((100.0 - percent) / percent) * totalTime.Toc(); if (!m_Callback->ProgressFunc(m_Ember, m_ProgressParameter, percent, 1, etaMs)) Abort(); lastPercent = percent; localTime.Tic(); } } } } #if defined(_WIN32) || defined(__APPLE__) , tbb::static_partitioner() #endif ); if (m_Callback && !m_Abort) m_Callback->ProgressFunc(m_Ember, m_ProgressParameter, 100.0, 1, 0); //totalTime.Toc(__FUNCTION__); return m_Abort ? eRenderStatus::RENDER_ABORT : eRenderStatus::RENDER_OK; } ///

/// Produce a final, visible image by clipping, gamma correcting and spatial filtering the color values /// in the density filtering buffer and save to the passed in buffer. ///

/// The pixel vector to allocate and store the final image in /// Offset in the buffer to store the pixels to /// True if not prematurely aborted, else false. template eRenderStatus Renderer::AccumulatorToFinalImage(vector& pixels, size_t finalOffset) { EnterFinalAccum(); if (!PrepFinalAccumVector(pixels)) { LeaveFinalAccum(); return eRenderStatus::RENDER_ERROR; } //Timing t(4); const size_t filterWidth = m_SpatialFilter->FinalFilterWidth(); bucketT g, linRange, vibrancy; Color background; auto p = pixels.data(); p += finalOffset; PrepFinalAccumVals(background, g, linRange, vibrancy);//After this, background has been scaled from 0-1 to 0-255. //If early clip, go through the entire accumulator and perform gamma correction first. //The original does it this way as well and it's roughly 11 times faster to do it this way than inline below with each pixel. if (EarlyClip()) { parallel_for(static_cast(0), m_SuperRasH, static_cast(1), [&](size_t j) { auto rowStart = m_AccumulatorBuckets.data() + (j * m_SuperRasW);//Pull out of inner loop for optimization. const auto rowEnd = rowStart + m_SuperRasW; while (rowStart < rowEnd && !m_Abort)//Use the pointer itself as the offset to save an extra addition per iter. { GammaCorrection(*rowStart, background, g, linRange, vibrancy, false, glm::value_ptr(*rowStart));//Write back in place. rowStart++; } } #if defined(_WIN32) || defined(__APPLE__) , tbb::static_partitioner() #endif ); } if (m_Abort) { LeaveFinalAccum(); return eRenderStatus::RENDER_ABORT; } //Note that abort is not checked here. The final accumulation must run to completion //otherwise artifacts that resemble page tearing will occur in an interactive run. It's //critical to never exit this loop prematurely. //for (size_t j = 0; j < FinalRasH(); j++)//Keep around for debugging. parallel_for(static_cast(0), FinalRasH(), static_cast(1), [&](size_t j) { Color newBucket; size_t pixelsRowStart = (m_YAxisUp ? ((FinalRasH() - j) - 1) : j) * FinalRasW();//Pull out of inner loop for optimization. size_t y = m_DensityFilterOffset + (j * Supersample());//Start at the beginning row of each super sample block. size_t clampedFilterH = std::min(filterWidth, m_SuperRasH - y);//Make sure the filter doesn't go past the bottom of the gutter. auto pv4T = p + pixelsRowStart; for (size_t i = 0; i < FinalRasW(); i++, pv4T++) { size_t ii, jj; const size_t x = m_DensityFilterOffset + (i * Supersample());//Start at the beginning column of each super sample block. const size_t clampedFilterW = std::min(filterWidth, m_SuperRasW - x);//Make sure the filter doesn't go past the right of the gutter. newBucket.Clear(); //Original was iterating column-wise, which is slow. //Here, iterate one row at a time, giving a 10% speed increase. for (jj = 0; jj < clampedFilterH; jj++) { size_t filterKRowIndex = jj * filterWidth;//Use the full, non-clamped width to get the filter value. size_t accumRowIndex = (y + jj) * m_SuperRasW;//Pull out of inner loop for optimization. for (ii = 0; ii < clampedFilterW; ii++) { //Need to dereference the spatial filter pointer object to use the [] operator. Makes no speed difference. bucketT k = ((*m_SpatialFilter)[filterKRowIndex + ii]); newBucket += (m_AccumulatorBuckets[accumRowIndex + (x + ii)] * k); } } auto pf = reinterpret_cast(pv4T); GammaCorrection(*(reinterpret_cast*>(&newBucket)), background, g, linRange, vibrancy, true, pf); } } #if defined(_WIN32) || defined(__APPLE__) , tbb::static_partitioner() #endif ); //Insert the palette into the image for debugging purposes. Not implemented on the GPU. if (m_InsertPalette) { size_t i, j, ph = 100; if (ph >= FinalRasH()) ph = FinalRasH(); for (j = 0; j < ph; j++) { for (i = 0; i < FinalRasW(); i++) { const auto pp = p + (i + j * FinalRasW()); pp->r = m_TempEmber.m_Palette[i * 256 / FinalRasW()][0]; pp->g = m_TempEmber.m_Palette[i * 256 / FinalRasW()][1]; pp->b = m_TempEmber.m_Palette[i * 256 / FinalRasW()][2]; pp->a = 1; } } } //t.Toc(__FUNCTION__); LeaveFinalAccum(); return m_Abort ? eRenderStatus::RENDER_ABORT : eRenderStatus::RENDER_OK; } //#define TG 1 //#define NEWSUBBATCH 1 ///

/// Run the iteration algorithm for the specified number of iterations. /// This is only called after all other setup has been done. /// This function will be called multiple times for an interactive rendering, and /// once for a straight through render. /// The iteration is reset and fused in each thread after each sub batch is done /// which by default is 10,240 iterations. ///

/// The number of iterations to run /// The temporal sample this is running for /// Rendering statistics template EmberStats Renderer::Iterate(size_t iterCount, size_t temporalSample) { //Timing t2(4); m_IterTimer.Tic(); const size_t totalItersPerThread = static_cast(std::ceil(static_cast(iterCount) / static_cast(m_ThreadsToUse))); EmberStats stats; //vector accumTimes(4); //Do this every iteration for an animation, or else do it once for a single image. CPU only. if (!m_LastIter) { m_ThreadEmbers.clear(); m_ThreadEmbers.insert(m_ThreadEmbers.begin(), m_ThreadsToUse, m_Ember); } parallel_for(static_cast(0), m_ThreadsToUse, static_cast(1), [&] (size_t threadIndex) { #if defined(_WIN32) SetThreadPriority(GetCurrentThread(), static_cast(m_Priority)); #elif defined(__APPLE__) sched_param sp = {0}; sp.sched_priority = static_cast(m_Priority); pthread_setschedparam(pthread_self(), SCHED_RR, &sp); #else pthread_setschedprio(pthread_self(), static_cast(m_Priority)); #endif //Timing t; IterParams params; m_BadVals[threadIndex] = 0; params.m_Count = std::min(totalItersPerThread, SubBatchSize()); params.m_Skip = FuseCount(); //params.m_OneColDiv2 = m_CarToRas.OneCol() / 2; //params.m_OneRowDiv2 = m_CarToRas.OneRow() / 2; //Sub batch iterations, loop 2. for (m_SubBatch[threadIndex] = 0; (m_SubBatch[threadIndex] < totalItersPerThread) && !m_Abort; m_SubBatch[threadIndex] += params.m_Count) { //Must recalculate the number of iters to run on each sub batch because the last batch will most likely have less than SubBatchSize iters. //For example, if 51,000 are requested, and the sbs is 10,000, it should run 5 sub batches of 10,000 iters, and one final sub batch of 1,000 iters. params.m_Count = std::min(params.m_Count, totalItersPerThread - m_SubBatch[threadIndex]); //Use first as random point, the rest are iterated points. //Note that this gets reset with a new random point for each SubBatchSize iterations. //This helps correct if iteration happens to be on a bad trajectory. m_Samples[threadIndex][0].m_X = m_Rand[threadIndex].template Frand(-m_ThreadEmbers[threadIndex].m_RandPointRange, m_ThreadEmbers[threadIndex].m_RandPointRange); m_Samples[threadIndex][0].m_Y = m_Rand[threadIndex].template Frand(-m_ThreadEmbers[threadIndex].m_RandPointRange, m_ThreadEmbers[threadIndex].m_RandPointRange); m_Samples[threadIndex][0].m_Z = 0;//m_Ember.m_CamZPos;//Apo set this to 0, then made the user use special variations to kick it. It seems easier to just set it to zpos. m_Samples[threadIndex][0].m_ColorX = m_Rand[threadIndex].template Frand01(); //Check if the user wanted to suspend the process. while (Paused()) std::this_thread::sleep_for(500ms); //Finally, iterate. //t.Tic(); //Iterating, loop 3. m_BadVals[threadIndex] += m_Iterator->Iterate(m_ThreadEmbers[threadIndex], params, m_CarToRas, m_Samples[threadIndex].data(), m_Rand[threadIndex]); //iterationTime += t.Toc(); if (m_LockAccum) m_AccumCs.lock(); //t.Tic(); //Map temp buffer samples into the histogram using the palette for color. Accumulate(m_Rand[threadIndex], m_Samples[threadIndex].data(), params.m_Count, &m_Dmap); //accumTimes[threadIndex] += t.Toc(); if (m_LockAccum) m_AccumCs.unlock(); if (m_Callback && threadIndex == 0) { auto percent = 100.0 * static_cast ( static_cast ( static_cast ( //Takes progress of current thread and multiplies by thread count. //This assumes the threads progress at roughly the same speed. //Adding m_LastIter is done so that an incremental render still gives an accurate percentage. static_cast(m_LastIter + (m_SubBatch[threadIndex] * m_ThreadsToUse)) / static_cast(ItersPerTemporalSample()) ) + temporalSample ) / static_cast(TemporalSamples()) ); const auto percentDiff = percent - m_LastIterPercent; const auto toc = m_ProgressTimer.Toc(); if (percentDiff >= 10 || (toc > 1000 && percentDiff >= 1))//Call callback function if either 10% has passed, or one second (and 1%). { const auto startingpercent = 100.0 * (m_LastIter / static_cast(ItersPerTemporalSample()));//This is done to support incremental renders, starting from the percentage it left off on. const auto currentpercent = percent - startingpercent;//Current percent in terms of starting percentage. So starting at 50% and progressing 5% will give a value of 5%, not 55%. const auto etaMs = currentpercent == 0 ? 0 : (((100.0 - startingpercent) - currentpercent) / currentpercent) * m_RenderTimer.Toc();//Subtract startingpercent from 100% so that it's properly scaled, meaning rendering from 50% - 100% will be treated as 0% - 100%. if (!m_Callback->ProgressFunc(m_Ember, m_ProgressParameter, percent, 0, etaMs)) Abort(); m_LastIterPercent = percent; m_ProgressTimer.Tic(); } } } } #if defined(_WIN32) || defined(__APPLE__) , tbb::static_partitioner() #endif ); stats.m_Iters = std::accumulate(m_SubBatch.begin(), m_SubBatch.end(), 0ULL);//Sum of iter count of all threads. stats.m_Badvals = std::accumulate(m_BadVals.begin(), m_BadVals.end(), 0ULL); stats.m_IterMs = m_IterTimer.Toc(); //cout << "Accum time: " << std::accumulate(accumTimes.begin(), accumTimes.end(), 0.0) << endl; //t2.Toc(__FUNCTION__); return stats; } ///

/// Non-virtual render properties, getters and setters. ///

///

/// Get the pixel aspect ratio of the output image. /// Default: 1. ///

/// The pixel aspect ratio. template T Renderer::PixelAspectRatio() const { return m_PixelAspectRatio; } ///

/// Set the pixel aspect ratio of the output image. /// Reset the rendering process. ///

/// The pixel aspect ratio. template void Renderer::PixelAspectRatio(T pixelAspectRatio) { ChangeVal([&] { m_PixelAspectRatio = pixelAspectRatio; }, eProcessAction::FULL_RENDER); } ///

/// Non-virtual renderer properties, getters only. ///

template T Renderer::Scale() const { return m_Scale; } template T Renderer::PixelsPerUnitX() const { return m_PixelsPerUnitX; } template T Renderer::PixelsPerUnitY() const { return m_PixelsPerUnitY; } template bucketT Renderer::K1() const { return m_K1; } template bucketT Renderer::K2() const { return m_K2; } template const CarToRas& Renderer::CoordMap() const { return m_CarToRas; } template tvec4* Renderer::HistBuckets() { return m_HistBuckets.data(); } template tvec4* Renderer::AccumulatorBuckets() { return m_AccumulatorBuckets.data(); } template SpatialFilter* Renderer::GetSpatialFilter() { return m_SpatialFilter.get(); } template TemporalFilter* Renderer::GetTemporalFilter() { return m_TemporalFilter.get(); } ///

/// Virtual renderer properties overridden from RendererBase, getters only. ///

template double Renderer::ScaledQuality() const { return static_cast(m_ScaledQuality); } template double Renderer::LowerLeftX(bool gutter) const { return static_cast(gutter ? m_CarToRas.CarLlX() : m_LowerLeftX); } template double Renderer::LowerLeftY(bool gutter) const { return static_cast(gutter ? m_CarToRas.CarLlY() : m_LowerLeftY); } template double Renderer::UpperRightX(bool gutter) const { return static_cast(gutter ? m_CarToRas.CarUrX() : m_UpperRightX); } template double Renderer::UpperRightY(bool gutter) const { return static_cast(gutter ? m_CarToRas.CarUrY() : m_UpperRightY); } template DensityFilterBase* Renderer::GetDensityFilter() { return m_DensityFilter.get(); } ///

/// Non-virtual ember wrappers, getters only. ///

template bool Renderer::XaosPresent() const { return m_Ember.XaosPresent(); } template size_t Renderer::Supersample() const { return m_Ember.m_Supersample; } template size_t Renderer::PaletteIndex() const { return m_Ember.PaletteIndex(); } template T Renderer::Time() const { return m_Ember.m_Time; } template T Renderer::Quality() const { return m_Ember.m_Quality; } template T Renderer::SpatialFilterRadius() const { return m_Ember.m_SpatialFilterRadius; } template T Renderer::PixelsPerUnit() const { return m_Ember.m_PixelsPerUnit; } template T Renderer::Zoom() const { return m_Ember.m_Zoom; } template T Renderer::CenterX() const { return m_Ember.m_CenterX; } template T Renderer::CenterY() const { return m_Ember.m_CenterY; } template T Renderer::Rotate() const { return m_Ember.m_Rotate; } template bucketT Renderer::Brightness() const { return static_cast(m_Ember.m_Brightness); } template bucketT Renderer::Gamma() const { return static_cast(m_Ember.m_Gamma); } template bucketT Renderer::Vibrancy() const { return static_cast(m_Ember.m_Vibrancy); } template bucketT Renderer::GammaThresh() const { return static_cast(m_Ember.m_GammaThresh); } template bucketT Renderer::HighlightPower() const { return static_cast(m_Ember.m_HighlightPower); } template Color Renderer::Background() const { return m_Ember.m_Background; } template const Xform* Renderer::Xforms() const { return m_Ember.Xforms(); } template Xform* Renderer::NonConstXforms() { return m_Ember.NonConstXforms(); } template size_t Renderer::XformCount() const { return m_Ember.XformCount(); } template const Xform* Renderer::FinalXform() const { return m_Ember.FinalXform(); } template Xform* Renderer::NonConstFinalXform() { return m_Ember.NonConstFinalXform(); } template bool Renderer::UseFinalXform() const { return m_Ember.UseFinalXform(); } template const Palette* Renderer::GetPalette() const { return &m_Ember.m_Palette; } template ePaletteMode Renderer::PaletteMode() const { return m_Ember.m_PaletteMode; } ///

/// Virtual ember wrappers overridden from RendererBase, getters only. ///

template size_t Renderer::TemporalSamples() const { return m_Ember.m_TemporalSamples; } template size_t Renderer::FinalRasW() const { return m_Ember.m_FinalRasW; } template size_t Renderer::FinalRasH() const { return m_Ember.m_FinalRasH; } template size_t Renderer::SubBatchSize() const { return m_Ember.m_SubBatchSize; } template size_t Renderer::FuseCount() const { return m_Ember.m_FuseCount; } ///

/// Non-virtual iterator wrappers. ///

template const byte* Renderer::XformDistributions() const { return m_Iterator ? m_Iterator->XformDistributions() : nullptr; } template size_t Renderer::XformDistributionsSize() const { return m_Iterator ? m_Iterator->XformDistributionsSize() : 0; } template Point* Renderer::Samples(size_t threadIndex) const { return threadIndex < m_Samples.size() ? const_cast*>(m_Samples[threadIndex].data()) : nullptr; } ///

/// Non-virtual functions that might be needed by a derived class. ///

///

/// Prepare various values needed for producing a final output image. ///

/// The computed background value, which may differ from the background member /// The computed gamma /// The computed linear range /// The computed vibrancy template void Renderer::PrepFinalAccumVals(Color& background, bucketT& g, bucketT& linRange, bucketT& vibrancy) { //If they are doing incremental rendering, they can get here without doing a full temporal //sample, which means the values will be zero. vibrancy = m_Vibrancy == 0 ? Vibrancy() : m_Vibrancy; size_t vibGamCount = m_VibGamCount == 0 ? 1 : m_VibGamCount; const bucketT gamma = m_Gamma == 0 ? Gamma() : m_Gamma; g = 1 / ClampGte(gamma / vibGamCount, static_cast(0.01));//Ensure a divide by zero doesn't occur. linRange = GammaThresh(); vibrancy /= vibGamCount; background.x = (IsNearZero(m_Background.r) ? static_cast(m_Ember.m_Background.r) : m_Background.r) / vibGamCount; background.y = (IsNearZero(m_Background.g) ? static_cast(m_Ember.m_Background.g) : m_Background.g) / vibGamCount; background.z = (IsNearZero(m_Background.b) ? static_cast(m_Ember.m_Background.b) : m_Background.b) / vibGamCount; } ///

/// Miscellaneous non-virtual functions used only in this class. ///

///

/// Accumulate the samples to the histogram. /// To be called after a sub batch is finished iterating. ///

/// The samples to accumulate /// The number of samples /// The palette to use template void Renderer::Accumulate(QTIsaac& rand, Point* samples, size_t sampleCount, const Palette* palette) { size_t histIndex, intColorIndex, histSize = m_HistBuckets.size(); bucketT colorIndex, colorIndexFrac; const auto psm1 = m_Ember.m_Palette.Size() - 1; //Linear is a linear scale for when the color index is not a whole number, which is most of the time. //It uses a portion of the value of the index, and the remainder of the next index. //Example: index = 25.7 //Fraction = 0.7 //Color = (dmap[25] * 0.3) + (dmap[26] * 0.7) //Use overloaded addition and multiplication operators in vec4 to perform the accumulation. if (PaletteMode() == ePaletteMode::PALETTE_LINEAR) { const auto psm2 = psm1 - 1; //It's critical to understand what's going on here as it's one of the most important parts of the algorithm. //A color value gets retrieved from the palette and //its RGB values are added to the existing RGB values in the histogram bucket. //Alpha is always 1 in the palettes, so that serves as the hit count. //This differs from the original since redundantly adding both an alpha component and a hit count is omitted. //This will eventually leave us with large values for pixels with many hits, which will be log scaled down later. //Original used a function called bump_no_overflow(). Just do a straight add because the type will always be float or double. //Doing so gives a 25% speed increase. //Splitting these conditionals into separate loops makes no speed difference. for (size_t i = 0; i < sampleCount && !m_Abort; i++) { Point p(samples[i]);//Slightly faster to cache this. if (p.m_Opacity != 0) { if (Rotate() != 0) { T p00 = p.m_X - m_Ember.m_CenterX; T p11 = p.m_Y - m_Ember.m_RotCenterY; p.m_X = (p00 * m_RotMat.A()) + (p11 * m_RotMat.B()) + m_Ember.m_CenterX; p.m_Y = (p00 * m_RotMat.D()) + (p11 * m_RotMat.E()) + m_Ember.m_RotCenterY; } //Checking this first before converting gives better performance than converting and checking a single value, which the original did. //Second, an interesting optimization observation is that when keeping the bounds vars within m_CarToRas and calling its InBounds() member function, //rather than here as members, about a 7% speedup is achieved. This is possibly due to the fact that data from m_CarToRas is accessed //right after the call to Convert(), so some caching efficiencies get realized. if (m_CarToRas.InBounds(p)) { m_CarToRas.Convert(p, histIndex); //There is a very slim chance that a point will be right on the border and will technically be in bounds, passing the InBounds() test, //but ends up being mapped to a histogram bucket that is out of bounds due to roundoff error. Perform one final check before proceeding. //This will result in a few points at the very edges getting discarded, but prevents a crash and doesn't seem to make a speed difference. if (histIndex < histSize) { colorIndex = static_cast(p.m_ColorX) * psm1; intColorIndex = static_cast(colorIndex); if (intColorIndex < 0) { intColorIndex = 0; colorIndexFrac = 0; } else if (intColorIndex >= psm1) { intColorIndex = psm2; colorIndexFrac = 1; } else { colorIndexFrac = colorIndex - static_cast(intColorIndex);//Interpolate between intColorIndex and intColorIndex + 1. } bucketT* __restrict hist = glm::value_ptr(m_HistBuckets[histIndex]);//Vectorizer can't tell these point to different locations. const bucketT* __restrict pal = glm::value_ptr(palette->m_Entries[intColorIndex]); const bucketT* __restrict pal2 = glm::value_ptr(palette->m_Entries[intColorIndex + 1]); const auto cifm1 = static_cast(1) - colorIndexFrac; //Loops are unrolled to allow auto vectorization. if (p.m_Opacity == 1) { hist[0] += (pal[0] * cifm1) + (pal2[0] * colorIndexFrac); hist[1] += (pal[1] * cifm1) + (pal2[1] * colorIndexFrac); hist[2] += (pal[2] * cifm1) + (pal2[2] * colorIndexFrac); hist[3] += (pal[3] * cifm1) + (pal2[3] * colorIndexFrac); } else { const auto va = static_cast(p.m_Opacity); hist[0] += ((pal[0] * cifm1) + (pal2[0] * colorIndexFrac)) * va; hist[1] += ((pal[1] * cifm1) + (pal2[1] * colorIndexFrac)) * va; hist[2] += ((pal[2] * cifm1) + (pal2[2] * colorIndexFrac)) * va; hist[3] += ((pal[3] * cifm1) + (pal2[3] * colorIndexFrac)) * va; } } } } } } else if (PaletteMode() == ePaletteMode::PALETTE_STEP)//Duplicate of above, but for step mode. { for (size_t i = 0; i < sampleCount && !m_Abort; i++) { Point p(samples[i]);//Slightly faster to cache this. if (p.m_Opacity != 0) { if (Rotate() != 0) { const T p00 = p.m_X - m_Ember.m_CenterX; const T p11 = p.m_Y - m_Ember.m_RotCenterY; p.m_X = (p00 * m_RotMat.A()) + (p11 * m_RotMat.B()) + m_Ember.m_CenterX; p.m_Y = (p00 * m_RotMat.D()) + (p11 * m_RotMat.E()) + m_Ember.m_RotCenterY; } if (m_CarToRas.InBounds(p)) { m_CarToRas.Convert(p, histIndex); if (histIndex < histSize) { intColorIndex = Clamp(static_cast(p.m_ColorX * psm1), 0, psm1); bucketT* __restrict hist = glm::value_ptr(m_HistBuckets[histIndex]);//Vectorizer can't tell these point to different locations. const bucketT* __restrict pal = glm::value_ptr(palette->m_Entries[intColorIndex]); if (p.m_Opacity == 1) { hist[0] += pal[0]; hist[1] += pal[1]; hist[2] += pal[2]; hist[3] += pal[3]; } else { auto va = static_cast(p.m_Opacity); hist[0] += pal[0] * va; hist[1] += pal[1] * va; hist[2] += pal[2] * va; hist[3] += pal[3] * va; } } } } } } } ///

/// Add a value to the density filtering buffer with a bounds check. ///

/// The bucket being filtered /// The column of the bucket /// The offset to add to the column /// The row of the bucket /// The offset to add to the row template void Renderer::AddToAccum(const tvec4& bucket, intmax_t i, intmax_t ii, intmax_t j, intmax_t jj) { if (j + jj >= 0 && j + jj < static_cast(m_SuperRasH) && i + ii >= 0 && i + ii < static_cast(m_SuperRasW)) { auto* __restrict accum = m_AccumulatorBuckets.data() + ((i + ii) + ((j + jj) * m_SuperRasW));//For vectorizer, results in a 33% speedup. accum->r += bucket.r; accum->g += bucket.g; accum->b += bucket.b; accum->a += bucket.a; } } ///

/// Clip and gamma correct a pixel. /// Because this code is used in both early and late clipping, a few extra arguments are passed /// to specify what actions to take. Coupled with an additional template argument, this allows /// using one function to perform all color clipping, gamma correction and final accumulation. /// Template argument accumT is expected to always be float4. ///

/// The pixel to correct /// The background color /// The gamma to use /// The linear range to use /// The vibrancy to use /// True if late clip, else false. /// The storage space for the corrected values to be written to template template void Renderer::GammaCorrection(tvec4& bucket, Color& background, bucketT g, bucketT linRange, bucketT vibrancy, bool scale, accumT* correctedChannels) { auto bt1 = static_cast(1); if (scale && EarlyClip()) { if (m_CurvesSet) { CurveAdjust(bucket.r, 1); CurveAdjust(bucket.g, 2); CurveAdjust(bucket.b, 3); } correctedChannels[0] = static_cast(Clamp(bucket.r, 0, bt1)); correctedChannels[1] = static_cast(Clamp(bucket.g, 0, bt1)); correctedChannels[2] = static_cast(Clamp(bucket.b, 0, bt1)); correctedChannels[3] = static_cast(Clamp(bucket.a, 0, bt1)); } else { bucketT alpha, ls, a, newRgb[3];//Would normally use a Color, but don't want to call a needless constructor every time this function is called, which is once per pixel. if (bucket.a <= 0) { alpha = 0; ls = 0; } else { alpha = Palette::CalcAlpha(bucket.a, g, linRange); ls = vibrancy * alpha / bucket.a; ClampRef(alpha, 0, 1); } Palette::template CalcNewRgb(glm::value_ptr(bucket), ls, HighlightPower(), newRgb); for (glm::length_t rgbi = 0; rgbi < 3; rgbi++) { a = newRgb[rgbi] + ((1 - vibrancy) * std::pow(std::abs(bucket[rgbi]), g));//Must use abs(), else it it could be a negative value and return NAN. a += (1 - alpha) * background[rgbi]; if (scale && m_CurvesSet) CurveAdjust(a, rgbi + 1); correctedChannels[rgbi] = static_cast(Clamp(a, 0, bt1));//Early clip, just assign directly. } correctedChannels[3] = static_cast(alpha); } } ///

/// Setup the curve values when they are being used. ///

template void Renderer::ComputeCurves() { if (m_CurvesSet) { auto st = m_Csa.size(); for (glm::length_t i = 0; i < m_Ember.m_Curves.m_Points.size(); i++)//Overall, r, g, b. { if (!m_Ember.m_Curves.m_Points[i].empty()) { Spline spline(m_Ember.m_Curves.m_Points[i]);//Will internally sort. for (glm::length_t j = 0; j < st; j++) m_Csa[j][i] = spline.Interpolate(j * ONE_OVER_CURVES_LENGTH_M1); } } } } ///

/// Apply the curve adjustment to a single channel. ///

/// The value of the channel to apply curve adjustment to. /// The index of the channel to apply curve adjustment to template void Renderer::CurveAdjust(bucketT& a, const glm::length_t& index) { size_t tempIndex = static_cast(Clamp(a * CURVES_LENGTH_M1, 0, CURVES_LENGTH_M1)); size_t tempIndex2 = static_cast(Clamp(m_Csa[tempIndex].x * CURVES_LENGTH_M1, 0, CURVES_LENGTH_M1)); a = m_Csa[tempIndex2][index]; } //This class had to be implemented in a cpp file because the compiler was breaking. //So the explicit instantiation must be declared here rather than in Ember.cpp where //all of the other classes are done. template EMBER_API class Renderer; template EMBER_API void Renderer::SetEmber(const vector>& embers); template EMBER_API void Renderer::SetEmber(const list>& embers); #ifdef DO_DOUBLE template EMBER_API class Renderer; template EMBER_API void Renderer::SetEmber(const vector>& embers); template EMBER_API void Renderer::SetEmber(const list>& embers); #endif }