diff --git a/Builds/MSVC/Installer/FractoriumInstaller.wixproj b/Builds/MSVC/Installer/FractoriumInstaller.wixproj index 4951f58..5704658 100644 --- a/Builds/MSVC/Installer/FractoriumInstaller.wixproj +++ b/Builds/MSVC/Installer/FractoriumInstaller.wixproj @@ -6,7 +6,7 @@ 3.7 {c8096c47-e358-438c-a520-146d46b0637d} 2.0 - Fractorium_Beta_0.4.1.4 + Fractorium_Beta_0.4.1.5 Package $(MSBuildExtensionsPath32)\Microsoft\WiX\v3.x\Wix.targets $(MSBuildExtensionsPath)\Microsoft\WiX\v3.x\Wix.targets diff --git a/Builds/MSVC/Installer/Product.wxs b/Builds/MSVC/Installer/Product.wxs index 0eb3367..9f1b883 100644 --- a/Builds/MSVC/Installer/Product.wxs +++ b/Builds/MSVC/Installer/Product.wxs @@ -1,6 +1,6 @@ - + @@ -13,7 +13,7 @@ - + ::m_Brightness>(embers, coefs, size); - InterpT<&Ember::m_HighlightPower>(embers, coefs, size); - InterpT<&Ember::m_Gamma>(embers, coefs, size); - InterpT<&Ember::m_Vibrancy>(embers, coefs, size); - InterpT<&Ember::m_Hue>(embers, coefs, size); + //Interpolate ember parameters, these should be in the same order the members are declared. InterpI<&Ember::m_FinalRasW>(embers, coefs, size); InterpI<&Ember::m_FinalRasH>(embers, coefs, size); + InterpI<&Ember::m_SubBatchSize>(embers, coefs, size); + InterpI<&Ember::m_FuseCount>(embers, coefs, size); InterpI<&Ember::m_Supersample>(embers, coefs, size); - InterpT<&Ember::m_CenterX>(embers, coefs, size); - InterpT<&Ember::m_CenterY>(embers, coefs, size); - InterpT<&Ember::m_RotCenterY>(embers, coefs, size); - InterpX, &Ember::m_Background>(embers, coefs, size); m_Background.a = bgAlphaSave;//Don't interp alpha. - InterpT<&Ember::m_PixelsPerUnit>(embers, coefs, size); - InterpT<&Ember::m_SpatialFilterRadius>(embers, coefs, size); - InterpT<&Ember::m_TemporalFilterExp>(embers, coefs, size); - InterpT<&Ember::m_TemporalFilterWidth>(embers, coefs, size); + InterpI<&Ember::m_TemporalSamples>(embers, coefs, size); InterpT<&Ember::m_Quality>(embers, coefs, size); + InterpT<&Ember::m_PixelsPerUnit>(embers, coefs, size); InterpT<&Ember::m_Zoom>(embers, coefs, size); InterpT<&Ember::m_CamZPos>(embers, coefs, size); InterpT<&Ember::m_CamPerspective>(embers, coefs, size); @@ -775,12 +770,23 @@ public: InterpT<&Ember::m_CamPitch>(embers, coefs, size); InterpT<&Ember::m_CamDepthBlur>(embers, coefs, size); InterpX::m_CamMat>(embers, coefs, size); + InterpT<&Ember::m_CenterX>(embers, coefs, size); + InterpT<&Ember::m_CenterY>(embers, coefs, size); + InterpT<&Ember::m_RotCenterY>(embers, coefs, size); InterpT<&Ember::m_Rotate>(embers, coefs, size); - InterpI<&Ember::m_TemporalSamples>(embers, coefs, size); + InterpT<&Ember::m_Hue>(embers, coefs, size); + InterpT<&Ember::m_Brightness>(embers, coefs, size); + InterpT<&Ember::m_Gamma>(embers, coefs, size); + InterpT<&Ember::m_Vibrancy>(embers, coefs, size); + InterpT<&Ember::m_GammaThresh>(embers, coefs, size); + InterpT<&Ember::m_HighlightPower>(embers, coefs, size); + InterpX, &Ember::m_Background>(embers, coefs, size); m_Background.a = bgAlphaSave;//Don't interp alpha. + InterpT<&Ember::m_TemporalFilterExp>(embers, coefs, size); + InterpT<&Ember::m_TemporalFilterWidth>(embers, coefs, size); InterpT<&Ember::m_MaxRadDE>(embers, coefs, size); InterpT<&Ember::m_MinRadDE>(embers, coefs, size); InterpT<&Ember::m_CurveDE>(embers, coefs, size); - InterpT<&Ember::m_GammaThresh>(embers, coefs, size); + InterpT<&Ember::m_SpatialFilterRadius>(embers, coefs, size); //An extra step needed here due to the OOD that was not needed in the original. //A small price to pay for the conveniences it affords us elsewhere. @@ -1382,6 +1388,8 @@ public: << "Quality: " << m_Quality << endl << "Pixels Per Unit: " << m_PixelsPerUnit << endl << "Original Pixels Per Unit: " << m_OrigPixPerUnit << endl + << "Sub Batch Size: " << m_SubBatchSize << endl + << "Fuse Count: " << m_FuseCount << endl << "Zoom: " << m_Zoom << endl << "ZPos: " << m_CamZPos << endl << "Perspective: " << m_CamPerspective << endl @@ -1459,6 +1467,14 @@ public: size_t m_OrigFinalRasH;//the dimension may change in an editor and the originals are needed for the aspect ratio. T m_OrigPixPerUnit; + //The iteration depth. This was a rendering parameter in flam3 but has been made a member here + //so that it can be adjusted more easily. + size_t m_SubBatchSize; + + //The number of iterations to disregard for each sub batch. This was a rendering parameter in flam3 but has been made a member here + //so that it can be adjusted more easily. + size_t m_FuseCount; + //The multiplier in size of the histogram and DE filtering buffers. Must be at least one, preferrably never larger than 4, only useful at 2. //Xml field: "supersample" or "overample (deprecated)". size_t m_Supersample; diff --git a/Source/Ember/EmberDefines.h b/Source/Ember/EmberDefines.h index 6adb8eb..85a831a 100644 --- a/Source/Ember/EmberDefines.h +++ b/Source/Ember/EmberDefines.h @@ -36,7 +36,7 @@ namespace EmberNs extern void sincos(float x, float *s, float *c); #endif -#define EMBER_VERSION "0.4.1.4" +#define EMBER_VERSION "0.4.1.5" #define EPS6 T(1e-6) #define EPS std::numeric_limits::epsilon()//Apoplugin.h uses -20, but it's more mathematically correct to do it this way. #define ISAAC_SIZE 4 @@ -54,6 +54,7 @@ namespace EmberNs #define COLORMAP_LENGTH 256//These will need to change if 2D palette support is ever added, or variable sized palettes. #define COLORMAP_LENGTH_MINUS_1 255 #define WHITE 255 +#define DEFAULT_SBS (1024 * 10) #define XC (const xmlChar*) #define BadVal(x) (((x) != (x)) || ((x) > 1e10) || ((x) < -1e10)) #define Rint(A) floor((A) + (((A) < 0) ? T(-0.5) : T(0.5))) diff --git a/Source/Ember/EmberToXml.h b/Source/Ember/EmberToXml.h index f345279..79f5eba 100644 --- a/Source/Ember/EmberToXml.h +++ b/Source/Ember/EmberToXml.h @@ -156,6 +156,8 @@ public: os << " temporal_filter_width=\"" << ember.m_TemporalFilterWidth << "\""; os << " quality=\"" << ember.m_Quality << "\""; os << " temporal_samples=\"" << ember.m_TemporalSamples << "\""; + os << " sub_batch_size=\"" << ember.m_SubBatchSize << "\""; + os << " fuse=\"" << ember.m_FuseCount << "\""; os << " background=\"" << ember.m_Background.r << " " << ember.m_Background.g << " " << ember.m_Background.b << "\""; os << " brightness=\"" << ember.m_Brightness << "\""; os << " gamma=\"" << ember.m_Gamma << "\""; diff --git a/Source/Ember/Iterator.h b/Source/Ember/Iterator.h index c1f4db5..29bd4e7 100644 --- a/Source/Ember/Iterator.h +++ b/Source/Ember/Iterator.h @@ -16,6 +16,17 @@ namespace EmberNs using Iterator::DoFinalXform; \ using Iterator::DoBadVals; +template class Renderer; + +template +struct IterParams +{ + size_t m_Count; + size_t m_Skip; + //T m_OneColDiv2; + //T m_OneRowDiv2; +}; + /// /// Iterator base class. /// Iterating is one loop level outside of the inner xform application loop so it's still very important @@ -69,7 +80,7 @@ public: /// The buffer to store the output points /// The random context to use /// The number of bad values - virtual size_t Iterate(Ember& ember, size_t count, size_t skip, Point* samples, QTIsaac& rand) { return 0; } + virtual size_t Iterate(Ember& ember, IterParams& params, Point* samples, QTIsaac& rand) { return 0; } /// /// Initialize the xform selection vector by normalizing the weights of all xforms and @@ -278,7 +289,7 @@ public: /// The buffer to store the output points /// The random context to use /// The number of bad values - virtual size_t Iterate(Ember& ember, size_t count, size_t skip, Point* samples, QTIsaac& rand) + virtual size_t Iterate(Ember& ember, IterParams& params, Point* samples, QTIsaac& rand) override { size_t i, badVals = 0; Point tempPoint, p1; @@ -290,7 +301,7 @@ public: { p1 = samples[0]; - for (i = 0; i < skip; i++)//Fuse. + for (i = 0; i < params.m_Skip; i++)//Fuse. { if (xforms[NextXformFromIndex(rand.Rand())].Apply(&p1, &p1, rand)) DoBadVals(xforms, badVals, &p1, rand); @@ -299,7 +310,7 @@ public: DoFinalXform(ember, p1, samples, rand);//Apply to last fuse point and store as the first element in samples. ember.Proj(samples[0], rand); - for (i = 1; i < count; i++)//Real loop. + for (i = 1; i < params.m_Count; i++)//Real loop. { if (xforms[NextXformFromIndex(rand.Rand())].Apply(&p1, &p1, rand)) DoBadVals(xforms, badVals, &p1, rand); @@ -312,7 +323,7 @@ public: { p1 = samples[0]; - for (i = 0; i < skip; i++)//Fuse. + for (i = 0; i < params.m_Skip; i++)//Fuse. { if (xforms[NextXformFromIndex(rand.Rand())].Apply(&p1, &p1, rand)) DoBadVals(xforms, badVals, &p1, rand); @@ -321,7 +332,7 @@ public: samples[0] = p1; ember.Proj(samples[0], rand); - for (i = 1; i < count; i++)//Real loop. + for (i = 1; i < params.m_Count; i++)//Real loop. { if (xforms[NextXformFromIndex(rand.Rand())].Apply(&p1, &samples[i], rand)) DoBadVals(xforms, badVals, samples + i, rand); @@ -337,7 +348,7 @@ public: { p1 = samples[0]; - for (i = 0; i < skip; i++)//Fuse. + for (i = 0; i < params.m_Skip; i++)//Fuse. { if (xforms[NextXformFromIndex(rand.Rand())].Apply(&p1, &p1, rand)) DoBadVals(xforms, badVals, &p1, rand); @@ -345,7 +356,7 @@ public: DoFinalXform(ember, p1, samples, rand);//Apply to last fuse point and store as the first element in samples. - for (i = 1; i < count; i++)//Real loop. + for (i = 1; i < params.m_Count; i++)//Real loop. { if (xforms[NextXformFromIndex(rand.Rand())].Apply(&p1, &p1, rand))//Feed the resulting value of applying the randomly selected xform back into the next iter, and not the result of applying the final xform. DoBadVals(xforms, badVals, &p1, rand); @@ -357,7 +368,7 @@ public: { p1 = samples[0]; - for (i = 0; i < skip; i++)//Fuse. + for (i = 0; i < params.m_Skip; i++)//Fuse. { if (xforms[NextXformFromIndex(rand.Rand())].Apply(&p1, &p1, rand)) DoBadVals(xforms, badVals, &p1, rand); @@ -365,9 +376,11 @@ public: samples[0] = p1; - for (i = 0; i < count - 1; i++)//Real loop. + for (i = 0; i < params.m_Count - 1; i++)//Real loop. + { if (xforms[NextXformFromIndex(rand.Rand())].Apply(samples + i, samples + i + 1, rand)) DoBadVals(xforms, badVals, samples + i + 1, rand); + } } } @@ -442,7 +455,7 @@ public: /// The buffer to store the output points /// The random context to use /// The number of bad values - virtual size_t Iterate(Ember& ember, size_t count, size_t skip, Point* samples, QTIsaac& rand) + virtual size_t Iterate(Ember& ember, IterParams& params, Point* samples, QTIsaac& rand) override { size_t i, xformIndex; size_t lastXformUsed = 0; @@ -456,7 +469,7 @@ public: { p1 = samples[0]; - for (i = 0; i < skip; i++)//Fuse. + for (i = 0; i < params.m_Skip; i++)//Fuse. { xformIndex = NextXformFromIndex(rand.Rand(), lastXformUsed); @@ -469,7 +482,7 @@ public: DoFinalXform(ember, p1, samples, rand);//Apply to last fuse point and store as the first element in samples. ember.Proj(samples[0], rand); - for (i = 1; i < count; i++)//Real loop. + for (i = 1; i < params.m_Count; i++)//Real loop. { xformIndex = NextXformFromIndex(rand.Rand(), lastXformUsed); @@ -485,7 +498,7 @@ public: { p1 = samples[0]; - for (i = 0; i < skip; i++)//Fuse. + for (i = 0; i < params.m_Skip; i++)//Fuse. { xformIndex = NextXformFromIndex(rand.Rand(), lastXformUsed); @@ -498,7 +511,7 @@ public: samples[0] = p1; ember.Proj(samples[0], rand); - for (i = 1; i < count; i++)//Real loop. + for (i = 1; i < params.m_Count; i++)//Real loop. { xformIndex = NextXformFromIndex(rand.Rand(), lastXformUsed); @@ -517,7 +530,7 @@ public: { p1 = samples[0]; - for (i = 0; i < skip; i++)//Fuse. + for (i = 0; i < params.m_Skip; i++)//Fuse. { xformIndex = NextXformFromIndex(rand.Rand(), lastXformUsed); @@ -529,7 +542,7 @@ public: DoFinalXform(ember, p1, samples, rand);//Apply to last fuse point and store as the first element in samples. - for (i = 1; i < count; i++)//Real loop. + for (i = 1; i < params.m_Count; i++)//Real loop. { xformIndex = NextXformFromIndex(rand.Rand(), lastXformUsed); @@ -544,7 +557,7 @@ public: { p1 = samples[0]; - for (i = 0; i < skip; i++)//Fuse. + for (i = 0; i < params.m_Skip; i++)//Fuse. { xformIndex = NextXformFromIndex(rand.Rand(), lastXformUsed); @@ -556,7 +569,7 @@ public: samples[0] = p1; - for (i = 0; i < count - 1; i++)//Real loop. + for (i = 0; i < params.m_Count - 1; i++)//Real loop. { xformIndex = NextXformFromIndex(rand.Rand(), lastXformUsed); diff --git a/Source/Ember/Renderer.cpp b/Source/Ember/Renderer.cpp index afe8619..9462e88 100644 --- a/Source/Ember/Renderer.cpp +++ b/Source/Ember/Renderer.cpp @@ -691,7 +691,7 @@ bool Renderer::Alloc() (m_SuperSize != m_HistBuckets.size()) || (m_SuperSize != m_AccumulatorBuckets.size()) || (m_ThreadsToUse != m_Samples.size()) || - (m_Samples[0].size() != m_SubBatchSize); + (m_Samples[0].size() != SubBatchSize()); if (lock) EnterResize(); @@ -728,14 +728,14 @@ bool Renderer::Alloc() for (size_t i = 0; i < m_Samples.size(); i++) { - if (m_Samples[i].size() != m_SubBatchSize) + if (m_Samples[i].size() != SubBatchSize()) { - m_Samples[i].resize(m_SubBatchSize); + m_Samples[i].resize(SubBatchSize()); if (m_ReclaimOnResize) m_Samples[i].shrink_to_fit(); - b &= (m_Samples[i].size() == m_SubBatchSize); + b &= (m_Samples[i].size() == SubBatchSize()); } } @@ -1154,7 +1154,7 @@ eRenderStatus Renderer::AccumulatorToFinalImage(unsigned char* pixel /// This function will be called multiple times for an interactive rendering, and /// once for a straight through render. /// The iteration is reset and fused in each thread after each sub batch is done -/// which by default is 10,000 iterations. +/// which by default is 10,240 iterations. /// /// The number of iterations to run /// The temporal sample this is running for @@ -1164,7 +1164,6 @@ EmberStats Renderer::Iterate(size_t iterCount, size_t temporalSample { //Timing t2(4); m_IterTimer.Tic(); - size_t fuse = EarlyClip() ? 100 : 15;//EarlyClip was one way of detecting a later version of flam3, so it used 100 which is a better value. size_t totalItersPerThread = (size_t)ceil((double)iterCount / (double)m_ThreadsToUse); double percent, etaMs; EmberStats stats; @@ -1180,17 +1179,21 @@ EmberStats Renderer::Iterate(size_t iterCount, size_t temporalSample parallel_for(size_t(0), m_ThreadsToUse, [&] (size_t threadIndex) { #endif - Timing t; - size_t subBatchSize = (size_t)min(totalItersPerThread, (size_t)m_SubBatchSize); + //Timing t; + IterParams params; m_BadVals[threadIndex] = 0; + params.m_Count = min(totalItersPerThread, SubBatchSize()); + params.m_Skip = FuseCount(); + //params.m_OneColDiv2 = m_CarToRas.OneCol() / 2; + //params.m_OneRowDiv2 = m_CarToRas.OneRow() / 2; //Sub batch iterations, loop 2. - for (m_SubBatch[threadIndex] = 0; (m_SubBatch[threadIndex] < totalItersPerThread) && !m_Abort; m_SubBatch[threadIndex] += subBatchSize) + for (m_SubBatch[threadIndex] = 0; (m_SubBatch[threadIndex] < totalItersPerThread) && !m_Abort; m_SubBatch[threadIndex] += params.m_Count) { - //Must recalculate the number of iters to run on each sub batch because the last batch will most likely have less than m_SubBatchSize iters. + //Must recalculate the number of iters to run on each sub batch because the last batch will most likely have less than SubBatchSize iters. //For example, if 51,000 are requested, and the sbs is 10,000, it should run 5 sub batches of 10,000 iters, and one final sub batch of 1,000 iters. - subBatchSize = min(subBatchSize, totalItersPerThread - m_SubBatch[threadIndex]); + params.m_Count = min(params.m_Count, totalItersPerThread - m_SubBatch[threadIndex]); //Use first as random point, the rest are iterated points. //Note that this gets reset with a new random point for each subBatchSize iterations. @@ -1203,14 +1206,14 @@ EmberStats Renderer::Iterate(size_t iterCount, size_t temporalSample //Finally, iterate. //t.Tic(); //Iterating, loop 3. - m_BadVals[threadIndex] += m_Iterator->Iterate(m_Ember, subBatchSize, fuse, m_Samples[threadIndex].data(), m_Rand[threadIndex]); + m_BadVals[threadIndex] += m_Iterator->Iterate(m_Ember, params, m_Samples[threadIndex].data(), m_Rand[threadIndex]); //iterationTime += t.Toc(); if (m_LockAccum) m_AccumCs.Enter(); //t.Tic(); //Map temp buffer samples into the histogram using the palette for color. - Accumulate(m_Samples[threadIndex].data(), subBatchSize, &m_Dmap); + Accumulate(m_Rand[threadIndex], m_Samples[threadIndex].data(), params.m_Count, &m_Dmap); //accumulationTime += t.Toc(); if (m_LockAccum) m_AccumCs.Leave(); @@ -1347,6 +1350,8 @@ template ePaletteMode Renderer:: template size_t Renderer::TemporalSamples() const { return m_Ember.m_TemporalSamples; } template size_t Renderer::FinalRasW() const { return m_Ember.m_FinalRasW; } template size_t Renderer::FinalRasH() const { return m_Ember.m_FinalRasH; } +template size_t Renderer::SubBatchSize() const { return m_Ember.m_SubBatchSize; } +template size_t Renderer::FuseCount() const { return m_Ember.m_FuseCount; } /// /// Non-virtual iterator wrappers. @@ -1396,11 +1401,13 @@ void Renderer::PrepFinalAccumVals(Color& background, T& g, T& lin /// The number of samples /// The palette to use template -void Renderer::Accumulate(Point* samples, size_t sampleCount, const Palette* palette) +void Renderer::Accumulate(QTIsaac& rand, Point* samples, size_t sampleCount, const Palette* palette) { size_t histIndex, intColorIndex, histSize = m_HistBuckets.size(); bucketT colorIndex, colorIndexFrac; const glm::detail::tvec4* dmap = &(palette->m_Entries[0]); + //T oneColDiv2 = m_CarToRas.OneCol() / 2; + //T oneRowDiv2 = m_CarToRas.OneRow() / 2; //It's critical to understand what's going on here as it's one of the most important parts of the algorithm. //A color value gets retrieved from the palette and @@ -1413,24 +1420,37 @@ void Renderer::Accumulate(Point* samples, size_t sampleCount, con //Splitting these conditionals into separate loops makes no speed difference. for (size_t i = 0; i < sampleCount && !m_Abort; i++) { + Point p(samples[i]);//Slightly faster to cache this. + if (Rotate() != 0) { - T p00 = samples[i].m_X - CenterX(); - T p11 = samples[i].m_Y - m_Ember.m_RotCenterY; + T p00 = p.m_X - CenterX(); + T p11 = p.m_Y - m_Ember.m_RotCenterY; - samples[i].m_X = (p00 * m_RotMat.A()) + (p11 * m_RotMat.B()) + CenterX(); - samples[i].m_Y = (p00 * m_RotMat.D()) + (p11 * m_RotMat.E()) + m_Ember.m_RotCenterY; + p.m_X = (p00 * m_RotMat.A()) + (p11 * m_RotMat.B()) + CenterX(); + p.m_Y = (p00 * m_RotMat.D()) + (p11 * m_RotMat.E()) + m_Ember.m_RotCenterY; } + //T angle = rand.Frand01() * M_2PI; + //T r = exp(T(0.5) * sqrt(-log(rand.Frand01()))) - 1; + + //T r = (rand.Frand01() + rand.Frand01() - 1); + //T r = (rand.Frand01() + rand.Frand01() + rand.Frand01() + rand.Frand01() - 2); + + //p.m_X += (r * oneColDiv2) * cos(angle); + //p.m_Y += (r * oneRowDiv2) * sin(angle); + //p.m_X += r * cos(angle); + //p.m_Y += r * sin(angle); + //Checking this first before converting gives better performance than converting and checking a single value, which the original did. //Second, an interesting optimization observation is that when keeping the bounds vars within m_CarToRas and calling its InBounds() member function, //rather than here as members, about a 7% speedup is achieved. This is possibly due to the fact that data from m_CarToRas is accessed //right after the call to Convert(), so some caching efficiencies get realized. - if (m_CarToRas.InBounds(samples[i])) + if (m_CarToRas.InBounds(p)) { - if (samples[i].m_VizAdjusted != 0) + if (p.m_VizAdjusted != 0) { - m_CarToRas.Convert(samples[i], histIndex); + m_CarToRas.Convert(p, histIndex); //There is a very slim chance that a point will be right on the border and will technically be in bounds, passing the InBounds() test, //but ends up being mapped to a histogram bucket that is out of bounds due to roundoff error. Perform one final check before proceeding. @@ -1445,7 +1465,7 @@ void Renderer::Accumulate(Point* samples, size_t sampleCount, con //Use overloaded addition and multiplication operators in vec4 to perform the accumulation. if (PaletteMode() == PALETTE_LINEAR) { - colorIndex = (bucketT)samples[i].m_ColorX * COLORMAP_LENGTH; + colorIndex = (bucketT)p.m_ColorX * COLORMAP_LENGTH; intColorIndex = (size_t)colorIndex; if (intColorIndex < 0) @@ -1463,19 +1483,19 @@ void Renderer::Accumulate(Point* samples, size_t sampleCount, con colorIndexFrac = colorIndex - (bucketT)intColorIndex;//Interpolate between intColorIndex and intColorIndex + 1. } - if (samples[i].m_VizAdjusted == 1) + if (p.m_VizAdjusted == 1) m_HistBuckets[histIndex] += ((dmap[intColorIndex] * (1 - colorIndexFrac)) + (dmap[intColorIndex + 1] * colorIndexFrac)); else - m_HistBuckets[histIndex] += (((dmap[intColorIndex] * (1 - colorIndexFrac)) + (dmap[intColorIndex + 1] * colorIndexFrac)) * (bucketT)samples[i].m_VizAdjusted); + m_HistBuckets[histIndex] += (((dmap[intColorIndex] * (1 - colorIndexFrac)) + (dmap[intColorIndex + 1] * colorIndexFrac)) * (bucketT)p.m_VizAdjusted); } else if (PaletteMode() == PALETTE_STEP) { - intColorIndex = Clamp((size_t)(samples[i].m_ColorX * COLORMAP_LENGTH), 0, COLORMAP_LENGTH_MINUS_1); + intColorIndex = Clamp((size_t)(p.m_ColorX * COLORMAP_LENGTH), 0, COLORMAP_LENGTH_MINUS_1); - if (samples[i].m_VizAdjusted == 1) + if (p.m_VizAdjusted == 1) m_HistBuckets[histIndex] += dmap[intColorIndex]; else - m_HistBuckets[histIndex] += (dmap[intColorIndex] * (bucketT)samples[i].m_VizAdjusted); + m_HistBuckets[histIndex] += (dmap[intColorIndex] * (bucketT)p.m_VizAdjusted); } } } diff --git a/Source/Ember/Renderer.h b/Source/Ember/Renderer.h index 8772eb1..7942c18 100644 --- a/Source/Ember/Renderer.h +++ b/Source/Ember/Renderer.h @@ -134,6 +134,8 @@ public: virtual size_t TemporalSamples() const override; virtual size_t FinalRasW() const override; virtual size_t FinalRasH() const override; + virtual size_t SubBatchSize() const override; + virtual size_t FuseCount() const override; //Non-virtual iterator wrappers. const unsigned char* XformDistributions() const; @@ -144,9 +146,9 @@ protected: //Non-virtual functions that might be needed by a derived class. void PrepFinalAccumVals(Color& background, T& g, T& linRange, T& vibrancy); -private: + private: //Miscellaneous non-virtual functions used only in this class. - void Accumulate(Point* samples, size_t sampleCount, const Palette* palette); + void Accumulate(QTIsaac& rand, Point* samples, size_t sampleCount, const Palette* palette); /*inline*/ void AddToAccum(const glm::detail::tvec4& bucket, intmax_t i, intmax_t ii, intmax_t j, intmax_t jj); template void GammaCorrection(glm::detail::tvec4& bucket, Color& background, T g, T linRange, T vibrancy, bool doAlpha, bool scale, accumT* correctedChannels); diff --git a/Source/Ember/RendererBase.cpp b/Source/Ember/RendererBase.cpp index 2a4c9df..0110d3f 100644 --- a/Source/Ember/RendererBase.cpp +++ b/Source/Ember/RendererBase.cpp @@ -15,7 +15,6 @@ RendererBase::RendererBase() m_YAxisUp = false; m_InsertPalette = false; m_ReclaimOnResize = false; - m_SubBatchSize = 1024 * 10; m_NumChannels = 3; m_BytesPerChannel = 1; m_SuperSize = 0; @@ -412,17 +411,6 @@ void RendererBase::Transparency(bool transparency) ChangeVal([&] { m_Transparency = transparency; }, ACCUM_ONLY); } -/// -/// Set the sub batch size. This is the size of of the chunks that the iteration -/// trajectory will be broken up into. -/// Reset the rendering process. -/// -/// The sub batch size to set -void RendererBase::SubBatchSize(size_t sbs) -{ - ChangeVal([&] { m_SubBatchSize = sbs; }, FULL_RENDER); -} - /// /// Set the callback object. /// @@ -583,14 +571,6 @@ void RendererBase::NumChannels(size_t numChannels) /// The number of threads used when rendering size_t RendererBase::ThreadCount() const { return m_ThreadsToUse; } -/// -/// Get the sub batch size. This is the size of of the chunks that the iteration -/// trajectory will be broken up into. -/// Default: 10k. -/// -/// The sub batch size -size_t RendererBase::SubBatchSize() const { return m_SubBatchSize; } - /// /// Get the renderer type enum. /// CPU_RENDERER for this class, other values for derived classes. diff --git a/Source/Ember/RendererBase.h b/Source/Ember/RendererBase.h index 01c390f..ae235ad 100644 --- a/Source/Ember/RendererBase.h +++ b/Source/Ember/RendererBase.h @@ -149,7 +149,6 @@ public: void ReclaimOnResize(bool reclaimOnResize); bool Transparency() const; void Transparency(bool transparency); - void SubBatchSize(size_t subBatchSize); void Callback(RenderCallback* callback); void ThreadCount(size_t threads, const char* seedString = nullptr); size_t BytesPerChannel() const; @@ -161,7 +160,6 @@ public: //Virtual render properties, getters and setters. virtual void NumChannels(size_t numChannels); virtual size_t ThreadCount() const; - virtual size_t SubBatchSize() const; virtual eRendererType RendererType() const; //Abstract render properties, getters only. @@ -169,6 +167,8 @@ public: virtual size_t HistBucketSize() const = 0; virtual size_t FinalRasW() const = 0; virtual size_t FinalRasH() const = 0; + virtual size_t SubBatchSize() const = 0; + virtual size_t FuseCount() const = 0; virtual double ScaledQuality() const = 0; virtual double LowerLeftX(bool gutter = true) const = 0; virtual double LowerLeftY(bool gutter = true) const = 0; @@ -207,10 +207,8 @@ protected: size_t m_DensityFilterOffset; size_t m_NumChannels; size_t m_BytesPerChannel; - size_t m_SubBatchSize; size_t m_ThreadsToUse; size_t m_VibGamCount; - size_t m_LastPass; size_t m_LastTemporalSample; double m_LastIterPercent; size_t m_LastIter; diff --git a/Source/Ember/SheepTools.h b/Source/Ember/SheepTools.h index a7b0ba4..cdc83ab 100644 --- a/Source/Ember/SheepTools.h +++ b/Source/Ember/SheepTools.h @@ -879,7 +879,6 @@ public: m_Renderer->EarlyClip(true); m_Renderer->PixelAspectRatio(1); m_Renderer->ThreadCount(Timing::ProcessorCount()); - m_Renderer->SubBatchSize(10000); m_Renderer->Callback(nullptr); if (m_Renderer->Run(m_FinalImage) != RENDER_OK) @@ -1280,8 +1279,16 @@ public: /// The number of iterations ran size_t EstimateBoundingBox(Ember& ember, T eps, size_t samples, T* bmin, T* bmax) { + bool newAlloc = false; size_t i, lowTarget, highTarget; T min[2], max[2]; + IterParams params; + + m_Renderer->SetEmber(ember); + m_Renderer->CreateSpatialFilter(newAlloc); + m_Renderer->CreateDEFilter(newAlloc); + m_Renderer->ComputeBounds(); + m_Renderer->ComputeCamera(); if (ember.XaosPresent()) m_Iterator = m_XaosIterator.get(); @@ -1290,8 +1297,12 @@ public: m_Iterator->InitDistributions(ember); m_Samples.resize(samples); + params.m_Count = samples; + params.m_Skip = 20; + //params.m_OneColDiv2 = m_Renderer->CoordMap()->OneCol() / 2; + //params.m_OneRowDiv2 = m_Renderer->CoordMap()->OneRow() / 2; - size_t bv = m_Iterator->Iterate(ember, samples, 20, m_Samples.data(), m_Rand);//Use a special fuse of 20, all other calls to this will use 15, or 100. + size_t bv = m_Iterator->Iterate(ember, params, m_Samples.data(), m_Rand);//Use a special fuse of 20, all other calls to this will use 15, or 100. if (bv / T(samples) > eps) eps = 3 * bv / T(samples); diff --git a/Source/Ember/Utils.h b/Source/Ember/Utils.h index b3d5734..3d01023 100644 --- a/Source/Ember/Utils.h +++ b/Source/Ember/Utils.h @@ -33,6 +33,17 @@ static inline void ForEach(c& container, fn func) std::for_each(container.begin(), container.end(), func); } +/// +/// Thin wrapper around computing the total size of a vector. +/// +/// The vector to compute the size of +/// The size of one element times the length. +template +static inline size_t SizeOf(vector& vec) +{ + return sizeof(vec[0]) * vec.size(); +} + /// /// After a run completes, information about what was run can be saved as strings to the comments /// section of a jpg or png file. This class is just a container for those values. @@ -276,7 +287,7 @@ static void ClearVec(vector& vec, bool arrayDelete = false) template static inline void Memset(vector& vec, int val = 0) { - memset((void*)vec.data(), val, vec.size() * sizeof(vec[0])); + memset((void*)vec.data(), val, SizeOf(vec)); } /// diff --git a/Source/Ember/XmlToEmber.h b/Source/Ember/XmlToEmber.h index 64de8b8..b80a14e 100644 --- a/Source/Ember/XmlToEmber.h +++ b/Source/Ember/XmlToEmber.h @@ -593,6 +593,8 @@ private: else if (ParseAndAssignInt(curAtt->name, attStr, "oversample", currentEmber.m_Supersample , ret)) { } else if (ParseAndAssignInt(curAtt->name, attStr, "supersample", currentEmber.m_Supersample , ret)) { } else if (ParseAndAssignInt(curAtt->name, attStr, "temporal_samples", currentEmber.m_TemporalSamples, ret)) { } + else if (ParseAndAssignInt(curAtt->name, attStr, "sub_batch_size", currentEmber.m_SubBatchSize , ret)) { } + else if (ParseAndAssignInt(curAtt->name, attStr, "fuse", currentEmber.m_FuseCount , ret)) { } else if (ParseAndAssignInt(curAtt->name, attStr, "soloxform", soloXform , ret)) { } else if (ParseAndAssignInt(curAtt->name, attStr, "new_linear", newLinear , ret)) { } diff --git a/Source/EmberAnimate/EmberAnimate.cpp b/Source/EmberAnimate/EmberAnimate.cpp index c216715..6466385 100644 --- a/Source/EmberAnimate/EmberAnimate.cpp +++ b/Source/EmberAnimate/EmberAnimate.cpp @@ -186,6 +186,9 @@ bool EmberAnimate(EmberOptions& opt) if (opt.Supersample() > 0) embers[i].m_Supersample = opt.Supersample(); + if (opt.SubBatchSize() != DEFAULT_SBS) + embers[i].m_SubBatchSize = opt.SubBatchSize(); + embers[i].m_Quality *= T(opt.QualityScale()); embers[i].m_FinalRasW = (unsigned int)((T)embers[i].m_FinalRasW * opt.SizeScale()); embers[i].m_FinalRasH = (unsigned int)((T)embers[i].m_FinalRasH * opt.SizeScale()); @@ -250,13 +253,12 @@ bool EmberAnimate(EmberOptions& opt) renderer->YAxisUp(opt.YAxisUp()); renderer->LockAccum(opt.LockAccum()); renderer->InsertPalette(opt.InsertPalette()); - renderer->SubBatchSize(opt.SubBatchSize()); renderer->PixelAspectRatio(T(opt.AspectRatio())); renderer->Transparency(opt.Transparency()); renderer->NumChannels(channels); renderer->BytesPerChannel(opt.BitsPerChannel() / 8); renderer->Callback(opt.DoProgress() ? progress.get() : NULL); - + //Begin run. for (ftime = opt.FirstFrame(); ftime <= opt.LastFrame(); ftime += opt.Dtime()) { diff --git a/Source/EmberAnimate/EmberAnimate.rc b/Source/EmberAnimate/EmberAnimate.rc index 09e4dc1..7ec4eec 100644 --- a/Source/EmberAnimate/EmberAnimate.rc +++ b/Source/EmberAnimate/EmberAnimate.rc @@ -49,8 +49,8 @@ END // VS_VERSION_INFO VERSIONINFO - FILEVERSION 0,4,1,4 - PRODUCTVERSION 0,4,1,4 + FILEVERSION 0,4,1,5 + PRODUCTVERSION 0,4,1,5 FILEFLAGSMASK 0x3fL #ifdef _DEBUG FILEFLAGS 0x1L @@ -67,12 +67,12 @@ BEGIN BEGIN VALUE "CompanyName", "Open Source" VALUE "FileDescription", "Renders fractal flames as animations with motion blur" - VALUE "FileVersion", "0.4.1.4" + VALUE "FileVersion", "0.4.1.5" VALUE "InternalName", "EmberAnimate.rc" VALUE "LegalCopyright", "Copyright (C) Matt Feemster 2013, GPL v3" VALUE "OriginalFilename", "EmberAnimate.rc" VALUE "ProductName", "Ember Animate" - VALUE "ProductVersion", "0.4.1.4" + VALUE "ProductVersion", "0.4.1.5" END END BLOCK "VarFileInfo" diff --git a/Source/EmberCL/DEOpenCLKernelCreator.cpp b/Source/EmberCL/DEOpenCLKernelCreator.cpp index fb5b677..f1da81a 100644 --- a/Source/EmberCL/DEOpenCLKernelCreator.cpp +++ b/Source/EmberCL/DEOpenCLKernelCreator.cpp @@ -25,7 +25,6 @@ template <> DEOpenCLKernelCreator::DEOpenCLKernelCreator(bool nVidia) { m_NVidia = nVidia; - m_LogScaleSumDEEntryPoint = "LogScaleSumDensityFilterKernel"; m_LogScaleAssignDEEntryPoint = "LogScaleAssignDensityFilterKernel"; m_GaussianDEWithoutSsEntryPoint = "GaussianDEWithoutSsKernel"; m_GaussianDESsWithScfEntryPoint = "GaussianDESsWithScfKernel"; @@ -33,7 +32,6 @@ DEOpenCLKernelCreator::DEOpenCLKernelCreator(bool nVidia) m_GaussianDEWithoutSsNoCacheEntryPoint = "GaussianDEWithoutSsNoCacheKernel"; m_GaussianDESsWithScfNoCacheEntryPoint = "GaussianDESsWithScfNoCacheKernel"; m_GaussianDESsWithoutScfNoCacheEntryPoint = "GaussianDESsWithoutScfNoCacheKernel"; - m_LogScaleSumDEKernel = CreateLogScaleSumDEKernelString(); m_LogScaleAssignDEKernel = CreateLogScaleAssignDEKernelString(); m_GaussianDEWithoutSsKernel = CreateGaussianDEKernel(1); m_GaussianDESsWithScfKernel = CreateGaussianDEKernel(2); @@ -56,25 +54,39 @@ DEOpenCLKernelCreator::DEOpenCLKernelCreator(bool nVidia) template <> DEOpenCLKernelCreator::DEOpenCLKernelCreator(bool nVidia) { +#ifdef ROW_ONLY_DE + m_NVidia = nVidia; + m_LogScaleAssignDEEntryPoint = "LogScaleAssignDensityFilterKernel"; + m_GaussianDEWithoutSsEntryPoint = "GaussianDEWithoutSsKernel"; + m_GaussianDESsWithScfEntryPoint = "GaussianDESsWithScfKernel"; + m_GaussianDESsWithoutScfEntryPoint = "GaussianDESsWithoutScfKernel"; + m_GaussianDEWithoutSsNoCacheEntryPoint = "GaussianDEWithoutSsNoCacheKernel"; + m_GaussianDESsWithScfNoCacheEntryPoint = "GaussianDESsWithScfNoCacheKernel"; + m_GaussianDESsWithoutScfNoCacheEntryPoint = "GaussianDESsWithoutScfNoCacheKernel"; + m_LogScaleAssignDEKernel = CreateLogScaleAssignDEKernelString(); + m_GaussianDEWithoutSsKernel = CreateGaussianDEKernel(1); + m_GaussianDESsWithScfKernel = CreateGaussianDEKernel(2); + m_GaussianDESsWithoutScfKernel = CreateGaussianDEKernel(3); + m_GaussianDEWithoutSsNoCacheKernel = CreateGaussianDEKernelNoLocalCache(1); + m_GaussianDESsWithScfNoCacheKernel = CreateGaussianDEKernelNoLocalCache(2); + m_GaussianDESsWithoutScfNoCacheKernel = CreateGaussianDEKernelNoLocalCache(3); +#else m_NVidia = nVidia; - m_LogScaleSumDEEntryPoint = "LogScaleSumDensityFilterKernel"; m_LogScaleAssignDEEntryPoint = "LogScaleAssignDensityFilterKernel"; m_GaussianDEWithoutSsNoCacheEntryPoint = "GaussianDEWithoutSsNoCacheKernel"; m_GaussianDESsWithScfNoCacheEntryPoint = "GaussianDESsWithScfNoCacheKernel"; m_GaussianDESsWithoutScfNoCacheEntryPoint = "GaussianDESsWithoutScfNoCacheKernel"; - m_LogScaleSumDEKernel = CreateLogScaleSumDEKernelString(); m_LogScaleAssignDEKernel = CreateLogScaleAssignDEKernelString(); m_GaussianDEWithoutSsNoCacheKernel = CreateGaussianDEKernelNoLocalCache(1); m_GaussianDESsWithScfNoCacheKernel = CreateGaussianDEKernelNoLocalCache(2); m_GaussianDESsWithoutScfNoCacheKernel = CreateGaussianDEKernelNoLocalCache(3); +#endif } /// /// Kernel source and entry point properties, getters only. /// -template string DEOpenCLKernelCreator::LogScaleSumDEKernel() { return m_LogScaleSumDEKernel; } -template string DEOpenCLKernelCreator::LogScaleSumDEEntryPoint() { return m_LogScaleSumDEEntryPoint; } template string DEOpenCLKernelCreator::LogScaleAssignDEKernel() { return m_LogScaleAssignDEKernel; } template string DEOpenCLKernelCreator::LogScaleAssignDEEntryPoint() { return m_LogScaleAssignDEEntryPoint; } @@ -87,6 +99,7 @@ template string DEOpenCLKernelCreator::LogScaleAssignDEEntryPoin template string DEOpenCLKernelCreator::GaussianDEKernel(size_t ss, unsigned int filterWidth) { +#ifndef ROW_ONLY_DE if ((typeid(T) == typeid(double)) || (filterWidth > MaxDEFilterSize()))//Type double does not use cache. { if (ss > 1) @@ -100,6 +113,7 @@ string DEOpenCLKernelCreator::GaussianDEKernel(size_t ss, unsigned int filter return m_GaussianDEWithoutSsNoCacheKernel; } else +#endif { if (ss > 1) { @@ -122,6 +136,7 @@ string DEOpenCLKernelCreator::GaussianDEKernel(size_t ss, unsigned int filter template string DEOpenCLKernelCreator::GaussianDEEntryPoint(size_t ss, unsigned int filterWidth) { +#ifndef ROW_ONLY_DE if ((typeid(T) == typeid(double)) || (filterWidth > MaxDEFilterSize()))//Type double does not use cache. { if (ss > 1) @@ -135,6 +150,7 @@ string DEOpenCLKernelCreator::GaussianDEEntryPoint(size_t ss, unsigned int fi return m_GaussianDEWithoutSsNoCacheEntryPoint; } else +#endif { if (ss > 1) { @@ -194,45 +210,6 @@ unsigned int DEOpenCLKernelCreator::SolveMaxBoxSize(unsigned int localMem) return (unsigned int)floor(sqrt(floor((T)localMem / 16.0)));//Divide by 16 because each element is float4. } -/// -/// Create the log scale kernel string, using summation. -/// This means each cell will be added to, rather than just assigned. -/// Since adding is slower than assigning, this should only be used when Passes > 1, -/// otherwise use the kernel created from CreateLogScaleAssignDEKernelString(). -/// -/// The kernel string -template -string DEOpenCLKernelCreator::CreateLogScaleSumDEKernelString() -{ - ostringstream os; - - os << - ConstantDefinesString(typeid(T) == typeid(double)) << - DensityFilterCLStructString << - "__kernel void " << m_LogScaleSumDEEntryPoint << "(\n" - " const __global real4* histogram,\n" - " __global real4* accumulator,\n" - " __constant DensityFilterCL* logFilter\n" - "\t)\n" - "{\n" - " if ((GLOBAL_ID_X < logFilter->m_SuperRasW) && (GLOBAL_ID_Y < logFilter->m_SuperRasH))\n" - " {\n" - " uint index = (GLOBAL_ID_Y * logFilter->m_SuperRasW) + GLOBAL_ID_X;\n" - "\n" - " if (histogram[index].w != 0)\n" - " {\n" - " real_t logScale = (logFilter->m_K1 * log(1.0 + histogram[index].w * logFilter->m_K2)) / histogram[index].w;\n" - "\n" - " accumulator[index] += histogram[index] * logScale;\n"//Using a single real4 vector operation doubles the speed from doing each component individually. - " }\n" - "\n" - " barrier(CLK_GLOBAL_MEM_FENCE);\n"//Just to be safe. Makes no speed difference to do all of the time or only when there's a hit. - " }\n" - "}\n"; - - return os.str(); -} - /// /// Create the log scale kernel string, using assignment. /// Use this when Passes == 1. @@ -270,6 +247,215 @@ string DEOpenCLKernelCreator::CreateLogScaleAssignDEKernelString() return os.str(); } +#ifdef ROW_ONLY_DE +template +string DEOpenCLKernelCreator::CreateGaussianDEKernel(size_t ss) +{ + bool doSS = ss > 1; + bool doScf = !(ss & 1); + ostringstream os; + + os << + ConstantDefinesString(typeid(T) == typeid(double)) << + DensityFilterCLStructString << + UnionCLStructString << + "__kernel void " << GaussianDEEntryPoint(ss, MaxDEFilterSize()) << "(\n" << + " const __global real4* histogram,\n" + " __global real4reals* accumulator,\n" + " __constant DensityFilterCL* densityFilter,\n" + " const __global real_t* filterCoefs,\n" + " const __global real_t* filterWidths,\n" + " const __global uint* coefIndices,\n" + " const uint chunkSizeW,\n" + " const uint chunkSizeH,\n" + " const uint chunkW,\n" + " const uint chunkH\n" + "\t)\n" + "{\n" + " uint rowsToProcess = 32;\n"//Rows to process. + "\n" + " if (((((BLOCK_ID_X * chunkSizeW) + chunkW) * BLOCK_SIZE_X) + THREAD_ID_X >= densityFilter->m_SuperRasW) ||\n" + " ((((BLOCK_ID_Y * chunkSizeH) + chunkH) * rowsToProcess) + THREAD_ID_Y >= densityFilter->m_SuperRasH))\n" + " return;\n" + "\n"; + + if (doSS) + { + os << + " uint ss = (uint)floor((real_t)densityFilter->m_Supersample / 2.0);\n" + " int densityBoxLeftX;\n" + " int densityBoxRightX;\n" + " int densityBoxTopY;\n" + " int densityBoxBottomY;\n" + "\n"; + + if (doScf) + os << + " real_t scfact = pow(densityFilter->m_Supersample / (densityFilter->m_Supersample + 1.0), 2.0);\n"; + } + + os << + " uint fullTempBoxWidth;\n" + " uint leftBound, rightBound, topBound, botBound;\n" + " uint blockHistStartRow, blockHistEndRow, histCol;\n" + " uint blockHistStartCol, boxReadStartCol, boxReadEndCol;\n" + " uint accumWriteStartCol, colsToWrite, colOffset, colsToWriteOffset;\n" + " int histRow, filterRow, accumWriteOffset;\n" + "\n" + " fullTempBoxWidth = BLOCK_SIZE_X + (densityFilter->m_FilterWidth * 2);\n" + //Compute the bounds of the area to be sampled, which is just the ends minus the super sample minus 1. + " leftBound = densityFilter->m_Supersample - 1;\n" + " rightBound = densityFilter->m_SuperRasW - (densityFilter->m_Supersample - 1);\n" + " topBound = densityFilter->m_Supersample - 1;\n" + " botBound = densityFilter->m_SuperRasH - (densityFilter->m_Supersample - 1);\n" + "\n" + //Start and end values are the indices in the histogram read from + //and written to in the accumulator. They are not the indices for the local block of data. + //Before computing local offsets, compute the global offsets first to determine if any rows or cols fall outside of the bounds. + " blockHistStartRow = min(botBound, topBound + (((BLOCK_ID_Y * chunkSizeH) + chunkH) * rowsToProcess));\n"//The first histogram row this block will process. + " blockHistEndRow = min(botBound, blockHistStartRow + rowsToProcess);\n"//The last histogram row this block will process, clamped to the last row. + " blockHistStartCol = min(rightBound, leftBound + (((BLOCK_ID_X * chunkSizeW) + chunkW) * BLOCK_SIZE_X));\n"//The first histogram column this block will process. + " boxReadStartCol = densityFilter->m_FilterWidth - min(densityFilter->m_FilterWidth, blockHistStartCol);\n"//The first box col this block will read from when copying to the accumulator. + " boxReadEndCol = densityFilter->m_FilterWidth + min(densityFilter->m_FilterWidth + BLOCK_SIZE_X, densityFilter->m_SuperRasW - blockHistStartCol);\n"//The last box col this block will read from when copying to the accumulator. + "\n" + //Last, the indices in the global accumulator that the local bounds will be writing to. + " accumWriteStartCol = blockHistStartCol - min(densityFilter->m_FilterWidth, blockHistStartCol);\n"//The first column in the accumulator this block will write to. + " colsToWrite = ceil((real_t)(boxReadEndCol - boxReadStartCol) / (real_t)BLOCK_SIZE_X);\n"//Elements per thread to be written to the accumulator. + " histCol = blockHistStartCol + THREAD_ID_X;\n"//The histogram column this individual thread will be reading from. + "\n" + " if (histCol >= rightBound)\n" + " return;\n" + "\n" + //Compute the col position in this local box to serve as the center position + //from which filter application offsets are computed. + //These are the local indices for the local data that are temporarily accumulated to before + //writing out to the global accumulator. + " uint boxCol = densityFilter->m_FilterWidth + THREAD_ID_X;\n" + " uint colsToZeroOffset, colsToZero = ceil((real_t)fullTempBoxWidth / (real_t)(BLOCK_SIZE_X));\n"//Usually is 2. + " int i, j, k, jmin, jmax;\n" + " uint filterSelectInt, filterCoefIndex;\n" + " real_t cacheLog;\n" + " real_t filterSelect;\n" + " real4 bucket;\n" + ; + + os << " __local real4reals filterBox[192];\n";//Must be >= fullTempBoxWidth. + + os << + "\n" + " colsToZeroOffset = colsToZero * THREAD_ID_X;\n" + " colsToWriteOffset = colsToWrite * THREAD_ID_X;\n" + " k = (int)densityFilter->m_FilterWidth;\n"//Need a signed int to use below, really is filter width, but reusing a variable to save space. + "\n" + " for (histRow = blockHistStartRow; histRow < blockHistEndRow; histRow++)\n"//Process pixels by row, for 32 rows. + " {\n" + " bucket = histogram[(histRow * densityFilter->m_SuperRasW) + histCol];\n" + "\n" + " if (bucket.w != 0)\n" + " cacheLog = (densityFilter->m_K1 * log(1.0 + bucket.w * densityFilter->m_K2)) / bucket.w;\n" + "\n"; + + if (doSS) + { + os << + " filterSelect = 0;\n" + " densityBoxLeftX = histCol - min(histCol, ss);\n" + " densityBoxRightX = histCol + min(ss, (densityFilter->m_SuperRasW - histCol) - 1);\n" + " densityBoxTopY = histRow - min((uint)histRow, ss);\n" + " densityBoxBottomY = histRow + min(ss, (densityFilter->m_SuperRasH - histRow) - 1);\n" + "\n" + " for (j = densityBoxTopY; j <= densityBoxBottomY; j++)\n" + " {\n" + " for (i = densityBoxLeftX; i <= densityBoxRightX; i++)\n" + " {\n" + " filterSelect += histogram[(j * densityFilter->m_SuperRasW) + i].w;\n" + " }\n" + " }\n" + "\n"; + + if (doScf) + os << " filterSelect *= scfact;\n"; + } + else + { + os + << " filterSelect = bucket.w;\n"; + } + + os << + "\n" + " if (filterSelect > densityFilter->m_MaxFilteredCounts)\n" + " filterSelectInt = densityFilter->m_MaxFilterIndex;\n" + " else if (filterSelect <= DE_THRESH)\n" + " filterSelectInt = (int)ceil(filterSelect) - 1;\n" + " else if (filterSelect != 0)\n" + " filterSelectInt = (int)DE_THRESH + (int)floor(pow((real_t)(filterSelect - DE_THRESH), densityFilter->m_Curve));\n" + " else\n" + " filterSelectInt = 0;\n" + "\n" + " if (filterSelectInt > densityFilter->m_MaxFilterIndex)\n" + " filterSelectInt = densityFilter->m_MaxFilterIndex;\n" + "\n" + " filterCoefIndex = filterSelectInt * densityFilter->m_KernelSize;\n" + "\n" + //With this new method, only accumulate to the temp local buffer first. Write to the final accumulator last. + //For each loop through, note that there is a local memory barrier call inside of each call to AddToAccumNoCheck(). + //If this isn't done, pixel errors occurr and even an out of resources error occurrs because too many writes are done to the same place in memory at once. + " jmin = min(k, histRow);\n" + " jmax = (int)min((densityFilter->m_SuperRasH - 1) - histRow, densityFilter->m_FilterWidth);\n" + "\n" + " for (j = -jmin; j <= jmax; j++)\n" + " {\n" + " for (i = 0; i < colsToZero && (colsToZeroOffset + i) < fullTempBoxWidth; i++)\n"//Each thread zeroizes a few columns. + " {\n" + " filterBox[colsToZeroOffset + i].m_Real4 = 0;\n" + " }\n" + "\n" + " barrier(CLK_LOCAL_MEM_FENCE);\n" + "\n" + " if (bucket.w != 0)\n" + " {\n" + " filterRow = abs(j) * (densityFilter->m_FilterWidth + 1);\n" + "\n" + " for (i = -k; i <= k; i++)\n" + " {\n" + " filterSelectInt = filterCoefIndex + coefIndices[filterRow + abs(i)];\n"//Really is filterCoeffIndexPlusOffset, but reusing a variable to save space. + " filterBox[i + boxCol].m_Real4 += (bucket * (filterCoefs[filterSelectInt] * cacheLog));\n" + " }\n" + " }\n" + "\n" + " barrier(CLK_LOCAL_MEM_FENCE);\n" + "\n" + //At this point, all threads in this block have applied the filter to their surrounding pixels and stored the results in the temp local box. + //Add the cells of it that are in bounds to the global accumulator. + //Compute offsets in local box to read from, and offsets into global accumulator to write to. + //Use a method here that is similar to the zeroization above: Each thread (column) in the first row iterates through all of the + //rows and adds a few columns to the accumulator. + //" if (THREAD_ID_X == 0)\n" + //" {\n" + //" for (int kk = boxReadStartCol, i = 0; kk < boxReadEndCol; kk++, i++)\n"//Each thread writes a few columns.//Could do away with kk//TODO//OPT + //" {\n" + //" accumulator[((histRow + j) * densityFilter->m_SuperRasW) + (accumWriteStartCol + i)].m_Real4 += filterBox[kk].m_Real4;\n" + //" }\n" + //" }\n" + " accumWriteOffset = ((histRow + j) * densityFilter->m_SuperRasW) + accumWriteStartCol;\n" + "\n" + " for (i = 0; i < colsToWrite; i++)\n"//Each thread writes a few columns. + " {\n" + " colOffset = colsToWriteOffset + i;\n" + "\n" + " if (boxReadStartCol + colOffset < boxReadEndCol)\n" + " accumulator[accumWriteOffset + colOffset].m_Real4 += filterBox[boxReadStartCol + colOffset].m_Real4;\n" + " }\n" + " }\n"//for() filter rows. + " barrier(CLK_GLOBAL_MEM_FENCE);\n" + " }\n"//for() histogram rows. + "}\n"; + + return os.str(); +} + +#else /// /// Create the gaussian density filtering kernel string. /// 6 different methods of processing were tried before settling on this final and fastest 7th one. @@ -281,7 +467,7 @@ string DEOpenCLKernelCreator::CreateLogScaleAssignDEKernelString() /// This allows writing to the global buffer without ever overlapping or using atomics. /// The supersample parameter will produce three different kernels. /// SS = 1, SS > 1 && SS even, SS > 1 && SS odd. -/// The width of the kernl this runs in must be evenly divisible by 16 or else artifacts will occur. +/// The width of the kernel this runs in must be evenly divisible by 16 or else artifacts will occur. /// Note that because this function uses so many variables and is so complex, OpenCL can easily run /// out of resources in some cases. Certain variables had to be reused to condense the kernel footprint /// down enough to be able to run a block size of 32x32. @@ -311,18 +497,15 @@ string DEOpenCLKernelCreator::CreateGaussianDEKernel(size_t ss) " const __global uint* coefIndices,\n" " const uint chunkSizeW,\n" " const uint chunkSizeH,\n" - " const uint rowParity,\n" - " const uint colParity\n" + " const uint chunkW,\n" + " const uint chunkH\n" "\t)\n" "{\n" - //Parity determines if this function should execute. - " if ((GLOBAL_ID_X >= densityFilter->m_SuperRasW) ||\n" - " (GLOBAL_ID_Y >= densityFilter->m_SuperRasH) ||\n" - " ((BLOCK_ID_X % chunkSizeW) != colParity) ||\n" - " ((BLOCK_ID_Y % chunkSizeH) != rowParity)) \n" + " if (((((BLOCK_ID_X * chunkSizeW) + chunkW) * BLOCK_SIZE_X) + THREAD_ID_X >= densityFilter->m_SuperRasW) ||\n" + " ((((BLOCK_ID_Y * chunkSizeH) + chunkH) * BLOCK_SIZE_Y) + THREAD_ID_Y >= densityFilter->m_SuperRasH))\n" " return;\n" "\n"; - + if (doSS) { os << @@ -367,13 +550,13 @@ string DEOpenCLKernelCreator::CreateGaussianDEKernel(size_t ss) //Start and end values are the indices in the histogram read from //and written to in the accumulator. They are not the indices for the local block of data. //Before computing local offsets, compute the global offsets first to determine if any rows or cols fall outside of the bounds. - " blockHistStartRow = min(botBound, topBound + (BLOCK_ID_Y * BLOCK_SIZE_Y));\n"//The first histogram row this block will process. + " blockHistStartRow = min(botBound, topBound + (((BLOCK_ID_Y * chunkSizeH) + chunkH) * BLOCK_SIZE_Y));\n"//The first histogram row this block will process. " blockHistEndRow = min(botBound, blockHistStartRow + BLOCK_SIZE_Y);\n"//The last histogram row this block will process, clamped to the last row. " boxReadStartRow = densityFilter->m_FilterWidth - min(densityFilter->m_FilterWidth, blockHistStartRow);\n"//The first row in the local box to read from when writing back to the final accumulator for this block. " boxReadEndRow = densityFilter->m_FilterWidth + min(densityFilter->m_FilterWidth + BLOCK_SIZE_Y, densityFilter->m_SuperRasH - blockHistStartRow);\n"//The last row in the local box to read from when writing back to the final accumulator for this block. - " blockHistStartCol = min(rightBound, leftBound + (BLOCK_ID_X * BLOCK_SIZE_X));\n"//The first histogram column this block will process. - " boxReadStartCol = densityFilter->m_FilterWidth - min(densityFilter->m_FilterWidth, blockHistStartCol);\n"//The first box row this block will read from when copying to the accumulator. - " boxReadEndCol = densityFilter->m_FilterWidth + min(densityFilter->m_FilterWidth + BLOCK_SIZE_X, densityFilter->m_SuperRasW - blockHistStartCol);\n"//The last box row this block will read from when copying to the accumulator. + " blockHistStartCol = min(rightBound, leftBound + (((BLOCK_ID_X * chunkSizeW) + chunkW) * BLOCK_SIZE_X));\n"//The first histogram column this block will process. + " boxReadStartCol = densityFilter->m_FilterWidth - min(densityFilter->m_FilterWidth, blockHistStartCol);\n"//The first box col this block will read from when copying to the accumulator. + " boxReadEndCol = densityFilter->m_FilterWidth + min(densityFilter->m_FilterWidth + BLOCK_SIZE_X, densityFilter->m_SuperRasW - blockHistStartCol);\n"//The last box col this block will read from when copying to the accumulator. "\n" //Last, the indices in the global accumulator that the local bounds will be writing to. " accumWriteStartRow = blockHistStartRow - min(densityFilter->m_FilterWidth, blockHistStartRow);\n"//Will be fw - 0 except for boundary columns, it will be less. @@ -496,7 +679,7 @@ string DEOpenCLKernelCreator::CreateGaussianDEKernel(size_t ss) " {\n" " filterSelectInt = filterCoefIndex + coefIndices[(abs(j) * (densityFilter->m_FilterWidth + 1)) + abs(i)];\n"//Really is filterCoeffIndexPlusOffset, but reusing a variable to save space. "\n" - " if (filterCoefs[filterSelectInt] != 0)\n" + " if (filterCoefs[filterSelectInt] != 0)\n"//This conditional actually improves speed, despite SIMT being bad at conditionals. " {\n" " filterBox[(i + boxCol) + ((j + boxRow) * fullTempBoxWidth)].m_Real4 += (bucket * (filterCoefs[filterSelectInt] * cacheLog));\n" " }\n" @@ -511,14 +694,14 @@ string DEOpenCLKernelCreator::CreateGaussianDEKernel(size_t ss) "\n" " if (THREAD_ID_Y == 0)\n" " {\n" - //At this point, all threads in this block have applied the filter to their surrounding pixel and stored the results in the temp local box. + //At this point, all threads in this block have applied the filter to their surrounding pixels and stored the results in the temp local box. //Add the cells of it that are in bounds to the global accumulator. //Compute offsets in local box to read from, and offsets into global accumulator to write to. //Use a method here that is similar to the zeroization above: Each thread (column) in the first row iterates through all of the //rows and adds a few columns to the accumulator. " for (i = boxReadStartRow, j = accumWriteStartRow; i < boxReadEndRow; i++, j++)\n" " {\n" - " for (k = 0; k < colsToWrite; k++)\n"//Write a few columns. + " for (k = 0; k < colsToWrite; k++)\n"//Each thread writes a few columns. " {\n" " boxCol = (colsToWrite * THREAD_ID_X) + k;\n"//Really is colOffset, but reusing a variable to save space. "\n" @@ -532,6 +715,7 @@ string DEOpenCLKernelCreator::CreateGaussianDEKernel(size_t ss) return os.str(); } +#endif /// /// Create the gaussian density filtering kernel string, but use no local cache and perform @@ -543,7 +727,7 @@ string DEOpenCLKernelCreator::CreateGaussianDEKernel(size_t ss) /// on the CPU because the frequent global memory access brings performance to a crawl. /// The supersample parameter will produce three different kernels. /// SS = 1, SS > 1 && SS even, SS > 1 && SS odd. -/// The width of the kernl this runs in must be evenly divisible by 16 or else artifacts will occur. +/// The width of the kernel this runs in must be evenly divisible by 16 or else artifacts will occur. /// Note that because this function uses so many variables and is so complex, OpenCL can easily run /// out of resources in some cases. Certain variables had to be reused to condense the kernel footprint /// down enough to be able to run a block size of 32x32. @@ -572,15 +756,12 @@ string DEOpenCLKernelCreator::CreateGaussianDEKernelNoLocalCache(size_t ss) " const __global uint* coefIndices,\n" " const uint chunkSizeW,\n" " const uint chunkSizeH,\n" - " const uint rowParity,\n" - " const uint colParity\n" + " const uint chunkW,\n" + " const uint chunkH\n" "\t)\n" "{\n" - //Parity determines if this function should execute. - " if ((GLOBAL_ID_X >= densityFilter->m_SuperRasW) ||\n" - " (GLOBAL_ID_Y >= densityFilter->m_SuperRasH) ||\n" - " ((BLOCK_ID_X % chunkSizeW) != colParity) ||\n" - " ((BLOCK_ID_Y % chunkSizeH) != rowParity)) \n" + " if (((((BLOCK_ID_X * chunkSizeW) + chunkW) * BLOCK_SIZE_X) + THREAD_ID_X >= densityFilter->m_SuperRasW) ||\n" + " ((((BLOCK_ID_Y * chunkSizeH) + chunkH) * BLOCK_SIZE_Y) + THREAD_ID_Y >= densityFilter->m_SuperRasH))\n" " return;\n" "\n"; @@ -606,10 +787,10 @@ string DEOpenCLKernelCreator::CreateGaussianDEKernelNoLocalCache(size_t ss) "\n" //Start and end values are the indices in the histogram read from and written to in the accumulator. //Before computing local offsets, compute the global offsets first to determine if any rows or cols fall outside of the bounds. - " uint blockHistStartRow = min(botBound, topBound + (BLOCK_ID_Y * BLOCK_SIZE_Y));\n"//The first histogram row this block will process. + " uint blockHistStartRow = min(botBound, topBound + (((BLOCK_ID_Y * chunkSizeH) + chunkH) * BLOCK_SIZE_Y));\n"//The first histogram row this block will process. " uint threadHistRow = blockHistStartRow + THREAD_ID_Y;\n"//The histogram row this individual thread will be reading from. "\n" - " uint blockHistStartCol = min(rightBound, leftBound + (BLOCK_ID_X * BLOCK_SIZE_X));\n"//The first histogram column this block will process. + " uint blockHistStartCol = min(rightBound, leftBound + (((BLOCK_ID_X * chunkSizeW) + chunkW) * BLOCK_SIZE_X));\n"//The first histogram column this block will process. " uint threadHistCol = blockHistStartCol + THREAD_ID_X;\n"//The histogram column this individual thread will be reading from. "\n" " int i, j;\n" diff --git a/Source/EmberCL/DEOpenCLKernelCreator.h b/Source/EmberCL/DEOpenCLKernelCreator.h index 802ddd6..a56cef3 100644 --- a/Source/EmberCL/DEOpenCLKernelCreator.h +++ b/Source/EmberCL/DEOpenCLKernelCreator.h @@ -8,6 +8,8 @@ /// DEOpenCLKernelCreator class. /// +//#define ROW_ONLY_DE 1 + namespace EmberCLns { /// @@ -35,8 +37,6 @@ public: DEOpenCLKernelCreator(bool nVidia); //Accessors. - string LogScaleSumDEKernel(); - string LogScaleSumDEEntryPoint(); string LogScaleAssignDEKernel(); string LogScaleAssignDEEntryPoint(); string GaussianDEKernel(size_t ss, unsigned int filterWidth); @@ -49,14 +49,10 @@ public: private: //Kernel creators. - string CreateLogScaleSumDEKernelString(); string CreateLogScaleAssignDEKernelString(); string CreateGaussianDEKernel(size_t ss); string CreateGaussianDEKernelNoLocalCache(size_t ss); - - string m_LogScaleSumDEKernel; - string m_LogScaleSumDEEntryPoint; - + string m_LogScaleAssignDEKernel; string m_LogScaleAssignDEEntryPoint; diff --git a/Source/EmberCL/EmberCLStructs.h b/Source/EmberCL/EmberCLStructs.h index 26328d4..6aa4c34 100644 --- a/Source/EmberCL/EmberCLStructs.h +++ b/Source/EmberCL/EmberCLStructs.h @@ -181,9 +181,6 @@ static const char* XformCLStructString = "} XformCL;\n" "\n"; -#define MAX_CL_XFORM 21//These must always match. -#define MAX_CL_XFORM_STRING "21" - /// /// A structure on the host used to hold all of the needed information for an ember used on the device to iterate in OpenCL. /// Template argument expected to be float or double. @@ -191,7 +188,6 @@ static const char* XformCLStructString = template struct ALIGN EmberCL { - XformCL m_Xforms[MAX_CL_XFORM]; T m_CamZPos; T m_CamPerspective; T m_CamYaw; @@ -209,7 +205,6 @@ struct ALIGN EmberCL static const char* EmberCLStructString = "typedef struct __attribute__ " ALIGN_CL " _EmberCL\n" "{\n" -" XformCL m_Xforms[" MAX_CL_XFORM_STRING "];\n" " real_t m_CamZPos;\n" " real_t m_CamPerspective;\n" " real_t m_CamYaw;\n" diff --git a/Source/EmberCL/IterOpenCLKernelCreator.cpp b/Source/EmberCL/IterOpenCLKernelCreator.cpp index 6c944e6..220938b 100644 --- a/Source/EmberCL/IterOpenCLKernelCreator.cpp +++ b/Source/EmberCL/IterOpenCLKernelCreator.cpp @@ -1,6 +1,9 @@ #include "EmberCLPch.h" #include "IterOpenCLKernelCreator.h" +//#define STRAIGHT_RAND 1 +#define USE_CASE 1 + namespace EmberCLns { /// @@ -233,8 +236,9 @@ string IterOpenCLKernelCreator::CreateIterKernelString(Ember& ember, strin "__kernel void " << m_IterEntryPoint << "(\n" << " uint iterCount,\n" " uint fuseCount,\n" - " uint seed,\n" + " __global uint2* seeds,\n" " __constant EmberCL* ember,\n" + " __constant XformCL* xforms,\n" " __constant real_t* parVars,\n" " __global uchar* xformDistributions,\n"//Using uchar is quicker than uint. Can't be constant because the size can be too large to fit when using xaos.//FINALOPT " __constant CarToRasCL* carToRas,\n" @@ -246,13 +250,14 @@ string IterOpenCLKernelCreator::CreateIterKernelString(Ember& ember, strin "{\n" " bool fuse, ok;\n" " uint threadIndex = INDEX_IN_BLOCK_2D;\n" + " uint pointsIndex = INDEX_IN_GRID_2D;\n" " uint i, itersToDo;\n" " uint consec = 0;\n" //" int badvals = 0;\n" " uint histIndex;\n" " real_t p00, p01;\n" " Point firstPoint, secondPoint, tempPoint;\n" - " uint2 mwc;\n" + " uint2 mwc = seeds[pointsIndex];\n" " float4 palColor1;\n" " int2 iPaletteCoord;\n" " const sampler_t paletteSampler = CLK_NORMALIZED_COORDS_FALSE |\n"//Coords from 0 to 255. @@ -265,12 +270,11 @@ string IterOpenCLKernelCreator::CreateIterKernelString(Ember& ember, strin os << "\n" +#ifndef STRAIGHT_RAND " __local Point swap[NTHREADS];\n" " __local uint xfsel[NWARPS];\n" +#endif "\n" - " uint pointsIndex = INDEX_IN_GRID_2D;\n" - " mwc.x = (pointsIndex + 1 * seed);\n" - " mwc.y = ((BLOCK_ID_X + 1) * (pointsIndex + 1) * seed);\n" " iPaletteCoord.y = 0;\n" "\n" " if (fuseCount > 0)\n" @@ -295,9 +299,11 @@ string IterOpenCLKernelCreator::CreateIterKernelString(Ember& ember, strin //This along with the randomness that the point shuffle provides gives sufficient randomness //to produce results identical to those produced on the CPU. os << +#ifndef STRAIGHT_RAND " if (THREAD_ID_Y == 0 && THREAD_ID_X < NWARPS)\n" " xfsel[THREAD_ID_X] = MwcNext(&mwc) % " << CHOOSE_XFORM_GRAIN << ";\n"//It's faster to do the % here ahead of time than every time an xform is looked up to use inside the loop. "\n" +#endif " barrier(CLK_LOCAL_MEM_FENCE);\n" "\n" " for (i = 0; i < itersToDo; i++)\n" @@ -309,22 +315,51 @@ string IterOpenCLKernelCreator::CreateIterKernelString(Ember& ember, strin " do\n" " {\n"; - //If xaos is present, the cuburn method is effectively ceased. Every thread will be picking a random xform. + //If xaos is present, the a hybrid of the cuburn method is used. + //This makes each thread in a row pick the same offset into a distribution, using xfsel. + //However, the distribution the offset is in, is determined by firstPoint.m_LastXfUsed. if (ember.XaosPresent()) { os << +#ifdef STRAIGHT_RAND " secondPoint.m_LastXfUsed = xformDistributions[MwcNext(&mwc) % " << CHOOSE_XFORM_GRAIN << " + (" << CHOOSE_XFORM_GRAIN << " * (firstPoint.m_LastXfUsed + 1u))];\n\n"; - //" secondPoint.m_LastXfUsed = xformDistributions[xfsel[THREAD_ID_Y] + (" << CHOOSE_XFORM_GRAIN << " * (firstPoint.m_LastXfUsed + 1u))];\n\n";//Partial cuburn hybrid. +#else + " secondPoint.m_LastXfUsed = xformDistributions[xfsel[THREAD_ID_Y] + (" << CHOOSE_XFORM_GRAIN << " * (firstPoint.m_LastXfUsed + 1u))];\n\n";//Partial cuburn hybrid. +#endif } else { os << - //" secondPoint.m_LastXfUsed = xformDistributions[MwcNext(&mwc) % " << CHOOSE_XFORM_GRAIN << "];\n\n";//For testing, using straight rand flam4/fractron style instead of cuburn. +#ifdef STRAIGHT_RAND + " secondPoint.m_LastXfUsed = xformDistributions[MwcNext(&mwc) % " << CHOOSE_XFORM_GRAIN << "];\n\n";//For testing, using straight rand flam4/fractron style instead of cuburn. +#else " secondPoint.m_LastXfUsed = xformDistributions[xfsel[THREAD_ID_Y]];\n\n"; +#endif } for (i = 0; i < ember.XformCount(); i++) { +#ifdef USE_CASE + if (i == 0) + { + os << + " switch (secondPoint.m_LastXfUsed)\n" + " {\n"; + } + + os << + " case " << i << ":\n" + " {\n" << + " Xform" << i << "(&(xforms[" << i << "]), parVars, &firstPoint, &secondPoint, &mwc);\n" << + " break;\n" + " }\n"; + + if (i == ember.XformCount() - 1) + { + os << + " }\n"; + } +#else if (i == 0) os << " if (secondPoint.m_LastXfUsed == " << i << ")\n"; @@ -334,9 +369,11 @@ string IterOpenCLKernelCreator::CreateIterKernelString(Ember& ember, strin os << " {\n" << - " Xform" << i << "(&(ember->m_Xforms[" << i << "]), parVars, &firstPoint, &secondPoint, &mwc);\n" << + " Xform" << i << "(&(xforms[" << i << "]), parVars, &firstPoint, &secondPoint, &mwc);\n" << " }\n"; +#endif } + os << "\n" " ok = !BadVal(secondPoint.m_X) && !BadVal(secondPoint.m_Y);\n" @@ -360,6 +397,7 @@ string IterOpenCLKernelCreator::CreateIterKernelString(Ember& ember, strin " secondPoint.m_Y = MwcNextNeg1Pos1(&mwc);\n" " secondPoint.m_Z = 0.0;\n" " }\n" +#ifndef STRAIGHT_RAND "\n"//Rotate points between threads. This is how randomization is achieved. " uint swr = threadXY + ((i & 1u) * threadXDivRows);\n" " uint sw = (swr * THREADS_PER_WARP + THREAD_ID_X) & threadsMinus1;\n" @@ -368,16 +406,16 @@ string IterOpenCLKernelCreator::CreateIterKernelString(Ember& ember, strin //Write to another thread's location. " swap[sw] = secondPoint;\n" "\n" - //Populate randomized xform index buffer with new random values. " if (THREAD_ID_Y == 0 && THREAD_ID_X < NWARPS)\n" " xfsel[THREAD_ID_X] = MwcNext(&mwc) % " << CHOOSE_XFORM_GRAIN << ";\n" "\n" " barrier(CLK_LOCAL_MEM_FENCE);\n" - "\n" - //Another thread will have written to this thread's location, so read the new value and use it for accumulation below. " firstPoint = swap[threadIndex];\n" +#else + " firstPoint = secondPoint;\n"//For testing, using straight rand flam4/fractron style instead of cuburn. +#endif "\n" " if (fuse)\n" " {\n" @@ -399,14 +437,14 @@ string IterOpenCLKernelCreator::CreateIterKernelString(Ember& ember, strin //CPU takes an extra step here to preserve the opacity of the randomly selected xform, rather than the final xform's opacity. //The same thing takes place here automatically because secondPoint.m_LastXfUsed is used below to retrieve the opacity when accumulating. - os << - " if ((ember->m_Xforms[" << finalIndex << "].m_Opacity == 1) || (MwcNext01(&mwc) < ember->m_Xforms[" << finalIndex << "].m_Opacity))\n" - " {\n" - " tempPoint.m_LastXfUsed = secondPoint.m_LastXfUsed;\n" - " Xform" << finalIndex << "(&(ember->m_Xforms[" << finalIndex << "]), parVars, &secondPoint, &tempPoint, &mwc);\n" - " secondPoint = tempPoint;\n" - " }\n" - "\n"; + os << + " if ((xforms[" << finalIndex << "].m_Opacity == 1) || (MwcNext01(&mwc) < xforms[" << finalIndex << "].m_Opacity))\n" + " {\n" + " tempPoint.m_LastXfUsed = secondPoint.m_LastXfUsed;\n" + " Xform" << finalIndex << "(&(xforms[" << finalIndex << "]), parVars, &secondPoint, &tempPoint, &mwc);\n" + " secondPoint = tempPoint;\n" + " }\n" + "\n"; } os << CreateProjectionString(ember); @@ -471,18 +509,18 @@ string IterOpenCLKernelCreator::CreateIterKernelString(Ember& ember, strin if (typeid(T) == typeid(double)) { os << - " AtomicAdd(&(histogram[histIndex].m_Reals[0]), (real_t)palColor1.x * ember->m_Xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n"//Always apply opacity, even though it's usually 1. - " AtomicAdd(&(histogram[histIndex].m_Reals[1]), (real_t)palColor1.y * ember->m_Xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n" - " AtomicAdd(&(histogram[histIndex].m_Reals[2]), (real_t)palColor1.z * ember->m_Xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n" - " AtomicAdd(&(histogram[histIndex].m_Reals[3]), (real_t)palColor1.w * ember->m_Xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n"; + " AtomicAdd(&(histogram[histIndex].m_Reals[0]), (real_t)palColor1.x * xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n"//Always apply opacity, even though it's usually 1. + " AtomicAdd(&(histogram[histIndex].m_Reals[1]), (real_t)palColor1.y * xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n" + " AtomicAdd(&(histogram[histIndex].m_Reals[2]), (real_t)palColor1.z * xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n" + " AtomicAdd(&(histogram[histIndex].m_Reals[3]), (real_t)palColor1.w * xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n"; } else { - os << - " AtomicAdd(&(histogram[histIndex].m_Reals[0]), palColor1.x * ember->m_Xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n"//Always apply opacity, even though it's usually 1. - " AtomicAdd(&(histogram[histIndex].m_Reals[1]), palColor1.y * ember->m_Xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n" - " AtomicAdd(&(histogram[histIndex].m_Reals[2]), palColor1.z * ember->m_Xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n" - " AtomicAdd(&(histogram[histIndex].m_Reals[3]), palColor1.w * ember->m_Xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n"; + os << + " AtomicAdd(&(histogram[histIndex].m_Reals[0]), palColor1.x * xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n"//Always apply opacity, even though it's usually 1. + " AtomicAdd(&(histogram[histIndex].m_Reals[1]), palColor1.y * xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n" + " AtomicAdd(&(histogram[histIndex].m_Reals[2]), palColor1.z * xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n" + " AtomicAdd(&(histogram[histIndex].m_Reals[3]), palColor1.w * xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n"; } } else @@ -496,12 +534,12 @@ string IterOpenCLKernelCreator::CreateIterKernelString(Ember& ember, strin " realColor.y = (real_t)palColor1.y;\n" " realColor.z = (real_t)palColor1.z;\n" " realColor.w = (real_t)palColor1.w;\n" - " histogram[histIndex].m_Real4 += (realColor * ember->m_Xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n"; + " histogram[histIndex].m_Real4 += (realColor * xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n"; } else { - os << - " histogram[histIndex].m_Real4 += (palColor1 * ember->m_Xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n"; + os << + " histogram[histIndex].m_Real4 += (palColor1 * xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n"; } } @@ -525,6 +563,7 @@ string IterOpenCLKernelCreator::CreateIterKernelString(Ember& ember, strin " points[pointsIndex].m_ColorX = MwcNextNeg1Pos1(&mwc);\n" #else " points[pointsIndex] = firstPoint;\n" + " seeds[pointsIndex] = mwc;\n" #endif " barrier(CLK_GLOBAL_MEM_FENCE);\n" "}\n"; diff --git a/Source/EmberCL/OpenCLWrapper.cpp b/Source/EmberCL/OpenCLWrapper.cpp index 0a043c5..57621a8 100644 --- a/Source/EmberCL/OpenCLWrapper.cpp +++ b/Source/EmberCL/OpenCLWrapper.cpp @@ -1121,12 +1121,12 @@ string OpenCLWrapper::DumpInfo() /// /// OpenCL properties, getters only. /// -bool OpenCLWrapper::Ok() { return m_Init; } -bool OpenCLWrapper::Shared() { return m_Shared; } -cl::Context OpenCLWrapper::Context() { return m_Context; } -unsigned int OpenCLWrapper::PlatformIndex() { return m_PlatformIndex; } -unsigned int OpenCLWrapper::DeviceIndex() { return m_DeviceIndex; } -unsigned int OpenCLWrapper::LocalMemSize() { return m_LocalMemSize; } +bool OpenCLWrapper::Ok() const { return m_Init; } +bool OpenCLWrapper::Shared() const { return m_Shared; } +cl::Context OpenCLWrapper::Context() const { return m_Context; } +unsigned int OpenCLWrapper::PlatformIndex() const { return m_PlatformIndex; } +unsigned int OpenCLWrapper::DeviceIndex() const { return m_DeviceIndex; } +unsigned int OpenCLWrapper::LocalMemSize() const { return m_LocalMemSize; } /// /// Makes the even grid dims. diff --git a/Source/EmberCL/OpenCLWrapper.h b/Source/EmberCL/OpenCLWrapper.h index e060d01..0369c18 100644 --- a/Source/EmberCL/OpenCLWrapper.h +++ b/Source/EmberCL/OpenCLWrapper.h @@ -184,12 +184,12 @@ public: string DumpInfo(); //Accessors. - bool Ok(); - bool Shared(); - cl::Context Context(); - unsigned int PlatformIndex(); - unsigned int DeviceIndex(); - unsigned int LocalMemSize(); + bool Ok() const; + bool Shared() const; + cl::Context Context() const; + unsigned int PlatformIndex() const; + unsigned int DeviceIndex() const; + unsigned int LocalMemSize() const; static void MakeEvenGridDims(unsigned int blockW, unsigned int blockH, unsigned int& gridW, unsigned int& gridH); diff --git a/Source/EmberCL/RendererCL.cpp b/Source/EmberCL/RendererCL.cpp index eebb711..bd43bb6 100644 --- a/Source/EmberCL/RendererCL.cpp +++ b/Source/EmberCL/RendererCL.cpp @@ -22,7 +22,9 @@ RendererCL::RendererCL(unsigned int platform, unsigned int device, bool share //Buffer names. m_EmberBufferName = "Ember"; + m_XformsBufferName = "Xforms"; m_ParVarsBufferName = "ParVars"; + m_SeedsBufferName = "Seeds"; m_DistBufferName = "Dist"; m_CarToRasBufferName = "CarToRas"; m_DEFilterParamsBufferName = "DEFilterParams"; @@ -50,6 +52,13 @@ RendererCL::RendererCL(unsigned int platform, unsigned int device, bool share m_PaletteFormat.image_channel_data_type = CL_FLOAT; m_FinalFormat.image_channel_order = CL_RGBA; m_FinalFormat.image_channel_data_type = CL_UNORM_INT8;//Change if this ever supports 2BPC outputs for PNG. + m_Seeds.resize(IterGridKernelCount()); + + for (size_t i = 0; i < m_Seeds.size(); i++) + { + m_Seeds[i].x = m_Rand[0].Rand(); + m_Seeds[i].y = m_Rand[0].Rand(); + } Init(platform, device, shared, outputTexID);//Init OpenCL upon construction and create programs that will not change. } @@ -100,14 +109,12 @@ bool RendererCL::Init(unsigned int platform, unsigned int device, bool shared m_DEOpenCLKernelCreator = DEOpenCLKernelCreator(m_NVidia); string zeroizeProgram = m_IterOpenCLKernelCreator.ZeroizeKernel(); - string logAssignProgram = m_DEOpenCLKernelCreator.LogScaleAssignDEKernel(); - string logSumProgram = m_DEOpenCLKernelCreator.LogScaleSumDEKernel();//Build a couple of simple programs to ensure OpenCL is working right. + string logAssignProgram = m_DEOpenCLKernelCreator.LogScaleAssignDEKernel();//Build a couple of simple programs to ensure OpenCL is working right. if (b && !(b = m_Wrapper.AddProgram(m_IterOpenCLKernelCreator.ZeroizeEntryPoint(), zeroizeProgram, m_IterOpenCLKernelCreator.ZeroizeEntryPoint(), m_DoublePrecision))) { m_ErrorReport.push_back(loc); } if (b && !(b = m_Wrapper.AddProgram(m_DEOpenCLKernelCreator.LogScaleAssignDEEntryPoint(), logAssignProgram, m_DEOpenCLKernelCreator.LogScaleAssignDEEntryPoint(), m_DoublePrecision))) { m_ErrorReport.push_back(loc); } - if (b && !(b = m_Wrapper.AddProgram(m_DEOpenCLKernelCreator.LogScaleSumDEEntryPoint(), logSumProgram, m_DEOpenCLKernelCreator.LogScaleSumDEEntryPoint(), m_DoublePrecision))) { m_ErrorReport.push_back(loc); } - if (b && !(b = m_Wrapper.AddAndWriteImage("Palette", CL_MEM_READ_ONLY, m_PaletteFormat, 256, 1, 0, NULL))) { m_ErrorReport.push_back(loc); } + if (b && !(b = m_Wrapper.AddAndWriteBuffer(m_SeedsBufferName, (void*)m_Seeds.data(), SizeOf(m_Seeds)))) { m_ErrorReport.push_back(loc); } //This is the maximum box dimension for density filtering which consists of (blockSize * blockSize) + (2 * filterWidth). //These blocks must be square, and ideally, 32x32. @@ -123,6 +130,11 @@ bool RendererCL::Init(unsigned int platform, unsigned int device, bool shared return b; } +/// +/// Set the shared output texture where final accumulation will be written to. +/// +/// The texture ID of the shared OpenGL texture if shared +/// True if success, else false. template bool RendererCL::SetOutputTexture(GLuint outputTexID) { @@ -149,16 +161,28 @@ bool RendererCL::SetOutputTexture(GLuint outputTexID) /// OpenCL property accessors, getters only. /// -template unsigned int RendererCL::IterCountPerKernel() { return m_IterCountPerKernel; } -template unsigned int RendererCL::IterBlocksWide() { return m_IterBlocksWide; } -template unsigned int RendererCL::IterBlocksHigh() { return m_IterBlocksHigh; } -template unsigned int RendererCL::IterBlockWidth() { return m_IterBlockWidth; } -template unsigned int RendererCL::IterBlockHeight() { return m_IterBlockHeight; } -template unsigned int RendererCL::IterGridWidth() { return IterBlocksWide() * IterBlockWidth(); } -template unsigned int RendererCL::IterGridHeight() { return IterBlocksHigh() * IterBlockHeight(); } -template unsigned int RendererCL::TotalIterKernelCount() { return IterGridWidth() * IterGridHeight(); } -template unsigned int RendererCL::PlatformIndex() { return m_Wrapper.PlatformIndex(); } -template unsigned int RendererCL::DeviceIndex() { return m_Wrapper.DeviceIndex(); } +//Iters per kernel/block/grid. +template unsigned int RendererCL::IterCountPerKernel() const { return m_IterCountPerKernel; } +template unsigned int RendererCL::IterCountPerBlock() const { return IterCountPerKernel() * IterBlockKernelCount(); } +template unsigned int RendererCL::IterCountPerGrid() const { return IterCountPerKernel() * IterGridKernelCount(); } + +//Kernels per block. +template unsigned int RendererCL::IterBlockKernelWidth() const { return m_IterBlockWidth; } +template unsigned int RendererCL::IterBlockKernelHeight() const { return m_IterBlockHeight; } +template unsigned int RendererCL::IterBlockKernelCount() const { return IterBlockKernelWidth() * IterBlockKernelHeight(); } + +//Kernels per grid. +template unsigned int RendererCL::IterGridKernelWidth() const { return IterGridBlockWidth() * IterBlockKernelWidth(); } +template unsigned int RendererCL::IterGridKernelHeight() const { return IterGridBlockHeight() * IterBlockKernelHeight(); } +template unsigned int RendererCL::IterGridKernelCount() const { return IterGridKernelWidth() * IterGridKernelHeight(); } + +//Blocks per grid. +template unsigned int RendererCL::IterGridBlockWidth() const { return m_IterBlocksWide; } +template unsigned int RendererCL::IterGridBlockHeight() const { return m_IterBlocksHigh; } +template unsigned int RendererCL::IterGridBlockCount() const { return IterGridBlockWidth() * IterGridBlockHeight(); } + +template unsigned int RendererCL::PlatformIndex() { return m_Wrapper.PlatformIndex(); } +template unsigned int RendererCL::DeviceIndex() { return m_Wrapper.DeviceIndex(); } /// /// Read the histogram into the host side CPU buffer. @@ -197,10 +221,10 @@ bool RendererCL::ReadAccum() template bool RendererCL::ReadPoints(vector>& vec) { - vec.resize(TotalIterKernelCount());//Allocate the memory to read into. + vec.resize(IterGridKernelCount());//Allocate the memory to read into. - if (vec.size() >= TotalIterKernelCount()) - return m_Wrapper.ReadBuffer(m_PointsBufferName, (void*)vec.data(), TotalIterKernelCount() * sizeof(PointCL)); + if (vec.size() >= IterGridKernelCount()) + return m_Wrapper.ReadBuffer(m_PointsBufferName, (void*)vec.data(), IterGridKernelCount() * sizeof(PointCL)); return false; } @@ -237,6 +261,26 @@ bool RendererCL::WritePoints(vector>& vec) return m_Wrapper.WriteBuffer(m_PointsBufferName, (void*)vec.data(), vec.size() * sizeof(vec[0])); } +#ifdef TEST_CL +template +bool RendererCL::WriteRandomPoints() +{ + size_t size = IterGridKernelCount(); + vector> vec(size); + + for (int i = 0; i < size; i++) + { + vec[i].m_X = m_Rand[0].Frand11(); + vec[i].m_Y = m_Rand[0].Frand11(); + vec[i].m_Z = 0; + vec[i].m_ColorX = m_Rand[0].Frand01(); + vec[i].m_LastXfUsed = 0; + } + + return WritePoints(vec); +} +#endif + /// /// Get the kernel string for the last built iter program. /// @@ -351,7 +395,7 @@ void RendererCL::ClearErrorReport() template size_t RendererCL::SubBatchSize() const { - return m_IterBlocksWide * m_IterBlocksHigh * SQR(m_IterCountPerKernel); + return IterCountPerGrid(); } /// @@ -483,6 +527,7 @@ bool RendererCL::Alloc() return false; EnterResize(); + m_XformsCL.resize(m_Ember.TotalXformCount()); bool b = true; size_t histLength = SuperSize() * sizeof(v4T); @@ -490,6 +535,7 @@ bool RendererCL::Alloc() const char* loc = __FUNCTION__; if (b && !(b = m_Wrapper.AddBuffer(m_EmberBufferName, sizeof(m_EmberCL)))) { m_ErrorReport.push_back(loc); } + if (b && !(b = m_Wrapper.AddBuffer(m_XformsBufferName, SizeOf(m_XformsCL)))) { m_ErrorReport.push_back(loc); } if (b && !(b = m_Wrapper.AddBuffer(m_ParVarsBufferName, 128 * sizeof(T)))) { m_ErrorReport.push_back(loc); } if (b && !(b = m_Wrapper.AddBuffer(m_DistBufferName, CHOOSE_XFORM_GRAIN))) { m_ErrorReport.push_back(loc); }//Will be resized for xaos. if (b && !(b = m_Wrapper.AddBuffer(m_CarToRasBufferName, sizeof(m_CarToRasCL)))) { m_ErrorReport.push_back(loc); } @@ -498,7 +544,7 @@ bool RendererCL::Alloc() if (b && !(b = m_Wrapper.AddBuffer(m_HistBufferName, histLength))) { m_ErrorReport.push_back(loc); }//Histogram. Will memset to zero later. if (b && !(b = m_Wrapper.AddBuffer(m_AccumBufferName, accumLength))) { m_ErrorReport.push_back(loc); }//Accum buffer. - if (b && !(b = m_Wrapper.AddBuffer(m_PointsBufferName, TotalIterKernelCount() * sizeof(PointCL)))) { m_ErrorReport.push_back(loc); }//Points between iter calls. + if (b && !(b = m_Wrapper.AddBuffer(m_PointsBufferName, IterGridKernelCount() * sizeof(PointCL)))) { m_ErrorReport.push_back(loc); }//Points between iter calls. if (b && !(b = SetOutputTexture(m_OutputTexID))) { m_ErrorReport.push_back(loc); } @@ -702,12 +748,12 @@ bool RendererCL::RunIter(size_t iterCount, size_t temporalSample, size_t& ite { Timing t;//, t2(4); bool b = true; - unsigned int seed, fuse, argIndex; - unsigned int iterCountPerKernel = m_IterCountPerKernel; - unsigned int iterCountPerBlock = iterCountPerKernel * m_IterBlockWidth * m_IterBlockHeight; + unsigned int fuse, argIndex; + unsigned int iterCountPerKernel = IterCountPerKernel(); + unsigned int iterCountPerBlock = IterCountPerBlock(); unsigned int supersize = (unsigned int)SuperSize(); int kernelIndex = m_Wrapper.FindKernelIndex(m_IterOpenCLKernelCreator.IterEntryPoint()); - size_t fuseFreq = m_SubBatchSize / m_IterCountPerKernel; + size_t fuseFreq = Renderer::SubBatchSize() / m_IterCountPerKernel;//Use the base sbs to determine when to fuse. size_t itersRemaining, localIterCount = 0; double percent, etaMs; const char* loc = __FUNCTION__; @@ -719,12 +765,13 @@ bool RendererCL::RunIter(size_t iterCount, size_t temporalSample, size_t& ite if (kernelIndex != -1) { - m_EmberCL = ConvertEmber(m_Ember); + ConvertEmber(m_Ember, m_EmberCL, m_XformsCL); m_CarToRasCL = ConvertCarToRas(*CoordMap()); - if (b && !(b = m_Wrapper.WriteBuffer (m_EmberBufferName, (void*)&m_EmberCL, sizeof(m_EmberCL)))) { m_ErrorReport.push_back(loc); } - if (b && !(b = m_Wrapper.AddAndWriteBuffer(m_DistBufferName, (void*)XformDistributions(), XformDistributionsSize()))) { m_ErrorReport.push_back(loc); }//Will be resized for xaos. - if (b && !(b = m_Wrapper.WriteBuffer (m_CarToRasBufferName, (void*)&m_CarToRasCL, sizeof(m_CarToRasCL)))) { m_ErrorReport.push_back(loc); } + if (b && !(b = m_Wrapper.WriteBuffer (m_EmberBufferName, (void*)&m_EmberCL, sizeof(m_EmberCL)))) { m_ErrorReport.push_back(loc); } + if (b && !(b = m_Wrapper.WriteBuffer (m_XformsBufferName, (void*)m_XformsCL.data(), sizeof(m_XformsCL[0]) * m_XformsCL.size()))) { m_ErrorReport.push_back(loc); } + if (b && !(b = m_Wrapper.AddAndWriteBuffer(m_DistBufferName, (void*)XformDistributions(), XformDistributionsSize()))) { m_ErrorReport.push_back(loc); }//Will be resized for xaos. + if (b && !(b = m_Wrapper.WriteBuffer (m_CarToRasBufferName, (void*)&m_CarToRasCL, sizeof(m_CarToRasCL)))) { m_ErrorReport.push_back(loc); } if (b && !(b = m_Wrapper.AddAndWriteImage("Palette", CL_MEM_READ_ONLY, m_PaletteFormat, m_Dmap.m_Entries.size(), 1, 0, m_Dmap.m_Entries.data()))) { m_ErrorReport.push_back(loc); } @@ -735,31 +782,32 @@ bool RendererCL::RunIter(size_t iterCount, size_t temporalSample, size_t& ite while (b && itersRan < iterCount && !m_Abort) { argIndex = 0; - seed = m_Rand[0].Rand(); #ifdef TEST_CL fuse = 0; #else //fuse = 100; - fuse = ((m_Calls % fuseFreq) == 0 ? (EarlyClip() ? 100u : 15u) : 0u); + //fuse = ((m_Calls % fuseFreq) == 0 ? (EarlyClip() ? 100u : 15u) : 0u); + fuse = (unsigned int)((m_Calls % fuseFreq) == 0u ? FuseCount() : 0u); //fuse = ((m_Calls % 4) == 0 ? 100u : 0u); #endif itersRemaining = iterCount - itersRan; - unsigned int gridW = (unsigned int)min(ceil((double)itersRemaining / (double)iterCountPerBlock), (double)IterBlocksWide()); - unsigned int gridH = (unsigned int)min(ceil((double)itersRemaining / ((double)gridW * iterCountPerBlock)), (double)IterBlocksHigh()); + unsigned int gridW = (unsigned int)min(ceil((double)itersRemaining / (double)iterCountPerBlock), (double)IterGridBlockWidth()); + unsigned int gridH = (unsigned int)min(ceil((double)itersRemaining / ((double)gridW * iterCountPerBlock)), (double)IterGridBlockHeight()); unsigned int iterCountThisLaunch = iterCountPerBlock * gridW * gridH; //Similar to what's done in the base class. //The number of iters per thread must be adjusted if they've requested less iters than is normally ran in a block (256 * 256). if (iterCountThisLaunch > iterCount) { - iterCountPerKernel = (unsigned int)ceil((double)iterCount / (double)(gridW * gridH * m_IterBlockWidth * m_IterBlockHeight)); - iterCountThisLaunch = iterCountPerKernel * (gridW * gridH * m_IterBlockWidth * m_IterBlockHeight); + iterCountPerKernel = (unsigned int)ceil((double)iterCount / (double)(gridW * gridH * IterBlockKernelCount())); + iterCountThisLaunch = iterCountPerKernel * (gridW * gridH * IterBlockKernelCount()); } if (b && !(b = m_Wrapper.SetArg (kernelIndex, argIndex++, iterCountPerKernel))) { m_ErrorReport.push_back(loc); }//Number of iters for each thread to run. if (b && !(b = m_Wrapper.SetArg (kernelIndex, argIndex++, fuse))) { m_ErrorReport.push_back(loc); }//Number of iters to fuse. - if (b && !(b = m_Wrapper.SetArg (kernelIndex, argIndex++, seed))) { m_ErrorReport.push_back(loc); }//Seed. - if (b && !(b = m_Wrapper.SetBufferArg(kernelIndex, argIndex++, m_EmberBufferName))) { m_ErrorReport.push_back(loc); }//Flame. + if (b && !(b = m_Wrapper.SetBufferArg(kernelIndex, argIndex++, m_SeedsBufferName))) { m_ErrorReport.push_back(loc); }//Seeds. + if (b && !(b = m_Wrapper.SetBufferArg(kernelIndex, argIndex++, m_EmberBufferName))) { m_ErrorReport.push_back(loc); }//Ember. + if (b && !(b = m_Wrapper.SetBufferArg(kernelIndex, argIndex++, m_XformsBufferName))) { m_ErrorReport.push_back(loc); }//Xforms. if (b && !(b = m_Wrapper.SetBufferArg(kernelIndex, argIndex++, m_ParVarsBufferName))) { m_ErrorReport.push_back(loc); }//Parametric variation parameters. if (b && !(b = m_Wrapper.SetBufferArg(kernelIndex, argIndex++, m_DistBufferName))) { m_ErrorReport.push_back(loc); }//Xform distributions. if (b && !(b = m_Wrapper.SetBufferArg(kernelIndex, argIndex++, m_CarToRasBufferName))) { m_ErrorReport.push_back(loc); }//Coordinate converter. @@ -769,11 +817,11 @@ bool RendererCL::RunIter(size_t iterCount, size_t temporalSample, size_t& ite if (b && !(b = m_Wrapper.SetBufferArg(kernelIndex, argIndex++, m_PointsBufferName))) { m_ErrorReport.push_back(loc); }//Random start points. if (b && !(b = m_Wrapper.RunKernel(kernelIndex, - gridW * IterBlockWidth(),//Total grid dims. - gridH * IterBlockHeight(), + gridW * IterBlockKernelWidth(),//Total grid dims. + gridH * IterBlockKernelHeight(), 1, - IterBlockWidth(),//Individual block dims. - IterBlockHeight(), + IterBlockKernelWidth(),//Individual block dims. + IterBlockKernelHeight(), 1))) { m_Abort = true; @@ -876,7 +924,7 @@ template eRenderStatus RendererCL::RunDensityFilter() { bool b = true; - Timing t(4);//, t2(4); + Timing t(4);// , t2(4); m_DensityFilterCL = ConvertDensityFilter(); int kernelIndex = MakeAndGetDensityFilterProgram(Supersample(), m_DensityFilterCL.m_FilterWidth); const char* loc = __FUNCTION__; @@ -907,26 +955,62 @@ eRenderStatus RendererCL::RunDensityFilter() //The other is to proces the entire image in multiple passes, and each pass processes blocks of pixels //that are far enough apart such that their filters do not overlap. //Do the latter. + //Gap is in terms of blocks. How many blocks must separate two blocks running at the same time. unsigned int gapW = (unsigned int)ceil((m_DensityFilterCL.m_FilterWidth * 2.0) / (double)blockSizeW); unsigned int chunkSizeW = gapW + 1; unsigned int gapH = (unsigned int)ceil((m_DensityFilterCL.m_FilterWidth * 2.0) / (double)blockSizeH); unsigned int chunkSizeH = gapH + 1; - double totalChunks = chunkSizeW * chunkSizeH; if (b && !(b = m_Wrapper.AddAndWriteBuffer(m_DEFilterParamsBufferName, (void*)&m_DensityFilterCL, sizeof(m_DensityFilterCL)))) { m_ErrorReport.push_back(loc); } - for (unsigned int row = 0; b && !m_Abort && row < chunkSizeH; row++) +#ifdef ROW_ONLY_DE + blockSizeW = 64;//These *must* both be divisible by 16 or else pixels will go missing. + blockSizeH = 1; + gapW = (unsigned int)ceil((m_DensityFilterCL.m_FilterWidth * 2.0) / (double)blockSizeW); + chunkSizeW = gapW + 1; + gapH = (unsigned int)ceil((m_DensityFilterCL.m_FilterWidth * 2.0) / (double)32);//Block height is 1, but iterates over 32 rows. + chunkSizeH = gapH + 1; + totalChunks = chunkSizeW * chunkSizeH; + + OpenCLWrapper::MakeEvenGridDims(blockSizeW, blockSizeH, gridW, gridH); + gridW /= chunkSizeW; + gridH /= chunkSizeH; + + for (unsigned int rowChunk = 0; b && !m_Abort && rowChunk < chunkSizeH; rowChunk++) { - for (unsigned int col = 0; b && !m_Abort && col < chunkSizeW; col++) + for (unsigned int colChunk = 0; b && !m_Abort && colChunk < chunkSizeW; colChunk++) { //t2.Tic(); - if (b && !(b = RunDensityFilterPrivate(kernelIndex, gridW, gridH, blockSizeW, blockSizeH, chunkSizeW, chunkSizeH, row, col))) { m_Abort = true; m_ErrorReport.push_back(loc); } + if (b && !(b = RunDensityFilterPrivate(kernelIndex, gridW, gridH, blockSizeW, blockSizeH, chunkSizeW, chunkSizeH, colChunk, rowChunk))) { m_Abort = true; m_ErrorReport.push_back(loc); } //t2.Toc(loc); if (b && m_Callback) { - double percent = (double((row * chunkSizeW) + (col + 1)) / totalChunks) * 100.0; + double percent = (double((rowChunk * chunkSizeW) + (colChunk + 1)) / totalChunks) * 100.0; + double etaMs = ((100.0 - percent) / percent) * t.Toc(); + + if (!m_Callback->ProgressFunc(m_Ember, m_ProgressParameter, percent, 1, etaMs)) + Abort(); + } + } + } +#else + OpenCLWrapper::MakeEvenGridDims(blockSizeW, blockSizeH, gridW, gridH); + gridW /= chunkSizeW; + gridH /= chunkSizeH; + + for (unsigned int rowChunk = 0; b && !m_Abort && rowChunk < chunkSizeH; rowChunk++) + { + for (unsigned int colChunk = 0; b && !m_Abort && colChunk < chunkSizeW; colChunk++) + { + //t2.Tic(); + if (b && !(b = RunDensityFilterPrivate(kernelIndex, gridW, gridH, blockSizeW, blockSizeH, chunkSizeW, chunkSizeH, colChunk, rowChunk))) { m_Abort = true; m_ErrorReport.push_back(loc); } + //t2.Toc(loc); + + if (b && m_Callback) + { + double percent = (double((rowChunk * chunkSizeW) + (colChunk + 1)) / totalChunks) * 100.0; double etaMs = ((100.0 - percent) / percent) * t.Toc(); if (!m_Callback->ProgressFunc(m_Ember, m_ProgressParameter, percent, 1, etaMs)) @@ -934,6 +1018,7 @@ eRenderStatus RendererCL::RunDensityFilter() } } } +#endif if (b && m_Callback) m_Callback->ProgressFunc(m_Ember, m_ProgressParameter, 100.0, 1, 0.0); @@ -1084,14 +1169,15 @@ bool RendererCL::ClearBuffer(const string& bufferName, unsigned int width, un /// Grid height /// Block width /// Block height -/// Chunk size (gap + 1) +/// Chunk size width (gapW + 1) +/// Chunk size height (gapH + 1) /// Row parity /// Column parity /// True if success, else false. template -bool RendererCL::RunDensityFilterPrivate(unsigned int kernelIndex, unsigned int gridW, unsigned int gridH, unsigned int blockW, unsigned int blockH, unsigned int chunkSizeW, unsigned int chunkSizeH, unsigned int rowParity, unsigned int colParity) +bool RendererCL::RunDensityFilterPrivate(unsigned int kernelIndex, unsigned int gridW, unsigned int gridH, unsigned int blockW, unsigned int blockH, unsigned int chunkSizeW, unsigned int chunkSizeH, unsigned int chunkW, unsigned int chunkH) { - //Timing t; + //Timing t(4); bool b = true; unsigned int argIndex = 0; const char* loc = __FUNCTION__; @@ -1104,8 +1190,8 @@ bool RendererCL::RunDensityFilterPrivate(unsigned int kernelIndex, unsigned i if (b && !(b = m_Wrapper.SetBufferArg(kernelIndex, argIndex, m_DECoefIndicesBufferName))) { m_ErrorReport.push_back(loc); } argIndex++;//Coef indices. if (b && !(b = m_Wrapper.SetArg( kernelIndex, argIndex, chunkSizeW))) { m_ErrorReport.push_back(loc); } argIndex++;//Chunk size width (gapW + 1). if (b && !(b = m_Wrapper.SetArg( kernelIndex, argIndex, chunkSizeH))) { m_ErrorReport.push_back(loc); } argIndex++;//Chunk size height (gapH + 1). - if (b && !(b = m_Wrapper.SetArg( kernelIndex, argIndex, rowParity))) { m_ErrorReport.push_back(loc); } argIndex++;//Row parity. - if (b && !(b = m_Wrapper.SetArg( kernelIndex, argIndex, colParity))) { m_ErrorReport.push_back(loc); } argIndex++;//Col parity. + if (b && !(b = m_Wrapper.SetArg( kernelIndex, argIndex, chunkW))) { m_ErrorReport.push_back(loc); } argIndex++;//Column chunk. + if (b && !(b = m_Wrapper.SetArg( kernelIndex, argIndex, chunkH))) { m_ErrorReport.push_back(loc); } argIndex++;//Row chunk. //t.Toc(__FUNCTION__ " set args"); //t.Tic(); @@ -1270,60 +1356,57 @@ SpatialFilterCL RendererCL::ConvertSpatialFilter() /// /// Convert the host side Ember object into an EmberCL object -/// for passing to OpenCL. +/// and a vector of XformCL for passing to OpenCL. /// /// The Ember object to convert -/// The EmberCL object +/// The converted EmberCL +/// The converted vector of XformCL template -EmberCL RendererCL::ConvertEmber(Ember& ember) +void RendererCL::ConvertEmber(Ember& ember, EmberCL& emberCL, vector>& xformsCL) { - EmberCL emberCL; - memset(&emberCL, 0, sizeof(EmberCL));//Might not really be needed. - emberCL.m_RotA = m_RotMat.A(); - emberCL.m_RotB = m_RotMat.B(); - emberCL.m_RotD = m_RotMat.D(); - emberCL.m_RotE = m_RotMat.E(); - emberCL.m_CamMat = ember.m_CamMat; - emberCL.m_CenterX = CenterX(); - emberCL.m_CenterY = ember.m_RotCenterY; - emberCL.m_CamZPos = ember.m_CamZPos; - emberCL.m_CamPerspective = ember.m_CamPerspective; - emberCL.m_CamYaw = ember.m_CamYaw; - emberCL.m_CamPitch = ember.m_CamPitch; - emberCL.m_CamDepthBlur = ember.m_CamDepthBlur; - emberCL.m_BlurCoef = ember.BlurCoef(); + emberCL.m_RotA = m_RotMat.A(); + emberCL.m_RotB = m_RotMat.B(); + emberCL.m_RotD = m_RotMat.D(); + emberCL.m_RotE = m_RotMat.E(); + emberCL.m_CamMat = ember.m_CamMat; + emberCL.m_CenterX = CenterX(); + emberCL.m_CenterY = ember.m_RotCenterY; + emberCL.m_CamZPos = ember.m_CamZPos; + emberCL.m_CamPerspective = ember.m_CamPerspective; + emberCL.m_CamYaw = ember.m_CamYaw; + emberCL.m_CamPitch = ember.m_CamPitch; + emberCL.m_CamDepthBlur = ember.m_CamDepthBlur; + emberCL.m_BlurCoef = ember.BlurCoef(); - for (unsigned int i = 0; i < ember.TotalXformCount() && i < MAX_CL_XFORM; i++)//Copy the relevant values for each xform, capped at the max. + for (unsigned int i = 0; i < ember.TotalXformCount() && i < xformsCL.size(); i++) { Xform* xform = ember.GetTotalXform(i); - emberCL.m_Xforms[i].m_A = xform->m_Affine.A(); - emberCL.m_Xforms[i].m_B = xform->m_Affine.B(); - emberCL.m_Xforms[i].m_C = xform->m_Affine.C(); - emberCL.m_Xforms[i].m_D = xform->m_Affine.D(); - emberCL.m_Xforms[i].m_E = xform->m_Affine.E(); - emberCL.m_Xforms[i].m_F = xform->m_Affine.F(); + xformsCL[i].m_A = xform->m_Affine.A(); + xformsCL[i].m_B = xform->m_Affine.B(); + xformsCL[i].m_C = xform->m_Affine.C(); + xformsCL[i].m_D = xform->m_Affine.D(); + xformsCL[i].m_E = xform->m_Affine.E(); + xformsCL[i].m_F = xform->m_Affine.F(); - emberCL.m_Xforms[i].m_PostA = xform->m_Post.A(); - emberCL.m_Xforms[i].m_PostB = xform->m_Post.B(); - emberCL.m_Xforms[i].m_PostC = xform->m_Post.C(); - emberCL.m_Xforms[i].m_PostD = xform->m_Post.D(); - emberCL.m_Xforms[i].m_PostE = xform->m_Post.E(); - emberCL.m_Xforms[i].m_PostF = xform->m_Post.F(); + xformsCL[i].m_PostA = xform->m_Post.A(); + xformsCL[i].m_PostB = xform->m_Post.B(); + xformsCL[i].m_PostC = xform->m_Post.C(); + xformsCL[i].m_PostD = xform->m_Post.D(); + xformsCL[i].m_PostE = xform->m_Post.E(); + xformsCL[i].m_PostF = xform->m_Post.F(); - emberCL.m_Xforms[i].m_DirectColor = xform->m_DirectColor; - emberCL.m_Xforms[i].m_ColorSpeedCache = xform->ColorSpeedCache(); - emberCL.m_Xforms[i].m_OneMinusColorCache = xform->OneMinusColorCache(); - emberCL.m_Xforms[i].m_Opacity = xform->m_Opacity; - emberCL.m_Xforms[i].m_VizAdjusted = xform->VizAdjusted(); + xformsCL[i].m_DirectColor = xform->m_DirectColor; + xformsCL[i].m_ColorSpeedCache = xform->ColorSpeedCache(); + xformsCL[i].m_OneMinusColorCache = xform->OneMinusColorCache(); + xformsCL[i].m_Opacity = xform->m_Opacity; + xformsCL[i].m_VizAdjusted = xform->VizAdjusted(); for (unsigned int varIndex = 0; varIndex < xform->TotalVariationCount() && varIndex < MAX_CL_VARS; varIndex++)//Assign all variation weights for this xform, with a max of MAX_CL_VARS. - emberCL.m_Xforms[i].m_VariationWeights[varIndex] = xform->GetVariation(varIndex)->m_Weight; + xformsCL[i].m_VariationWeights[varIndex] = xform->GetVariation(varIndex)->m_Weight; } - - return emberCL; } /// diff --git a/Source/EmberCL/RendererCL.h b/Source/EmberCL/RendererCL.h index 1ef9760..677afbd 100644 --- a/Source/EmberCL/RendererCL.h +++ b/Source/EmberCL/RendererCL.h @@ -42,14 +42,27 @@ public: //Non-virtual member functions for OpenCL specific tasks. bool Init(unsigned int platform, unsigned int device, bool shared, GLuint outputTexID); bool SetOutputTexture(GLuint outputTexID); - inline unsigned int IterCountPerKernel(); - inline unsigned int IterBlocksWide(); - inline unsigned int IterBlocksHigh(); - inline unsigned int IterBlockWidth(); - inline unsigned int IterBlockHeight(); - inline unsigned int IterGridWidth(); - inline unsigned int IterGridHeight(); - inline unsigned int TotalIterKernelCount(); + + //Iters per kernel/block/grid. + inline unsigned int IterCountPerKernel() const; + inline unsigned int IterCountPerBlock() const; + inline unsigned int IterCountPerGrid() const; + + //Kernels per block. + inline unsigned int IterBlockKernelWidth() const; + inline unsigned int IterBlockKernelHeight() const; + inline unsigned int IterBlockKernelCount() const; + + //Kernels per grid. + inline unsigned int IterGridKernelWidth() const; + inline unsigned int IterGridKernelHeight() const; + inline unsigned int IterGridKernelCount() const; + + //Blocks per grid. + inline unsigned int IterGridBlockWidth() const; + inline unsigned int IterGridBlockHeight() const; + inline unsigned int IterGridBlockCount() const; + unsigned int PlatformIndex(); unsigned int DeviceIndex(); bool ReadHist(); @@ -58,6 +71,9 @@ public: bool ClearHist(); bool ClearAccum(); bool WritePoints(vector>& vec); +#ifdef TEST_CL + bool WriteRandomPoints(); +#endif string IterKernel(); //Virtual functions overridden from RendererCLBase. @@ -98,7 +114,7 @@ private: eRenderStatus RunDensityFilter(); eRenderStatus RunFinalAccum(); bool ClearBuffer(const string& bufferName, unsigned int width, unsigned int height, unsigned int elementSize); - bool RunDensityFilterPrivate(unsigned int kernelIndex, unsigned int gridW, unsigned int gridH, unsigned int blockW, unsigned int blockH, unsigned int chunkSizeW, unsigned int chunkSizeH, unsigned int rowParity, unsigned int colParity); + bool RunDensityFilterPrivate(unsigned int kernelIndex, unsigned int gridW, unsigned int gridH, unsigned int blockW, unsigned int blockH, unsigned int chunkSizeW, unsigned int chunkSizeH, unsigned int chunkW, unsigned int chunkH); int MakeAndGetDensityFilterProgram(size_t ss, unsigned int filterWidth); int MakeAndGetFinalAccumProgram(T& alphaBase, T& alphaScale); int MakeAndGetGammaCorrectionProgram(); @@ -106,7 +122,7 @@ private: //Private functions passing data to OpenCL programs. DensityFilterCL ConvertDensityFilter(); SpatialFilterCL ConvertSpatialFilter(); - EmberCL ConvertEmber(Ember& ember); + void ConvertEmber(Ember& ember, EmberCL& emberCL, vector>& xformsCL); static CarToRasCL ConvertCarToRas(const CarToRas& carToRas); bool m_Init; @@ -122,7 +138,9 @@ private: //Buffer names. string m_EmberBufferName; + string m_XformsBufferName; string m_ParVarsBufferName; + string m_SeedsBufferName; string m_DistBufferName; string m_CarToRasBufferName; string m_DEFilterParamsBufferName; @@ -146,6 +164,8 @@ private: IMAGEGL2D m_AccumImage; GLuint m_OutputTexID; EmberCL m_EmberCL; + vector> m_XformsCL; + vector m_Seeds; Palette m_Dmap;//Used instead of the base class' m_Dmap because OpenCL only supports float textures. CarToRasCL m_CarToRasCL; DensityFilterCL m_DensityFilterCL; diff --git a/Source/EmberCommon/EmberOptions.h b/Source/EmberCommon/EmberOptions.h index b06bb98..81aa9d9 100644 --- a/Source/EmberCommon/EmberOptions.h +++ b/Source/EmberCommon/EmberOptions.h @@ -306,7 +306,7 @@ public: INITUINTOPTION(Strips, Eou(OPT_USE_RENDER, OPT_STRIPS, _T("--nstrips"), 1, SO_REQ_SEP, "\t--nstrips= The number of fractions to split a single render frame into. Useful for print size renders or low memory systems [default: 1].\n")); INITUINTOPTION(Supersample, Eou(OPT_RENDER_ANIM, OPT_SUPERSAMPLE, _T("--supersample"), 0, SO_REQ_SEP, "\t--supersample= The supersample value used to override the one specified in the file [default: 0 (use value from file)].\n")); INITUINTOPTION(BitsPerChannel, Eou(OPT_RENDER_ANIM, OPT_BPC, _T("--bpc"), 8, SO_REQ_SEP, "\t--bpc= Bits per channel. 8 or 16 for PNG, 8 for all others [default: 8].\n")); - INITUINTOPTION(SubBatchSize, Eou(OPT_USE_ALL, OPT_SBS, _T("--sub_batch_size"), 10240, SO_REQ_SEP, "\t--sub_batch_size= The chunk size that iterating will be broken into [default: 10k].\n")); + INITUINTOPTION(SubBatchSize, Eou(OPT_USE_ALL, OPT_SBS, _T("--sub_batch_size"), DEFAULT_SBS, SO_REQ_SEP, "\t--sub_batch_size= The chunk size that iterating will be broken into [default: 10k].\n")); INITUINTOPTION(Bits, Eou(OPT_USE_ALL, OPT_BITS, _T("--bits"), 33, SO_REQ_SEP, "\t--bits= Determines the types used for the histogram and accumulator [default: 33].\n" "\t\t\t\t\t32: Histogram: float, Accumulator: float.\n" "\t\t\t\t\t33: Histogram: float, Accumulator: float.\n"//This differs from the original which used an int hist for bits 33. diff --git a/Source/EmberGenome/EmberGenome.cpp b/Source/EmberGenome/EmberGenome.cpp index 2d76ef3..6c153da 100644 --- a/Source/EmberGenome/EmberGenome.cpp +++ b/Source/EmberGenome/EmberGenome.cpp @@ -264,10 +264,22 @@ bool EmberGenome(EmberOptions& opt) else if (opt.Clone() != "") filename = opt.Clone(); else if (opt.Mutate() != "") filename = opt.Mutate(); - if (!ParseEmberFile(parser, filename, embers)) + if (ParseEmberFile(parser, filename, embers)) + { + if (opt.SubBatchSize() != DEFAULT_SBS) + for (i = 0; i < embers.size(); i++) + embers[i].m_SubBatchSize = opt.SubBatchSize(); + } + else return false; - if (doCross1 && !ParseEmberFile(parser, opt.Cross1(), embers2)) + if (doCross1 && ParseEmberFile(parser, opt.Cross1(), embers2)) + { + if (opt.SubBatchSize() != DEFAULT_SBS) + for (i = 0; i < embers2.size(); i++) + embers2[i].m_SubBatchSize = opt.SubBatchSize(); + } + else return false; if (opt.CloneAll() != "") @@ -493,7 +505,6 @@ bool EmberGenome(EmberOptions& opt) renderer->EarlyClip(opt.EarlyClip()); renderer->YAxisUp(opt.YAxisUp()); renderer->LockAccum(opt.LockAccum()); - renderer->SubBatchSize(opt.SubBatchSize()); renderer->PixelAspectRatio(T(opt.AspectRatio())); renderer->Transparency(opt.Transparency()); diff --git a/Source/EmberGenome/EmberGenome.rc b/Source/EmberGenome/EmberGenome.rc index 6e8d76d..b246fdc 100644 --- a/Source/EmberGenome/EmberGenome.rc +++ b/Source/EmberGenome/EmberGenome.rc @@ -49,8 +49,8 @@ END // VS_VERSION_INFO VERSIONINFO - FILEVERSION 0,4,1,4 - PRODUCTVERSION 0,4,1,4 + FILEVERSION 0,4,1,5 + PRODUCTVERSION 0,4,1,5 FILEFLAGSMASK 0x3fL #ifdef _DEBUG FILEFLAGS 0x1L @@ -67,12 +67,12 @@ BEGIN BEGIN VALUE "CompanyName", "Open Source" VALUE "FileDescription", "Manipulates fractal flames parameter files" - VALUE "FileVersion", "0.4.1.4" + VALUE "FileVersion", "0.4.1.5" VALUE "InternalName", "EmberGenome.rc" VALUE "LegalCopyright", "Copyright (C) Matt Feemster 2013, GPL v3" VALUE "OriginalFilename", "EmberGenome.rc" VALUE "ProductName", "Ember Genome" - VALUE "ProductVersion", "0.4.1.4" + VALUE "ProductVersion", "0.4.1.5" END END BLOCK "VarFileInfo" diff --git a/Source/EmberRender/EmberRender.cpp b/Source/EmberRender/EmberRender.cpp index 1d9a758..f45f712 100644 --- a/Source/EmberRender/EmberRender.cpp +++ b/Source/EmberRender/EmberRender.cpp @@ -147,7 +147,6 @@ bool EmberRender(EmberOptions& opt) renderer->YAxisUp(opt.YAxisUp()); renderer->LockAccum(opt.LockAccum()); renderer->InsertPalette(opt.InsertPalette()); - renderer->SubBatchSize(opt.SubBatchSize()); renderer->PixelAspectRatio(T(opt.AspectRatio())); renderer->Transparency(opt.Transparency()); renderer->NumChannels(channels); @@ -164,6 +163,9 @@ bool EmberRender(EmberOptions& opt) if (opt.Supersample() > 0) embers[i].m_Supersample = opt.Supersample(); + if (opt.SubBatchSize() != DEFAULT_SBS) + embers[i].m_SubBatchSize = opt.SubBatchSize(); + embers[i].m_TemporalSamples = 1;//Force temporal samples to 1 for render. embers[i].m_Quality *= T(opt.QualityScale()); embers[i].m_FinalRasW = (unsigned int)((T)embers[i].m_FinalRasW * opt.SizeScale()); diff --git a/Source/EmberRender/EmberRender.rc b/Source/EmberRender/EmberRender.rc index 786c4b5..1fa4a0e 100644 --- a/Source/EmberRender/EmberRender.rc +++ b/Source/EmberRender/EmberRender.rc @@ -49,8 +49,8 @@ END // VS_VERSION_INFO VERSIONINFO - FILEVERSION 0,4,1,4 - PRODUCTVERSION 0,4,1,4 + FILEVERSION 0,4,1,5 + PRODUCTVERSION 0,4,1,5 FILEFLAGSMASK 0x3fL #ifdef _DEBUG FILEFLAGS 0x1L @@ -67,12 +67,12 @@ BEGIN BEGIN VALUE "CompanyName", "Open Source" VALUE "FileDescription", "Renders fractal flames as single images" - VALUE "FileVersion", "0.4.1.4" + VALUE "FileVersion", "0.4.1.5" VALUE "InternalName", "EmberRender.rc" VALUE "LegalCopyright", "Copyright (C) Matt Feemster 2013, GPL v3" VALUE "OriginalFilename", "EmberRender.rc" VALUE "ProductName", "Ember Render" - VALUE "ProductVersion", "0.4.1.4" + VALUE "ProductVersion", "0.4.1.5" END END BLOCK "VarFileInfo" diff --git a/Source/Fractorium/AboutDialog.ui b/Source/Fractorium/AboutDialog.ui index 9202d7c..0020c90 100644 --- a/Source/Fractorium/AboutDialog.ui +++ b/Source/Fractorium/AboutDialog.ui @@ -52,7 +52,7 @@ - <html><head/><body><p align="center"><br/><span style=" font-size:12pt;">Fractorium 0.4.1.4 Beta</span></p><p align="center"><span style=" font-size:10pt;"><br/>A Qt-based fractal flame editor which uses a C++ re-write of the flam3 algorithm named Ember and a GPU capable version named EmberCL which implements a portion of the cuburn algorithm in OpenCL.</span></p><p align="center"><span style=" font-size:10pt;">Matt Feemster</span></p></body></html> + <html><head/><body><p align="center"><br/><span style=" font-size:12pt;">Fractorium 0.4.1.5 Beta</span></p><p align="center"><span style=" font-size:10pt;"><br/>A Qt-based fractal flame editor which uses a C++ re-write of the flam3 algorithm named Ember and a GPU capable version named EmberCL which implements a portion of the cuburn algorithm in OpenCL.</span></p><p align="center"><span style=" font-size:10pt;">Matt Feemster</span></p></body></html> Qt::RichText diff --git a/Source/Fractorium/FinalRenderEmberController.cpp b/Source/Fractorium/FinalRenderEmberController.cpp index 47bf421..806e484 100644 --- a/Source/Fractorium/FinalRenderEmberController.cpp +++ b/Source/Fractorium/FinalRenderEmberController.cpp @@ -578,6 +578,7 @@ pair FinalRenderEmberController::SyncAndComputeMemory() [&](const string& s) { }, [&](const string& s) { }, [&](const string& s) { }); SyncGuiToEmbers(); + m_FinalRenderDialog->m_StripsSpin->setSuffix(" (" + ToString(strips) + ")"); m_Renderer->SetEmber(*m_Ember); m_Renderer->CreateSpatialFilter(b); m_Renderer->CreateTemporalFilter(b); diff --git a/Source/Fractorium/Fractorium.h b/Source/Fractorium/Fractorium.h index b3f3e19..bbdfe18 100644 --- a/Source/Fractorium/Fractorium.h +++ b/Source/Fractorium/Fractorium.h @@ -152,9 +152,11 @@ public slots: void OnDEFilterMinRadiusWidthChanged(double d); void OnDEFilterMaxRadiusWidthChanged(double d); void OnDEFilterCurveWidthChanged(double d); - void OnTemporalSamplesChanged(int d);//Iteration. + void OnSbsChanged(int d);//Iteration. + void OnFuseChanged(int d); void OnQualityChanged(double d); void OnSupersampleChanged(int d); + void OnTemporalSamplesChanged(int d); void OnAffineInterpTypeComboCurrentIndexChanged(int index); void OnInterpTypeComboCurrentIndexChanged(int index); @@ -331,9 +333,11 @@ private: DoubleSpinBox* m_DEFilterMinRadiusSpin; DoubleSpinBox* m_DEFilterMaxRadiusSpin; DoubleSpinBox* m_DECurveSpin; - SpinBox* m_TemporalSamplesSpin;//Iteration. + SpinBox* m_SbsSpin;//Iteration. + SpinBox* m_FuseSpin; DoubleSpinBox* m_QualitySpin; SpinBox* m_SupersampleSpin; + SpinBox* m_TemporalSamplesSpin; StealthComboBox* m_AffineInterpTypeCombo; StealthComboBox* m_InterpTypeCombo; diff --git a/Source/Fractorium/Fractorium.rc b/Source/Fractorium/Fractorium.rc index d6dabf4..0ab71fc 100644 Binary files a/Source/Fractorium/Fractorium.rc and b/Source/Fractorium/Fractorium.rc differ diff --git a/Source/Fractorium/Fractorium.ui b/Source/Fractorium/Fractorium.ui index 7d955b2..88677b6 100644 --- a/Source/Fractorium/Fractorium.ui +++ b/Source/Fractorium/Fractorium.ui @@ -720,13 +720,13 @@ 0 - 163 + 156 16777215 - 163 + 156 @@ -805,13 +805,13 @@ false - 23 + 22 false - 23 + 22 false @@ -944,13 +944,13 @@ 0 - 117 + 156 16777215 - 117 + 156 @@ -1020,20 +1020,25 @@ false - 23 + 22 false - 23 + 22 false - Temporal Samples + Sub Batch Size + + + + + Fuse Count @@ -1046,6 +1051,11 @@ Supersample + + + Temporal Samples + + Affine Interpolation @@ -1068,7 +1078,7 @@ - Temporal Samples + Sub Batch Size @@ -1078,7 +1088,7 @@ - Quality + Fuse Count @@ -1088,7 +1098,7 @@ - Supersample + Quality @@ -1098,7 +1108,7 @@ - Affine Interpolation + Supersample @@ -1108,7 +1118,7 @@ - Interpolation + Temporal Samples @@ -1116,6 +1126,26 @@ 0 + + + Affine Interpolation + + + + + 0 + + + + + Interpolation + + + + + 0 + + @@ -1291,13 +1321,13 @@ 0 - 163 + 156 16777215 - 163 + 156 @@ -1367,13 +1397,13 @@ false - 23 + 22 false - 23 + 22 false @@ -1538,13 +1568,13 @@ 0 - 278 + 266 16777215 - 278 + 266 @@ -1614,13 +1644,13 @@ false - 23 + 22 false - 23 + 22 false @@ -2820,8 +2850,8 @@ SpinBox 0 0 - 238 - 747 + 118 + 597 diff --git a/Source/Fractorium/FractoriumEmberController.h b/Source/Fractorium/FractoriumEmberController.h index 0fa2e8a..3614769 100644 --- a/Source/Fractorium/FractoriumEmberController.h +++ b/Source/Fractorium/FractoriumEmberController.h @@ -126,9 +126,11 @@ public: virtual void DEFilterMinRadiusWidthChanged(double d) { } virtual void DEFilterMaxRadiusWidthChanged(double d) { } virtual void DEFilterCurveWidthChanged(double d) { } - virtual void TemporalSamplesChanged(int d) { } + virtual void SbsChanged(int d) { } + virtual void FuseChanged(int d) { } virtual void QualityChanged(double d) { } virtual void SupersampleChanged(int d) { } + virtual void TemporalSamplesChanged(int d) { } virtual void AffineInterpTypeChanged(int i) { } virtual void InterpTypeChanged(int i) { } virtual void BackgroundChanged(const QColor& color) { } @@ -340,9 +342,11 @@ public: virtual void DEFilterMinRadiusWidthChanged(double d) override; virtual void DEFilterMaxRadiusWidthChanged(double d) override; virtual void DEFilterCurveWidthChanged(double d) override; - virtual void TemporalSamplesChanged(int d) override; + virtual void SbsChanged(int d) override; + virtual void FuseChanged(int d) override; virtual void QualityChanged(double d) override; virtual void SupersampleChanged(int d) override; + virtual void TemporalSamplesChanged(int d) override; virtual void AffineInterpTypeChanged(int index) override; virtual void InterpTypeChanged(int index) override; virtual void BackgroundChanged(const QColor& col) override; diff --git a/Source/Fractorium/FractoriumParams.cpp b/Source/Fractorium/FractoriumParams.cpp index b1266dc..dec7e37 100644 --- a/Source/Fractorium/FractoriumParams.cpp +++ b/Source/Fractorium/FractoriumParams.cpp @@ -85,9 +85,11 @@ void Fractorium::InitParamsUI() //Iteration. row = 0; table = ui.IterationTable; - SetupSpinner (table, this, row, 1, m_TemporalSamplesSpin, spinHeight, 1, 5000, 50, SIGNAL(valueChanged(int)), SLOT(OnTemporalSamplesChanged(int)), true, 1000); - SetupSpinner(table, this, row, 1, m_QualitySpin, spinHeight, 1, dmax, 50, SIGNAL(valueChanged(double)), SLOT(OnQualityChanged(double)), true, 10, 10, 10); - SetupSpinner (table, this, row, 1, m_SupersampleSpin, spinHeight, 1, 4, 1, SIGNAL(valueChanged(int)), SLOT(OnSupersampleChanged(int)), true, 1, 1, 1); + SetupSpinner( table, this, row, 1, m_SbsSpin, spinHeight, 1000, 100000, 100, SIGNAL(valueChanged(int)), SLOT(OnSbsChanged(int)), true, DEFAULT_SBS, DEFAULT_SBS, DEFAULT_SBS); + SetupSpinner( table, this, row, 1, m_FuseSpin, spinHeight, 1, 1000, 5, SIGNAL(valueChanged(int)), SLOT(OnFuseChanged(int)), true, 15, 15, 15); + SetupSpinner(table, this, row, 1, m_QualitySpin, spinHeight, 1, dmax, 50, SIGNAL(valueChanged(double)), SLOT(OnQualityChanged(double)), true, 10, 10, 10); + SetupSpinner( table, this, row, 1, m_SupersampleSpin, spinHeight, 1, 4, 1, SIGNAL(valueChanged(int)), SLOT(OnSupersampleChanged(int)), true, 1, 1, 1); + SetupSpinner( table, this, row, 1, m_TemporalSamplesSpin, spinHeight, 1, 5000, 50, SIGNAL(valueChanged(int)), SLOT(OnTemporalSamplesChanged(int)), true, 1000); comboVals.clear(); comboVals.push_back("Step"); @@ -389,14 +391,22 @@ void Fractorium::OnDEFilterCurveWidthChanged(double d) { m_Controller->DEFilterC /// /// -/// Set the temporal samples to be used with animation. -/// Called when the temporal samples spinner is changed. -/// Does not reset anything because this is only used for animation. -/// In the future, when animation is implemented, this will have an effect. +/// Set the iteration depth. +/// Called when the sub batch size spinner is changed. +/// Resets the rendering process. /// -/// The temporal samples value -template void FractoriumEmberController::TemporalSamplesChanged(int i) { Update([&] { m_Ember.m_TemporalSamples = i; }, true, NOTHING); }//Don't do anything until animation is implemented. -void Fractorium::OnTemporalSamplesChanged(int d) { m_Controller->TemporalSamplesChanged(d); } +/// The sub batch size value to set +template void FractoriumEmberController::SbsChanged(int d) { Update([&] { m_Ember.m_SubBatchSize = d; }); } +void Fractorium::OnSbsChanged(int d) { m_Controller->SbsChanged(d); } + +/// +/// Set the number of samples to disregard for each sub batch. +/// Called when the fuse count spinner is changed. +/// Resets the rendering process. +/// +/// The fuse count value to set +template void FractoriumEmberController::FuseChanged(int d) { Update([&] { m_Ember.m_FuseCount = d; }); } +void Fractorium::OnFuseChanged(int d) { m_Controller->FuseChanged(d); } /// /// Set the quality. @@ -425,6 +435,16 @@ void Fractorium::OnQualityChanged(double d) { m_Controller->QualityChanged(d); } template void FractoriumEmberController::SupersampleChanged(int d) { Update([&] { m_Ember.m_Supersample = d; }); } void Fractorium::OnSupersampleChanged(int d) { m_Controller->SupersampleChanged(d); } +/// +/// Set the temporal samples to be used with animation. +/// Called when the temporal samples spinner is changed. +/// Does not reset anything because this is only used for animation. +/// In the future, when animation is implemented, this will have an effect. +/// +/// The temporal samples value +template void FractoriumEmberController::TemporalSamplesChanged(int i) { Update([&] { m_Ember.m_TemporalSamples = i; }, true, NOTHING); }//Don't do anything until animation is implemented. +void Fractorium::OnTemporalSamplesChanged(int d) { m_Controller->TemporalSamplesChanged(d); } + /// /// Set the affine interpolation type. /// Does not reset anything because this is only used for animation. @@ -523,9 +543,11 @@ void FractoriumEmberController::FillParamTablesAndPalette() m_Fractorium->m_DEFilterMinRadiusSpin->SetValueStealth(m_Ember.m_MinRadDE); m_Fractorium->m_DEFilterMaxRadiusSpin->SetValueStealth(m_Ember.m_MaxRadDE); m_Fractorium->m_DECurveSpin->SetValueStealth(m_Ember.m_CurveDE); - m_Fractorium->m_TemporalSamplesSpin->SetValueStealth(m_Ember.m_TemporalSamples);//Iteration. + m_Fractorium->m_SbsSpin->SetValueStealth(m_Ember.m_SubBatchSize);//Iteration. + m_Fractorium->m_FuseSpin->SetValueStealth(m_Ember.m_FuseCount); m_Fractorium->m_QualitySpin->SetValueStealth(m_Ember.m_Quality); m_Fractorium->m_SupersampleSpin->SetValueStealth(m_Ember.m_Supersample); + m_Fractorium->m_TemporalSamplesSpin->SetValueStealth(m_Ember.m_TemporalSamples); m_Fractorium->m_AffineInterpTypeCombo->SetCurrentIndexStealth(m_Ember.m_AffineInterp); m_Fractorium->m_InterpTypeCombo->SetCurrentIndexStealth(m_Ember.m_Interp); @@ -585,9 +607,11 @@ void FractoriumEmberController::ParamsToEmber(Ember& ember) ember.m_MinRadDE = m_Fractorium->m_DEFilterMinRadiusSpin->value(); ember.m_MaxRadDE = m_Fractorium->m_DEFilterMaxRadiusSpin->value(); ember.m_CurveDE = m_Fractorium->m_DECurveSpin->value(); - ember.m_TemporalSamples = m_Fractorium->m_TemporalSamplesSpin->value(); + ember.m_SubBatchSize = m_Fractorium->m_SbsSpin->value(); + ember.m_FuseCount = m_Fractorium->m_FuseSpin->value(); ember.m_Quality = m_Fractorium->m_QualitySpin->value(); ember.m_Supersample = m_Fractorium->m_SupersampleSpin->value(); + ember.m_TemporalSamples = m_Fractorium->m_TemporalSamplesSpin->value(); ember.m_AffineInterp = (eAffineInterp)m_Fractorium->m_AffineInterpTypeCombo->currentIndex(); ember.m_Interp = (eInterp)m_Fractorium->m_InterpTypeCombo->currentIndex(); } diff --git a/Source/Fractorium/FractoriumRender.cpp b/Source/Fractorium/FractoriumRender.cpp index a22908e..2bc0cb5 100644 --- a/Source/Fractorium/FractoriumRender.cpp +++ b/Source/Fractorium/FractoriumRender.cpp @@ -140,7 +140,7 @@ void FractoriumEmberControllerBase::SaveCurrentRender(const QString& filename, b return; } - data = m_FinalImage.data();//Png and channels = 4. + data = m_FinalImage.data();//Png and channels == 4. if ((suffix == "jpg" || suffix == "bmp") && m_Renderer->NumChannels() == 4) { @@ -296,7 +296,7 @@ bool FractoriumEmberController::Render() bool success = true; GLWidget* gl = m_Fractorium->ui.GLDisplay; - RendererCL* rendererCL; + RendererCL* rendererCL = nullptr; eProcessAction action = CondenseAndClearProcessActions(); if (m_Renderer->RendererType() == OPENCL_RENDERER) @@ -412,10 +412,16 @@ bool FractoriumEmberController::Render() if (m_UndoList.size() >= UNDO_SIZE) m_UndoList.pop_front(); } - else if (!m_LastEditWasUndoRedo && m_UndoIndex != m_UndoList.size() - 1)//They were in the middle of the undo list, then did a manual edit, so clear the undo list. + else if (!m_LastEditWasUndoRedo && m_UndoIndex < m_UndoList.size() - 1)//They were anywhere but the end of the undo list, then did a manual edit, so clear the undo list. { + Ember ember(m_UndoList[m_UndoIndex]); + ClearUndo(); + m_UndoList.push_back(ember); m_UndoList.push_back(m_Ember); + m_UndoIndex = m_UndoList.size() - 1; + m_Fractorium->ui.ActionUndo->setEnabled(true); + m_Fractorium->ui.ActionRedo->setEnabled(false); } m_LastEditWasUndoRedo = false; @@ -515,8 +521,22 @@ bool FractoriumEmberController::CreateRenderer(eRendererType renderType, unsi { m_RenderType = m_Renderer->RendererType(); - if (m_RenderType == OPENCL_RENDERER && m_Fractorium->m_QualitySpin->value() < 30) - m_Fractorium->m_QualitySpin->setValue(30); + if (m_RenderType == OPENCL_RENDERER) + { + m_Fractorium->m_QualitySpin->DoubleClickZero(30); + m_Fractorium->m_QualitySpin->DoubleClickNonZero(30); + + if (m_Fractorium->m_QualitySpin->value() < 30) + m_Fractorium->m_QualitySpin->setValue(30); + } + else + { + m_Fractorium->m_QualitySpin->DoubleClickZero(10); + m_Fractorium->m_QualitySpin->DoubleClickNonZero(10); + + if (m_Fractorium->m_QualitySpin->value() > 10) + m_Fractorium->m_QualitySpin->setValue(10); + } m_Renderer->Callback(this); m_Renderer->NumChannels(4);//Always using 4 since the GL texture is RGBA. diff --git a/Source/Fractorium/FractoriumXformsVariations.cpp b/Source/Fractorium/FractoriumXformsVariations.cpp index 7a8890e..c36c10d 100644 --- a/Source/Fractorium/FractoriumXformsVariations.cpp +++ b/Source/Fractorium/FractoriumXformsVariations.cpp @@ -86,11 +86,8 @@ void FractoriumEmberController::SetupVariationTree() varSpinBox->Step(1); varSpinBox->SmallStep(1); } - else - { - varSpinBox->setDecimals(4); - } - + + varSpinBox->setDecimals(4); tree->setItemWidget(paramWidget, 1, varSpinBox); m_Fractorium->connect(varSpinBox, SIGNAL(valueChanged(double)), SLOT(OnVariationSpinBoxValueChanged(double)), Qt::QueuedConnection); } diff --git a/Source/Fractorium/GLWidget.cpp b/Source/Fractorium/GLWidget.cpp index 8cbdf06..79a08fb 100644 --- a/Source/Fractorium/GLWidget.cpp +++ b/Source/Fractorium/GLWidget.cpp @@ -294,7 +294,7 @@ void GLEmberController::DrawAffines(bool pre, bool post) bool dragging = m_DragState == DragDragging; //Draw grid if control key is pressed. - if ((m_DragModifier & DragModControl) == DragModControl) + if (m_GL->hasFocus() && ((m_DragModifier & DragModControl) == DragModControl)) { m_GL->glLineWidth(1.0f); m_GL->DrawGrid();