diff --git a/Builds/MSVC/Installer/FractoriumInstaller.wixproj b/Builds/MSVC/Installer/FractoriumInstaller.wixproj
index 4951f58..5704658 100644
--- a/Builds/MSVC/Installer/FractoriumInstaller.wixproj
+++ b/Builds/MSVC/Installer/FractoriumInstaller.wixproj
@@ -6,7 +6,7 @@
3.7
{c8096c47-e358-438c-a520-146d46b0637d}
2.0
- Fractorium_Beta_0.4.1.4
+ Fractorium_Beta_0.4.1.5
Package
$(MSBuildExtensionsPath32)\Microsoft\WiX\v3.x\Wix.targets
$(MSBuildExtensionsPath)\Microsoft\WiX\v3.x\Wix.targets
diff --git a/Builds/MSVC/Installer/Product.wxs b/Builds/MSVC/Installer/Product.wxs
index 0eb3367..9f1b883 100644
--- a/Builds/MSVC/Installer/Product.wxs
+++ b/Builds/MSVC/Installer/Product.wxs
@@ -1,6 +1,6 @@
-
+
@@ -13,7 +13,7 @@
-
+
::m_Brightness>(embers, coefs, size);
- InterpT<&Ember::m_HighlightPower>(embers, coefs, size);
- InterpT<&Ember::m_Gamma>(embers, coefs, size);
- InterpT<&Ember::m_Vibrancy>(embers, coefs, size);
- InterpT<&Ember::m_Hue>(embers, coefs, size);
+ //Interpolate ember parameters, these should be in the same order the members are declared.
InterpI<&Ember::m_FinalRasW>(embers, coefs, size);
InterpI<&Ember::m_FinalRasH>(embers, coefs, size);
+ InterpI<&Ember::m_SubBatchSize>(embers, coefs, size);
+ InterpI<&Ember::m_FuseCount>(embers, coefs, size);
InterpI<&Ember::m_Supersample>(embers, coefs, size);
- InterpT<&Ember::m_CenterX>(embers, coefs, size);
- InterpT<&Ember::m_CenterY>(embers, coefs, size);
- InterpT<&Ember::m_RotCenterY>(embers, coefs, size);
- InterpX, &Ember::m_Background>(embers, coefs, size); m_Background.a = bgAlphaSave;//Don't interp alpha.
- InterpT<&Ember::m_PixelsPerUnit>(embers, coefs, size);
- InterpT<&Ember::m_SpatialFilterRadius>(embers, coefs, size);
- InterpT<&Ember::m_TemporalFilterExp>(embers, coefs, size);
- InterpT<&Ember::m_TemporalFilterWidth>(embers, coefs, size);
+ InterpI<&Ember::m_TemporalSamples>(embers, coefs, size);
InterpT<&Ember::m_Quality>(embers, coefs, size);
+ InterpT<&Ember::m_PixelsPerUnit>(embers, coefs, size);
InterpT<&Ember::m_Zoom>(embers, coefs, size);
InterpT<&Ember::m_CamZPos>(embers, coefs, size);
InterpT<&Ember::m_CamPerspective>(embers, coefs, size);
@@ -775,12 +770,23 @@ public:
InterpT<&Ember::m_CamPitch>(embers, coefs, size);
InterpT<&Ember::m_CamDepthBlur>(embers, coefs, size);
InterpX::m_CamMat>(embers, coefs, size);
+ InterpT<&Ember::m_CenterX>(embers, coefs, size);
+ InterpT<&Ember::m_CenterY>(embers, coefs, size);
+ InterpT<&Ember::m_RotCenterY>(embers, coefs, size);
InterpT<&Ember::m_Rotate>(embers, coefs, size);
- InterpI<&Ember::m_TemporalSamples>(embers, coefs, size);
+ InterpT<&Ember::m_Hue>(embers, coefs, size);
+ InterpT<&Ember::m_Brightness>(embers, coefs, size);
+ InterpT<&Ember::m_Gamma>(embers, coefs, size);
+ InterpT<&Ember::m_Vibrancy>(embers, coefs, size);
+ InterpT<&Ember::m_GammaThresh>(embers, coefs, size);
+ InterpT<&Ember::m_HighlightPower>(embers, coefs, size);
+ InterpX, &Ember::m_Background>(embers, coefs, size); m_Background.a = bgAlphaSave;//Don't interp alpha.
+ InterpT<&Ember::m_TemporalFilterExp>(embers, coefs, size);
+ InterpT<&Ember::m_TemporalFilterWidth>(embers, coefs, size);
InterpT<&Ember::m_MaxRadDE>(embers, coefs, size);
InterpT<&Ember::m_MinRadDE>(embers, coefs, size);
InterpT<&Ember::m_CurveDE>(embers, coefs, size);
- InterpT<&Ember::m_GammaThresh>(embers, coefs, size);
+ InterpT<&Ember::m_SpatialFilterRadius>(embers, coefs, size);
//An extra step needed here due to the OOD that was not needed in the original.
//A small price to pay for the conveniences it affords us elsewhere.
@@ -1382,6 +1388,8 @@ public:
<< "Quality: " << m_Quality << endl
<< "Pixels Per Unit: " << m_PixelsPerUnit << endl
<< "Original Pixels Per Unit: " << m_OrigPixPerUnit << endl
+ << "Sub Batch Size: " << m_SubBatchSize << endl
+ << "Fuse Count: " << m_FuseCount << endl
<< "Zoom: " << m_Zoom << endl
<< "ZPos: " << m_CamZPos << endl
<< "Perspective: " << m_CamPerspective << endl
@@ -1459,6 +1467,14 @@ public:
size_t m_OrigFinalRasH;//the dimension may change in an editor and the originals are needed for the aspect ratio.
T m_OrigPixPerUnit;
+ //The iteration depth. This was a rendering parameter in flam3 but has been made a member here
+ //so that it can be adjusted more easily.
+ size_t m_SubBatchSize;
+
+ //The number of iterations to disregard for each sub batch. This was a rendering parameter in flam3 but has been made a member here
+ //so that it can be adjusted more easily.
+ size_t m_FuseCount;
+
//The multiplier in size of the histogram and DE filtering buffers. Must be at least one, preferrably never larger than 4, only useful at 2.
//Xml field: "supersample" or "overample (deprecated)".
size_t m_Supersample;
diff --git a/Source/Ember/EmberDefines.h b/Source/Ember/EmberDefines.h
index 6adb8eb..85a831a 100644
--- a/Source/Ember/EmberDefines.h
+++ b/Source/Ember/EmberDefines.h
@@ -36,7 +36,7 @@ namespace EmberNs
extern void sincos(float x, float *s, float *c);
#endif
-#define EMBER_VERSION "0.4.1.4"
+#define EMBER_VERSION "0.4.1.5"
#define EPS6 T(1e-6)
#define EPS std::numeric_limits::epsilon()//Apoplugin.h uses -20, but it's more mathematically correct to do it this way.
#define ISAAC_SIZE 4
@@ -54,6 +54,7 @@ namespace EmberNs
#define COLORMAP_LENGTH 256//These will need to change if 2D palette support is ever added, or variable sized palettes.
#define COLORMAP_LENGTH_MINUS_1 255
#define WHITE 255
+#define DEFAULT_SBS (1024 * 10)
#define XC (const xmlChar*)
#define BadVal(x) (((x) != (x)) || ((x) > 1e10) || ((x) < -1e10))
#define Rint(A) floor((A) + (((A) < 0) ? T(-0.5) : T(0.5)))
diff --git a/Source/Ember/EmberToXml.h b/Source/Ember/EmberToXml.h
index f345279..79f5eba 100644
--- a/Source/Ember/EmberToXml.h
+++ b/Source/Ember/EmberToXml.h
@@ -156,6 +156,8 @@ public:
os << " temporal_filter_width=\"" << ember.m_TemporalFilterWidth << "\"";
os << " quality=\"" << ember.m_Quality << "\"";
os << " temporal_samples=\"" << ember.m_TemporalSamples << "\"";
+ os << " sub_batch_size=\"" << ember.m_SubBatchSize << "\"";
+ os << " fuse=\"" << ember.m_FuseCount << "\"";
os << " background=\"" << ember.m_Background.r << " " << ember.m_Background.g << " " << ember.m_Background.b << "\"";
os << " brightness=\"" << ember.m_Brightness << "\"";
os << " gamma=\"" << ember.m_Gamma << "\"";
diff --git a/Source/Ember/Iterator.h b/Source/Ember/Iterator.h
index c1f4db5..29bd4e7 100644
--- a/Source/Ember/Iterator.h
+++ b/Source/Ember/Iterator.h
@@ -16,6 +16,17 @@ namespace EmberNs
using Iterator::DoFinalXform; \
using Iterator::DoBadVals;
+template class Renderer;
+
+template
+struct IterParams
+{
+ size_t m_Count;
+ size_t m_Skip;
+ //T m_OneColDiv2;
+ //T m_OneRowDiv2;
+};
+
///
/// Iterator base class.
/// Iterating is one loop level outside of the inner xform application loop so it's still very important
@@ -69,7 +80,7 @@ public:
/// The buffer to store the output points
/// The random context to use
/// The number of bad values
- virtual size_t Iterate(Ember& ember, size_t count, size_t skip, Point* samples, QTIsaac& rand) { return 0; }
+ virtual size_t Iterate(Ember& ember, IterParams& params, Point* samples, QTIsaac& rand) { return 0; }
///
/// Initialize the xform selection vector by normalizing the weights of all xforms and
@@ -278,7 +289,7 @@ public:
/// The buffer to store the output points
/// The random context to use
/// The number of bad values
- virtual size_t Iterate(Ember& ember, size_t count, size_t skip, Point* samples, QTIsaac& rand)
+ virtual size_t Iterate(Ember& ember, IterParams& params, Point* samples, QTIsaac& rand) override
{
size_t i, badVals = 0;
Point tempPoint, p1;
@@ -290,7 +301,7 @@ public:
{
p1 = samples[0];
- for (i = 0; i < skip; i++)//Fuse.
+ for (i = 0; i < params.m_Skip; i++)//Fuse.
{
if (xforms[NextXformFromIndex(rand.Rand())].Apply(&p1, &p1, rand))
DoBadVals(xforms, badVals, &p1, rand);
@@ -299,7 +310,7 @@ public:
DoFinalXform(ember, p1, samples, rand);//Apply to last fuse point and store as the first element in samples.
ember.Proj(samples[0], rand);
- for (i = 1; i < count; i++)//Real loop.
+ for (i = 1; i < params.m_Count; i++)//Real loop.
{
if (xforms[NextXformFromIndex(rand.Rand())].Apply(&p1, &p1, rand))
DoBadVals(xforms, badVals, &p1, rand);
@@ -312,7 +323,7 @@ public:
{
p1 = samples[0];
- for (i = 0; i < skip; i++)//Fuse.
+ for (i = 0; i < params.m_Skip; i++)//Fuse.
{
if (xforms[NextXformFromIndex(rand.Rand())].Apply(&p1, &p1, rand))
DoBadVals(xforms, badVals, &p1, rand);
@@ -321,7 +332,7 @@ public:
samples[0] = p1;
ember.Proj(samples[0], rand);
- for (i = 1; i < count; i++)//Real loop.
+ for (i = 1; i < params.m_Count; i++)//Real loop.
{
if (xforms[NextXformFromIndex(rand.Rand())].Apply(&p1, &samples[i], rand))
DoBadVals(xforms, badVals, samples + i, rand);
@@ -337,7 +348,7 @@ public:
{
p1 = samples[0];
- for (i = 0; i < skip; i++)//Fuse.
+ for (i = 0; i < params.m_Skip; i++)//Fuse.
{
if (xforms[NextXformFromIndex(rand.Rand())].Apply(&p1, &p1, rand))
DoBadVals(xforms, badVals, &p1, rand);
@@ -345,7 +356,7 @@ public:
DoFinalXform(ember, p1, samples, rand);//Apply to last fuse point and store as the first element in samples.
- for (i = 1; i < count; i++)//Real loop.
+ for (i = 1; i < params.m_Count; i++)//Real loop.
{
if (xforms[NextXformFromIndex(rand.Rand())].Apply(&p1, &p1, rand))//Feed the resulting value of applying the randomly selected xform back into the next iter, and not the result of applying the final xform.
DoBadVals(xforms, badVals, &p1, rand);
@@ -357,7 +368,7 @@ public:
{
p1 = samples[0];
- for (i = 0; i < skip; i++)//Fuse.
+ for (i = 0; i < params.m_Skip; i++)//Fuse.
{
if (xforms[NextXformFromIndex(rand.Rand())].Apply(&p1, &p1, rand))
DoBadVals(xforms, badVals, &p1, rand);
@@ -365,9 +376,11 @@ public:
samples[0] = p1;
- for (i = 0; i < count - 1; i++)//Real loop.
+ for (i = 0; i < params.m_Count - 1; i++)//Real loop.
+ {
if (xforms[NextXformFromIndex(rand.Rand())].Apply(samples + i, samples + i + 1, rand))
DoBadVals(xforms, badVals, samples + i + 1, rand);
+ }
}
}
@@ -442,7 +455,7 @@ public:
/// The buffer to store the output points
/// The random context to use
/// The number of bad values
- virtual size_t Iterate(Ember& ember, size_t count, size_t skip, Point* samples, QTIsaac& rand)
+ virtual size_t Iterate(Ember& ember, IterParams& params, Point* samples, QTIsaac& rand) override
{
size_t i, xformIndex;
size_t lastXformUsed = 0;
@@ -456,7 +469,7 @@ public:
{
p1 = samples[0];
- for (i = 0; i < skip; i++)//Fuse.
+ for (i = 0; i < params.m_Skip; i++)//Fuse.
{
xformIndex = NextXformFromIndex(rand.Rand(), lastXformUsed);
@@ -469,7 +482,7 @@ public:
DoFinalXform(ember, p1, samples, rand);//Apply to last fuse point and store as the first element in samples.
ember.Proj(samples[0], rand);
- for (i = 1; i < count; i++)//Real loop.
+ for (i = 1; i < params.m_Count; i++)//Real loop.
{
xformIndex = NextXformFromIndex(rand.Rand(), lastXformUsed);
@@ -485,7 +498,7 @@ public:
{
p1 = samples[0];
- for (i = 0; i < skip; i++)//Fuse.
+ for (i = 0; i < params.m_Skip; i++)//Fuse.
{
xformIndex = NextXformFromIndex(rand.Rand(), lastXformUsed);
@@ -498,7 +511,7 @@ public:
samples[0] = p1;
ember.Proj(samples[0], rand);
- for (i = 1; i < count; i++)//Real loop.
+ for (i = 1; i < params.m_Count; i++)//Real loop.
{
xformIndex = NextXformFromIndex(rand.Rand(), lastXformUsed);
@@ -517,7 +530,7 @@ public:
{
p1 = samples[0];
- for (i = 0; i < skip; i++)//Fuse.
+ for (i = 0; i < params.m_Skip; i++)//Fuse.
{
xformIndex = NextXformFromIndex(rand.Rand(), lastXformUsed);
@@ -529,7 +542,7 @@ public:
DoFinalXform(ember, p1, samples, rand);//Apply to last fuse point and store as the first element in samples.
- for (i = 1; i < count; i++)//Real loop.
+ for (i = 1; i < params.m_Count; i++)//Real loop.
{
xformIndex = NextXformFromIndex(rand.Rand(), lastXformUsed);
@@ -544,7 +557,7 @@ public:
{
p1 = samples[0];
- for (i = 0; i < skip; i++)//Fuse.
+ for (i = 0; i < params.m_Skip; i++)//Fuse.
{
xformIndex = NextXformFromIndex(rand.Rand(), lastXformUsed);
@@ -556,7 +569,7 @@ public:
samples[0] = p1;
- for (i = 0; i < count - 1; i++)//Real loop.
+ for (i = 0; i < params.m_Count - 1; i++)//Real loop.
{
xformIndex = NextXformFromIndex(rand.Rand(), lastXformUsed);
diff --git a/Source/Ember/Renderer.cpp b/Source/Ember/Renderer.cpp
index afe8619..9462e88 100644
--- a/Source/Ember/Renderer.cpp
+++ b/Source/Ember/Renderer.cpp
@@ -691,7 +691,7 @@ bool Renderer::Alloc()
(m_SuperSize != m_HistBuckets.size()) ||
(m_SuperSize != m_AccumulatorBuckets.size()) ||
(m_ThreadsToUse != m_Samples.size()) ||
- (m_Samples[0].size() != m_SubBatchSize);
+ (m_Samples[0].size() != SubBatchSize());
if (lock)
EnterResize();
@@ -728,14 +728,14 @@ bool Renderer::Alloc()
for (size_t i = 0; i < m_Samples.size(); i++)
{
- if (m_Samples[i].size() != m_SubBatchSize)
+ if (m_Samples[i].size() != SubBatchSize())
{
- m_Samples[i].resize(m_SubBatchSize);
+ m_Samples[i].resize(SubBatchSize());
if (m_ReclaimOnResize)
m_Samples[i].shrink_to_fit();
- b &= (m_Samples[i].size() == m_SubBatchSize);
+ b &= (m_Samples[i].size() == SubBatchSize());
}
}
@@ -1154,7 +1154,7 @@ eRenderStatus Renderer::AccumulatorToFinalImage(unsigned char* pixel
/// This function will be called multiple times for an interactive rendering, and
/// once for a straight through render.
/// The iteration is reset and fused in each thread after each sub batch is done
-/// which by default is 10,000 iterations.
+/// which by default is 10,240 iterations.
///
/// The number of iterations to run
/// The temporal sample this is running for
@@ -1164,7 +1164,6 @@ EmberStats Renderer::Iterate(size_t iterCount, size_t temporalSample
{
//Timing t2(4);
m_IterTimer.Tic();
- size_t fuse = EarlyClip() ? 100 : 15;//EarlyClip was one way of detecting a later version of flam3, so it used 100 which is a better value.
size_t totalItersPerThread = (size_t)ceil((double)iterCount / (double)m_ThreadsToUse);
double percent, etaMs;
EmberStats stats;
@@ -1180,17 +1179,21 @@ EmberStats Renderer::Iterate(size_t iterCount, size_t temporalSample
parallel_for(size_t(0), m_ThreadsToUse, [&] (size_t threadIndex)
{
#endif
- Timing t;
- size_t subBatchSize = (size_t)min(totalItersPerThread, (size_t)m_SubBatchSize);
+ //Timing t;
+ IterParams params;
m_BadVals[threadIndex] = 0;
+ params.m_Count = min(totalItersPerThread, SubBatchSize());
+ params.m_Skip = FuseCount();
+ //params.m_OneColDiv2 = m_CarToRas.OneCol() / 2;
+ //params.m_OneRowDiv2 = m_CarToRas.OneRow() / 2;
//Sub batch iterations, loop 2.
- for (m_SubBatch[threadIndex] = 0; (m_SubBatch[threadIndex] < totalItersPerThread) && !m_Abort; m_SubBatch[threadIndex] += subBatchSize)
+ for (m_SubBatch[threadIndex] = 0; (m_SubBatch[threadIndex] < totalItersPerThread) && !m_Abort; m_SubBatch[threadIndex] += params.m_Count)
{
- //Must recalculate the number of iters to run on each sub batch because the last batch will most likely have less than m_SubBatchSize iters.
+ //Must recalculate the number of iters to run on each sub batch because the last batch will most likely have less than SubBatchSize iters.
//For example, if 51,000 are requested, and the sbs is 10,000, it should run 5 sub batches of 10,000 iters, and one final sub batch of 1,000 iters.
- subBatchSize = min(subBatchSize, totalItersPerThread - m_SubBatch[threadIndex]);
+ params.m_Count = min(params.m_Count, totalItersPerThread - m_SubBatch[threadIndex]);
//Use first as random point, the rest are iterated points.
//Note that this gets reset with a new random point for each subBatchSize iterations.
@@ -1203,14 +1206,14 @@ EmberStats Renderer::Iterate(size_t iterCount, size_t temporalSample
//Finally, iterate.
//t.Tic();
//Iterating, loop 3.
- m_BadVals[threadIndex] += m_Iterator->Iterate(m_Ember, subBatchSize, fuse, m_Samples[threadIndex].data(), m_Rand[threadIndex]);
+ m_BadVals[threadIndex] += m_Iterator->Iterate(m_Ember, params, m_Samples[threadIndex].data(), m_Rand[threadIndex]);
//iterationTime += t.Toc();
if (m_LockAccum)
m_AccumCs.Enter();
//t.Tic();
//Map temp buffer samples into the histogram using the palette for color.
- Accumulate(m_Samples[threadIndex].data(), subBatchSize, &m_Dmap);
+ Accumulate(m_Rand[threadIndex], m_Samples[threadIndex].data(), params.m_Count, &m_Dmap);
//accumulationTime += t.Toc();
if (m_LockAccum)
m_AccumCs.Leave();
@@ -1347,6 +1350,8 @@ template ePaletteMode Renderer::
template size_t Renderer::TemporalSamples() const { return m_Ember.m_TemporalSamples; }
template size_t Renderer::FinalRasW() const { return m_Ember.m_FinalRasW; }
template size_t Renderer::FinalRasH() const { return m_Ember.m_FinalRasH; }
+template size_t Renderer::SubBatchSize() const { return m_Ember.m_SubBatchSize; }
+template size_t Renderer::FuseCount() const { return m_Ember.m_FuseCount; }
///
/// Non-virtual iterator wrappers.
@@ -1396,11 +1401,13 @@ void Renderer::PrepFinalAccumVals(Color& background, T& g, T& lin
/// The number of samples
/// The palette to use
template
-void Renderer::Accumulate(Point* samples, size_t sampleCount, const Palette* palette)
+void Renderer::Accumulate(QTIsaac& rand, Point* samples, size_t sampleCount, const Palette* palette)
{
size_t histIndex, intColorIndex, histSize = m_HistBuckets.size();
bucketT colorIndex, colorIndexFrac;
const glm::detail::tvec4* dmap = &(palette->m_Entries[0]);
+ //T oneColDiv2 = m_CarToRas.OneCol() / 2;
+ //T oneRowDiv2 = m_CarToRas.OneRow() / 2;
//It's critical to understand what's going on here as it's one of the most important parts of the algorithm.
//A color value gets retrieved from the palette and
@@ -1413,24 +1420,37 @@ void Renderer::Accumulate(Point* samples, size_t sampleCount, con
//Splitting these conditionals into separate loops makes no speed difference.
for (size_t i = 0; i < sampleCount && !m_Abort; i++)
{
+ Point p(samples[i]);//Slightly faster to cache this.
+
if (Rotate() != 0)
{
- T p00 = samples[i].m_X - CenterX();
- T p11 = samples[i].m_Y - m_Ember.m_RotCenterY;
+ T p00 = p.m_X - CenterX();
+ T p11 = p.m_Y - m_Ember.m_RotCenterY;
- samples[i].m_X = (p00 * m_RotMat.A()) + (p11 * m_RotMat.B()) + CenterX();
- samples[i].m_Y = (p00 * m_RotMat.D()) + (p11 * m_RotMat.E()) + m_Ember.m_RotCenterY;
+ p.m_X = (p00 * m_RotMat.A()) + (p11 * m_RotMat.B()) + CenterX();
+ p.m_Y = (p00 * m_RotMat.D()) + (p11 * m_RotMat.E()) + m_Ember.m_RotCenterY;
}
+ //T angle = rand.Frand01() * M_2PI;
+ //T r = exp(T(0.5) * sqrt(-log(rand.Frand01()))) - 1;
+
+ //T r = (rand.Frand01() + rand.Frand01() - 1);
+ //T r = (rand.Frand01() + rand.Frand01() + rand.Frand01() + rand.Frand01() - 2);
+
+ //p.m_X += (r * oneColDiv2) * cos(angle);
+ //p.m_Y += (r * oneRowDiv2) * sin(angle);
+ //p.m_X += r * cos(angle);
+ //p.m_Y += r * sin(angle);
+
//Checking this first before converting gives better performance than converting and checking a single value, which the original did.
//Second, an interesting optimization observation is that when keeping the bounds vars within m_CarToRas and calling its InBounds() member function,
//rather than here as members, about a 7% speedup is achieved. This is possibly due to the fact that data from m_CarToRas is accessed
//right after the call to Convert(), so some caching efficiencies get realized.
- if (m_CarToRas.InBounds(samples[i]))
+ if (m_CarToRas.InBounds(p))
{
- if (samples[i].m_VizAdjusted != 0)
+ if (p.m_VizAdjusted != 0)
{
- m_CarToRas.Convert(samples[i], histIndex);
+ m_CarToRas.Convert(p, histIndex);
//There is a very slim chance that a point will be right on the border and will technically be in bounds, passing the InBounds() test,
//but ends up being mapped to a histogram bucket that is out of bounds due to roundoff error. Perform one final check before proceeding.
@@ -1445,7 +1465,7 @@ void Renderer::Accumulate(Point* samples, size_t sampleCount, con
//Use overloaded addition and multiplication operators in vec4 to perform the accumulation.
if (PaletteMode() == PALETTE_LINEAR)
{
- colorIndex = (bucketT)samples[i].m_ColorX * COLORMAP_LENGTH;
+ colorIndex = (bucketT)p.m_ColorX * COLORMAP_LENGTH;
intColorIndex = (size_t)colorIndex;
if (intColorIndex < 0)
@@ -1463,19 +1483,19 @@ void Renderer::Accumulate(Point* samples, size_t sampleCount, con
colorIndexFrac = colorIndex - (bucketT)intColorIndex;//Interpolate between intColorIndex and intColorIndex + 1.
}
- if (samples[i].m_VizAdjusted == 1)
+ if (p.m_VizAdjusted == 1)
m_HistBuckets[histIndex] += ((dmap[intColorIndex] * (1 - colorIndexFrac)) + (dmap[intColorIndex + 1] * colorIndexFrac));
else
- m_HistBuckets[histIndex] += (((dmap[intColorIndex] * (1 - colorIndexFrac)) + (dmap[intColorIndex + 1] * colorIndexFrac)) * (bucketT)samples[i].m_VizAdjusted);
+ m_HistBuckets[histIndex] += (((dmap[intColorIndex] * (1 - colorIndexFrac)) + (dmap[intColorIndex + 1] * colorIndexFrac)) * (bucketT)p.m_VizAdjusted);
}
else if (PaletteMode() == PALETTE_STEP)
{
- intColorIndex = Clamp((size_t)(samples[i].m_ColorX * COLORMAP_LENGTH), 0, COLORMAP_LENGTH_MINUS_1);
+ intColorIndex = Clamp((size_t)(p.m_ColorX * COLORMAP_LENGTH), 0, COLORMAP_LENGTH_MINUS_1);
- if (samples[i].m_VizAdjusted == 1)
+ if (p.m_VizAdjusted == 1)
m_HistBuckets[histIndex] += dmap[intColorIndex];
else
- m_HistBuckets[histIndex] += (dmap[intColorIndex] * (bucketT)samples[i].m_VizAdjusted);
+ m_HistBuckets[histIndex] += (dmap[intColorIndex] * (bucketT)p.m_VizAdjusted);
}
}
}
diff --git a/Source/Ember/Renderer.h b/Source/Ember/Renderer.h
index 8772eb1..7942c18 100644
--- a/Source/Ember/Renderer.h
+++ b/Source/Ember/Renderer.h
@@ -134,6 +134,8 @@ public:
virtual size_t TemporalSamples() const override;
virtual size_t FinalRasW() const override;
virtual size_t FinalRasH() const override;
+ virtual size_t SubBatchSize() const override;
+ virtual size_t FuseCount() const override;
//Non-virtual iterator wrappers.
const unsigned char* XformDistributions() const;
@@ -144,9 +146,9 @@ protected:
//Non-virtual functions that might be needed by a derived class.
void PrepFinalAccumVals(Color& background, T& g, T& linRange, T& vibrancy);
-private:
+ private:
//Miscellaneous non-virtual functions used only in this class.
- void Accumulate(Point* samples, size_t sampleCount, const Palette* palette);
+ void Accumulate(QTIsaac& rand, Point* samples, size_t sampleCount, const Palette* palette);
/*inline*/ void AddToAccum(const glm::detail::tvec4& bucket, intmax_t i, intmax_t ii, intmax_t j, intmax_t jj);
template void GammaCorrection(glm::detail::tvec4& bucket, Color& background, T g, T linRange, T vibrancy, bool doAlpha, bool scale, accumT* correctedChannels);
diff --git a/Source/Ember/RendererBase.cpp b/Source/Ember/RendererBase.cpp
index 2a4c9df..0110d3f 100644
--- a/Source/Ember/RendererBase.cpp
+++ b/Source/Ember/RendererBase.cpp
@@ -15,7 +15,6 @@ RendererBase::RendererBase()
m_YAxisUp = false;
m_InsertPalette = false;
m_ReclaimOnResize = false;
- m_SubBatchSize = 1024 * 10;
m_NumChannels = 3;
m_BytesPerChannel = 1;
m_SuperSize = 0;
@@ -412,17 +411,6 @@ void RendererBase::Transparency(bool transparency)
ChangeVal([&] { m_Transparency = transparency; }, ACCUM_ONLY);
}
-///
-/// Set the sub batch size. This is the size of of the chunks that the iteration
-/// trajectory will be broken up into.
-/// Reset the rendering process.
-///
-/// The sub batch size to set
-void RendererBase::SubBatchSize(size_t sbs)
-{
- ChangeVal([&] { m_SubBatchSize = sbs; }, FULL_RENDER);
-}
-
///
/// Set the callback object.
///
@@ -583,14 +571,6 @@ void RendererBase::NumChannels(size_t numChannels)
/// The number of threads used when rendering
size_t RendererBase::ThreadCount() const { return m_ThreadsToUse; }
-///
-/// Get the sub batch size. This is the size of of the chunks that the iteration
-/// trajectory will be broken up into.
-/// Default: 10k.
-///
-/// The sub batch size
-size_t RendererBase::SubBatchSize() const { return m_SubBatchSize; }
-
///
/// Get the renderer type enum.
/// CPU_RENDERER for this class, other values for derived classes.
diff --git a/Source/Ember/RendererBase.h b/Source/Ember/RendererBase.h
index 01c390f..ae235ad 100644
--- a/Source/Ember/RendererBase.h
+++ b/Source/Ember/RendererBase.h
@@ -149,7 +149,6 @@ public:
void ReclaimOnResize(bool reclaimOnResize);
bool Transparency() const;
void Transparency(bool transparency);
- void SubBatchSize(size_t subBatchSize);
void Callback(RenderCallback* callback);
void ThreadCount(size_t threads, const char* seedString = nullptr);
size_t BytesPerChannel() const;
@@ -161,7 +160,6 @@ public:
//Virtual render properties, getters and setters.
virtual void NumChannels(size_t numChannels);
virtual size_t ThreadCount() const;
- virtual size_t SubBatchSize() const;
virtual eRendererType RendererType() const;
//Abstract render properties, getters only.
@@ -169,6 +167,8 @@ public:
virtual size_t HistBucketSize() const = 0;
virtual size_t FinalRasW() const = 0;
virtual size_t FinalRasH() const = 0;
+ virtual size_t SubBatchSize() const = 0;
+ virtual size_t FuseCount() const = 0;
virtual double ScaledQuality() const = 0;
virtual double LowerLeftX(bool gutter = true) const = 0;
virtual double LowerLeftY(bool gutter = true) const = 0;
@@ -207,10 +207,8 @@ protected:
size_t m_DensityFilterOffset;
size_t m_NumChannels;
size_t m_BytesPerChannel;
- size_t m_SubBatchSize;
size_t m_ThreadsToUse;
size_t m_VibGamCount;
- size_t m_LastPass;
size_t m_LastTemporalSample;
double m_LastIterPercent;
size_t m_LastIter;
diff --git a/Source/Ember/SheepTools.h b/Source/Ember/SheepTools.h
index a7b0ba4..cdc83ab 100644
--- a/Source/Ember/SheepTools.h
+++ b/Source/Ember/SheepTools.h
@@ -879,7 +879,6 @@ public:
m_Renderer->EarlyClip(true);
m_Renderer->PixelAspectRatio(1);
m_Renderer->ThreadCount(Timing::ProcessorCount());
- m_Renderer->SubBatchSize(10000);
m_Renderer->Callback(nullptr);
if (m_Renderer->Run(m_FinalImage) != RENDER_OK)
@@ -1280,8 +1279,16 @@ public:
/// The number of iterations ran
size_t EstimateBoundingBox(Ember& ember, T eps, size_t samples, T* bmin, T* bmax)
{
+ bool newAlloc = false;
size_t i, lowTarget, highTarget;
T min[2], max[2];
+ IterParams params;
+
+ m_Renderer->SetEmber(ember);
+ m_Renderer->CreateSpatialFilter(newAlloc);
+ m_Renderer->CreateDEFilter(newAlloc);
+ m_Renderer->ComputeBounds();
+ m_Renderer->ComputeCamera();
if (ember.XaosPresent())
m_Iterator = m_XaosIterator.get();
@@ -1290,8 +1297,12 @@ public:
m_Iterator->InitDistributions(ember);
m_Samples.resize(samples);
+ params.m_Count = samples;
+ params.m_Skip = 20;
+ //params.m_OneColDiv2 = m_Renderer->CoordMap()->OneCol() / 2;
+ //params.m_OneRowDiv2 = m_Renderer->CoordMap()->OneRow() / 2;
- size_t bv = m_Iterator->Iterate(ember, samples, 20, m_Samples.data(), m_Rand);//Use a special fuse of 20, all other calls to this will use 15, or 100.
+ size_t bv = m_Iterator->Iterate(ember, params, m_Samples.data(), m_Rand);//Use a special fuse of 20, all other calls to this will use 15, or 100.
if (bv / T(samples) > eps)
eps = 3 * bv / T(samples);
diff --git a/Source/Ember/Utils.h b/Source/Ember/Utils.h
index b3d5734..3d01023 100644
--- a/Source/Ember/Utils.h
+++ b/Source/Ember/Utils.h
@@ -33,6 +33,17 @@ static inline void ForEach(c& container, fn func)
std::for_each(container.begin(), container.end(), func);
}
+///
+/// Thin wrapper around computing the total size of a vector.
+///
+/// The vector to compute the size of
+/// The size of one element times the length.
+template
+static inline size_t SizeOf(vector& vec)
+{
+ return sizeof(vec[0]) * vec.size();
+}
+
///
/// After a run completes, information about what was run can be saved as strings to the comments
/// section of a jpg or png file. This class is just a container for those values.
@@ -276,7 +287,7 @@ static void ClearVec(vector& vec, bool arrayDelete = false)
template
static inline void Memset(vector& vec, int val = 0)
{
- memset((void*)vec.data(), val, vec.size() * sizeof(vec[0]));
+ memset((void*)vec.data(), val, SizeOf(vec));
}
///
diff --git a/Source/Ember/XmlToEmber.h b/Source/Ember/XmlToEmber.h
index 64de8b8..b80a14e 100644
--- a/Source/Ember/XmlToEmber.h
+++ b/Source/Ember/XmlToEmber.h
@@ -593,6 +593,8 @@ private:
else if (ParseAndAssignInt(curAtt->name, attStr, "oversample", currentEmber.m_Supersample , ret)) { }
else if (ParseAndAssignInt(curAtt->name, attStr, "supersample", currentEmber.m_Supersample , ret)) { }
else if (ParseAndAssignInt(curAtt->name, attStr, "temporal_samples", currentEmber.m_TemporalSamples, ret)) { }
+ else if (ParseAndAssignInt(curAtt->name, attStr, "sub_batch_size", currentEmber.m_SubBatchSize , ret)) { }
+ else if (ParseAndAssignInt(curAtt->name, attStr, "fuse", currentEmber.m_FuseCount , ret)) { }
else if (ParseAndAssignInt(curAtt->name, attStr, "soloxform", soloXform , ret)) { }
else if (ParseAndAssignInt(curAtt->name, attStr, "new_linear", newLinear , ret)) { }
diff --git a/Source/EmberAnimate/EmberAnimate.cpp b/Source/EmberAnimate/EmberAnimate.cpp
index c216715..6466385 100644
--- a/Source/EmberAnimate/EmberAnimate.cpp
+++ b/Source/EmberAnimate/EmberAnimate.cpp
@@ -186,6 +186,9 @@ bool EmberAnimate(EmberOptions& opt)
if (opt.Supersample() > 0)
embers[i].m_Supersample = opt.Supersample();
+ if (opt.SubBatchSize() != DEFAULT_SBS)
+ embers[i].m_SubBatchSize = opt.SubBatchSize();
+
embers[i].m_Quality *= T(opt.QualityScale());
embers[i].m_FinalRasW = (unsigned int)((T)embers[i].m_FinalRasW * opt.SizeScale());
embers[i].m_FinalRasH = (unsigned int)((T)embers[i].m_FinalRasH * opt.SizeScale());
@@ -250,13 +253,12 @@ bool EmberAnimate(EmberOptions& opt)
renderer->YAxisUp(opt.YAxisUp());
renderer->LockAccum(opt.LockAccum());
renderer->InsertPalette(opt.InsertPalette());
- renderer->SubBatchSize(opt.SubBatchSize());
renderer->PixelAspectRatio(T(opt.AspectRatio()));
renderer->Transparency(opt.Transparency());
renderer->NumChannels(channels);
renderer->BytesPerChannel(opt.BitsPerChannel() / 8);
renderer->Callback(opt.DoProgress() ? progress.get() : NULL);
-
+
//Begin run.
for (ftime = opt.FirstFrame(); ftime <= opt.LastFrame(); ftime += opt.Dtime())
{
diff --git a/Source/EmberAnimate/EmberAnimate.rc b/Source/EmberAnimate/EmberAnimate.rc
index 09e4dc1..7ec4eec 100644
--- a/Source/EmberAnimate/EmberAnimate.rc
+++ b/Source/EmberAnimate/EmberAnimate.rc
@@ -49,8 +49,8 @@ END
//
VS_VERSION_INFO VERSIONINFO
- FILEVERSION 0,4,1,4
- PRODUCTVERSION 0,4,1,4
+ FILEVERSION 0,4,1,5
+ PRODUCTVERSION 0,4,1,5
FILEFLAGSMASK 0x3fL
#ifdef _DEBUG
FILEFLAGS 0x1L
@@ -67,12 +67,12 @@ BEGIN
BEGIN
VALUE "CompanyName", "Open Source"
VALUE "FileDescription", "Renders fractal flames as animations with motion blur"
- VALUE "FileVersion", "0.4.1.4"
+ VALUE "FileVersion", "0.4.1.5"
VALUE "InternalName", "EmberAnimate.rc"
VALUE "LegalCopyright", "Copyright (C) Matt Feemster 2013, GPL v3"
VALUE "OriginalFilename", "EmberAnimate.rc"
VALUE "ProductName", "Ember Animate"
- VALUE "ProductVersion", "0.4.1.4"
+ VALUE "ProductVersion", "0.4.1.5"
END
END
BLOCK "VarFileInfo"
diff --git a/Source/EmberCL/DEOpenCLKernelCreator.cpp b/Source/EmberCL/DEOpenCLKernelCreator.cpp
index fb5b677..f1da81a 100644
--- a/Source/EmberCL/DEOpenCLKernelCreator.cpp
+++ b/Source/EmberCL/DEOpenCLKernelCreator.cpp
@@ -25,7 +25,6 @@ template <>
DEOpenCLKernelCreator::DEOpenCLKernelCreator(bool nVidia)
{
m_NVidia = nVidia;
- m_LogScaleSumDEEntryPoint = "LogScaleSumDensityFilterKernel";
m_LogScaleAssignDEEntryPoint = "LogScaleAssignDensityFilterKernel";
m_GaussianDEWithoutSsEntryPoint = "GaussianDEWithoutSsKernel";
m_GaussianDESsWithScfEntryPoint = "GaussianDESsWithScfKernel";
@@ -33,7 +32,6 @@ DEOpenCLKernelCreator::DEOpenCLKernelCreator(bool nVidia)
m_GaussianDEWithoutSsNoCacheEntryPoint = "GaussianDEWithoutSsNoCacheKernel";
m_GaussianDESsWithScfNoCacheEntryPoint = "GaussianDESsWithScfNoCacheKernel";
m_GaussianDESsWithoutScfNoCacheEntryPoint = "GaussianDESsWithoutScfNoCacheKernel";
- m_LogScaleSumDEKernel = CreateLogScaleSumDEKernelString();
m_LogScaleAssignDEKernel = CreateLogScaleAssignDEKernelString();
m_GaussianDEWithoutSsKernel = CreateGaussianDEKernel(1);
m_GaussianDESsWithScfKernel = CreateGaussianDEKernel(2);
@@ -56,25 +54,39 @@ DEOpenCLKernelCreator::DEOpenCLKernelCreator(bool nVidia)
template <>
DEOpenCLKernelCreator::DEOpenCLKernelCreator(bool nVidia)
{
+#ifdef ROW_ONLY_DE
+ m_NVidia = nVidia;
+ m_LogScaleAssignDEEntryPoint = "LogScaleAssignDensityFilterKernel";
+ m_GaussianDEWithoutSsEntryPoint = "GaussianDEWithoutSsKernel";
+ m_GaussianDESsWithScfEntryPoint = "GaussianDESsWithScfKernel";
+ m_GaussianDESsWithoutScfEntryPoint = "GaussianDESsWithoutScfKernel";
+ m_GaussianDEWithoutSsNoCacheEntryPoint = "GaussianDEWithoutSsNoCacheKernel";
+ m_GaussianDESsWithScfNoCacheEntryPoint = "GaussianDESsWithScfNoCacheKernel";
+ m_GaussianDESsWithoutScfNoCacheEntryPoint = "GaussianDESsWithoutScfNoCacheKernel";
+ m_LogScaleAssignDEKernel = CreateLogScaleAssignDEKernelString();
+ m_GaussianDEWithoutSsKernel = CreateGaussianDEKernel(1);
+ m_GaussianDESsWithScfKernel = CreateGaussianDEKernel(2);
+ m_GaussianDESsWithoutScfKernel = CreateGaussianDEKernel(3);
+ m_GaussianDEWithoutSsNoCacheKernel = CreateGaussianDEKernelNoLocalCache(1);
+ m_GaussianDESsWithScfNoCacheKernel = CreateGaussianDEKernelNoLocalCache(2);
+ m_GaussianDESsWithoutScfNoCacheKernel = CreateGaussianDEKernelNoLocalCache(3);
+#else
m_NVidia = nVidia;
- m_LogScaleSumDEEntryPoint = "LogScaleSumDensityFilterKernel";
m_LogScaleAssignDEEntryPoint = "LogScaleAssignDensityFilterKernel";
m_GaussianDEWithoutSsNoCacheEntryPoint = "GaussianDEWithoutSsNoCacheKernel";
m_GaussianDESsWithScfNoCacheEntryPoint = "GaussianDESsWithScfNoCacheKernel";
m_GaussianDESsWithoutScfNoCacheEntryPoint = "GaussianDESsWithoutScfNoCacheKernel";
- m_LogScaleSumDEKernel = CreateLogScaleSumDEKernelString();
m_LogScaleAssignDEKernel = CreateLogScaleAssignDEKernelString();
m_GaussianDEWithoutSsNoCacheKernel = CreateGaussianDEKernelNoLocalCache(1);
m_GaussianDESsWithScfNoCacheKernel = CreateGaussianDEKernelNoLocalCache(2);
m_GaussianDESsWithoutScfNoCacheKernel = CreateGaussianDEKernelNoLocalCache(3);
+#endif
}
///
/// Kernel source and entry point properties, getters only.
///
-template string DEOpenCLKernelCreator::LogScaleSumDEKernel() { return m_LogScaleSumDEKernel; }
-template string DEOpenCLKernelCreator::LogScaleSumDEEntryPoint() { return m_LogScaleSumDEEntryPoint; }
template string DEOpenCLKernelCreator::LogScaleAssignDEKernel() { return m_LogScaleAssignDEKernel; }
template string DEOpenCLKernelCreator::LogScaleAssignDEEntryPoint() { return m_LogScaleAssignDEEntryPoint; }
@@ -87,6 +99,7 @@ template string DEOpenCLKernelCreator::LogScaleAssignDEEntryPoin
template
string DEOpenCLKernelCreator::GaussianDEKernel(size_t ss, unsigned int filterWidth)
{
+#ifndef ROW_ONLY_DE
if ((typeid(T) == typeid(double)) || (filterWidth > MaxDEFilterSize()))//Type double does not use cache.
{
if (ss > 1)
@@ -100,6 +113,7 @@ string DEOpenCLKernelCreator::GaussianDEKernel(size_t ss, unsigned int filter
return m_GaussianDEWithoutSsNoCacheKernel;
}
else
+#endif
{
if (ss > 1)
{
@@ -122,6 +136,7 @@ string DEOpenCLKernelCreator::GaussianDEKernel(size_t ss, unsigned int filter
template
string DEOpenCLKernelCreator::GaussianDEEntryPoint(size_t ss, unsigned int filterWidth)
{
+#ifndef ROW_ONLY_DE
if ((typeid(T) == typeid(double)) || (filterWidth > MaxDEFilterSize()))//Type double does not use cache.
{
if (ss > 1)
@@ -135,6 +150,7 @@ string DEOpenCLKernelCreator::GaussianDEEntryPoint(size_t ss, unsigned int fi
return m_GaussianDEWithoutSsNoCacheEntryPoint;
}
else
+#endif
{
if (ss > 1)
{
@@ -194,45 +210,6 @@ unsigned int DEOpenCLKernelCreator::SolveMaxBoxSize(unsigned int localMem)
return (unsigned int)floor(sqrt(floor((T)localMem / 16.0)));//Divide by 16 because each element is float4.
}
-///
-/// Create the log scale kernel string, using summation.
-/// This means each cell will be added to, rather than just assigned.
-/// Since adding is slower than assigning, this should only be used when Passes > 1,
-/// otherwise use the kernel created from CreateLogScaleAssignDEKernelString().
-///
-/// The kernel string
-template
-string DEOpenCLKernelCreator::CreateLogScaleSumDEKernelString()
-{
- ostringstream os;
-
- os <<
- ConstantDefinesString(typeid(T) == typeid(double)) <<
- DensityFilterCLStructString <<
- "__kernel void " << m_LogScaleSumDEEntryPoint << "(\n"
- " const __global real4* histogram,\n"
- " __global real4* accumulator,\n"
- " __constant DensityFilterCL* logFilter\n"
- "\t)\n"
- "{\n"
- " if ((GLOBAL_ID_X < logFilter->m_SuperRasW) && (GLOBAL_ID_Y < logFilter->m_SuperRasH))\n"
- " {\n"
- " uint index = (GLOBAL_ID_Y * logFilter->m_SuperRasW) + GLOBAL_ID_X;\n"
- "\n"
- " if (histogram[index].w != 0)\n"
- " {\n"
- " real_t logScale = (logFilter->m_K1 * log(1.0 + histogram[index].w * logFilter->m_K2)) / histogram[index].w;\n"
- "\n"
- " accumulator[index] += histogram[index] * logScale;\n"//Using a single real4 vector operation doubles the speed from doing each component individually.
- " }\n"
- "\n"
- " barrier(CLK_GLOBAL_MEM_FENCE);\n"//Just to be safe. Makes no speed difference to do all of the time or only when there's a hit.
- " }\n"
- "}\n";
-
- return os.str();
-}
-
///
/// Create the log scale kernel string, using assignment.
/// Use this when Passes == 1.
@@ -270,6 +247,215 @@ string DEOpenCLKernelCreator::CreateLogScaleAssignDEKernelString()
return os.str();
}
+#ifdef ROW_ONLY_DE
+template
+string DEOpenCLKernelCreator::CreateGaussianDEKernel(size_t ss)
+{
+ bool doSS = ss > 1;
+ bool doScf = !(ss & 1);
+ ostringstream os;
+
+ os <<
+ ConstantDefinesString(typeid(T) == typeid(double)) <<
+ DensityFilterCLStructString <<
+ UnionCLStructString <<
+ "__kernel void " << GaussianDEEntryPoint(ss, MaxDEFilterSize()) << "(\n" <<
+ " const __global real4* histogram,\n"
+ " __global real4reals* accumulator,\n"
+ " __constant DensityFilterCL* densityFilter,\n"
+ " const __global real_t* filterCoefs,\n"
+ " const __global real_t* filterWidths,\n"
+ " const __global uint* coefIndices,\n"
+ " const uint chunkSizeW,\n"
+ " const uint chunkSizeH,\n"
+ " const uint chunkW,\n"
+ " const uint chunkH\n"
+ "\t)\n"
+ "{\n"
+ " uint rowsToProcess = 32;\n"//Rows to process.
+ "\n"
+ " if (((((BLOCK_ID_X * chunkSizeW) + chunkW) * BLOCK_SIZE_X) + THREAD_ID_X >= densityFilter->m_SuperRasW) ||\n"
+ " ((((BLOCK_ID_Y * chunkSizeH) + chunkH) * rowsToProcess) + THREAD_ID_Y >= densityFilter->m_SuperRasH))\n"
+ " return;\n"
+ "\n";
+
+ if (doSS)
+ {
+ os <<
+ " uint ss = (uint)floor((real_t)densityFilter->m_Supersample / 2.0);\n"
+ " int densityBoxLeftX;\n"
+ " int densityBoxRightX;\n"
+ " int densityBoxTopY;\n"
+ " int densityBoxBottomY;\n"
+ "\n";
+
+ if (doScf)
+ os <<
+ " real_t scfact = pow(densityFilter->m_Supersample / (densityFilter->m_Supersample + 1.0), 2.0);\n";
+ }
+
+ os <<
+ " uint fullTempBoxWidth;\n"
+ " uint leftBound, rightBound, topBound, botBound;\n"
+ " uint blockHistStartRow, blockHistEndRow, histCol;\n"
+ " uint blockHistStartCol, boxReadStartCol, boxReadEndCol;\n"
+ " uint accumWriteStartCol, colsToWrite, colOffset, colsToWriteOffset;\n"
+ " int histRow, filterRow, accumWriteOffset;\n"
+ "\n"
+ " fullTempBoxWidth = BLOCK_SIZE_X + (densityFilter->m_FilterWidth * 2);\n"
+ //Compute the bounds of the area to be sampled, which is just the ends minus the super sample minus 1.
+ " leftBound = densityFilter->m_Supersample - 1;\n"
+ " rightBound = densityFilter->m_SuperRasW - (densityFilter->m_Supersample - 1);\n"
+ " topBound = densityFilter->m_Supersample - 1;\n"
+ " botBound = densityFilter->m_SuperRasH - (densityFilter->m_Supersample - 1);\n"
+ "\n"
+ //Start and end values are the indices in the histogram read from
+ //and written to in the accumulator. They are not the indices for the local block of data.
+ //Before computing local offsets, compute the global offsets first to determine if any rows or cols fall outside of the bounds.
+ " blockHistStartRow = min(botBound, topBound + (((BLOCK_ID_Y * chunkSizeH) + chunkH) * rowsToProcess));\n"//The first histogram row this block will process.
+ " blockHistEndRow = min(botBound, blockHistStartRow + rowsToProcess);\n"//The last histogram row this block will process, clamped to the last row.
+ " blockHistStartCol = min(rightBound, leftBound + (((BLOCK_ID_X * chunkSizeW) + chunkW) * BLOCK_SIZE_X));\n"//The first histogram column this block will process.
+ " boxReadStartCol = densityFilter->m_FilterWidth - min(densityFilter->m_FilterWidth, blockHistStartCol);\n"//The first box col this block will read from when copying to the accumulator.
+ " boxReadEndCol = densityFilter->m_FilterWidth + min(densityFilter->m_FilterWidth + BLOCK_SIZE_X, densityFilter->m_SuperRasW - blockHistStartCol);\n"//The last box col this block will read from when copying to the accumulator.
+ "\n"
+ //Last, the indices in the global accumulator that the local bounds will be writing to.
+ " accumWriteStartCol = blockHistStartCol - min(densityFilter->m_FilterWidth, blockHistStartCol);\n"//The first column in the accumulator this block will write to.
+ " colsToWrite = ceil((real_t)(boxReadEndCol - boxReadStartCol) / (real_t)BLOCK_SIZE_X);\n"//Elements per thread to be written to the accumulator.
+ " histCol = blockHistStartCol + THREAD_ID_X;\n"//The histogram column this individual thread will be reading from.
+ "\n"
+ " if (histCol >= rightBound)\n"
+ " return;\n"
+ "\n"
+ //Compute the col position in this local box to serve as the center position
+ //from which filter application offsets are computed.
+ //These are the local indices for the local data that are temporarily accumulated to before
+ //writing out to the global accumulator.
+ " uint boxCol = densityFilter->m_FilterWidth + THREAD_ID_X;\n"
+ " uint colsToZeroOffset, colsToZero = ceil((real_t)fullTempBoxWidth / (real_t)(BLOCK_SIZE_X));\n"//Usually is 2.
+ " int i, j, k, jmin, jmax;\n"
+ " uint filterSelectInt, filterCoefIndex;\n"
+ " real_t cacheLog;\n"
+ " real_t filterSelect;\n"
+ " real4 bucket;\n"
+ ;
+
+ os << " __local real4reals filterBox[192];\n";//Must be >= fullTempBoxWidth.
+
+ os <<
+ "\n"
+ " colsToZeroOffset = colsToZero * THREAD_ID_X;\n"
+ " colsToWriteOffset = colsToWrite * THREAD_ID_X;\n"
+ " k = (int)densityFilter->m_FilterWidth;\n"//Need a signed int to use below, really is filter width, but reusing a variable to save space.
+ "\n"
+ " for (histRow = blockHistStartRow; histRow < blockHistEndRow; histRow++)\n"//Process pixels by row, for 32 rows.
+ " {\n"
+ " bucket = histogram[(histRow * densityFilter->m_SuperRasW) + histCol];\n"
+ "\n"
+ " if (bucket.w != 0)\n"
+ " cacheLog = (densityFilter->m_K1 * log(1.0 + bucket.w * densityFilter->m_K2)) / bucket.w;\n"
+ "\n";
+
+ if (doSS)
+ {
+ os <<
+ " filterSelect = 0;\n"
+ " densityBoxLeftX = histCol - min(histCol, ss);\n"
+ " densityBoxRightX = histCol + min(ss, (densityFilter->m_SuperRasW - histCol) - 1);\n"
+ " densityBoxTopY = histRow - min((uint)histRow, ss);\n"
+ " densityBoxBottomY = histRow + min(ss, (densityFilter->m_SuperRasH - histRow) - 1);\n"
+ "\n"
+ " for (j = densityBoxTopY; j <= densityBoxBottomY; j++)\n"
+ " {\n"
+ " for (i = densityBoxLeftX; i <= densityBoxRightX; i++)\n"
+ " {\n"
+ " filterSelect += histogram[(j * densityFilter->m_SuperRasW) + i].w;\n"
+ " }\n"
+ " }\n"
+ "\n";
+
+ if (doScf)
+ os << " filterSelect *= scfact;\n";
+ }
+ else
+ {
+ os
+ << " filterSelect = bucket.w;\n";
+ }
+
+ os <<
+ "\n"
+ " if (filterSelect > densityFilter->m_MaxFilteredCounts)\n"
+ " filterSelectInt = densityFilter->m_MaxFilterIndex;\n"
+ " else if (filterSelect <= DE_THRESH)\n"
+ " filterSelectInt = (int)ceil(filterSelect) - 1;\n"
+ " else if (filterSelect != 0)\n"
+ " filterSelectInt = (int)DE_THRESH + (int)floor(pow((real_t)(filterSelect - DE_THRESH), densityFilter->m_Curve));\n"
+ " else\n"
+ " filterSelectInt = 0;\n"
+ "\n"
+ " if (filterSelectInt > densityFilter->m_MaxFilterIndex)\n"
+ " filterSelectInt = densityFilter->m_MaxFilterIndex;\n"
+ "\n"
+ " filterCoefIndex = filterSelectInt * densityFilter->m_KernelSize;\n"
+ "\n"
+ //With this new method, only accumulate to the temp local buffer first. Write to the final accumulator last.
+ //For each loop through, note that there is a local memory barrier call inside of each call to AddToAccumNoCheck().
+ //If this isn't done, pixel errors occurr and even an out of resources error occurrs because too many writes are done to the same place in memory at once.
+ " jmin = min(k, histRow);\n"
+ " jmax = (int)min((densityFilter->m_SuperRasH - 1) - histRow, densityFilter->m_FilterWidth);\n"
+ "\n"
+ " for (j = -jmin; j <= jmax; j++)\n"
+ " {\n"
+ " for (i = 0; i < colsToZero && (colsToZeroOffset + i) < fullTempBoxWidth; i++)\n"//Each thread zeroizes a few columns.
+ " {\n"
+ " filterBox[colsToZeroOffset + i].m_Real4 = 0;\n"
+ " }\n"
+ "\n"
+ " barrier(CLK_LOCAL_MEM_FENCE);\n"
+ "\n"
+ " if (bucket.w != 0)\n"
+ " {\n"
+ " filterRow = abs(j) * (densityFilter->m_FilterWidth + 1);\n"
+ "\n"
+ " for (i = -k; i <= k; i++)\n"
+ " {\n"
+ " filterSelectInt = filterCoefIndex + coefIndices[filterRow + abs(i)];\n"//Really is filterCoeffIndexPlusOffset, but reusing a variable to save space.
+ " filterBox[i + boxCol].m_Real4 += (bucket * (filterCoefs[filterSelectInt] * cacheLog));\n"
+ " }\n"
+ " }\n"
+ "\n"
+ " barrier(CLK_LOCAL_MEM_FENCE);\n"
+ "\n"
+ //At this point, all threads in this block have applied the filter to their surrounding pixels and stored the results in the temp local box.
+ //Add the cells of it that are in bounds to the global accumulator.
+ //Compute offsets in local box to read from, and offsets into global accumulator to write to.
+ //Use a method here that is similar to the zeroization above: Each thread (column) in the first row iterates through all of the
+ //rows and adds a few columns to the accumulator.
+ //" if (THREAD_ID_X == 0)\n"
+ //" {\n"
+ //" for (int kk = boxReadStartCol, i = 0; kk < boxReadEndCol; kk++, i++)\n"//Each thread writes a few columns.//Could do away with kk//TODO//OPT
+ //" {\n"
+ //" accumulator[((histRow + j) * densityFilter->m_SuperRasW) + (accumWriteStartCol + i)].m_Real4 += filterBox[kk].m_Real4;\n"
+ //" }\n"
+ //" }\n"
+ " accumWriteOffset = ((histRow + j) * densityFilter->m_SuperRasW) + accumWriteStartCol;\n"
+ "\n"
+ " for (i = 0; i < colsToWrite; i++)\n"//Each thread writes a few columns.
+ " {\n"
+ " colOffset = colsToWriteOffset + i;\n"
+ "\n"
+ " if (boxReadStartCol + colOffset < boxReadEndCol)\n"
+ " accumulator[accumWriteOffset + colOffset].m_Real4 += filterBox[boxReadStartCol + colOffset].m_Real4;\n"
+ " }\n"
+ " }\n"//for() filter rows.
+ " barrier(CLK_GLOBAL_MEM_FENCE);\n"
+ " }\n"//for() histogram rows.
+ "}\n";
+
+ return os.str();
+}
+
+#else
///
/// Create the gaussian density filtering kernel string.
/// 6 different methods of processing were tried before settling on this final and fastest 7th one.
@@ -281,7 +467,7 @@ string DEOpenCLKernelCreator::CreateLogScaleAssignDEKernelString()
/// This allows writing to the global buffer without ever overlapping or using atomics.
/// The supersample parameter will produce three different kernels.
/// SS = 1, SS > 1 && SS even, SS > 1 && SS odd.
-/// The width of the kernl this runs in must be evenly divisible by 16 or else artifacts will occur.
+/// The width of the kernel this runs in must be evenly divisible by 16 or else artifacts will occur.
/// Note that because this function uses so many variables and is so complex, OpenCL can easily run
/// out of resources in some cases. Certain variables had to be reused to condense the kernel footprint
/// down enough to be able to run a block size of 32x32.
@@ -311,18 +497,15 @@ string DEOpenCLKernelCreator::CreateGaussianDEKernel(size_t ss)
" const __global uint* coefIndices,\n"
" const uint chunkSizeW,\n"
" const uint chunkSizeH,\n"
- " const uint rowParity,\n"
- " const uint colParity\n"
+ " const uint chunkW,\n"
+ " const uint chunkH\n"
"\t)\n"
"{\n"
- //Parity determines if this function should execute.
- " if ((GLOBAL_ID_X >= densityFilter->m_SuperRasW) ||\n"
- " (GLOBAL_ID_Y >= densityFilter->m_SuperRasH) ||\n"
- " ((BLOCK_ID_X % chunkSizeW) != colParity) ||\n"
- " ((BLOCK_ID_Y % chunkSizeH) != rowParity)) \n"
+ " if (((((BLOCK_ID_X * chunkSizeW) + chunkW) * BLOCK_SIZE_X) + THREAD_ID_X >= densityFilter->m_SuperRasW) ||\n"
+ " ((((BLOCK_ID_Y * chunkSizeH) + chunkH) * BLOCK_SIZE_Y) + THREAD_ID_Y >= densityFilter->m_SuperRasH))\n"
" return;\n"
"\n";
-
+
if (doSS)
{
os <<
@@ -367,13 +550,13 @@ string DEOpenCLKernelCreator::CreateGaussianDEKernel(size_t ss)
//Start and end values are the indices in the histogram read from
//and written to in the accumulator. They are not the indices for the local block of data.
//Before computing local offsets, compute the global offsets first to determine if any rows or cols fall outside of the bounds.
- " blockHistStartRow = min(botBound, topBound + (BLOCK_ID_Y * BLOCK_SIZE_Y));\n"//The first histogram row this block will process.
+ " blockHistStartRow = min(botBound, topBound + (((BLOCK_ID_Y * chunkSizeH) + chunkH) * BLOCK_SIZE_Y));\n"//The first histogram row this block will process.
" blockHistEndRow = min(botBound, blockHistStartRow + BLOCK_SIZE_Y);\n"//The last histogram row this block will process, clamped to the last row.
" boxReadStartRow = densityFilter->m_FilterWidth - min(densityFilter->m_FilterWidth, blockHistStartRow);\n"//The first row in the local box to read from when writing back to the final accumulator for this block.
" boxReadEndRow = densityFilter->m_FilterWidth + min(densityFilter->m_FilterWidth + BLOCK_SIZE_Y, densityFilter->m_SuperRasH - blockHistStartRow);\n"//The last row in the local box to read from when writing back to the final accumulator for this block.
- " blockHistStartCol = min(rightBound, leftBound + (BLOCK_ID_X * BLOCK_SIZE_X));\n"//The first histogram column this block will process.
- " boxReadStartCol = densityFilter->m_FilterWidth - min(densityFilter->m_FilterWidth, blockHistStartCol);\n"//The first box row this block will read from when copying to the accumulator.
- " boxReadEndCol = densityFilter->m_FilterWidth + min(densityFilter->m_FilterWidth + BLOCK_SIZE_X, densityFilter->m_SuperRasW - blockHistStartCol);\n"//The last box row this block will read from when copying to the accumulator.
+ " blockHistStartCol = min(rightBound, leftBound + (((BLOCK_ID_X * chunkSizeW) + chunkW) * BLOCK_SIZE_X));\n"//The first histogram column this block will process.
+ " boxReadStartCol = densityFilter->m_FilterWidth - min(densityFilter->m_FilterWidth, blockHistStartCol);\n"//The first box col this block will read from when copying to the accumulator.
+ " boxReadEndCol = densityFilter->m_FilterWidth + min(densityFilter->m_FilterWidth + BLOCK_SIZE_X, densityFilter->m_SuperRasW - blockHistStartCol);\n"//The last box col this block will read from when copying to the accumulator.
"\n"
//Last, the indices in the global accumulator that the local bounds will be writing to.
" accumWriteStartRow = blockHistStartRow - min(densityFilter->m_FilterWidth, blockHistStartRow);\n"//Will be fw - 0 except for boundary columns, it will be less.
@@ -496,7 +679,7 @@ string DEOpenCLKernelCreator::CreateGaussianDEKernel(size_t ss)
" {\n"
" filterSelectInt = filterCoefIndex + coefIndices[(abs(j) * (densityFilter->m_FilterWidth + 1)) + abs(i)];\n"//Really is filterCoeffIndexPlusOffset, but reusing a variable to save space.
"\n"
- " if (filterCoefs[filterSelectInt] != 0)\n"
+ " if (filterCoefs[filterSelectInt] != 0)\n"//This conditional actually improves speed, despite SIMT being bad at conditionals.
" {\n"
" filterBox[(i + boxCol) + ((j + boxRow) * fullTempBoxWidth)].m_Real4 += (bucket * (filterCoefs[filterSelectInt] * cacheLog));\n"
" }\n"
@@ -511,14 +694,14 @@ string DEOpenCLKernelCreator::CreateGaussianDEKernel(size_t ss)
"\n"
" if (THREAD_ID_Y == 0)\n"
" {\n"
- //At this point, all threads in this block have applied the filter to their surrounding pixel and stored the results in the temp local box.
+ //At this point, all threads in this block have applied the filter to their surrounding pixels and stored the results in the temp local box.
//Add the cells of it that are in bounds to the global accumulator.
//Compute offsets in local box to read from, and offsets into global accumulator to write to.
//Use a method here that is similar to the zeroization above: Each thread (column) in the first row iterates through all of the
//rows and adds a few columns to the accumulator.
" for (i = boxReadStartRow, j = accumWriteStartRow; i < boxReadEndRow; i++, j++)\n"
" {\n"
- " for (k = 0; k < colsToWrite; k++)\n"//Write a few columns.
+ " for (k = 0; k < colsToWrite; k++)\n"//Each thread writes a few columns.
" {\n"
" boxCol = (colsToWrite * THREAD_ID_X) + k;\n"//Really is colOffset, but reusing a variable to save space.
"\n"
@@ -532,6 +715,7 @@ string DEOpenCLKernelCreator::CreateGaussianDEKernel(size_t ss)
return os.str();
}
+#endif
///
/// Create the gaussian density filtering kernel string, but use no local cache and perform
@@ -543,7 +727,7 @@ string DEOpenCLKernelCreator::CreateGaussianDEKernel(size_t ss)
/// on the CPU because the frequent global memory access brings performance to a crawl.
/// The supersample parameter will produce three different kernels.
/// SS = 1, SS > 1 && SS even, SS > 1 && SS odd.
-/// The width of the kernl this runs in must be evenly divisible by 16 or else artifacts will occur.
+/// The width of the kernel this runs in must be evenly divisible by 16 or else artifacts will occur.
/// Note that because this function uses so many variables and is so complex, OpenCL can easily run
/// out of resources in some cases. Certain variables had to be reused to condense the kernel footprint
/// down enough to be able to run a block size of 32x32.
@@ -572,15 +756,12 @@ string DEOpenCLKernelCreator::CreateGaussianDEKernelNoLocalCache(size_t ss)
" const __global uint* coefIndices,\n"
" const uint chunkSizeW,\n"
" const uint chunkSizeH,\n"
- " const uint rowParity,\n"
- " const uint colParity\n"
+ " const uint chunkW,\n"
+ " const uint chunkH\n"
"\t)\n"
"{\n"
- //Parity determines if this function should execute.
- " if ((GLOBAL_ID_X >= densityFilter->m_SuperRasW) ||\n"
- " (GLOBAL_ID_Y >= densityFilter->m_SuperRasH) ||\n"
- " ((BLOCK_ID_X % chunkSizeW) != colParity) ||\n"
- " ((BLOCK_ID_Y % chunkSizeH) != rowParity)) \n"
+ " if (((((BLOCK_ID_X * chunkSizeW) + chunkW) * BLOCK_SIZE_X) + THREAD_ID_X >= densityFilter->m_SuperRasW) ||\n"
+ " ((((BLOCK_ID_Y * chunkSizeH) + chunkH) * BLOCK_SIZE_Y) + THREAD_ID_Y >= densityFilter->m_SuperRasH))\n"
" return;\n"
"\n";
@@ -606,10 +787,10 @@ string DEOpenCLKernelCreator::CreateGaussianDEKernelNoLocalCache(size_t ss)
"\n"
//Start and end values are the indices in the histogram read from and written to in the accumulator.
//Before computing local offsets, compute the global offsets first to determine if any rows or cols fall outside of the bounds.
- " uint blockHistStartRow = min(botBound, topBound + (BLOCK_ID_Y * BLOCK_SIZE_Y));\n"//The first histogram row this block will process.
+ " uint blockHistStartRow = min(botBound, topBound + (((BLOCK_ID_Y * chunkSizeH) + chunkH) * BLOCK_SIZE_Y));\n"//The first histogram row this block will process.
" uint threadHistRow = blockHistStartRow + THREAD_ID_Y;\n"//The histogram row this individual thread will be reading from.
"\n"
- " uint blockHistStartCol = min(rightBound, leftBound + (BLOCK_ID_X * BLOCK_SIZE_X));\n"//The first histogram column this block will process.
+ " uint blockHistStartCol = min(rightBound, leftBound + (((BLOCK_ID_X * chunkSizeW) + chunkW) * BLOCK_SIZE_X));\n"//The first histogram column this block will process.
" uint threadHistCol = blockHistStartCol + THREAD_ID_X;\n"//The histogram column this individual thread will be reading from.
"\n"
" int i, j;\n"
diff --git a/Source/EmberCL/DEOpenCLKernelCreator.h b/Source/EmberCL/DEOpenCLKernelCreator.h
index 802ddd6..a56cef3 100644
--- a/Source/EmberCL/DEOpenCLKernelCreator.h
+++ b/Source/EmberCL/DEOpenCLKernelCreator.h
@@ -8,6 +8,8 @@
/// DEOpenCLKernelCreator class.
///
+//#define ROW_ONLY_DE 1
+
namespace EmberCLns
{
///
@@ -35,8 +37,6 @@ public:
DEOpenCLKernelCreator(bool nVidia);
//Accessors.
- string LogScaleSumDEKernel();
- string LogScaleSumDEEntryPoint();
string LogScaleAssignDEKernel();
string LogScaleAssignDEEntryPoint();
string GaussianDEKernel(size_t ss, unsigned int filterWidth);
@@ -49,14 +49,10 @@ public:
private:
//Kernel creators.
- string CreateLogScaleSumDEKernelString();
string CreateLogScaleAssignDEKernelString();
string CreateGaussianDEKernel(size_t ss);
string CreateGaussianDEKernelNoLocalCache(size_t ss);
-
- string m_LogScaleSumDEKernel;
- string m_LogScaleSumDEEntryPoint;
-
+
string m_LogScaleAssignDEKernel;
string m_LogScaleAssignDEEntryPoint;
diff --git a/Source/EmberCL/EmberCLStructs.h b/Source/EmberCL/EmberCLStructs.h
index 26328d4..6aa4c34 100644
--- a/Source/EmberCL/EmberCLStructs.h
+++ b/Source/EmberCL/EmberCLStructs.h
@@ -181,9 +181,6 @@ static const char* XformCLStructString =
"} XformCL;\n"
"\n";
-#define MAX_CL_XFORM 21//These must always match.
-#define MAX_CL_XFORM_STRING "21"
-
///
/// A structure on the host used to hold all of the needed information for an ember used on the device to iterate in OpenCL.
/// Template argument expected to be float or double.
@@ -191,7 +188,6 @@ static const char* XformCLStructString =
template
struct ALIGN EmberCL
{
- XformCL m_Xforms[MAX_CL_XFORM];
T m_CamZPos;
T m_CamPerspective;
T m_CamYaw;
@@ -209,7 +205,6 @@ struct ALIGN EmberCL
static const char* EmberCLStructString =
"typedef struct __attribute__ " ALIGN_CL " _EmberCL\n"
"{\n"
-" XformCL m_Xforms[" MAX_CL_XFORM_STRING "];\n"
" real_t m_CamZPos;\n"
" real_t m_CamPerspective;\n"
" real_t m_CamYaw;\n"
diff --git a/Source/EmberCL/IterOpenCLKernelCreator.cpp b/Source/EmberCL/IterOpenCLKernelCreator.cpp
index 6c944e6..220938b 100644
--- a/Source/EmberCL/IterOpenCLKernelCreator.cpp
+++ b/Source/EmberCL/IterOpenCLKernelCreator.cpp
@@ -1,6 +1,9 @@
#include "EmberCLPch.h"
#include "IterOpenCLKernelCreator.h"
+//#define STRAIGHT_RAND 1
+#define USE_CASE 1
+
namespace EmberCLns
{
///
@@ -233,8 +236,9 @@ string IterOpenCLKernelCreator::CreateIterKernelString(Ember& ember, strin
"__kernel void " << m_IterEntryPoint << "(\n" <<
" uint iterCount,\n"
" uint fuseCount,\n"
- " uint seed,\n"
+ " __global uint2* seeds,\n"
" __constant EmberCL* ember,\n"
+ " __constant XformCL* xforms,\n"
" __constant real_t* parVars,\n"
" __global uchar* xformDistributions,\n"//Using uchar is quicker than uint. Can't be constant because the size can be too large to fit when using xaos.//FINALOPT
" __constant CarToRasCL* carToRas,\n"
@@ -246,13 +250,14 @@ string IterOpenCLKernelCreator::CreateIterKernelString(Ember& ember, strin
"{\n"
" bool fuse, ok;\n"
" uint threadIndex = INDEX_IN_BLOCK_2D;\n"
+ " uint pointsIndex = INDEX_IN_GRID_2D;\n"
" uint i, itersToDo;\n"
" uint consec = 0;\n"
//" int badvals = 0;\n"
" uint histIndex;\n"
" real_t p00, p01;\n"
" Point firstPoint, secondPoint, tempPoint;\n"
- " uint2 mwc;\n"
+ " uint2 mwc = seeds[pointsIndex];\n"
" float4 palColor1;\n"
" int2 iPaletteCoord;\n"
" const sampler_t paletteSampler = CLK_NORMALIZED_COORDS_FALSE |\n"//Coords from 0 to 255.
@@ -265,12 +270,11 @@ string IterOpenCLKernelCreator::CreateIterKernelString(Ember& ember, strin
os <<
"\n"
+#ifndef STRAIGHT_RAND
" __local Point swap[NTHREADS];\n"
" __local uint xfsel[NWARPS];\n"
+#endif
"\n"
- " uint pointsIndex = INDEX_IN_GRID_2D;\n"
- " mwc.x = (pointsIndex + 1 * seed);\n"
- " mwc.y = ((BLOCK_ID_X + 1) * (pointsIndex + 1) * seed);\n"
" iPaletteCoord.y = 0;\n"
"\n"
" if (fuseCount > 0)\n"
@@ -295,9 +299,11 @@ string IterOpenCLKernelCreator::CreateIterKernelString(Ember& ember, strin
//This along with the randomness that the point shuffle provides gives sufficient randomness
//to produce results identical to those produced on the CPU.
os <<
+#ifndef STRAIGHT_RAND
" if (THREAD_ID_Y == 0 && THREAD_ID_X < NWARPS)\n"
" xfsel[THREAD_ID_X] = MwcNext(&mwc) % " << CHOOSE_XFORM_GRAIN << ";\n"//It's faster to do the % here ahead of time than every time an xform is looked up to use inside the loop.
"\n"
+#endif
" barrier(CLK_LOCAL_MEM_FENCE);\n"
"\n"
" for (i = 0; i < itersToDo; i++)\n"
@@ -309,22 +315,51 @@ string IterOpenCLKernelCreator::CreateIterKernelString(Ember& ember, strin
" do\n"
" {\n";
- //If xaos is present, the cuburn method is effectively ceased. Every thread will be picking a random xform.
+ //If xaos is present, the a hybrid of the cuburn method is used.
+ //This makes each thread in a row pick the same offset into a distribution, using xfsel.
+ //However, the distribution the offset is in, is determined by firstPoint.m_LastXfUsed.
if (ember.XaosPresent())
{
os <<
+#ifdef STRAIGHT_RAND
" secondPoint.m_LastXfUsed = xformDistributions[MwcNext(&mwc) % " << CHOOSE_XFORM_GRAIN << " + (" << CHOOSE_XFORM_GRAIN << " * (firstPoint.m_LastXfUsed + 1u))];\n\n";
- //" secondPoint.m_LastXfUsed = xformDistributions[xfsel[THREAD_ID_Y] + (" << CHOOSE_XFORM_GRAIN << " * (firstPoint.m_LastXfUsed + 1u))];\n\n";//Partial cuburn hybrid.
+#else
+ " secondPoint.m_LastXfUsed = xformDistributions[xfsel[THREAD_ID_Y] + (" << CHOOSE_XFORM_GRAIN << " * (firstPoint.m_LastXfUsed + 1u))];\n\n";//Partial cuburn hybrid.
+#endif
}
else
{
os <<
- //" secondPoint.m_LastXfUsed = xformDistributions[MwcNext(&mwc) % " << CHOOSE_XFORM_GRAIN << "];\n\n";//For testing, using straight rand flam4/fractron style instead of cuburn.
+#ifdef STRAIGHT_RAND
+ " secondPoint.m_LastXfUsed = xformDistributions[MwcNext(&mwc) % " << CHOOSE_XFORM_GRAIN << "];\n\n";//For testing, using straight rand flam4/fractron style instead of cuburn.
+#else
" secondPoint.m_LastXfUsed = xformDistributions[xfsel[THREAD_ID_Y]];\n\n";
+#endif
}
for (i = 0; i < ember.XformCount(); i++)
{
+#ifdef USE_CASE
+ if (i == 0)
+ {
+ os <<
+ " switch (secondPoint.m_LastXfUsed)\n"
+ " {\n";
+ }
+
+ os <<
+ " case " << i << ":\n"
+ " {\n" <<
+ " Xform" << i << "(&(xforms[" << i << "]), parVars, &firstPoint, &secondPoint, &mwc);\n" <<
+ " break;\n"
+ " }\n";
+
+ if (i == ember.XformCount() - 1)
+ {
+ os <<
+ " }\n";
+ }
+#else
if (i == 0)
os <<
" if (secondPoint.m_LastXfUsed == " << i << ")\n";
@@ -334,9 +369,11 @@ string IterOpenCLKernelCreator::CreateIterKernelString(Ember& ember, strin
os <<
" {\n" <<
- " Xform" << i << "(&(ember->m_Xforms[" << i << "]), parVars, &firstPoint, &secondPoint, &mwc);\n" <<
+ " Xform" << i << "(&(xforms[" << i << "]), parVars, &firstPoint, &secondPoint, &mwc);\n" <<
" }\n";
+#endif
}
+
os <<
"\n"
" ok = !BadVal(secondPoint.m_X) && !BadVal(secondPoint.m_Y);\n"
@@ -360,6 +397,7 @@ string IterOpenCLKernelCreator::CreateIterKernelString(Ember& ember, strin
" secondPoint.m_Y = MwcNextNeg1Pos1(&mwc);\n"
" secondPoint.m_Z = 0.0;\n"
" }\n"
+#ifndef STRAIGHT_RAND
"\n"//Rotate points between threads. This is how randomization is achieved.
" uint swr = threadXY + ((i & 1u) * threadXDivRows);\n"
" uint sw = (swr * THREADS_PER_WARP + THREAD_ID_X) & threadsMinus1;\n"
@@ -368,16 +406,16 @@ string IterOpenCLKernelCreator::CreateIterKernelString(Ember& ember, strin
//Write to another thread's location.
" swap[sw] = secondPoint;\n"
"\n"
-
//Populate randomized xform index buffer with new random values.
" if (THREAD_ID_Y == 0 && THREAD_ID_X < NWARPS)\n"
" xfsel[THREAD_ID_X] = MwcNext(&mwc) % " << CHOOSE_XFORM_GRAIN << ";\n"
"\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
- "\n"
-
//Another thread will have written to this thread's location, so read the new value and use it for accumulation below.
" firstPoint = swap[threadIndex];\n"
+#else
+ " firstPoint = secondPoint;\n"//For testing, using straight rand flam4/fractron style instead of cuburn.
+#endif
"\n"
" if (fuse)\n"
" {\n"
@@ -399,14 +437,14 @@ string IterOpenCLKernelCreator::CreateIterKernelString(Ember& ember, strin
//CPU takes an extra step here to preserve the opacity of the randomly selected xform, rather than the final xform's opacity.
//The same thing takes place here automatically because secondPoint.m_LastXfUsed is used below to retrieve the opacity when accumulating.
- os <<
- " if ((ember->m_Xforms[" << finalIndex << "].m_Opacity == 1) || (MwcNext01(&mwc) < ember->m_Xforms[" << finalIndex << "].m_Opacity))\n"
- " {\n"
- " tempPoint.m_LastXfUsed = secondPoint.m_LastXfUsed;\n"
- " Xform" << finalIndex << "(&(ember->m_Xforms[" << finalIndex << "]), parVars, &secondPoint, &tempPoint, &mwc);\n"
- " secondPoint = tempPoint;\n"
- " }\n"
- "\n";
+ os <<
+ " if ((xforms[" << finalIndex << "].m_Opacity == 1) || (MwcNext01(&mwc) < xforms[" << finalIndex << "].m_Opacity))\n"
+ " {\n"
+ " tempPoint.m_LastXfUsed = secondPoint.m_LastXfUsed;\n"
+ " Xform" << finalIndex << "(&(xforms[" << finalIndex << "]), parVars, &secondPoint, &tempPoint, &mwc);\n"
+ " secondPoint = tempPoint;\n"
+ " }\n"
+ "\n";
}
os << CreateProjectionString(ember);
@@ -471,18 +509,18 @@ string IterOpenCLKernelCreator::CreateIterKernelString(Ember& ember, strin
if (typeid(T) == typeid(double))
{
os <<
- " AtomicAdd(&(histogram[histIndex].m_Reals[0]), (real_t)palColor1.x * ember->m_Xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n"//Always apply opacity, even though it's usually 1.
- " AtomicAdd(&(histogram[histIndex].m_Reals[1]), (real_t)palColor1.y * ember->m_Xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n"
- " AtomicAdd(&(histogram[histIndex].m_Reals[2]), (real_t)palColor1.z * ember->m_Xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n"
- " AtomicAdd(&(histogram[histIndex].m_Reals[3]), (real_t)palColor1.w * ember->m_Xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n";
+ " AtomicAdd(&(histogram[histIndex].m_Reals[0]), (real_t)palColor1.x * xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n"//Always apply opacity, even though it's usually 1.
+ " AtomicAdd(&(histogram[histIndex].m_Reals[1]), (real_t)palColor1.y * xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n"
+ " AtomicAdd(&(histogram[histIndex].m_Reals[2]), (real_t)palColor1.z * xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n"
+ " AtomicAdd(&(histogram[histIndex].m_Reals[3]), (real_t)palColor1.w * xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n";
}
else
{
- os <<
- " AtomicAdd(&(histogram[histIndex].m_Reals[0]), palColor1.x * ember->m_Xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n"//Always apply opacity, even though it's usually 1.
- " AtomicAdd(&(histogram[histIndex].m_Reals[1]), palColor1.y * ember->m_Xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n"
- " AtomicAdd(&(histogram[histIndex].m_Reals[2]), palColor1.z * ember->m_Xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n"
- " AtomicAdd(&(histogram[histIndex].m_Reals[3]), palColor1.w * ember->m_Xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n";
+ os <<
+ " AtomicAdd(&(histogram[histIndex].m_Reals[0]), palColor1.x * xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n"//Always apply opacity, even though it's usually 1.
+ " AtomicAdd(&(histogram[histIndex].m_Reals[1]), palColor1.y * xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n"
+ " AtomicAdd(&(histogram[histIndex].m_Reals[2]), palColor1.z * xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n"
+ " AtomicAdd(&(histogram[histIndex].m_Reals[3]), palColor1.w * xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n";
}
}
else
@@ -496,12 +534,12 @@ string IterOpenCLKernelCreator::CreateIterKernelString(Ember& ember, strin
" realColor.y = (real_t)palColor1.y;\n"
" realColor.z = (real_t)palColor1.z;\n"
" realColor.w = (real_t)palColor1.w;\n"
- " histogram[histIndex].m_Real4 += (realColor * ember->m_Xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n";
+ " histogram[histIndex].m_Real4 += (realColor * xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n";
}
else
{
- os <<
- " histogram[histIndex].m_Real4 += (palColor1 * ember->m_Xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n";
+ os <<
+ " histogram[histIndex].m_Real4 += (palColor1 * xforms[secondPoint.m_LastXfUsed].m_VizAdjusted);\n";
}
}
@@ -525,6 +563,7 @@ string IterOpenCLKernelCreator::CreateIterKernelString(Ember& ember, strin
" points[pointsIndex].m_ColorX = MwcNextNeg1Pos1(&mwc);\n"
#else
" points[pointsIndex] = firstPoint;\n"
+ " seeds[pointsIndex] = mwc;\n"
#endif
" barrier(CLK_GLOBAL_MEM_FENCE);\n"
"}\n";
diff --git a/Source/EmberCL/OpenCLWrapper.cpp b/Source/EmberCL/OpenCLWrapper.cpp
index 0a043c5..57621a8 100644
--- a/Source/EmberCL/OpenCLWrapper.cpp
+++ b/Source/EmberCL/OpenCLWrapper.cpp
@@ -1121,12 +1121,12 @@ string OpenCLWrapper::DumpInfo()
///
/// OpenCL properties, getters only.
///
-bool OpenCLWrapper::Ok() { return m_Init; }
-bool OpenCLWrapper::Shared() { return m_Shared; }
-cl::Context OpenCLWrapper::Context() { return m_Context; }
-unsigned int OpenCLWrapper::PlatformIndex() { return m_PlatformIndex; }
-unsigned int OpenCLWrapper::DeviceIndex() { return m_DeviceIndex; }
-unsigned int OpenCLWrapper::LocalMemSize() { return m_LocalMemSize; }
+bool OpenCLWrapper::Ok() const { return m_Init; }
+bool OpenCLWrapper::Shared() const { return m_Shared; }
+cl::Context OpenCLWrapper::Context() const { return m_Context; }
+unsigned int OpenCLWrapper::PlatformIndex() const { return m_PlatformIndex; }
+unsigned int OpenCLWrapper::DeviceIndex() const { return m_DeviceIndex; }
+unsigned int OpenCLWrapper::LocalMemSize() const { return m_LocalMemSize; }
///
/// Makes the even grid dims.
diff --git a/Source/EmberCL/OpenCLWrapper.h b/Source/EmberCL/OpenCLWrapper.h
index e060d01..0369c18 100644
--- a/Source/EmberCL/OpenCLWrapper.h
+++ b/Source/EmberCL/OpenCLWrapper.h
@@ -184,12 +184,12 @@ public:
string DumpInfo();
//Accessors.
- bool Ok();
- bool Shared();
- cl::Context Context();
- unsigned int PlatformIndex();
- unsigned int DeviceIndex();
- unsigned int LocalMemSize();
+ bool Ok() const;
+ bool Shared() const;
+ cl::Context Context() const;
+ unsigned int PlatformIndex() const;
+ unsigned int DeviceIndex() const;
+ unsigned int LocalMemSize() const;
static void MakeEvenGridDims(unsigned int blockW, unsigned int blockH, unsigned int& gridW, unsigned int& gridH);
diff --git a/Source/EmberCL/RendererCL.cpp b/Source/EmberCL/RendererCL.cpp
index eebb711..bd43bb6 100644
--- a/Source/EmberCL/RendererCL.cpp
+++ b/Source/EmberCL/RendererCL.cpp
@@ -22,7 +22,9 @@ RendererCL::RendererCL(unsigned int platform, unsigned int device, bool share
//Buffer names.
m_EmberBufferName = "Ember";
+ m_XformsBufferName = "Xforms";
m_ParVarsBufferName = "ParVars";
+ m_SeedsBufferName = "Seeds";
m_DistBufferName = "Dist";
m_CarToRasBufferName = "CarToRas";
m_DEFilterParamsBufferName = "DEFilterParams";
@@ -50,6 +52,13 @@ RendererCL::RendererCL(unsigned int platform, unsigned int device, bool share
m_PaletteFormat.image_channel_data_type = CL_FLOAT;
m_FinalFormat.image_channel_order = CL_RGBA;
m_FinalFormat.image_channel_data_type = CL_UNORM_INT8;//Change if this ever supports 2BPC outputs for PNG.
+ m_Seeds.resize(IterGridKernelCount());
+
+ for (size_t i = 0; i < m_Seeds.size(); i++)
+ {
+ m_Seeds[i].x = m_Rand[0].Rand();
+ m_Seeds[i].y = m_Rand[0].Rand();
+ }
Init(platform, device, shared, outputTexID);//Init OpenCL upon construction and create programs that will not change.
}
@@ -100,14 +109,12 @@ bool RendererCL::Init(unsigned int platform, unsigned int device, bool shared
m_DEOpenCLKernelCreator = DEOpenCLKernelCreator(m_NVidia);
string zeroizeProgram = m_IterOpenCLKernelCreator.ZeroizeKernel();
- string logAssignProgram = m_DEOpenCLKernelCreator.LogScaleAssignDEKernel();
- string logSumProgram = m_DEOpenCLKernelCreator.LogScaleSumDEKernel();//Build a couple of simple programs to ensure OpenCL is working right.
+ string logAssignProgram = m_DEOpenCLKernelCreator.LogScaleAssignDEKernel();//Build a couple of simple programs to ensure OpenCL is working right.
if (b && !(b = m_Wrapper.AddProgram(m_IterOpenCLKernelCreator.ZeroizeEntryPoint(), zeroizeProgram, m_IterOpenCLKernelCreator.ZeroizeEntryPoint(), m_DoublePrecision))) { m_ErrorReport.push_back(loc); }
if (b && !(b = m_Wrapper.AddProgram(m_DEOpenCLKernelCreator.LogScaleAssignDEEntryPoint(), logAssignProgram, m_DEOpenCLKernelCreator.LogScaleAssignDEEntryPoint(), m_DoublePrecision))) { m_ErrorReport.push_back(loc); }
- if (b && !(b = m_Wrapper.AddProgram(m_DEOpenCLKernelCreator.LogScaleSumDEEntryPoint(), logSumProgram, m_DEOpenCLKernelCreator.LogScaleSumDEEntryPoint(), m_DoublePrecision))) { m_ErrorReport.push_back(loc); }
-
if (b && !(b = m_Wrapper.AddAndWriteImage("Palette", CL_MEM_READ_ONLY, m_PaletteFormat, 256, 1, 0, NULL))) { m_ErrorReport.push_back(loc); }
+ if (b && !(b = m_Wrapper.AddAndWriteBuffer(m_SeedsBufferName, (void*)m_Seeds.data(), SizeOf(m_Seeds)))) { m_ErrorReport.push_back(loc); }
//This is the maximum box dimension for density filtering which consists of (blockSize * blockSize) + (2 * filterWidth).
//These blocks must be square, and ideally, 32x32.
@@ -123,6 +130,11 @@ bool RendererCL::Init(unsigned int platform, unsigned int device, bool shared
return b;
}
+///
+/// Set the shared output texture where final accumulation will be written to.
+///
+/// The texture ID of the shared OpenGL texture if shared
+/// True if success, else false.
template
bool RendererCL::SetOutputTexture(GLuint outputTexID)
{
@@ -149,16 +161,28 @@ bool RendererCL::SetOutputTexture(GLuint outputTexID)
/// OpenCL property accessors, getters only.
///
-template unsigned int RendererCL::IterCountPerKernel() { return m_IterCountPerKernel; }
-template unsigned int RendererCL::IterBlocksWide() { return m_IterBlocksWide; }
-template unsigned int RendererCL::IterBlocksHigh() { return m_IterBlocksHigh; }
-template unsigned int RendererCL::IterBlockWidth() { return m_IterBlockWidth; }
-template unsigned int RendererCL::IterBlockHeight() { return m_IterBlockHeight; }
-template unsigned int RendererCL::IterGridWidth() { return IterBlocksWide() * IterBlockWidth(); }
-template unsigned int RendererCL::IterGridHeight() { return IterBlocksHigh() * IterBlockHeight(); }
-template unsigned int RendererCL::TotalIterKernelCount() { return IterGridWidth() * IterGridHeight(); }
-template unsigned int RendererCL::PlatformIndex() { return m_Wrapper.PlatformIndex(); }
-template unsigned int RendererCL::DeviceIndex() { return m_Wrapper.DeviceIndex(); }
+//Iters per kernel/block/grid.
+template unsigned int RendererCL::IterCountPerKernel() const { return m_IterCountPerKernel; }
+template unsigned int RendererCL::IterCountPerBlock() const { return IterCountPerKernel() * IterBlockKernelCount(); }
+template unsigned int RendererCL::IterCountPerGrid() const { return IterCountPerKernel() * IterGridKernelCount(); }
+
+//Kernels per block.
+template unsigned int RendererCL::IterBlockKernelWidth() const { return m_IterBlockWidth; }
+template unsigned int RendererCL::IterBlockKernelHeight() const { return m_IterBlockHeight; }
+template unsigned int RendererCL::IterBlockKernelCount() const { return IterBlockKernelWidth() * IterBlockKernelHeight(); }
+
+//Kernels per grid.
+template unsigned int RendererCL::IterGridKernelWidth() const { return IterGridBlockWidth() * IterBlockKernelWidth(); }
+template unsigned int RendererCL::IterGridKernelHeight() const { return IterGridBlockHeight() * IterBlockKernelHeight(); }
+template unsigned int RendererCL::IterGridKernelCount() const { return IterGridKernelWidth() * IterGridKernelHeight(); }
+
+//Blocks per grid.
+template unsigned int RendererCL::IterGridBlockWidth() const { return m_IterBlocksWide; }
+template unsigned int RendererCL::IterGridBlockHeight() const { return m_IterBlocksHigh; }
+template unsigned int RendererCL::IterGridBlockCount() const { return IterGridBlockWidth() * IterGridBlockHeight(); }
+
+template unsigned int RendererCL::PlatformIndex() { return m_Wrapper.PlatformIndex(); }
+template unsigned int RendererCL::DeviceIndex() { return m_Wrapper.DeviceIndex(); }
///
/// Read the histogram into the host side CPU buffer.
@@ -197,10 +221,10 @@ bool RendererCL::ReadAccum()
template
bool RendererCL::ReadPoints(vector>& vec)
{
- vec.resize(TotalIterKernelCount());//Allocate the memory to read into.
+ vec.resize(IterGridKernelCount());//Allocate the memory to read into.
- if (vec.size() >= TotalIterKernelCount())
- return m_Wrapper.ReadBuffer(m_PointsBufferName, (void*)vec.data(), TotalIterKernelCount() * sizeof(PointCL));
+ if (vec.size() >= IterGridKernelCount())
+ return m_Wrapper.ReadBuffer(m_PointsBufferName, (void*)vec.data(), IterGridKernelCount() * sizeof(PointCL));
return false;
}
@@ -237,6 +261,26 @@ bool RendererCL::WritePoints(vector>& vec)
return m_Wrapper.WriteBuffer(m_PointsBufferName, (void*)vec.data(), vec.size() * sizeof(vec[0]));
}
+#ifdef TEST_CL
+template
+bool RendererCL::WriteRandomPoints()
+{
+ size_t size = IterGridKernelCount();
+ vector> vec(size);
+
+ for (int i = 0; i < size; i++)
+ {
+ vec[i].m_X = m_Rand[0].Frand11();
+ vec[i].m_Y = m_Rand[0].Frand11();
+ vec[i].m_Z = 0;
+ vec[i].m_ColorX = m_Rand[0].Frand01