Features:

--Added panorama1 and panorama2 variations.

Bug fixes:
--crackle had a bug with Nvidia GPUs.

Code changes:
--crackle now uses real_t* for cache rather than real2. This is what was causing the bug.
--Make the local offsets array used in crackle a precalc since it's the same for all. This reduces register pressure.
--Get rid of all usages of real3, just to be safe since Nvidia doesn't like them.
--#define TOTAL_GLOBAL_SIZE_END in the OpenCL iteration kernel just for debugging purposes to see how large the parvars buffer is.
This commit is contained in:
Person
2017-08-16 17:33:11 -07:00
parent d6d121ac95
commit 59f5bffc3c
8 changed files with 142 additions and 28 deletions

View File

@ -3768,22 +3768,24 @@ public:
virtual vector<string> OpenCLGlobalDataNames() const override
{
return vector<string> { "NOISE_INDEX", "NOISE_POINTS" };
return vector<string> { "NOISE_INDEX", "NOISE_POINTS", "OFFSETS" };
}
virtual string OpenCLFuncsString() const override
{
ostringstream os;
os <<
"static void Position(__constant real2* cache, __global real_t* p, __global real_t* grad, int x, int y, real_t z, real_t s, real_t d, real2* v)\n"
"static void Position(__constant real_t* cache, __global real_t* p, __global real_t* grad, int x, int y, real_t z, real_t s, real_t d, real2* v)\n"
"{\n"
" if (abs(x) <= " << CACHE_NUM << " && abs(y) <= " << CACHE_NUM << ")\n"
" {\n"
" *v = cache[((x + " << CACHE_NUM << ") * " << CACHE_WIDTH << ") + (y + " << CACHE_NUM << ")];\n"
" int index = (((x + " << CACHE_NUM << ") * " << CACHE_WIDTH << ") + (y + " << CACHE_NUM << ")) * 2;\n"
" (*v).x = cache[index];\n"
" (*v).y = cache[index + 1];\n"
" }\n"
" else\n"
" {\n"
" real3 e, f;\n"
" real4 e, f;\n"
" e.x = x * 2.5;\n"
" e.y = y * 2.5;\n"
" e.z = z * 2.5;\n"
@ -3818,6 +3820,7 @@ public:
<< "\t\treal2 u, dO;\n"
<< "\t\tint2 cv;\n"
<< "\t\treal2 p[" << VORONOI_MAXPOINTS << "];\n"
<< "\t\t__global real2* offset = (__global real2*)(globalShared + OFFSETS);\n"
<< "\n"
<< "\t\tif (" << cellSize << " == 0)\n"
<< "\t\t return;\n"
@ -3833,31 +3836,29 @@ public:
<< "\t\t{\n"
<< "\t\t for (dj = -1; dj < 2; dj++)\n"
<< "\t\t {\n"
<< "\t\t Position((__constant real2*)(&" << cache << "), globalShared + NOISE_INDEX, globalShared + NOISE_POINTS, cv.x + di, cv.y + dj, " << z << ", " << halfCellSize << ", " << distort << ", &p[i]); \n"
<< "\t\t Position(&" << cache << ", globalShared + NOISE_INDEX, globalShared + NOISE_POINTS, cv.x + di, cv.y + dj, " << z << ", " << halfCellSize << ", " << distort << ", &p[i]); \n"
<< "\t\t i++;\n"
<< "\t\t }\n"
<< "\t\t}\n"
<< "\n"
<< "\t\tint q = Closest(p, 9, &u);\n"
<< "\t\tint2 offset[9] = { { -1, -1 }, { -1, 0 }, { -1, 1 }, \n"
<< "\t\t{ 0, -1 }, { 0, 0 }, { 0, 1 },\n"
<< "\t\t{ 1, -1 }, { 1, 0 }, { 1, 1 } };\n"
<< "\t\tcv += offset[q];\n"
<< "\t\tcv.x += (int)offset[q].x;\n"
<< "\t\tcv.y += (int)offset[q].y;\n"
<< "\t\ti = 0;\n"
<< "\n"
<< "\t\tfor (di = -1; di < 2; di++)\n"
<< "\t\t{\n"
<< "\t\t for (dj = -1; dj < 2; dj++)\n"
<< "\t\t {\n"
<< "\t\t Position((__constant real2*)(&" << cache << "), globalShared + NOISE_INDEX, globalShared + NOISE_POINTS, cv.x + di, cv.y + dj, " << z << ", " << halfCellSize << ", " << distort << ", &p[i]);\n"
<< "\t\t Position(&" << cache << ", globalShared + NOISE_INDEX, globalShared + NOISE_POINTS, cv.x + di, cv.y + dj, " << z << ", " << halfCellSize << ", " << distort << ", &p[i]);\n"
<< "\t\t i++;\n"
<< "\t\t }\n"
<< "\t\t}\n"
<< "\n"
<< "\t\tl = Voronoi(p, 9, 4, &u);\n"
<< "\t\tl = Zeps(Voronoi(p, 9, 4, &u));\n"
<< "\t\tdO = u - p[4];\n"
<< "\t\ttrgL = pow(fabs(Zeps(l)), " << power << ") * " << scale << ";\n"
<< "\t\tr = trgL / Zeps(l);\n"
<< "\t\ttrgL = pow(fabs(l), " << power << ") * " << scale << ";\n"
<< "\t\tr = trgL / l;\n"
<< "\t\tdO *= r;\n"
<< "\t\tdO += p[4];\n"
<< "\t\tvOut.x = xform->m_VariationWeights[" << varIndex << "] * dO.x;\n"