Some optimizations for capture sharpening

This commit is contained in:
Ingo Weyrich 2019-12-18 17:33:41 +01:00
parent 5e0ad767ad
commit 3a207dace7

View File

@ -152,7 +152,7 @@ inline void gauss5x5div (float** RESTRICT src, float** RESTRICT dst, float** RES
for (int i = 2; i < tileSize - 2; ++i) { for (int i = 2; i < tileSize - 2; ++i) {
// I tried hand written SSE code but gcc vectorizes better // I tried hand written SSE code but gcc vectorizes better
for (int j = 2; j < tileSize - 2; ++j) { for (int j = 2; j < tileSize - 2; ++j) {
const float val = c21 * (src[i - 2][j - 1] + src[i - 2][j + 1] + src[i - 1][j - 2] + src[i - 1][j + 2] + src[i + 1][j - 2] + src[i + 1][j + 2] + src[i + 2][j - 1] + src[i + 2][j + 1]) + const float val = c21 * ((src[i - 2][j - 1] + src[i - 2][j + 1]) + (src[i - 1][j - 2] + src[i - 1][j + 2]) + (src[i + 1][j - 2] + src[i + 1][j + 2]) + (src[i + 2][j - 1] + src[i + 2][j + 1])) +
c20 * (src[i - 2][j] + src[i][j - 2] + src[i][j + 2] + src[i + 2][j]) + c20 * (src[i - 2][j] + src[i][j - 2] + src[i][j + 2] + src[i + 2][j]) +
c11 * (src[i - 1][j - 1] + src[i - 1][j + 1] + src[i + 1][j - 1] + src[i + 1][j + 1]) + c11 * (src[i - 1][j - 1] + src[i - 1][j + 1] + src[i + 1][j - 1] + src[i + 1][j + 1]) +
c10 * (src[i - 1][j] + src[i][j - 1] + src[i][j + 1] + src[i + 1][j]) + c10 * (src[i - 1][j] + src[i][j - 1] + src[i][j + 1] + src[i + 1][j]) +
@ -178,10 +178,10 @@ inline void gauss7x7div(float** RESTRICT src, float** RESTRICT dst, float** REST
for (int i = 3; i < tileSize - 3; ++i) { for (int i = 3; i < tileSize - 3; ++i) {
// I tried hand written SSE code but gcc vectorizes better // I tried hand written SSE code but gcc vectorizes better
for (int j = 3; j < tileSize - 3; ++j) { for (int j = 3; j < tileSize - 3; ++j) {
const float val = c31 * (src[i - 3][j - 1] + src[i - 3][j + 1] + src[i - 1][j - 3] + src[i - 1][j + 3] + src[i + 1][j - 3] + src[i + 1][j + 3] + src[i + 3][j - 1] + src[i + 3][j + 1]) + const float val = c31 * ((src[i - 3][j - 1] + src[i - 3][j + 1]) + (src[i - 1][j - 3] + src[i - 1][j + 3]) + (src[i + 1][j - 3] + src[i + 1][j + 3]) + (src[i + 3][j - 1] + src[i + 3][j + 1])) +
c30 * (src[i - 3][j] + src[i][j - 3] + src[i][j + 3] + src[i + 3][j]) + c30 * (src[i - 3][j] + src[i][j - 3] + src[i][j + 3] + src[i + 3][j]) +
c22 * (src[i - 2][j - 2] + src[i - 2][j + 2] + src[i + 2][j - 2] + src[i + 2][j + 2]) + c22 * (src[i - 2][j - 2] + src[i - 2][j + 2] + src[i + 2][j - 2] + src[i + 2][j + 2]) +
c21 * (src[i - 2][j - 1] + src[i - 2][j + 1] + src[i - 1][j - 2] + src[i - 1][j + 2] + src[i + 1][j - 2] + src[i + 1][j + 2] + src[i + 2][j - 1] + src[i + 2][j + 1]) + c21 * ((src[i - 2][j - 1] + src[i - 2][j + 1]) + (src[i - 1][j - 2] + src[i - 1][j + 2]) + (src[i + 1][j - 2] + src[i + 1][j + 2]) + (src[i + 2][j - 1] + src[i + 2][j + 1])) +
c20 * (src[i - 2][j] + src[i][j - 2] + src[i][j + 2] + src[i + 2][j]) + c20 * (src[i - 2][j] + src[i][j - 2] + src[i][j + 2] + src[i + 2][j]) +
c11 * (src[i - 1][j - 1] + src[i - 1][j + 1] + src[i + 1][j - 1] + src[i + 1][j + 1]) + c11 * (src[i - 1][j - 1] + src[i - 1][j + 1] + src[i + 1][j - 1] + src[i + 1][j + 1]) +
c10 * (src[i - 1][j] + src[i][j - 1] + src[i][j + 1] + src[i + 1][j]) + c10 * (src[i - 1][j] + src[i][j - 1] + src[i][j + 1] + src[i + 1][j]) +
@ -221,7 +221,7 @@ inline void gauss5x5mult (float** RESTRICT src, float** RESTRICT dst, const int
for (int i = 2; i < tileSize - 2; ++i) { for (int i = 2; i < tileSize - 2; ++i) {
// I tried hand written SSE code but gcc vectorizes better // I tried hand written SSE code but gcc vectorizes better
for (int j = 2; j < tileSize - 2; ++j) { for (int j = 2; j < tileSize - 2; ++j) {
const float val = c21 * (src[i - 2][j - 1] + src[i - 2][j + 1] + src[i - 1][j - 2] + src[i - 1][j + 2] + src[i + 1][j - 2] + src[i + 1][j + 2] + src[i + 2][j - 1] + src[i + 2][j + 1]) + const float val = c21 * ((src[i - 2][j - 1] + src[i - 2][j + 1]) + (src[i - 1][j - 2] + src[i - 1][j + 2]) + (src[i + 1][j - 2] + src[i + 1][j + 2]) + (src[i + 2][j - 1] + src[i + 2][j + 1])) +
c20 * (src[i - 2][j] + src[i][j - 2] + src[i][j + 2] + src[i + 2][j]) + c20 * (src[i - 2][j] + src[i][j - 2] + src[i][j + 2] + src[i + 2][j]) +
c11 * (src[i - 1][j - 1] + src[i - 1][j + 1] + src[i + 1][j - 1] + src[i + 1][j + 1]) + c11 * (src[i - 1][j - 1] + src[i - 1][j + 1] + src[i + 1][j - 1] + src[i + 1][j + 1]) +
c10 * (src[i - 1][j] + src[i][j - 1] + src[i][j + 1] + src[i + 1][j]) + c10 * (src[i - 1][j] + src[i][j - 1] + src[i][j + 1] + src[i + 1][j]) +
@ -247,10 +247,10 @@ inline void gauss7x7mult(float** RESTRICT src, float** RESTRICT dst, const int t
for (int i = 3; i < tileSize - 3; ++i) { for (int i = 3; i < tileSize - 3; ++i) {
// I tried hand written SSE code but gcc vectorizes better // I tried hand written SSE code but gcc vectorizes better
for (int j = 3; j < tileSize - 3; ++j) { for (int j = 3; j < tileSize - 3; ++j) {
const float val = c31 * (src[i - 3][j - 1] + src[i - 3][j + 1] + src[i - 1][j - 3] + src[i - 1][j + 3] + src[i + 1][j - 3] + src[i + 1][j + 3] + src[i + 3][j - 1] + src[i + 3][j + 1]) + const float val = c31 * ((src[i - 3][j - 1] + src[i - 3][j + 1]) + (src[i - 1][j - 3] + src[i - 1][j + 3]) + (src[i + 1][j - 3] + src[i + 1][j + 3]) + (src[i + 3][j - 1] + src[i + 3][j + 1])) +
c30 * (src[i - 3][j] + src[i][j - 3] + src[i][j + 3] + src[i + 3][j]) + c30 * (src[i - 3][j] + src[i][j - 3] + src[i][j + 3] + src[i + 3][j]) +
c22 * (src[i - 2][j - 2] + src[i - 2][j + 2] + src[i + 2][j - 2] + src[i + 2][j + 2]) + c22 * (src[i - 2][j - 2] + src[i - 2][j + 2] + src[i + 2][j - 2] + src[i + 2][j + 2]) +
c21 * (src[i - 2][j - 1] + src[i - 2][j + 1] + src[i - 1][j - 2] + src[i - 1][j + 2] + src[i + 1][j - 2] + src[i + 1][j + 2] + src[i + 2][j - 1] + src[i + 2][j + 1]) + c21 * ((src[i - 2][j - 1] + src[i - 2][j + 1]) + (src[i - 1][j - 2] + src[i - 1][j + 2]) + (src[i + 1][j - 2] + src[i + 1][j + 2]) + (src[i + 2][j - 1] + src[i + 2][j + 1])) +
c20 * (src[i - 2][j] + src[i][j - 2] + src[i][j + 2] + src[i + 2][j]) + c20 * (src[i - 2][j] + src[i][j - 2] + src[i][j + 2] + src[i + 2][j]) +
c11 * (src[i - 1][j - 1] + src[i - 1][j + 1] + src[i + 1][j - 1] + src[i + 1][j + 1]) + c11 * (src[i - 1][j - 1] + src[i - 1][j + 1] + src[i + 1][j - 1] + src[i + 1][j + 1]) +
c10 * (src[i - 1][j] + src[i][j - 1] + src[i][j + 1] + src[i + 1][j]) + c10 * (src[i - 1][j] + src[i][j - 1] + src[i][j + 1] + src[i + 1][j]) +
@ -563,19 +563,25 @@ BENCHFUN
// fill tiles // fill tiles
if (endOfRow || endOfCol) { if (endOfRow || endOfCol) {
// special handling for small tiles at end of row or column // special handling for small tiles at end of row or column
float maxVal = 0.f;
if (checkIterStop) { if (checkIterStop) {
float maxVal = 0.f;
for (int k = 0, ii = endOfCol ? H - fullTileSize + border : i; k < tileSize; ++k, ++ii) { for (int k = 0, ii = endOfCol ? H - fullTileSize + border : i; k < tileSize; ++k, ++ii) {
for (int l = 0, jj = endOfRow ? W - fullTileSize + border : j; l < tileSize; ++l, ++jj) { for (int l = 0, jj = endOfRow ? W - fullTileSize + border : j; l < tileSize; ++l, ++jj) {
iterCheck[k][l] = oldLuminance[ii][jj] * clipmask[ii][jj] * 0.5f; iterCheck[k][l] = oldLuminance[ii][jj] * clipmask[ii][jj] * 0.5f;
maxVal = std::max(maxVal, clipmask[ii][jj]); maxVal = std::max(maxVal, clipmask[ii][jj]);
} }
} }
if (maxVal < minBlend) { } else {
// no pixel of the tile has a blend factor >= minBlend => skip the tile for (int k = 0, ii = endOfCol ? H - fullTileSize + border : i; k < tileSize; ++k, ++ii) {
continue; for (int l = 0, jj = endOfRow ? W - fullTileSize + border : j; l < tileSize; ++l, ++jj) {
maxVal = std::max(maxVal, clipmask[ii][jj]);
}
} }
} }
if (maxVal < minBlend) {
// no pixel of the tile has a blend factor >= minBlend => skip the tile
continue;
}
for (int k = 0, ii = endOfCol ? H - fullTileSize : i - border; k < fullTileSize; ++k, ++ii) { for (int k = 0, ii = endOfCol ? H - fullTileSize : i - border; k < fullTileSize; ++k, ++ii) {
for (int l = 0, jj = endOfRow ? W - fullTileSize : j - border; l < fullTileSize; ++l, ++jj) { for (int l = 0, jj = endOfRow ? W - fullTileSize : j - border; l < fullTileSize; ++l, ++jj) {
tmpIThr[k][l] = oldLuminance[ii][jj]; tmpIThr[k][l] = oldLuminance[ii][jj];
@ -583,19 +589,25 @@ BENCHFUN
} }
} }
} else { } else {
float maxVal = 0.f;
if (checkIterStop) { if (checkIterStop) {
float maxVal = 0.f;
for (int ii = 0; ii < tileSize; ++ii) { for (int ii = 0; ii < tileSize; ++ii) {
for (int jj = 0; jj < tileSize; ++jj) { for (int jj = 0; jj < tileSize; ++jj) {
iterCheck[ii][jj] = oldLuminance[i + ii][j + jj] * clipmask[i + ii][j + jj] * 0.5f; iterCheck[ii][jj] = oldLuminance[i + ii][j + jj] * clipmask[i + ii][j + jj] * 0.5f;
maxVal = std::max(maxVal, clipmask[i + ii][j + jj]); maxVal = std::max(maxVal, clipmask[i + ii][j + jj]);
} }
} }
if (maxVal < minBlend) { } else {
// no pixel of the tile has a blend factor >= minBlend => skip the tile for (int ii = 0; ii < tileSize; ++ii) {
continue; for (int jj = 0; jj < tileSize; ++jj) {
maxVal = std::max(maxVal, clipmask[i + ii][j + jj]);
}
} }
} }
if (maxVal < minBlend) {
// no pixel of the tile has a blend factor >= minBlend => skip the tile
continue;
}
for (int ii = i; ii < i + fullTileSize; ++ii) { for (int ii = i; ii < i + fullTileSize; ++ii) {
for (int jj = j; jj < j + fullTileSize; ++jj) { for (int jj = j; jj < j + fullTileSize; ++jj) {
tmpIThr[ii - i][jj - j] = oldLuminance[ii - border][jj - border]; tmpIThr[ii - i][jj - j] = oldLuminance[ii - border][jj - border];