Some optimizations for capture sharpening
This commit is contained in:
parent
5e0ad767ad
commit
3a207dace7
@ -152,7 +152,7 @@ inline void gauss5x5div (float** RESTRICT src, float** RESTRICT dst, float** RES
|
|||||||
for (int i = 2; i < tileSize - 2; ++i) {
|
for (int i = 2; i < tileSize - 2; ++i) {
|
||||||
// I tried hand written SSE code but gcc vectorizes better
|
// I tried hand written SSE code but gcc vectorizes better
|
||||||
for (int j = 2; j < tileSize - 2; ++j) {
|
for (int j = 2; j < tileSize - 2; ++j) {
|
||||||
const float val = c21 * (src[i - 2][j - 1] + src[i - 2][j + 1] + src[i - 1][j - 2] + src[i - 1][j + 2] + src[i + 1][j - 2] + src[i + 1][j + 2] + src[i + 2][j - 1] + src[i + 2][j + 1]) +
|
const float val = c21 * ((src[i - 2][j - 1] + src[i - 2][j + 1]) + (src[i - 1][j - 2] + src[i - 1][j + 2]) + (src[i + 1][j - 2] + src[i + 1][j + 2]) + (src[i + 2][j - 1] + src[i + 2][j + 1])) +
|
||||||
c20 * (src[i - 2][j] + src[i][j - 2] + src[i][j + 2] + src[i + 2][j]) +
|
c20 * (src[i - 2][j] + src[i][j - 2] + src[i][j + 2] + src[i + 2][j]) +
|
||||||
c11 * (src[i - 1][j - 1] + src[i - 1][j + 1] + src[i + 1][j - 1] + src[i + 1][j + 1]) +
|
c11 * (src[i - 1][j - 1] + src[i - 1][j + 1] + src[i + 1][j - 1] + src[i + 1][j + 1]) +
|
||||||
c10 * (src[i - 1][j] + src[i][j - 1] + src[i][j + 1] + src[i + 1][j]) +
|
c10 * (src[i - 1][j] + src[i][j - 1] + src[i][j + 1] + src[i + 1][j]) +
|
||||||
@ -178,10 +178,10 @@ inline void gauss7x7div(float** RESTRICT src, float** RESTRICT dst, float** REST
|
|||||||
for (int i = 3; i < tileSize - 3; ++i) {
|
for (int i = 3; i < tileSize - 3; ++i) {
|
||||||
// I tried hand written SSE code but gcc vectorizes better
|
// I tried hand written SSE code but gcc vectorizes better
|
||||||
for (int j = 3; j < tileSize - 3; ++j) {
|
for (int j = 3; j < tileSize - 3; ++j) {
|
||||||
const float val = c31 * (src[i - 3][j - 1] + src[i - 3][j + 1] + src[i - 1][j - 3] + src[i - 1][j + 3] + src[i + 1][j - 3] + src[i + 1][j + 3] + src[i + 3][j - 1] + src[i + 3][j + 1]) +
|
const float val = c31 * ((src[i - 3][j - 1] + src[i - 3][j + 1]) + (src[i - 1][j - 3] + src[i - 1][j + 3]) + (src[i + 1][j - 3] + src[i + 1][j + 3]) + (src[i + 3][j - 1] + src[i + 3][j + 1])) +
|
||||||
c30 * (src[i - 3][j] + src[i][j - 3] + src[i][j + 3] + src[i + 3][j]) +
|
c30 * (src[i - 3][j] + src[i][j - 3] + src[i][j + 3] + src[i + 3][j]) +
|
||||||
c22 * (src[i - 2][j - 2] + src[i - 2][j + 2] + src[i + 2][j - 2] + src[i + 2][j + 2]) +
|
c22 * (src[i - 2][j - 2] + src[i - 2][j + 2] + src[i + 2][j - 2] + src[i + 2][j + 2]) +
|
||||||
c21 * (src[i - 2][j - 1] + src[i - 2][j + 1] + src[i - 1][j - 2] + src[i - 1][j + 2] + src[i + 1][j - 2] + src[i + 1][j + 2] + src[i + 2][j - 1] + src[i + 2][j + 1]) +
|
c21 * ((src[i - 2][j - 1] + src[i - 2][j + 1]) + (src[i - 1][j - 2] + src[i - 1][j + 2]) + (src[i + 1][j - 2] + src[i + 1][j + 2]) + (src[i + 2][j - 1] + src[i + 2][j + 1])) +
|
||||||
c20 * (src[i - 2][j] + src[i][j - 2] + src[i][j + 2] + src[i + 2][j]) +
|
c20 * (src[i - 2][j] + src[i][j - 2] + src[i][j + 2] + src[i + 2][j]) +
|
||||||
c11 * (src[i - 1][j - 1] + src[i - 1][j + 1] + src[i + 1][j - 1] + src[i + 1][j + 1]) +
|
c11 * (src[i - 1][j - 1] + src[i - 1][j + 1] + src[i + 1][j - 1] + src[i + 1][j + 1]) +
|
||||||
c10 * (src[i - 1][j] + src[i][j - 1] + src[i][j + 1] + src[i + 1][j]) +
|
c10 * (src[i - 1][j] + src[i][j - 1] + src[i][j + 1] + src[i + 1][j]) +
|
||||||
@ -221,7 +221,7 @@ inline void gauss5x5mult (float** RESTRICT src, float** RESTRICT dst, const int
|
|||||||
for (int i = 2; i < tileSize - 2; ++i) {
|
for (int i = 2; i < tileSize - 2; ++i) {
|
||||||
// I tried hand written SSE code but gcc vectorizes better
|
// I tried hand written SSE code but gcc vectorizes better
|
||||||
for (int j = 2; j < tileSize - 2; ++j) {
|
for (int j = 2; j < tileSize - 2; ++j) {
|
||||||
const float val = c21 * (src[i - 2][j - 1] + src[i - 2][j + 1] + src[i - 1][j - 2] + src[i - 1][j + 2] + src[i + 1][j - 2] + src[i + 1][j + 2] + src[i + 2][j - 1] + src[i + 2][j + 1]) +
|
const float val = c21 * ((src[i - 2][j - 1] + src[i - 2][j + 1]) + (src[i - 1][j - 2] + src[i - 1][j + 2]) + (src[i + 1][j - 2] + src[i + 1][j + 2]) + (src[i + 2][j - 1] + src[i + 2][j + 1])) +
|
||||||
c20 * (src[i - 2][j] + src[i][j - 2] + src[i][j + 2] + src[i + 2][j]) +
|
c20 * (src[i - 2][j] + src[i][j - 2] + src[i][j + 2] + src[i + 2][j]) +
|
||||||
c11 * (src[i - 1][j - 1] + src[i - 1][j + 1] + src[i + 1][j - 1] + src[i + 1][j + 1]) +
|
c11 * (src[i - 1][j - 1] + src[i - 1][j + 1] + src[i + 1][j - 1] + src[i + 1][j + 1]) +
|
||||||
c10 * (src[i - 1][j] + src[i][j - 1] + src[i][j + 1] + src[i + 1][j]) +
|
c10 * (src[i - 1][j] + src[i][j - 1] + src[i][j + 1] + src[i + 1][j]) +
|
||||||
@ -247,10 +247,10 @@ inline void gauss7x7mult(float** RESTRICT src, float** RESTRICT dst, const int t
|
|||||||
for (int i = 3; i < tileSize - 3; ++i) {
|
for (int i = 3; i < tileSize - 3; ++i) {
|
||||||
// I tried hand written SSE code but gcc vectorizes better
|
// I tried hand written SSE code but gcc vectorizes better
|
||||||
for (int j = 3; j < tileSize - 3; ++j) {
|
for (int j = 3; j < tileSize - 3; ++j) {
|
||||||
const float val = c31 * (src[i - 3][j - 1] + src[i - 3][j + 1] + src[i - 1][j - 3] + src[i - 1][j + 3] + src[i + 1][j - 3] + src[i + 1][j + 3] + src[i + 3][j - 1] + src[i + 3][j + 1]) +
|
const float val = c31 * ((src[i - 3][j - 1] + src[i - 3][j + 1]) + (src[i - 1][j - 3] + src[i - 1][j + 3]) + (src[i + 1][j - 3] + src[i + 1][j + 3]) + (src[i + 3][j - 1] + src[i + 3][j + 1])) +
|
||||||
c30 * (src[i - 3][j] + src[i][j - 3] + src[i][j + 3] + src[i + 3][j]) +
|
c30 * (src[i - 3][j] + src[i][j - 3] + src[i][j + 3] + src[i + 3][j]) +
|
||||||
c22 * (src[i - 2][j - 2] + src[i - 2][j + 2] + src[i + 2][j - 2] + src[i + 2][j + 2]) +
|
c22 * (src[i - 2][j - 2] + src[i - 2][j + 2] + src[i + 2][j - 2] + src[i + 2][j + 2]) +
|
||||||
c21 * (src[i - 2][j - 1] + src[i - 2][j + 1] + src[i - 1][j - 2] + src[i - 1][j + 2] + src[i + 1][j - 2] + src[i + 1][j + 2] + src[i + 2][j - 1] + src[i + 2][j + 1]) +
|
c21 * ((src[i - 2][j - 1] + src[i - 2][j + 1]) + (src[i - 1][j - 2] + src[i - 1][j + 2]) + (src[i + 1][j - 2] + src[i + 1][j + 2]) + (src[i + 2][j - 1] + src[i + 2][j + 1])) +
|
||||||
c20 * (src[i - 2][j] + src[i][j - 2] + src[i][j + 2] + src[i + 2][j]) +
|
c20 * (src[i - 2][j] + src[i][j - 2] + src[i][j + 2] + src[i + 2][j]) +
|
||||||
c11 * (src[i - 1][j - 1] + src[i - 1][j + 1] + src[i + 1][j - 1] + src[i + 1][j + 1]) +
|
c11 * (src[i - 1][j - 1] + src[i - 1][j + 1] + src[i + 1][j - 1] + src[i + 1][j + 1]) +
|
||||||
c10 * (src[i - 1][j] + src[i][j - 1] + src[i][j + 1] + src[i + 1][j]) +
|
c10 * (src[i - 1][j] + src[i][j - 1] + src[i][j + 1] + src[i + 1][j]) +
|
||||||
@ -563,19 +563,25 @@ BENCHFUN
|
|||||||
// fill tiles
|
// fill tiles
|
||||||
if (endOfRow || endOfCol) {
|
if (endOfRow || endOfCol) {
|
||||||
// special handling for small tiles at end of row or column
|
// special handling for small tiles at end of row or column
|
||||||
|
float maxVal = 0.f;
|
||||||
if (checkIterStop) {
|
if (checkIterStop) {
|
||||||
float maxVal = 0.f;
|
|
||||||
for (int k = 0, ii = endOfCol ? H - fullTileSize + border : i; k < tileSize; ++k, ++ii) {
|
for (int k = 0, ii = endOfCol ? H - fullTileSize + border : i; k < tileSize; ++k, ++ii) {
|
||||||
for (int l = 0, jj = endOfRow ? W - fullTileSize + border : j; l < tileSize; ++l, ++jj) {
|
for (int l = 0, jj = endOfRow ? W - fullTileSize + border : j; l < tileSize; ++l, ++jj) {
|
||||||
iterCheck[k][l] = oldLuminance[ii][jj] * clipmask[ii][jj] * 0.5f;
|
iterCheck[k][l] = oldLuminance[ii][jj] * clipmask[ii][jj] * 0.5f;
|
||||||
maxVal = std::max(maxVal, clipmask[ii][jj]);
|
maxVal = std::max(maxVal, clipmask[ii][jj]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (maxVal < minBlend) {
|
} else {
|
||||||
// no pixel of the tile has a blend factor >= minBlend => skip the tile
|
for (int k = 0, ii = endOfCol ? H - fullTileSize + border : i; k < tileSize; ++k, ++ii) {
|
||||||
continue;
|
for (int l = 0, jj = endOfRow ? W - fullTileSize + border : j; l < tileSize; ++l, ++jj) {
|
||||||
|
maxVal = std::max(maxVal, clipmask[ii][jj]);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (maxVal < minBlend) {
|
||||||
|
// no pixel of the tile has a blend factor >= minBlend => skip the tile
|
||||||
|
continue;
|
||||||
|
}
|
||||||
for (int k = 0, ii = endOfCol ? H - fullTileSize : i - border; k < fullTileSize; ++k, ++ii) {
|
for (int k = 0, ii = endOfCol ? H - fullTileSize : i - border; k < fullTileSize; ++k, ++ii) {
|
||||||
for (int l = 0, jj = endOfRow ? W - fullTileSize : j - border; l < fullTileSize; ++l, ++jj) {
|
for (int l = 0, jj = endOfRow ? W - fullTileSize : j - border; l < fullTileSize; ++l, ++jj) {
|
||||||
tmpIThr[k][l] = oldLuminance[ii][jj];
|
tmpIThr[k][l] = oldLuminance[ii][jj];
|
||||||
@ -583,19 +589,25 @@ BENCHFUN
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
float maxVal = 0.f;
|
||||||
if (checkIterStop) {
|
if (checkIterStop) {
|
||||||
float maxVal = 0.f;
|
|
||||||
for (int ii = 0; ii < tileSize; ++ii) {
|
for (int ii = 0; ii < tileSize; ++ii) {
|
||||||
for (int jj = 0; jj < tileSize; ++jj) {
|
for (int jj = 0; jj < tileSize; ++jj) {
|
||||||
iterCheck[ii][jj] = oldLuminance[i + ii][j + jj] * clipmask[i + ii][j + jj] * 0.5f;
|
iterCheck[ii][jj] = oldLuminance[i + ii][j + jj] * clipmask[i + ii][j + jj] * 0.5f;
|
||||||
maxVal = std::max(maxVal, clipmask[i + ii][j + jj]);
|
maxVal = std::max(maxVal, clipmask[i + ii][j + jj]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (maxVal < minBlend) {
|
} else {
|
||||||
// no pixel of the tile has a blend factor >= minBlend => skip the tile
|
for (int ii = 0; ii < tileSize; ++ii) {
|
||||||
continue;
|
for (int jj = 0; jj < tileSize; ++jj) {
|
||||||
|
maxVal = std::max(maxVal, clipmask[i + ii][j + jj]);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (maxVal < minBlend) {
|
||||||
|
// no pixel of the tile has a blend factor >= minBlend => skip the tile
|
||||||
|
continue;
|
||||||
|
}
|
||||||
for (int ii = i; ii < i + fullTileSize; ++ii) {
|
for (int ii = i; ii < i + fullTileSize; ++ii) {
|
||||||
for (int jj = j; jj < j + fullTileSize; ++jj) {
|
for (int jj = j; jj < j + fullTileSize; ++jj) {
|
||||||
tmpIThr[ii - i][jj - j] = oldLuminance[ii - border][jj - border];
|
tmpIThr[ii - i][jj - j] = oldLuminance[ii - border][jj - border];
|
||||||
|
Loading…
x
Reference in New Issue
Block a user