diff --git a/rtengine/capturesharpening.cc b/rtengine/capturesharpening.cc index 357a8593f..b1e1657b4 100644 --- a/rtengine/capturesharpening.cc +++ b/rtengine/capturesharpening.cc @@ -99,33 +99,48 @@ void compute3x3kernel(float sigma, float kernel[3][3]) { } } -inline void gauss3x3div (float** RESTRICT src, float** RESTRICT dst, float** RESTRICT divBuffer, const int W, const int H, const float kernel[3][3]) +inline void initTile(float** dst, const int tileSize) +{ + + // first rows + for (int i = 0; i < 3; ++i) { + for (int j = 0; j < tileSize; ++j) { + dst[i][j] = 1.f; + } + } + + // left and right border + for (int i = 3; i < tileSize - 3; ++i) { + dst[i][0] = dst[i][1] = dst[i][2] = 1.f; + dst[i][tileSize - 3] = dst[i][tileSize - 2] = dst[i][tileSize - 1] = 1.f; + } + + // last rows + for (int i = tileSize - 3 ; i < tileSize; ++i) { + for (int j = 0; j < tileSize; ++j) { + dst[i][j] = 1.f; + } + } +} + +inline void gauss3x3div (float** RESTRICT src, float** RESTRICT dst, float** RESTRICT divBuffer, const int tileSize, const float kernel[3][3]) { const float c11 = kernel[0][0]; const float c10 = kernel[0][1]; const float c00 = kernel[1][1]; - for (int i = 1; i < H - 1; i++) { - dst[i][0] = 1.f; - for (int j = 1; j < W - 1; j++) { + for (int i = 1; i < tileSize - 1; i++) { + for (int j = 1; j < tileSize - 1; j++) { const float val = c11 * (src[i - 1][j - 1] + src[i - 1][j + 1] + src[i + 1][j - 1] + src[i + 1][j + 1]) + c10 * (src[i - 1][j] + src[i][j - 1] + src[i][j + 1] + src[i + 1][j]) + c00 * src[i][j]; dst[i][j] = divBuffer[i][j] / std::max(val, 0.00001f); } - dst[i][W - 1] = 1.f; - } - // first and last row - for (int j = 0; j < W; ++j) { - dst[0][j] = 1.f; - } - for (int j = 0; j < W; ++j) { - dst[H - 1][j] = 1.f; } } -inline void gauss5x5div (float** RESTRICT src, float** RESTRICT dst, float** RESTRICT divBuffer, const int W, const int H, const float kernel[5][5]) +inline void gauss5x5div (float** RESTRICT src, float** RESTRICT dst, float** RESTRICT divBuffer, const int tileSize, const float kernel[5][5]) { const float c21 = kernel[0][1]; @@ -134,10 +149,9 @@ inline void gauss5x5div (float** RESTRICT src, float** RESTRICT dst, float** RES const float c10 = kernel[1][2]; const float c00 = kernel[2][2]; - for (int i = 2; i < H - 2; ++i) { - dst[i][0] = dst[i][1] = 1.f; + for (int i = 2; i < tileSize - 2; ++i) { // I tried hand written SSE code but gcc vectorizes better - for (int j = 2; j < W - 2; ++j) { + for (int j = 2; j < tileSize - 2; ++j) { const float val = c21 * (src[i - 2][j - 1] + src[i - 2][j + 1] + src[i - 1][j - 2] + src[i - 1][j + 2] + src[i + 1][j - 2] + src[i + 1][j + 2] + src[i + 2][j - 1] + src[i + 2][j + 1]) + c20 * (src[i - 2][j] + src[i][j - 2] + src[i][j + 2] + src[i + 2][j]) + c11 * (src[i - 1][j - 1] + src[i - 1][j + 1] + src[i + 1][j - 1] + src[i + 1][j + 1]) + @@ -146,23 +160,10 @@ inline void gauss5x5div (float** RESTRICT src, float** RESTRICT dst, float** RES dst[i][j] = divBuffer[i][j] / std::max(val, 0.00001f); } - dst[i][W - 2] = dst[i][W - 1] = 1.f; - } - - // first and last rows - for (int i = 0; i < 2; ++i) { - for (int j = 0; j < W; ++j) { - dst[i][j] = 1.f; - } - } - for (int i = H - 2 ; i < H; ++i) { - for (int j = 0; j < W; ++j) { - dst[i][j] = 1.f; - } } } -inline void gauss7x7div(float** RESTRICT src, float** RESTRICT dst, float** RESTRICT divBuffer, const int W, const int H, const float kernel[7][7]) +inline void gauss7x7div(float** RESTRICT src, float** RESTRICT dst, float** RESTRICT divBuffer, const int tileSize, const float kernel[7][7]) { const float c31 = kernel[0][2]; @@ -174,10 +175,9 @@ inline void gauss7x7div(float** RESTRICT src, float** RESTRICT dst, float** REST const float c10 = kernel[2][3]; const float c00 = kernel[3][3]; - for (int i = 3; i < H - 3; ++i) { - dst[i][0] = dst[i][1] = dst[i][2] = 1.f; + for (int i = 3; i < tileSize - 3; ++i) { // I tried hand written SSE code but gcc vectorizes better - for (int j = 3; j < W - 3; ++j) { + for (int j = 3; j < tileSize - 3; ++j) { const float val = c31 * (src[i - 3][j - 1] + src[i - 3][j + 1] + src[i - 1][j - 3] + src[i - 1][j + 3] + src[i + 1][j - 3] + src[i + 1][j + 3] + src[i + 3][j - 1] + src[i + 3][j + 1]) + c30 * (src[i - 3][j] + src[i][j - 3] + src[i][j + 3] + src[i + 3][j]) + c22 * (src[i - 2][j - 2] + src[i - 2][j + 2] + src[i + 2][j - 2] + src[i + 2][j + 2]) + @@ -189,30 +189,17 @@ inline void gauss7x7div(float** RESTRICT src, float** RESTRICT dst, float** REST dst[i][j] = divBuffer[i][j] / std::max(val, 0.00001f); } - dst[i][W - 3] = dst[i][W - 2] = dst[i][W - 1] = 1.f; - } - - // first and last rows - for (int i = 0; i < 3; ++i) { - for (int j = 0; j < W; ++j) { - dst[i][j] = 1.f; - } - } - for (int i = H - 3 ; i < H; ++i) { - for (int j = 0; j < W; ++j) { - dst[i][j] = 1.f; - } } } -inline void gauss3x3mult(float** RESTRICT src, float** RESTRICT dst, const int W, const int H, const float kernel[3][3]) +inline void gauss3x3mult(float** RESTRICT src, float** RESTRICT dst, const int tileSize, const float kernel[3][3]) { const float c11 = kernel[0][0]; const float c10 = kernel[0][1]; const float c00 = kernel[1][1]; - for (int i = 1; i < H - 1; i++) { - for (int j = 1; j < W - 1; j++) { + for (int i = 1; i < tileSize - 1; i++) { + for (int j = 1; j < tileSize - 1; j++) { const float val = c11 * (src[i - 1][j - 1] + src[i - 1][j + 1] + src[i + 1][j - 1] + src[i + 1][j + 1]) + c10 * (src[i - 1][j] + src[i][j - 1] + src[i][j + 1] + src[i + 1][j]) + c00 * src[i][j]; @@ -222,7 +209,7 @@ inline void gauss3x3mult(float** RESTRICT src, float** RESTRICT dst, const int W } -inline void gauss5x5mult (float** RESTRICT src, float** RESTRICT dst, const int W, const int H, const float kernel[5][5]) +inline void gauss5x5mult (float** RESTRICT src, float** RESTRICT dst, const int tileSize, const float kernel[5][5]) { const float c21 = kernel[0][1]; @@ -231,9 +218,9 @@ inline void gauss5x5mult (float** RESTRICT src, float** RESTRICT dst, const int const float c10 = kernel[1][2]; const float c00 = kernel[2][2]; - for (int i = 2; i < H - 2; ++i) { + for (int i = 2; i < tileSize - 2; ++i) { // I tried hand written SSE code but gcc vectorizes better - for (int j = 2; j < W - 2; ++j) { + for (int j = 2; j < tileSize - 2; ++j) { const float val = c21 * (src[i - 2][j - 1] + src[i - 2][j + 1] + src[i - 1][j - 2] + src[i - 1][j + 2] + src[i + 1][j - 2] + src[i + 1][j + 2] + src[i + 2][j - 1] + src[i + 2][j + 1]) + c20 * (src[i - 2][j] + src[i][j - 2] + src[i][j + 2] + src[i + 2][j]) + c11 * (src[i - 1][j - 1] + src[i - 1][j + 1] + src[i + 1][j - 1] + src[i + 1][j + 1]) + @@ -245,7 +232,7 @@ inline void gauss5x5mult (float** RESTRICT src, float** RESTRICT dst, const int } } -inline void gauss7x7mult(float** RESTRICT src, float** RESTRICT dst, const int W, const int H, const float kernel[7][7]) +inline void gauss7x7mult(float** RESTRICT src, float** RESTRICT dst, const int tileSize, const float kernel[7][7]) { const float c31 = kernel[0][2]; @@ -257,9 +244,9 @@ inline void gauss7x7mult(float** RESTRICT src, float** RESTRICT dst, const int W const float c10 = kernel[2][3]; const float c00 = kernel[3][3]; - for (int i = 3; i < H - 3; ++i) { + for (int i = 3; i < tileSize - 3; ++i) { // I tried hand written SSE code but gcc vectorizes better - for (int j = 3; j < W - 3; ++j) { + for (int j = 3; j < tileSize - 3; ++j) { const float val = c31 * (src[i - 3][j - 1] + src[i - 3][j + 1] + src[i - 1][j - 3] + src[i - 1][j + 3] + src[i + 1][j - 3] + src[i + 1][j + 3] + src[i + 3][j - 1] + src[i + 3][j + 1]) + c30 * (src[i - 3][j] + src[i][j - 3] + src[i][j + 3] + src[i + 3][j]) + c22 * (src[i - 2][j - 2] + src[i - 2][j + 2] + src[i + 2][j - 2] + src[i + 2][j + 2]) + @@ -543,6 +530,7 @@ BENCHFUN array2D tmpIThr(fullTileSize, fullTileSize); array2D tmpThr(fullTileSize, fullTileSize); array2D lumThr(fullTileSize, fullTileSize); + initTile(tmpThr, fullTileSize); #ifdef _OPENMP #pragma omp for schedule(dynamic,2) collapse(2) #endif @@ -570,14 +558,14 @@ BENCHFUN if (is3x3) { for (int k = 0; k < iterations; ++k) { // apply 3x3 gaussian blur and divide luminance by result of gaussian blur - gauss3x3div(tmpIThr, tmpThr, lumThr, fullTileSize, fullTileSize, kernel3); - gauss3x3mult(tmpThr, tmpIThr, fullTileSize, fullTileSize, kernel3); + gauss3x3div(tmpIThr, tmpThr, lumThr, fullTileSize, kernel3); + gauss3x3mult(tmpThr, tmpIThr, fullTileSize, kernel3); } } else if (is5x5) { for (int k = 0; k < iterations; ++k) { // apply 5x5 gaussian blur and divide luminance by result of gaussian blur - gauss5x5div(tmpIThr, tmpThr, lumThr, fullTileSize, fullTileSize, kernel5); - gauss5x5mult(tmpThr, tmpIThr, fullTileSize, fullTileSize, kernel5); + gauss5x5div(tmpIThr, tmpThr, lumThr, fullTileSize, kernel5); + gauss5x5mult(tmpThr, tmpIThr, fullTileSize, kernel5); } } else { if (sigmaCornerOffset != 0.0) { @@ -586,17 +574,17 @@ BENCHFUN if (sigmaTile >= 0.4f) { float lkernel7[7][7]; compute7x7kernel(static_cast(sigma) + distanceFactor * distance, lkernel7); - for (int k = 0; k < iterations - 1; ++k) { + for (int k = 0; k < iterations; ++k) { // apply 7x7 gaussian blur and divide luminance by result of gaussian blur - gauss7x7div(tmpIThr, tmpThr, lumThr, fullTileSize, fullTileSize, lkernel7); - gauss7x7mult(tmpThr, tmpIThr, fullTileSize, fullTileSize, lkernel7); + gauss7x7div(tmpIThr, tmpThr, lumThr, fullTileSize, lkernel7); + gauss7x7mult(tmpThr, tmpIThr, fullTileSize, lkernel7); } } } else { for (int k = 0; k < iterations; ++k) { // apply 7x7 gaussian blur and divide luminance by result of gaussian blur - gauss7x7div(tmpIThr, tmpThr, lumThr, fullTileSize, fullTileSize, kernel7); - gauss7x7mult(tmpThr, tmpIThr, fullTileSize, fullTileSize, kernel7); + gauss7x7div(tmpIThr, tmpThr, lumThr, fullTileSize, kernel7); + gauss7x7mult(tmpThr, tmpIThr, fullTileSize, kernel7); } } } @@ -643,7 +631,7 @@ void RawImageSource::captureSharpening(const procparams::CaptureSharpeningParams plistener->setProgress(0.0); } BENCHFUN - const float xyz_rgb[3][3] = { // XYZ from RGB + constexpr float xyz_rgb[3][3] = { // XYZ from RGB { 0.412453, 0.357580, 0.180423 }, { 0.212671, 0.715160, 0.072169 }, { 0.019334, 0.119193, 0.950227 } @@ -659,6 +647,8 @@ BENCHFUN array2D clipMask(W, H); constexpr float clipLimit = 0.95f; + constexpr float maxSigma = 1.15f; + if (getSensorType() == ST_BAYER) { const float whites[2][2] = { {(ri->get_white(FC(0,0)) - c_black[FC(0,0)]) * scale_mul[FC(0,0)] * clipLimit, (ri->get_white(FC(0,1)) - c_black[FC(0,1)]) * scale_mul[FC(0,1)] * clipLimit}, @@ -667,7 +657,7 @@ BENCHFUN buildClipMaskBayer(rawData, W, H, clipMask, whites); const unsigned int fc[2] = {FC(0,0), FC(1,0)}; if (sharpeningParams.autoRadius) { - radius = std::min(calcRadiusBayer(rawData, W, H, 1000.f, clipVal, fc), 1.15f); + radius = std::min(calcRadiusBayer(rawData, W, H, 1000.f, clipVal, fc), maxSigma); } } else if (getSensorType() == ST_FUJI_XTRANS) { float whites[6][6]; @@ -695,14 +685,14 @@ BENCHFUN } } if (sharpeningParams.autoRadius) { - radius = std::min(calcRadiusXtrans(rawData, W, H, 1000.f, clipVal, i, j), 1.15f); + radius = std::min(calcRadiusXtrans(rawData, W, H, 1000.f, clipVal, i, j), maxSigma); } } else if (ri->get_colors() == 1) { buildClipMaskMono(rawData, W, H, clipMask, (ri->get_white(0) - c_black[0]) * scale_mul[0] * clipLimit); if (sharpeningParams.autoRadius) { const unsigned int fc[2] = {0, 0}; - radius = std::min(calcRadiusBayer(rawData, W, H, 1000.f, clipVal, fc), 1.15f); + radius = std::min(calcRadiusBayer(rawData, W, H, 1000.f, clipVal, fc), maxSigma); } }