diff --git a/rtengine/gauss.cc b/rtengine/gauss.cc index 171ed84e8..5665394cc 100644 --- a/rtengine/gauss.cc +++ b/rtengine/gauss.cc @@ -30,8 +30,12 @@ void compute7x7kernel(float sigma, float kernel[7][7]) { float sum = 0.f; for (int i = -3; i <= 3; ++i) { for (int j = -3; j <= 3; ++j) { - kernel[i + 3][j + 3] = std::exp((rtengine::SQR(i) + rtengine::SQR(j)) / temp); - sum += kernel[i + 3][j + 3]; + if((rtengine::SQR(i) + rtengine::SQR(j)) <= rtengine::SQR(3.0 * 1.15)) { + kernel[i + 3][j + 3] = std::exp((rtengine::SQR(i) + rtengine::SQR(j)) / temp); + sum += kernel[i + 3][j + 3]; + } else { + kernel[i + 3][j + 3] = 0.f; + } } } @@ -47,8 +51,12 @@ void compute5x5kernel(float sigma, float kernel[5][5]) { float sum = 0.f; for (int i = -2; i <= 2; ++i) { for (int j = -2; j <= 2; ++j) { - kernel[i + 2][j + 2] = std::exp((rtengine::SQR(i) + rtengine::SQR(j)) / temp); - sum += kernel[i + 2][j + 2]; + if((rtengine::SQR(i) + rtengine::SQR(j)) <= rtengine::SQR(3.0 * 0.84)) { + kernel[i + 2][j + 2] = std::exp((rtengine::SQR(i) + rtengine::SQR(j)) / temp); + sum += kernel[i + 2][j + 2]; + } else { + kernel[i + 2][j + 2] = 0.f; + } } } @@ -247,15 +255,14 @@ template void gauss7x7div (T** RESTRICT src, T** RESTRICT dst, T** REST float kernel[7][7]; compute7x7kernel(sigma, kernel); -#ifdef __SSE2__ - vfloat kernelv[7][7] ALIGNED16; - for (int i = 0; i < 7; ++i) { - for (int j = 0; j < 7; ++j) { - kernelv[i][j] = F2V(kernel[i][j]); - } - } - const vfloat onev = F2V(1.f); -#endif + const float c31 = kernel[0][2]; + const float c30 = kernel[0][3]; + const float c22 = kernel[1][1]; + const float c21 = kernel[1][2]; + const float c20 = kernel[1][3]; + const float c11 = kernel[2][2]; + const float c10 = kernel[2][3]; + const float c00 = kernel[3][3]; #ifdef _OPENMP #pragma omp for schedule(dynamic, 16) nowait @@ -263,26 +270,54 @@ template void gauss7x7div (T** RESTRICT src, T** RESTRICT dst, T** REST for (int i = 3; i < H - 3; ++i) { dst[i][0] = dst[i][1] = dst[i][2] = 1.f; - int j = 3; -#ifdef __SSE2__ - for (; j < W - 6; j += 4) { - vfloat valv = ZEROV; - for (int k = -3; k <= 3; ++k) { - for (int l = -3; l <= 3; ++l) { - valv += LVFU(src[i + k][j + l]) * LVF(kernelv[k + 3][l + 3]); - } - } - STVFU(dst[i][j], vmaxf(LVFU(divBuffer[i][j]) / (vself(vmaskf_gt(valv, ZEROV), valv, onev)), ZEROV)); - } -#endif - for (; j < W - 3; ++j) { - float val = 0; - for (int k = -3; k <= 3; ++k) { - for (int l = -3; l <= 3; ++l) { - val += src[i + k][j + l] * kernel[k + 3][l + 3]; - } - } - dst[i][j] = rtengine::max(divBuffer[i][j] / (val > 0.f ? val : 1.f), 0.f); + // I tried hand written SSE code but gcc vectorizes better + for (int j = 3; j < W - 3; ++j) { + float val = src[i - 3][j - 1] * c31; + val += src[i - 3][j - 0] * c30; + val += src[i - 3][j + 1] * c31; + + val += src[i - 2][j - 2] * c22; + val += src[i - 2][j - 1] * c21; + val += src[i - 2][j - 0] * c20; + val += src[i - 2][j + 1] * c21; + val += src[i - 2][j + 2] * c22; + + val += src[i - 1][j - 3] * c31; + val += src[i - 1][j - 2] * c21; + val += src[i - 1][j - 1] * c11; + val += src[i - 1][j - 0] * c10; + val += src[i - 1][j + 1] * c11; + val += src[i - 1][j + 2] * c21; + val += src[i - 1][j + 3] * c31; + + val += src[i][j - 3] * c30; + val += src[i][j - 2] * c20; + val += src[i][j - 1] * c10; + val += src[i][j - 0] * c00; + val += src[i][j + 1] * c10; + val += src[i][j + 2] * c20; + val += src[i][j + 3] * c30; + + val += src[i + 1][j - 3] * c31; + val += src[i + 1][j - 2] * c21; + val += src[i + 1][j - 1] * c11; + val += src[i + 1][j - 0] * c10; + val += src[i + 1][j + 1] * c11; + val += src[i + 1][j + 2] * c21; + val += src[i + 1][j + 3] * c31; + + val += src[i + 2][j - 2] * c22; + val += src[i + 2][j - 1] * c21; + val += src[i + 2][j - 0] * c20; + val += src[i + 2][j + 1] * c21; + val += src[i + 2][j + 2] * c22; + + val += src[i + 3][j - 1] * c31; + val += src[i + 3][j - 0] * c30; + val += src[i + 3][j + 1] * c31; + + val = val > 0.f ? val : 1.f; + dst[i][j] = std::max(divBuffer[i][j] / val, 0.f); } dst[i][W - 3] = dst[i][W - 2] = dst[i][W - 1] = 1.f; } @@ -311,15 +346,11 @@ template void gauss5x5div (T** RESTRICT src, T** RESTRICT dst, T** REST float kernel[5][5]; compute5x5kernel(sigma, kernel); -#ifdef __SSE2__ - vfloat kernelv[5][5] ALIGNED16; - for (int i = 0; i < 5; ++i) { - for (int j = 0; j < 5; ++j) { - kernelv[i][j] = F2V(kernel[i][j]); - } - } - const vfloat onev = F2V(1.f); -#endif + const float c21 = kernel[0][1]; + const float c20 = kernel[0][2]; + const float c11 = kernel[1][1]; + const float c10 = kernel[1][2]; + const float c00 = kernel[2][2]; #ifdef _OPENMP #pragma omp for schedule(dynamic, 16) nowait @@ -327,26 +358,36 @@ template void gauss5x5div (T** RESTRICT src, T** RESTRICT dst, T** REST for (int i = 2; i < H - 2; ++i) { dst[i][0] = dst[i][1] = 1.f; - int j = 2; -#ifdef __SSE2__ - for (; j < W - 5; j += 4) { - vfloat valv = ZEROV; - for (int k = -2; k <= 2; ++k) { - for (int l = -2; l <= 2; ++l) { - valv += LVFU(src[i + k][j + l]) * LVF(kernelv[k + 2][l + 2]); - } - } - STVFU(dst[i][j], vmaxf(LVFU(divBuffer[i][j]) / (vself(vmaskf_gt(valv, ZEROV), valv, onev)), ZEROV)); - } -#endif - for (; j < W - 2; ++j) { - float val = 0; - for (int k = -2; k <= 2; ++k) { - for (int l = -2; l <= 2; ++l) { - val += src[i + k][j + l] * kernel[k + 2][l + 2]; - } - } - dst[i][j] = rtengine::max(divBuffer[i][j] / (val > 0.f ? val : 1.f), 0.f); + // I tried hand written SSE code but gcc vectorizes better + for (int j = 2; j < W - 2; ++j) { + float val = src[i - 2][j - 1] * c21; + val += src[i - 2][j - 0] * c20; + val += src[i - 2][j + 1] * c21; + + val += src[i - 1][j - 2] * c21; + val += src[i - 1][j - 1] * c11; + val += src[i - 1][j - 0] * c10; + val += src[i - 1][j + 1] * c11; + val += src[i - 1][j + 2] * c21; + + val += src[i - 0][j - 2] * c20; + val += src[i - 0][j - 1] * c10; + val += src[i - 0][j - 0] * c00; + val += src[i - 0][j + 1] * c10; + val += src[i - 0][j + 2] * c20; + + val += src[i + 1][j - 2] * c21; + val += src[i + 1][j - 1] * c11; + val += src[i + 1][j - 0] * c10; + val += src[i + 1][j + 1] * c11; + val += src[i + 1][j + 2] * c21; + + val += src[i + 2][j - 1] * c21; + val += src[i + 2][j - 0] * c20; + val += src[i + 2][j + 1] * c21; + + val = val > 0.f ? val : 1.f; + dst[i][j] = std::max(divBuffer[i][j] / val, 0.f); } dst[i][W - 2] = dst[i][W - 1] = 1.f; } @@ -374,39 +415,66 @@ template void gauss7x7mult (T** RESTRICT src, T** RESTRICT dst, const i float kernel[7][7]; compute7x7kernel(sigma, kernel); -#ifdef __SSE2__ - vfloat kernelv[7][7] ALIGNED16; - for (int i = 0; i < 7; ++i) { - for (int j = 0; j < 7; ++j) { - kernelv[i][j] = F2V(kernel[i][j]); - } - } -#endif + const float c31 = kernel[0][2]; + const float c30 = kernel[0][3]; + const float c22 = kernel[1][1]; + const float c21 = kernel[1][2]; + const float c20 = kernel[1][3]; + const float c11 = kernel[2][2]; + const float c10 = kernel[2][3]; + const float c00 = kernel[3][3]; #ifdef _OPENMP #pragma omp for schedule(dynamic, 16) #endif for (int i = 3; i < H - 3; ++i) { - int j = 3; -#ifdef __SSE2__ - for (; j < W - 6; j += 4) { - vfloat valv = ZEROV; - for (int k = -3; k <= 3; ++k) { - for (int l = -3; l <= 3; ++l) { - valv += LVFU(src[i + k][j + l]) * LVF(kernelv[k + 3][l + 3]); - } - } - STVFU(dst[i][j], LVFU(dst[i][j]) * valv); - } -#endif - for (; j < W - 3; ++j) { - float val = 0; - for (int k = -3; k <= 3; ++k) { - for (int l = -3; l <= 3; ++l) { - val += src[i + k][j + l] * kernel[k + 3][l + 3]; - } - } + // I tried hand written SSE code but gcc vectorizes better + for (int j = 3; j < W - 3; ++j) { + float val = src[i - 3][j - 1] * c31; + val += src[i - 3][j - 0] * c30; + val += src[i - 3][j + 1] * c31; + + val += src[i - 2][j - 2] * c22; + val += src[i - 2][j - 1] * c21; + val += src[i - 2][j - 0] * c20; + val += src[i - 2][j + 1] * c21; + val += src[i - 2][j + 2] * c22; + + val += src[i - 1][j - 3] * c31; + val += src[i - 1][j - 2] * c21; + val += src[i - 1][j - 1] * c11; + val += src[i - 1][j - 0] * c10; + val += src[i - 1][j + 1] * c11; + val += src[i - 1][j + 2] * c21; + val += src[i - 1][j + 3] * c31; + + val += src[i][j - 3] * c30; + val += src[i][j - 2] * c20; + val += src[i][j - 1] * c10; + val += src[i][j - 0] * c00; + val += src[i][j + 1] * c10; + val += src[i][j + 2] * c20; + val += src[i][j + 3] * c30; + + val += src[i + 1][j - 3] * c31; + val += src[i + 1][j - 2] * c21; + val += src[i + 1][j - 1] * c11; + val += src[i + 1][j - 0] * c10; + val += src[i + 1][j + 1] * c11; + val += src[i + 1][j + 2] * c21; + val += src[i + 1][j + 3] * c31; + + val += src[i + 2][j - 2] * c22; + val += src[i + 2][j - 1] * c21; + val += src[i + 2][j - 0] * c20; + val += src[i + 2][j + 1] * c21; + val += src[i + 2][j + 2] * c22; + + val += src[i + 3][j - 1] * c31; + val += src[i + 3][j - 0] * c30; + val += src[i + 3][j + 1] * c31; + dst[i][j] *= val; } } @@ -417,39 +485,46 @@ template void gauss5x5mult (T** RESTRICT src, T** RESTRICT dst, const i float kernel[5][5]; compute5x5kernel(sigma, kernel); -#ifdef __SSE2__ - vfloat kernelv[5][5] ALIGNED16; - for (int i = 0; i < 5; ++i) { - for (int j = 0; j < 5; ++j) { - kernelv[i][j] = F2V(kernel[i][j]); - } - } -#endif + + const float c21 = kernel[0][1]; + const float c20 = kernel[0][2]; + const float c11 = kernel[1][1]; + const float c10 = kernel[1][2]; + const float c00 = kernel[2][2]; #ifdef _OPENMP #pragma omp for schedule(dynamic, 16) #endif for (int i = 2; i < H - 2; ++i) { - int j = 2; -#ifdef __SSE2__ - for (; j < W - 5; j += 4) { - vfloat valv = ZEROV; - for (int k = -2; k <= 2; ++k) { - for (int l = -2; l <= 2; ++l) { - valv += LVFU(src[i + k][j + l]) * LVF(kernelv[k + 2][l + 2]); - } - } - STVFU(dst[i][j], LVFU(dst[i][j]) * valv); - } -#endif - for (; j < W - 2; ++j) { - float val = 0; - for (int k = -2; k <= 2; ++k) { - for (int l = -2; l <= 2; ++l) { - val += src[i + k][j + l] * kernel[k + 2][l + 2]; - } - } + // I tried hand written SSE code but gcc vectorizes better + for (int j = 2; j < W - 2; ++j) { + float val = src[i - 2][j - 1] * c21; + val += src[i - 2][j - 0] * c20; + val += src[i - 2][j + 1] * c21; + + val += src[i - 1][j - 2] * c21; + val += src[i - 1][j - 1] * c11; + val += src[i - 1][j - 0] * c10; + val += src[i - 1][j + 1] * c11; + val += src[i - 1][j + 2] * c21; + + val += src[i - 0][j - 2] * c20; + val += src[i - 0][j - 1] * c10; + val += src[i - 0][j - 0] * c00; + val += src[i - 0][j + 1] * c10; + val += src[i - 0][j + 2] * c20; + + val += src[i + 1][j - 2] * c21; + val += src[i + 1][j - 1] * c11; + val += src[i + 1][j - 0] * c10; + val += src[i + 1][j + 1] * c11; + val += src[i + 1][j + 2] * c21; + + val += src[i + 2][j - 1] * c21; + val += src[i + 2][j - 0] * c20; + val += src[i + 2][j + 1] * c21; + dst[i][j] *= val; } } @@ -1390,6 +1465,8 @@ template void gaussianBlurImpl(T** src, T** dst, const int W, const int { static constexpr auto GAUSS_SKIP = 0.25; static constexpr auto GAUSS_3X3_LIMIT = 0.6; + static constexpr auto GAUSS_5X5_LIMIT = 0.84; + static constexpr auto GAUSS_7X7_LIMIT = 1.15; static constexpr auto GAUSS_DOUBLE = 25.0; if(buffer) { @@ -1488,9 +1565,9 @@ template void gaussianBlurImpl(T** src, T** dst, const int W, const int if (sigma < GAUSS_DOUBLE) { switch (gausstype) { case GAUSS_MULT : { - if (sigma < 0.84 && src != dst) { + if (sigma <= GAUSS_5X5_LIMIT && src != dst) { gauss5x5mult(src, dst, W, H, sigma); - } else if (sigma < 1.15f && src != dst) { + } else if (sigma <= GAUSS_7X7_LIMIT && src != dst) { gauss7x7mult(src, dst, W, H, sigma); } else { gaussHorizontalSse (src, src, W, H, sigma); @@ -1500,9 +1577,9 @@ template void gaussianBlurImpl(T** src, T** dst, const int W, const int } case GAUSS_DIV : { - if (sigma < 0.84f && src != dst) { + if (sigma <= GAUSS_5X5_LIMIT && src != dst) { gauss5x5div (src, dst, buffer2, W, H, sigma); - } else if (sigma < 1.15f && src != dst) { + } else if (sigma <= GAUSS_7X7_LIMIT && src != dst) { gauss7x7div (src, dst, buffer2, W, H, sigma); } else { gaussHorizontalSse (src, dst, W, H, sigma); @@ -1527,14 +1604,26 @@ template void gaussianBlurImpl(T** src, T** dst, const int W, const int if (sigma < GAUSS_DOUBLE) { switch (gausstype) { case GAUSS_MULT : { - gaussHorizontal (src, src, W, H, sigma); - gaussVerticalmult (src, dst, W, H, sigma); + if (sigma <= GAUSS_5X5_LIMIT && src != dst) { + gauss5x5mult(src, dst, W, H, sigma); + } else if (sigma <= GAUSS_7X7_LIMIT && src != dst) { + gauss7x7mult(src, dst, W, H, sigma); + } else { + gaussHorizontal (src, src, W, H, sigma); + gaussVerticalmult (src, dst, W, H, sigma); + } break; } case GAUSS_DIV : { - gaussHorizontal (src, dst, W, H, sigma); - gaussVerticaldiv (dst, dst, buffer2, W, H, sigma); + if (sigma <= GAUSS_5X5_LIMIT && src != dst) { + gauss5x5div (src, dst, buffer2, W, H, sigma); + } else if (sigma <= GAUSS_7X7_LIMIT && src != dst) { + gauss7x7div (src, dst, buffer2, W, H, sigma); + } else { + gaussHorizontal (src, dst, W, H, sigma); + gaussVerticaldiv (dst, dst, buffer2, W, H, sigma); + } break; }