From 0c1caf6c3658f8230eb16d0e1ba4ccac892463eb Mon Sep 17 00:00:00 2001 From: Ingo Weyrich Date: Tue, 20 Aug 2019 18:41:06 +0200 Subject: [PATCH] capture sharpening: further improvements and speedups --- rtengine/color.h | 25 ++++ rtengine/gauss.cc | 269 ++++++++++++++++++++++++++++++++++++- rtengine/improcfun.h | 2 +- rtengine/ipsharpen.cc | 2 +- rtengine/rawimagesource.cc | 27 ++-- 5 files changed, 307 insertions(+), 18 deletions(-) diff --git a/rtengine/color.h b/rtengine/color.h index 7cc7368b3..789bf27c6 100644 --- a/rtengine/color.h +++ b/rtengine/color.h @@ -1882,6 +1882,31 @@ public: } } + static inline void RGB2Y(const float* R, const float* G, const float* B, float* Y1, float * Y2, float gamma, int W) { + gamma = 1.f / gamma; + int i = 0; +#ifdef __SSE2__ + const vfloat gammav = F2V(gamma); + const vfloat c1v = F2V(0.2627f); + const vfloat c2v = F2V(0.6780f); + const vfloat c3v = F2V(0.0593f); + for (; i < W - 3; i += 4) { + const vfloat Rv = vmaxf(LVFU(R[i]), ZEROV); + const vfloat Gv = vmaxf(LVFU(G[i]), ZEROV); + const vfloat Bv = vmaxf(LVFU(B[i]), ZEROV); + vfloat yv = pow_F(c1v * Rv + c2v * Gv + c3v * Bv, gammav); + STVFU(Y1[i], yv); + STVFU(Y2[i], yv); + } +#endif + for (; i < W; ++i) { + const float r = std::max(R[i], 0.f); + const float g = std::max(G[i], 0.f); + const float b = std::max(B[i], 0.f); + Y1[i] = Y2[i] = pow_F(0.2627f * r + 0.6780f * g + 0.0593f * b, gamma); + } + } + }; } diff --git a/rtengine/gauss.cc b/rtengine/gauss.cc index b7de67851..171ed84e8 100644 --- a/rtengine/gauss.cc +++ b/rtengine/gauss.cc @@ -17,14 +17,48 @@ * along with RawTherapee. If not, see . */ #include "gauss.h" +#include "rt_math.h" #include #include #include "opthelper.h" #include "boxblur.h" - namespace { +void compute7x7kernel(float sigma, float kernel[7][7]) { + const double temp = -2.f * rtengine::SQR(sigma); + float sum = 0.f; + for (int i = -3; i <= 3; ++i) { + for (int j = -3; j <= 3; ++j) { + kernel[i + 3][j + 3] = std::exp((rtengine::SQR(i) + rtengine::SQR(j)) / temp); + sum += kernel[i + 3][j + 3]; + } + } + + for (int i = 0; i < 7; ++i) { + for (int j = 0; j < 7; ++j) { + kernel[i][j] /= sum; + } + } +} + +void compute5x5kernel(float sigma, float kernel[5][5]) { + const double temp = -2.f * rtengine::SQR(sigma); + float sum = 0.f; + for (int i = -2; i <= 2; ++i) { + for (int j = -2; j <= 2; ++j) { + kernel[i + 2][j + 2] = std::exp((rtengine::SQR(i) + rtengine::SQR(j)) / temp); + sum += kernel[i + 2][j + 2]; + } + } + + for (int i = 0; i < 5; ++i) { + for (int j = 0; j < 5; ++j) { + kernel[i][j] /= sum; + } + } +} + template void calculateYvVFactors( const T sigma, T &b1, T &b2, T &b3, T &B, T M[3][3]) { // coefficient calculation @@ -207,6 +241,219 @@ template void gauss3x3div (T** RESTRICT src, T** RESTRICT dst, T** REST } } +template void gauss7x7div (T** RESTRICT src, T** RESTRICT dst, T** RESTRICT divBuffer, const int W, const int H, float sigma) +{ + + float kernel[7][7]; + compute7x7kernel(sigma, kernel); + +#ifdef __SSE2__ + vfloat kernelv[7][7] ALIGNED16; + for (int i = 0; i < 7; ++i) { + for (int j = 0; j < 7; ++j) { + kernelv[i][j] = F2V(kernel[i][j]); + } + } + const vfloat onev = F2V(1.f); +#endif + +#ifdef _OPENMP + #pragma omp for schedule(dynamic, 16) nowait +#endif + + for (int i = 3; i < H - 3; ++i) { + dst[i][0] = dst[i][1] = dst[i][2] = 1.f; + int j = 3; +#ifdef __SSE2__ + for (; j < W - 6; j += 4) { + vfloat valv = ZEROV; + for (int k = -3; k <= 3; ++k) { + for (int l = -3; l <= 3; ++l) { + valv += LVFU(src[i + k][j + l]) * LVF(kernelv[k + 3][l + 3]); + } + } + STVFU(dst[i][j], vmaxf(LVFU(divBuffer[i][j]) / (vself(vmaskf_gt(valv, ZEROV), valv, onev)), ZEROV)); + } +#endif + for (; j < W - 3; ++j) { + float val = 0; + for (int k = -3; k <= 3; ++k) { + for (int l = -3; l <= 3; ++l) { + val += src[i + k][j + l] * kernel[k + 3][l + 3]; + } + } + dst[i][j] = rtengine::max(divBuffer[i][j] / (val > 0.f ? val : 1.f), 0.f); + } + dst[i][W - 3] = dst[i][W - 2] = dst[i][W - 1] = 1.f; + } + + // first and last rows +#ifdef _OPENMP + #pragma omp single +#endif + { + for (int i = 0; i < 3; ++i) { + for (int j = 0; j < W; ++j) { + dst[i][j] = 1.f; + } + } + for (int i = H - 3 ; i < H; ++i) { + for (int j = 0; j < W; ++j) { + dst[i][j] = 1.f; + } + } + } +} + +template void gauss5x5div (T** RESTRICT src, T** RESTRICT dst, T** RESTRICT divBuffer, const int W, const int H, float sigma) +{ + + float kernel[5][5]; + compute5x5kernel(sigma, kernel); + +#ifdef __SSE2__ + vfloat kernelv[5][5] ALIGNED16; + for (int i = 0; i < 5; ++i) { + for (int j = 0; j < 5; ++j) { + kernelv[i][j] = F2V(kernel[i][j]); + } + } + const vfloat onev = F2V(1.f); +#endif + +#ifdef _OPENMP + #pragma omp for schedule(dynamic, 16) nowait +#endif + + for (int i = 2; i < H - 2; ++i) { + dst[i][0] = dst[i][1] = 1.f; + int j = 2; +#ifdef __SSE2__ + for (; j < W - 5; j += 4) { + vfloat valv = ZEROV; + for (int k = -2; k <= 2; ++k) { + for (int l = -2; l <= 2; ++l) { + valv += LVFU(src[i + k][j + l]) * LVF(kernelv[k + 2][l + 2]); + } + } + STVFU(dst[i][j], vmaxf(LVFU(divBuffer[i][j]) / (vself(vmaskf_gt(valv, ZEROV), valv, onev)), ZEROV)); + } +#endif + for (; j < W - 2; ++j) { + float val = 0; + for (int k = -2; k <= 2; ++k) { + for (int l = -2; l <= 2; ++l) { + val += src[i + k][j + l] * kernel[k + 2][l + 2]; + } + } + dst[i][j] = rtengine::max(divBuffer[i][j] / (val > 0.f ? val : 1.f), 0.f); + } + dst[i][W - 2] = dst[i][W - 1] = 1.f; + } + + // first and last rows +#ifdef _OPENMP + #pragma omp single +#endif + { + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < W; ++j) { + dst[i][j] = 1.f; + } + } + for (int i = H - 2 ; i < H; ++i) { + for (int j = 0; j < W; ++j) { + dst[i][j] = 1.f; + } + } + } +} + +template void gauss7x7mult (T** RESTRICT src, T** RESTRICT dst, const int W, const int H, float sigma) +{ + + float kernel[7][7]; + compute7x7kernel(sigma, kernel); +#ifdef __SSE2__ + vfloat kernelv[7][7] ALIGNED16; + for (int i = 0; i < 7; ++i) { + for (int j = 0; j < 7; ++j) { + kernelv[i][j] = F2V(kernel[i][j]); + } + } +#endif + +#ifdef _OPENMP + #pragma omp for schedule(dynamic, 16) +#endif + + for (int i = 3; i < H - 3; ++i) { + int j = 3; +#ifdef __SSE2__ + for (; j < W - 6; j += 4) { + vfloat valv = ZEROV; + for (int k = -3; k <= 3; ++k) { + for (int l = -3; l <= 3; ++l) { + valv += LVFU(src[i + k][j + l]) * LVF(kernelv[k + 3][l + 3]); + } + } + STVFU(dst[i][j], LVFU(dst[i][j]) * valv); + } +#endif + for (; j < W - 3; ++j) { + float val = 0; + for (int k = -3; k <= 3; ++k) { + for (int l = -3; l <= 3; ++l) { + val += src[i + k][j + l] * kernel[k + 3][l + 3]; + } + } + dst[i][j] *= val; + } + } +} + +template void gauss5x5mult (T** RESTRICT src, T** RESTRICT dst, const int W, const int H, float sigma) +{ + + float kernel[5][5]; + compute5x5kernel(sigma, kernel); +#ifdef __SSE2__ + vfloat kernelv[5][5] ALIGNED16; + for (int i = 0; i < 5; ++i) { + for (int j = 0; j < 5; ++j) { + kernelv[i][j] = F2V(kernel[i][j]); + } + } +#endif + +#ifdef _OPENMP + #pragma omp for schedule(dynamic, 16) +#endif + + for (int i = 2; i < H - 2; ++i) { + int j = 2; +#ifdef __SSE2__ + for (; j < W - 5; j += 4) { + vfloat valv = ZEROV; + for (int k = -2; k <= 2; ++k) { + for (int l = -2; l <= 2; ++l) { + valv += LVFU(src[i + k][j + l]) * LVF(kernelv[k + 2][l + 2]); + } + } + STVFU(dst[i][j], LVFU(dst[i][j]) * valv); + } +#endif + for (; j < W - 2; ++j) { + float val = 0; + for (int k = -2; k <= 2; ++k) { + for (int l = -2; l <= 2; ++l) { + val += src[i + k][j + l] * kernel[k + 2][l + 2]; + } + } + dst[i][j] *= val; + } + } +} // use separated filter if the support window is small and src == dst template void gaussHorizontal3 (T** src, T** dst, int W, int H, const float c0, const float c1) @@ -1241,14 +1488,26 @@ template void gaussianBlurImpl(T** src, T** dst, const int W, const int if (sigma < GAUSS_DOUBLE) { switch (gausstype) { case GAUSS_MULT : { - gaussHorizontalSse (src, src, W, H, sigma); - gaussVerticalSsemult (src, dst, W, H, sigma); + if (sigma < 0.84 && src != dst) { + gauss5x5mult(src, dst, W, H, sigma); + } else if (sigma < 1.15f && src != dst) { + gauss7x7mult(src, dst, W, H, sigma); + } else { + gaussHorizontalSse (src, src, W, H, sigma); + gaussVerticalSsemult (src, dst, W, H, sigma); + } break; } case GAUSS_DIV : { - gaussHorizontalSse (src, dst, W, H, sigma); - gaussVerticalSsediv (dst, dst, buffer2, W, H, sigma); + if (sigma < 0.84f && src != dst) { + gauss5x5div (src, dst, buffer2, W, H, sigma); + } else if (sigma < 1.15f && src != dst) { + gauss7x7div (src, dst, buffer2, W, H, sigma); + } else { + gaussHorizontalSse (src, dst, W, H, sigma); + gaussVerticalSsediv (dst, dst, buffer2, W, H, sigma); + } break; } diff --git a/rtengine/improcfun.h b/rtengine/improcfun.h index 6fe8a785d..5cd65cad8 100644 --- a/rtengine/improcfun.h +++ b/rtengine/improcfun.h @@ -248,7 +248,7 @@ public: void Lanczos(const LabImage* src, LabImage* dst, float scale); void Lanczos(const Imagefloat* src, Imagefloat* dst, float scale); - void deconvsharpening(float** luminance, float** buffer, float** blend, int W, int H, const procparams::SharpeningParams &sharpenParam, double Scale); + void deconvsharpening(float** luminance, float** buffer, const float* const * blend, int W, int H, const procparams::SharpeningParams &sharpenParam, double Scale); void MLsharpen(LabImage* lab); // Manuel's clarity / sharpening void MLmicrocontrast(float** luminance, int W, int H); //Manuel's microcontrast void MLmicrocontrast(LabImage* lab); //Manuel's microcontrast diff --git a/rtengine/ipsharpen.cc b/rtengine/ipsharpen.cc index 897aaf7b5..1d3d6375e 100644 --- a/rtengine/ipsharpen.cc +++ b/rtengine/ipsharpen.cc @@ -158,7 +158,7 @@ namespace rtengine extern const Settings* settings; -void ImProcFunctions::deconvsharpening (float** luminance, float** tmp, float ** blend, int W, int H, const SharpeningParams &sharpenParam, double Scale) +void ImProcFunctions::deconvsharpening (float** luminance, float** tmp, const float * const * blend, int W, int H, const SharpeningParams &sharpenParam, double Scale) { if (sharpenParam.deconvamount == 0 && sharpenParam.blurradius < 0.25f) { return; diff --git a/rtengine/rawimagesource.cc b/rtengine/rawimagesource.cc index d947d4195..0602c770b 100644 --- a/rtengine/rawimagesource.cc +++ b/rtengine/rawimagesource.cc @@ -4989,20 +4989,14 @@ BENCHFUN array2D L(W,H); array2D YOld(W,H); array2D YNew(W,H); -// array2D& Y = red; // red will be overridden anyway => we can use its buffer to store Y -// array2D& Cb = green; // green will be overridden anyway => we can use its buffer to store Cb -// array2D& Cr = blue; // blue will be overridden anyway => we can use its buffer to store Cr - StopWatch Stop1("rgb2Y"); + StopWatch Stop1("rgb2YL"); #ifdef _OPENMP #pragma omp parallel for schedule(dynamic, 16) #endif for (int i = 0; i < H; ++i) { Color::RGB2L(red[i], green[i], blue[i], L[i], xyz_rgb, W); - Color::RGB2Y(red[i], green[i], blue[i], YOld[i], sharpeningParams.gamma, W); - for (int j = 0; j < W; ++j) { - YNew[i][j] = YOld[i][j]; - } + Color::RGB2Y(red[i], green[i], blue[i], YOld[i], YNew[i], sharpeningParams.gamma, W); } // calculate contrast based blend factors to reduce sharpening in regions with low contrast JaggedArray blend(W, H); @@ -5017,11 +5011,22 @@ BENCHFUN StopWatch Stop2("Y2RGB"); const float gamma = sharpeningParams.gamma; #ifdef _OPENMP - #pragma omp parallel for + #pragma omp parallel for schedule(dynamic, 16) #endif for (int i = 0; i < H; ++i) { - for (int j = 0; j < W; ++j) { - const float factor = pow_F(YNew[i][j] / (YOld[i][j] == 0.f ? 0.00001f : YOld[i][j]), gamma); + int j = 0; +#ifdef __SSE2__ + const vfloat gammav = F2V(gamma); + for (; j < W - 3; j += 4) { + const vfloat factor = pow_F(LVFU(YNew[i][j]) / vmaxf(LVFU(YOld[i][j]), F2V(0.00001f)), gammav); + STVFU(red[i][j], LVFU(red[i][j]) * factor); + STVFU(green[i][j], LVFU(green[i][j]) * factor); + STVFU(blue[i][j], LVFU(blue[i][j]) * factor); + } + +#endif + for (; j < W; ++j) { + const float factor = pow_F(YNew[i][j] / std::max(YOld[i][j], 0.00001f), gamma); red[i][j] *= factor; green[i][j] *= factor; blue[i][j] *= factor;