diff --git a/rtengine/color.h b/rtengine/color.h
index 7cc7368b3..789bf27c6 100644
--- a/rtengine/color.h
+++ b/rtengine/color.h
@@ -1882,6 +1882,31 @@ public:
}
}
+ static inline void RGB2Y(const float* R, const float* G, const float* B, float* Y1, float * Y2, float gamma, int W) {
+ gamma = 1.f / gamma;
+ int i = 0;
+#ifdef __SSE2__
+ const vfloat gammav = F2V(gamma);
+ const vfloat c1v = F2V(0.2627f);
+ const vfloat c2v = F2V(0.6780f);
+ const vfloat c3v = F2V(0.0593f);
+ for (; i < W - 3; i += 4) {
+ const vfloat Rv = vmaxf(LVFU(R[i]), ZEROV);
+ const vfloat Gv = vmaxf(LVFU(G[i]), ZEROV);
+ const vfloat Bv = vmaxf(LVFU(B[i]), ZEROV);
+ vfloat yv = pow_F(c1v * Rv + c2v * Gv + c3v * Bv, gammav);
+ STVFU(Y1[i], yv);
+ STVFU(Y2[i], yv);
+ }
+#endif
+ for (; i < W; ++i) {
+ const float r = std::max(R[i], 0.f);
+ const float g = std::max(G[i], 0.f);
+ const float b = std::max(B[i], 0.f);
+ Y1[i] = Y2[i] = pow_F(0.2627f * r + 0.6780f * g + 0.0593f * b, gamma);
+ }
+ }
+
};
}
diff --git a/rtengine/gauss.cc b/rtengine/gauss.cc
index b7de67851..171ed84e8 100644
--- a/rtengine/gauss.cc
+++ b/rtengine/gauss.cc
@@ -17,14 +17,48 @@
* along with RawTherapee. If not, see .
*/
#include "gauss.h"
+#include "rt_math.h"
#include
#include
#include "opthelper.h"
#include "boxblur.h"
-
namespace
{
+void compute7x7kernel(float sigma, float kernel[7][7]) {
+ const double temp = -2.f * rtengine::SQR(sigma);
+ float sum = 0.f;
+ for (int i = -3; i <= 3; ++i) {
+ for (int j = -3; j <= 3; ++j) {
+ kernel[i + 3][j + 3] = std::exp((rtengine::SQR(i) + rtengine::SQR(j)) / temp);
+ sum += kernel[i + 3][j + 3];
+ }
+ }
+
+ for (int i = 0; i < 7; ++i) {
+ for (int j = 0; j < 7; ++j) {
+ kernel[i][j] /= sum;
+ }
+ }
+}
+
+void compute5x5kernel(float sigma, float kernel[5][5]) {
+ const double temp = -2.f * rtengine::SQR(sigma);
+ float sum = 0.f;
+ for (int i = -2; i <= 2; ++i) {
+ for (int j = -2; j <= 2; ++j) {
+ kernel[i + 2][j + 2] = std::exp((rtengine::SQR(i) + rtengine::SQR(j)) / temp);
+ sum += kernel[i + 2][j + 2];
+ }
+ }
+
+ for (int i = 0; i < 5; ++i) {
+ for (int j = 0; j < 5; ++j) {
+ kernel[i][j] /= sum;
+ }
+ }
+}
+
template void calculateYvVFactors( const T sigma, T &b1, T &b2, T &b3, T &B, T M[3][3])
{
// coefficient calculation
@@ -207,6 +241,219 @@ template void gauss3x3div (T** RESTRICT src, T** RESTRICT dst, T** REST
}
}
+template void gauss7x7div (T** RESTRICT src, T** RESTRICT dst, T** RESTRICT divBuffer, const int W, const int H, float sigma)
+{
+
+ float kernel[7][7];
+ compute7x7kernel(sigma, kernel);
+
+#ifdef __SSE2__
+ vfloat kernelv[7][7] ALIGNED16;
+ for (int i = 0; i < 7; ++i) {
+ for (int j = 0; j < 7; ++j) {
+ kernelv[i][j] = F2V(kernel[i][j]);
+ }
+ }
+ const vfloat onev = F2V(1.f);
+#endif
+
+#ifdef _OPENMP
+ #pragma omp for schedule(dynamic, 16) nowait
+#endif
+
+ for (int i = 3; i < H - 3; ++i) {
+ dst[i][0] = dst[i][1] = dst[i][2] = 1.f;
+ int j = 3;
+#ifdef __SSE2__
+ for (; j < W - 6; j += 4) {
+ vfloat valv = ZEROV;
+ for (int k = -3; k <= 3; ++k) {
+ for (int l = -3; l <= 3; ++l) {
+ valv += LVFU(src[i + k][j + l]) * LVF(kernelv[k + 3][l + 3]);
+ }
+ }
+ STVFU(dst[i][j], vmaxf(LVFU(divBuffer[i][j]) / (vself(vmaskf_gt(valv, ZEROV), valv, onev)), ZEROV));
+ }
+#endif
+ for (; j < W - 3; ++j) {
+ float val = 0;
+ for (int k = -3; k <= 3; ++k) {
+ for (int l = -3; l <= 3; ++l) {
+ val += src[i + k][j + l] * kernel[k + 3][l + 3];
+ }
+ }
+ dst[i][j] = rtengine::max(divBuffer[i][j] / (val > 0.f ? val : 1.f), 0.f);
+ }
+ dst[i][W - 3] = dst[i][W - 2] = dst[i][W - 1] = 1.f;
+ }
+
+ // first and last rows
+#ifdef _OPENMP
+ #pragma omp single
+#endif
+ {
+ for (int i = 0; i < 3; ++i) {
+ for (int j = 0; j < W; ++j) {
+ dst[i][j] = 1.f;
+ }
+ }
+ for (int i = H - 3 ; i < H; ++i) {
+ for (int j = 0; j < W; ++j) {
+ dst[i][j] = 1.f;
+ }
+ }
+ }
+}
+
+template void gauss5x5div (T** RESTRICT src, T** RESTRICT dst, T** RESTRICT divBuffer, const int W, const int H, float sigma)
+{
+
+ float kernel[5][5];
+ compute5x5kernel(sigma, kernel);
+
+#ifdef __SSE2__
+ vfloat kernelv[5][5] ALIGNED16;
+ for (int i = 0; i < 5; ++i) {
+ for (int j = 0; j < 5; ++j) {
+ kernelv[i][j] = F2V(kernel[i][j]);
+ }
+ }
+ const vfloat onev = F2V(1.f);
+#endif
+
+#ifdef _OPENMP
+ #pragma omp for schedule(dynamic, 16) nowait
+#endif
+
+ for (int i = 2; i < H - 2; ++i) {
+ dst[i][0] = dst[i][1] = 1.f;
+ int j = 2;
+#ifdef __SSE2__
+ for (; j < W - 5; j += 4) {
+ vfloat valv = ZEROV;
+ for (int k = -2; k <= 2; ++k) {
+ for (int l = -2; l <= 2; ++l) {
+ valv += LVFU(src[i + k][j + l]) * LVF(kernelv[k + 2][l + 2]);
+ }
+ }
+ STVFU(dst[i][j], vmaxf(LVFU(divBuffer[i][j]) / (vself(vmaskf_gt(valv, ZEROV), valv, onev)), ZEROV));
+ }
+#endif
+ for (; j < W - 2; ++j) {
+ float val = 0;
+ for (int k = -2; k <= 2; ++k) {
+ for (int l = -2; l <= 2; ++l) {
+ val += src[i + k][j + l] * kernel[k + 2][l + 2];
+ }
+ }
+ dst[i][j] = rtengine::max(divBuffer[i][j] / (val > 0.f ? val : 1.f), 0.f);
+ }
+ dst[i][W - 2] = dst[i][W - 1] = 1.f;
+ }
+
+ // first and last rows
+#ifdef _OPENMP
+ #pragma omp single
+#endif
+ {
+ for (int i = 0; i < 2; ++i) {
+ for (int j = 0; j < W; ++j) {
+ dst[i][j] = 1.f;
+ }
+ }
+ for (int i = H - 2 ; i < H; ++i) {
+ for (int j = 0; j < W; ++j) {
+ dst[i][j] = 1.f;
+ }
+ }
+ }
+}
+
+template void gauss7x7mult (T** RESTRICT src, T** RESTRICT dst, const int W, const int H, float sigma)
+{
+
+ float kernel[7][7];
+ compute7x7kernel(sigma, kernel);
+#ifdef __SSE2__
+ vfloat kernelv[7][7] ALIGNED16;
+ for (int i = 0; i < 7; ++i) {
+ for (int j = 0; j < 7; ++j) {
+ kernelv[i][j] = F2V(kernel[i][j]);
+ }
+ }
+#endif
+
+#ifdef _OPENMP
+ #pragma omp for schedule(dynamic, 16)
+#endif
+
+ for (int i = 3; i < H - 3; ++i) {
+ int j = 3;
+#ifdef __SSE2__
+ for (; j < W - 6; j += 4) {
+ vfloat valv = ZEROV;
+ for (int k = -3; k <= 3; ++k) {
+ for (int l = -3; l <= 3; ++l) {
+ valv += LVFU(src[i + k][j + l]) * LVF(kernelv[k + 3][l + 3]);
+ }
+ }
+ STVFU(dst[i][j], LVFU(dst[i][j]) * valv);
+ }
+#endif
+ for (; j < W - 3; ++j) {
+ float val = 0;
+ for (int k = -3; k <= 3; ++k) {
+ for (int l = -3; l <= 3; ++l) {
+ val += src[i + k][j + l] * kernel[k + 3][l + 3];
+ }
+ }
+ dst[i][j] *= val;
+ }
+ }
+}
+
+template void gauss5x5mult (T** RESTRICT src, T** RESTRICT dst, const int W, const int H, float sigma)
+{
+
+ float kernel[5][5];
+ compute5x5kernel(sigma, kernel);
+#ifdef __SSE2__
+ vfloat kernelv[5][5] ALIGNED16;
+ for (int i = 0; i < 5; ++i) {
+ for (int j = 0; j < 5; ++j) {
+ kernelv[i][j] = F2V(kernel[i][j]);
+ }
+ }
+#endif
+
+#ifdef _OPENMP
+ #pragma omp for schedule(dynamic, 16)
+#endif
+
+ for (int i = 2; i < H - 2; ++i) {
+ int j = 2;
+#ifdef __SSE2__
+ for (; j < W - 5; j += 4) {
+ vfloat valv = ZEROV;
+ for (int k = -2; k <= 2; ++k) {
+ for (int l = -2; l <= 2; ++l) {
+ valv += LVFU(src[i + k][j + l]) * LVF(kernelv[k + 2][l + 2]);
+ }
+ }
+ STVFU(dst[i][j], LVFU(dst[i][j]) * valv);
+ }
+#endif
+ for (; j < W - 2; ++j) {
+ float val = 0;
+ for (int k = -2; k <= 2; ++k) {
+ for (int l = -2; l <= 2; ++l) {
+ val += src[i + k][j + l] * kernel[k + 2][l + 2];
+ }
+ }
+ dst[i][j] *= val;
+ }
+ }
+}
// use separated filter if the support window is small and src == dst
template void gaussHorizontal3 (T** src, T** dst, int W, int H, const float c0, const float c1)
@@ -1241,14 +1488,26 @@ template void gaussianBlurImpl(T** src, T** dst, const int W, const int
if (sigma < GAUSS_DOUBLE) {
switch (gausstype) {
case GAUSS_MULT : {
- gaussHorizontalSse (src, src, W, H, sigma);
- gaussVerticalSsemult (src, dst, W, H, sigma);
+ if (sigma < 0.84 && src != dst) {
+ gauss5x5mult(src, dst, W, H, sigma);
+ } else if (sigma < 1.15f && src != dst) {
+ gauss7x7mult(src, dst, W, H, sigma);
+ } else {
+ gaussHorizontalSse (src, src, W, H, sigma);
+ gaussVerticalSsemult (src, dst, W, H, sigma);
+ }
break;
}
case GAUSS_DIV : {
- gaussHorizontalSse (src, dst, W, H, sigma);
- gaussVerticalSsediv (dst, dst, buffer2, W, H, sigma);
+ if (sigma < 0.84f && src != dst) {
+ gauss5x5div (src, dst, buffer2, W, H, sigma);
+ } else if (sigma < 1.15f && src != dst) {
+ gauss7x7div (src, dst, buffer2, W, H, sigma);
+ } else {
+ gaussHorizontalSse (src, dst, W, H, sigma);
+ gaussVerticalSsediv (dst, dst, buffer2, W, H, sigma);
+ }
break;
}
diff --git a/rtengine/improcfun.h b/rtengine/improcfun.h
index 6fe8a785d..5cd65cad8 100644
--- a/rtengine/improcfun.h
+++ b/rtengine/improcfun.h
@@ -248,7 +248,7 @@ public:
void Lanczos(const LabImage* src, LabImage* dst, float scale);
void Lanczos(const Imagefloat* src, Imagefloat* dst, float scale);
- void deconvsharpening(float** luminance, float** buffer, float** blend, int W, int H, const procparams::SharpeningParams &sharpenParam, double Scale);
+ void deconvsharpening(float** luminance, float** buffer, const float* const * blend, int W, int H, const procparams::SharpeningParams &sharpenParam, double Scale);
void MLsharpen(LabImage* lab); // Manuel's clarity / sharpening
void MLmicrocontrast(float** luminance, int W, int H); //Manuel's microcontrast
void MLmicrocontrast(LabImage* lab); //Manuel's microcontrast
diff --git a/rtengine/ipsharpen.cc b/rtengine/ipsharpen.cc
index 897aaf7b5..1d3d6375e 100644
--- a/rtengine/ipsharpen.cc
+++ b/rtengine/ipsharpen.cc
@@ -158,7 +158,7 @@ namespace rtengine
extern const Settings* settings;
-void ImProcFunctions::deconvsharpening (float** luminance, float** tmp, float ** blend, int W, int H, const SharpeningParams &sharpenParam, double Scale)
+void ImProcFunctions::deconvsharpening (float** luminance, float** tmp, const float * const * blend, int W, int H, const SharpeningParams &sharpenParam, double Scale)
{
if (sharpenParam.deconvamount == 0 && sharpenParam.blurradius < 0.25f) {
return;
diff --git a/rtengine/rawimagesource.cc b/rtengine/rawimagesource.cc
index d947d4195..0602c770b 100644
--- a/rtengine/rawimagesource.cc
+++ b/rtengine/rawimagesource.cc
@@ -4989,20 +4989,14 @@ BENCHFUN
array2D L(W,H);
array2D YOld(W,H);
array2D YNew(W,H);
-// array2D& Y = red; // red will be overridden anyway => we can use its buffer to store Y
-// array2D& Cb = green; // green will be overridden anyway => we can use its buffer to store Cb
-// array2D& Cr = blue; // blue will be overridden anyway => we can use its buffer to store Cr
- StopWatch Stop1("rgb2Y");
+ StopWatch Stop1("rgb2YL");
#ifdef _OPENMP
#pragma omp parallel for schedule(dynamic, 16)
#endif
for (int i = 0; i < H; ++i) {
Color::RGB2L(red[i], green[i], blue[i], L[i], xyz_rgb, W);
- Color::RGB2Y(red[i], green[i], blue[i], YOld[i], sharpeningParams.gamma, W);
- for (int j = 0; j < W; ++j) {
- YNew[i][j] = YOld[i][j];
- }
+ Color::RGB2Y(red[i], green[i], blue[i], YOld[i], YNew[i], sharpeningParams.gamma, W);
}
// calculate contrast based blend factors to reduce sharpening in regions with low contrast
JaggedArray blend(W, H);
@@ -5017,11 +5011,22 @@ BENCHFUN
StopWatch Stop2("Y2RGB");
const float gamma = sharpeningParams.gamma;
#ifdef _OPENMP
- #pragma omp parallel for
+ #pragma omp parallel for schedule(dynamic, 16)
#endif
for (int i = 0; i < H; ++i) {
- for (int j = 0; j < W; ++j) {
- const float factor = pow_F(YNew[i][j] / (YOld[i][j] == 0.f ? 0.00001f : YOld[i][j]), gamma);
+ int j = 0;
+#ifdef __SSE2__
+ const vfloat gammav = F2V(gamma);
+ for (; j < W - 3; j += 4) {
+ const vfloat factor = pow_F(LVFU(YNew[i][j]) / vmaxf(LVFU(YOld[i][j]), F2V(0.00001f)), gammav);
+ STVFU(red[i][j], LVFU(red[i][j]) * factor);
+ STVFU(green[i][j], LVFU(green[i][j]) * factor);
+ STVFU(blue[i][j], LVFU(blue[i][j]) * factor);
+ }
+
+#endif
+ for (; j < W; ++j) {
+ const float factor = pow_F(YNew[i][j] / std::max(YOld[i][j], 0.00001f), gamma);
red[i][j] *= factor;
green[i][j] *= factor;
blue[i][j] *= factor;