From 9a624ca01e36208a8727f200554ed088f9f618e2 Mon Sep 17 00:00:00 2001
From: Ingo Weyrich <heckflosse67@gmx.de>
Date: Tue, 27 Aug 2019 13:25:34 +0200
Subject: [PATCH 01/15] Speedup for transform

---
 rtengine/improcfun.h    | 110 -------------------------------------
 rtengine/iptransform.cc | 119 +++++++++++++++++++++++++++++-----------
 2 files changed, 86 insertions(+), 143 deletions(-)

diff --git a/rtengine/improcfun.h b/rtengine/improcfun.h
index 987a460d7..338df2416 100644
--- a/rtengine/improcfun.h
+++ b/rtengine/improcfun.h
@@ -77,116 +77,6 @@ class ImProcFunctions
     bool needsLensfun();
 //   static cmsUInt8Number* Mempro = NULL;
 
-    inline void interpolateTransformCubic(Imagefloat* src, int xs, int ys, double Dx, double Dy, float *r, float *g, float *b, double mul)
-    {
-        const double A = -0.85;
-
-        double w[4];
-
-        {
-            double t1, t2;
-            t1 = -A * (Dx - 1.0) * Dx;
-            t2 = (3.0 - 2.0 * Dx) * Dx * Dx;
-            w[3] = t1 * Dx;
-            w[2] = t1 * (Dx - 1.0) + t2;
-            w[1] = -t1 * Dx + 1.0 - t2;
-            w[0] = -t1 * (Dx - 1.0);
-        }
-
-        double rd, gd, bd;
-        double yr[4] = {0.0}, yg[4] = {0.0}, yb[4] = {0.0};
-
-        for (int k = ys, kx = 0; k < ys + 4; k++, kx++) {
-            rd = gd = bd = 0.0;
-
-            for (int i = xs, ix = 0; i < xs + 4; i++, ix++) {
-                rd += src->r(k, i) * w[ix];
-                gd += src->g(k, i) * w[ix];
-                bd += src->b(k, i) * w[ix];
-            }
-
-            yr[kx] = rd;
-            yg[kx] = gd;
-            yb[kx] = bd;
-        }
-
-
-        {
-            double t1, t2;
-
-            t1 = -A * (Dy - 1.0) * Dy;
-            t2 = (3.0 - 2.0 * Dy) * Dy * Dy;
-            w[3] = t1 * Dy;
-            w[2] = t1 * (Dy - 1.0) + t2;
-            w[1] = -t1 * Dy + 1.0 - t2;
-            w[0] = -t1 * (Dy - 1.0);
-        }
-
-        rd = gd = bd = 0.0;
-
-        for (int i = 0; i < 4; i++) {
-            rd += yr[i] * w[i];
-            gd += yg[i] * w[i];
-            bd += yb[i] * w[i];
-        }
-
-        *r = rd * mul;
-        *g = gd * mul;
-        *b = bd * mul;
-
-        //  if (xs==100 && ys==100)
-        //    printf ("r=%g, g=%g\n", *r, *g);
-    }
-
-    inline void interpolateTransformChannelsCubic(float** src, int xs, int ys, double Dx, double Dy, float *r, double mul)
-    {
-        const double A = -0.85;
-
-        double w[4];
-
-        {
-            double t1, t2;
-            t1 = -A * (Dx - 1.0) * Dx;
-            t2 = (3.0 - 2.0 * Dx) * Dx * Dx;
-            w[3] = t1 * Dx;
-            w[2] = t1 * (Dx - 1.0) + t2;
-            w[1] = -t1 * Dx + 1.0 - t2;
-            w[0] = -t1 * (Dx - 1.0);
-        }
-
-        double rd;
-        double yr[4] = {0.0};
-
-        for (int k = ys, kx = 0; k < ys + 4; k++, kx++) {
-            rd = 0.0;
-
-            for (int i = xs, ix = 0; i < xs + 4; i++, ix++) {
-                rd += src[k][i] * w[ix];
-            }
-
-            yr[kx] = rd;
-        }
-
-
-        {
-            double t1, t2;
-            t1 = -A * (Dy - 1.0) * Dy;
-            t2 = (3.0 - 2.0 * Dy) * Dy * Dy;
-            w[3] = t1 * Dy;
-            w[2] = t1 * (Dy - 1.0) + t2;
-            w[1] = -t1 * Dy + 1.0 - t2;
-            w[0] = -t1 * (Dy - 1.0);
-        }
-
-        rd = 0.0;
-
-        for (int i = 0; i < 4; i++) {
-            rd += yr[i] * w[i];
-        }
-
-        *r = rd * mul;
-    }
-
 
 public:
     enum class Median {
diff --git a/rtengine/iptransform.cc b/rtengine/iptransform.cc
index 5b1e7c458..1d7ac9d56 100644
--- a/rtengine/iptransform.cc
+++ b/rtengine/iptransform.cc
@@ -26,7 +26,8 @@
 #include "rt_math.h"
 #include "sleef.c"
 #include "rtlensfun.h"
-
+#define BENCHMARK
+#include "StopWatch.h"
 
 using namespace std;
 
@@ -87,6 +88,66 @@ float normn (float a, float b, int n)
     }
 }
 
+inline void interpolateTransformCubic(rtengine::Imagefloat* src, int xs, int ys, float Dx, float Dy, float &r, float &g, float &b, float mul)
+{
+    constexpr float A = -0.85f;
+
+    // Vertical
+    const float t1Vert = A * (Dy - Dy * Dy);
+    const float t2Vert = (3.f - 2.f * Dy) * Dy * Dy;
+    const float w3Vert = t1Vert * Dy;
+    const float w2Vert = t1Vert * Dy - t1Vert + t2Vert;
+    const float w1Vert = 1.f - (t1Vert * Dy) - t2Vert;
+    const float w0Vert = t1Vert - (t1Vert * Dy);
+
+    float rv[4], gv[4], bv[4];
+    for (int i = 0; i < 4; ++i) {
+        rv[i] = w0Vert * src->r(ys, xs + i) + w1Vert * src->r(ys + 1, xs + i) + w2Vert * src->r(ys + 2, xs + i) + w3Vert * src->r(ys + 3, xs + i);
+        gv[i] = w0Vert * src->g(ys, xs + i) + w1Vert * src->g(ys + 1, xs + i) + w2Vert * src->g(ys + 2, xs + i) + w3Vert * src->g(ys + 3, xs + i);
+        bv[i] = w0Vert * src->b(ys, xs + i) + w1Vert * src->b(ys + 1, xs + i) + w2Vert * src->b(ys + 2, xs + i) + w3Vert * src->b(ys + 3, xs + i);
+    }
+
+    // Horizontal
+    const float t1Hor = A * (Dx - Dx * Dx);
+    const float t2Hor = (3.f - 2.f * Dx) * Dx * Dx;
+    const float w3Hor = t1Hor * Dx;
+    const float w2Hor = t1Hor * Dx - t1Hor + t2Hor;
+    const float w1Hor = 1.f - (t1Hor * Dx) - t2Hor;
+    const float w0Hor = t1Hor - (t1Hor * Dx);
+
+    r = mul * (rv[0] * w0Hor + rv[1] * w1Hor + rv[2] * w2Hor + rv[3] * w3Hor);
+    g = mul * (gv[0] * w0Hor + gv[1] * w1Hor + gv[2] * w2Hor + gv[3] * w3Hor);
+    b = mul * (bv[0] * w0Hor + bv[1] * w1Hor + bv[2] * w2Hor + bv[3] * w3Hor);
+}
+
+inline void interpolateTransformChannelsCubic(const float* const * src, int xs, int ys, float Dx, float Dy, float &dest, float mul)
+{
+    constexpr float A = -0.85f;
+
+    // Vertical
+    const float t1Vert = A * (Dy - Dy * Dy);
+    const float t2Vert = (3.f - 2.f * Dy) * Dy * Dy;
+    const float w3Vert = t1Vert * Dy;
+    const float w2Vert = t1Vert * Dy - t1Vert + t2Vert;
+    const float w1Vert = 1.f - (t1Vert * Dy) - t2Vert;
+    const float w0Vert = t1Vert - (t1Vert * Dy);
+
+    float cv[4];
+    for (int i = 0; i < 4; ++i) {
+        cv[i] = w0Vert * src[ys][xs + i] + w1Vert * src[ys + 1][xs + i] + w2Vert * src[ys + 2][xs + i] + w3Vert * src[ys + 3][xs + i];
+    }
+
+    // Horizontal
+    const float t1Hor = A * (Dx - Dx * Dx);
+    const float t2Hor = (3.f - 2.f * Dx) * Dx * Dx;
+    const float w3Hor = t1Hor * Dx;
+    const float w2Hor = t1Hor * Dx - t1Hor + t2Hor;
+    const float w1Hor = 1.f - (t1Hor * Dx) - t2Hor;
+    const float w0Hor = t1Hor - (t1Hor * Dx);
+
+    dest = mul * (cv[0] * w0Hor + cv[1] * w1Hor + cv[2] * w2Hor + cv[3] * w3Hor);
+}
+
 
 }
 
@@ -740,17 +801,18 @@ void ImProcFunctions::transformLuminanceOnly (Imagefloat* original, Imagefloat*
 
 void ImProcFunctions::transformGeneral(bool highQuality, Imagefloat *original, Imagefloat *transformed, int cx, int cy, int sx, int sy, int oW, int oH, int fW, int fH, const LensCorrection *pLCPMap)
 {
+    BENCHFUN
     // set up stuff, depending on the mode we are
-    bool enableLCPDist = pLCPMap && params->lensProf.useDist;
-    bool enableCA = highQuality && needsCA();
-    bool enableGradient = needsGradient();
-    bool enablePCVignetting = needsPCVignetting();
-    bool enableVignetting = needsVignetting();
-    bool enablePerspective = needsPerspective();
-    bool enableDistortion = needsDistortion();
+    const bool enableLCPDist = pLCPMap && params->lensProf.useDist;
+    const bool enableCA = highQuality && needsCA();
+    const bool enableGradient = needsGradient();
+    const bool enablePCVignetting = needsPCVignetting();
+    const bool enableVignetting = needsVignetting();
+    const bool enablePerspective = needsPerspective();
+    const bool enableDistortion = needsDistortion();
 
-    double w2 = (double) oW  / 2.0 - 0.5;
-    double h2 = (double) oH  / 2.0 - 0.5;
+    const double w2 = (double) oW  / 2.0 - 0.5;
+    const double h2 = (double) oH  / 2.0 - 0.5;
 
     double vig_w2, vig_h2, maxRadius, v, b, mul;
     calcVignettingParams (oW, oH, params->vignetting, vig_w2, vig_h2, maxRadius, v, b, mul);
@@ -784,11 +846,11 @@ void ImProcFunctions::transformGeneral(bool highQuality, Imagefloat *original, I
     chDist[2] = enableCA ? params->cacorrection.blue : 0.0;
 
     // auxiliary variables for distortion correction
-    double distAmount = params->distortion.amount;
+    const double distAmount = params->distortion.amount;
 
     // auxiliary variables for rotation
-    double cost = cos (params->rotate.degree * rtengine::RT_PI / 180.0);
-    double sint = sin (params->rotate.degree * rtengine::RT_PI / 180.0);
+    const double cost = cos (params->rotate.degree * rtengine::RT_PI / 180.0);
+    const double sint = sin (params->rotate.degree * rtengine::RT_PI / 180.0);
 
     // auxiliary variables for vertical perspective correction
     double vpdeg = params->perspective.vertical / 100.0 * 45.0;
@@ -814,9 +876,10 @@ void ImProcFunctions::transformGeneral(bool highQuality, Imagefloat *original, I
 #pragma GCC diagnostic pop
 #endif
     // main cycle
-    bool darkening = (params->vignetting.amount <= 0.0);
+    const bool darkening = (params->vignetting.amount <= 0.0);
+
 #ifdef _OPENMP
-    #pragma omp parallel for if (multiThread)
+    #pragma omp parallel for schedule(dynamic, 16) if(multiThread)
 #endif
 
     for (int y = 0; y < transformed->getHeight(); y++) {
@@ -833,13 +896,6 @@ void ImProcFunctions::transformGeneral(bool highQuality, Imagefloat *original, I
             x_d += ascale * (cx - w2);     // centering x coord & scale
             y_d += ascale * (cy - h2);     // centering y coord & scale
 
-            double vig_x_d = 0., vig_y_d = 0.;
-
-            if (enableVignetting) {
-                vig_x_d = ascale * (x + cx - vig_w2);       // centering x coord & scale
-                vig_y_d = ascale * (y + cy - vig_h2);       // centering y coord & scale
-            }
-
             if (enablePerspective) {
                 // horizontal perspective transformation
                 y_d *= maxRadius / (maxRadius + x_d * hptanpt);
@@ -862,14 +918,6 @@ void ImProcFunctions::transformGeneral(bool highQuality, Imagefloat *original, I
                 s = 1.0 - distAmount + distAmount * r ;
             }
 
-            double r2 = 0.;
-
-            if (enableVignetting) {
-                double vig_Dx = vig_x_d * cost - vig_y_d * sint;
-                double vig_Dy = vig_x_d * sint + vig_y_d * cost;
-                r2 = sqrt (vig_Dx * vig_Dx + vig_Dy * vig_Dy);
-            }
-
             for (int c = 0; c < (enableCA ? 3 : 1); c++) {
                 double Dx = Dxc * (s + chDist[c]);
                 double Dy = Dyc * (s + chDist[c]);
@@ -893,6 +941,11 @@ void ImProcFunctions::transformGeneral(bool highQuality, Imagefloat *original, I
                     double vignmul = 1.0;
 
                     if (enableVignetting) {
+                        const double vig_x_d = ascale * (x + cx - vig_w2);       // centering x coord & scale
+                        const double vig_y_d = ascale * (y + cy - vig_h2);       // centering y coord & scale
+                        const double vig_Dx = vig_x_d * cost - vig_y_d * sint;
+                        const double vig_Dy = vig_x_d * sint + vig_y_d * cost;
+                        const double r2 = sqrt (vig_Dx * vig_Dx + vig_Dy * vig_Dy);
                         if (darkening) {
                             vignmul /= std::max (v + mul * tanh (b * (maxRadius - s * r2) / maxRadius), 0.001);
                         } else {
@@ -911,13 +964,13 @@ void ImProcFunctions::transformGeneral(bool highQuality, Imagefloat *original, I
                     if (yc > 0 && yc < original->getHeight() - 2 && xc > 0 && xc < original->getWidth() - 2) {
                         // all interpolation pixels inside image
                         if (enableCA) {
-                            interpolateTransformChannelsCubic (chOrig[c], xc - 1, yc - 1, Dx, Dy, & (chTrans[c][y][x]), vignmul);
+                            interpolateTransformChannelsCubic (chOrig[c], xc - 1, yc - 1, Dx, Dy, chTrans[c][y][x], vignmul);
                         } else if (!highQuality) {
                             transformed->r (y, x) = vignmul * (original->r (yc, xc) * (1.0 - Dx) * (1.0 - Dy) + original->r (yc, xc + 1) * Dx * (1.0 - Dy) + original->r (yc + 1, xc) * (1.0 - Dx) * Dy + original->r (yc + 1, xc + 1) * Dx * Dy);
                             transformed->g (y, x) = vignmul * (original->g (yc, xc) * (1.0 - Dx) * (1.0 - Dy) + original->g (yc, xc + 1) * Dx * (1.0 - Dy) + original->g (yc + 1, xc) * (1.0 - Dx) * Dy + original->g (yc + 1, xc + 1) * Dx * Dy);
                             transformed->b (y, x) = vignmul * (original->b (yc, xc) * (1.0 - Dx) * (1.0 - Dy) + original->b (yc, xc + 1) * Dx * (1.0 - Dy) + original->b (yc + 1, xc) * (1.0 - Dx) * Dy + original->b (yc + 1, xc + 1) * Dx * Dy);
                         } else {
-                            interpolateTransformCubic (original, xc - 1, yc - 1, Dx, Dy, & (transformed->r (y, x)), & (transformed->g (y, x)), & (transformed->b (y, x)), vignmul);
+                            interpolateTransformCubic (original, xc - 1, yc - 1, Dx, Dy, transformed->r (y, x), transformed->g (y, x), transformed->b (y, x), vignmul);
                         }
                     } else {
                         // edge pixels
@@ -988,7 +1041,7 @@ void ImProcFunctions::transformLCPCAOnly(Imagefloat *original, Imagefloat *trans
                     // multiplier for vignetting correction
                     if (yc > 0 && yc < original->getHeight() - 2 && xc > 0 && xc < original->getWidth() - 2) {
                         // all interpolation pixels inside image
-                        interpolateTransformChannelsCubic (chOrig[c], xc - 1, yc - 1, Dx, Dy, & (chTrans[c][y][x]), 1.0);
+                        interpolateTransformChannelsCubic (chOrig[c], xc - 1, yc - 1, Dx, Dy, chTrans[c][y][x], 1.0);
                     } else {
                         // edge pixels
                         int y1 = LIM (yc,   0, original->getHeight() - 1);

From 5a5952dddbbeb5ba901950e43592aea338d22e60 Mon Sep 17 00:00:00 2001
From: Ingo Weyrich <heckflosse67@gmx.de>
Date: Tue, 27 Aug 2019 13:36:52 +0200
Subject: [PATCH 02/15] added two comments

---
 rtengine/iptransform.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/rtengine/iptransform.cc b/rtengine/iptransform.cc
index 1d7ac9d56..9484599d6 100644
--- a/rtengine/iptransform.cc
+++ b/rtengine/iptransform.cc
@@ -90,6 +90,7 @@ float normn (float a, float b, int n)
 
 inline void interpolateTransformCubic(rtengine::Imagefloat* src, int xs, int ys, float Dx, float Dy, float &r, float &g, float &b, float mul)
 {
+    // I tried hand written SSE code but gcc vectorizes better
     constexpr float A = -0.85f;
 
     // Vertical
@@ -122,6 +123,7 @@ inline void interpolateTransformCubic(rtengine::Imagefloat* src, int xs, int ys,
 
 inline void interpolateTransformChannelsCubic(const float* const * src, int xs, int ys, float Dx, float Dy, float &dest, float mul)
 {
+    // I tried hand written SSE code but gcc vectorizes better
     constexpr float A = -0.85f;
 
     // Vertical

From 4312e682657590ef8c30f4056baa49613bdd16ed Mon Sep 17 00:00:00 2001
From: Ingo Weyrich <heckflosse67@gmx.de>
Date: Tue, 27 Aug 2019 19:59:10 +0200
Subject: [PATCH 03/15] SSE code for interpolateTransformCubic and
 interpolateTransformChannelsCubic, also some cleanups

---
 rtengine/iptransform.cc | 165 ++++++++++++++++++++++++----------------
 1 file changed, 98 insertions(+), 67 deletions(-)

diff --git a/rtengine/iptransform.cc b/rtengine/iptransform.cc
index 9484599d6..608fa0337 100644
--- a/rtengine/iptransform.cc
+++ b/rtengine/iptransform.cc
@@ -88,9 +88,34 @@ float normn (float a, float b, int n)
     }
 }
 
+#ifdef __SSE2__
+inline void interpolateTransformCubic(rtengine::Imagefloat* src, int xs, int ys, float Dx, float Dy, float &r, float &g, float &b, float mul)
+{
+    constexpr float A = -0.85f;
+
+    // Vertical
+    const float t1Vert = A * (Dy - Dy * Dy);
+    const float t2Vert = (3.f - 2.f * Dy) * Dy * Dy;
+    const vfloat w3Vert = F2V(t1Vert * Dy);
+    const vfloat w2Vert = F2V(t1Vert * Dy - t1Vert + t2Vert);
+    const vfloat w1Vert = F2V(1.f - (t1Vert * Dy) - t2Vert);
+    const vfloat w0Vert = F2V(t1Vert - (t1Vert * Dy));
+
+    const vfloat rv = (w0Vert * LVFU(src->r(ys, xs)) + w1Vert * LVFU(src->r(ys + 1, xs))) + (w2Vert * LVFU(src->r(ys + 2, xs)) + w3Vert * LVFU(src->r(ys + 3, xs)));
+    const vfloat gv = (w0Vert * LVFU(src->g(ys, xs)) + w1Vert * LVFU(src->g(ys + 1, xs))) + (w2Vert * LVFU(src->g(ys + 2, xs)) + w3Vert * LVFU(src->g(ys + 3, xs)));
+    const vfloat bv = (w0Vert * LVFU(src->b(ys, xs)) + w1Vert * LVFU(src->b(ys + 1, xs))) + (w2Vert * LVFU(src->b(ys + 2, xs)) + w3Vert * LVFU(src->b(ys + 3, xs)));
+
+    // Horizontal
+    const float t1Hor = A * (Dx - Dx * Dx);
+    const float t2Hor = (3.f - 2.f * Dx) * Dx * Dx;
+    const vfloat weight = _mm_set_ps(t1Hor * Dx, t1Hor * Dx - t1Hor + t2Hor, 1.f - (t1Hor * Dx) - t2Hor, t1Hor - (t1Hor * Dx)) * F2V(mul);
+    r = vhadd(weight * rv);
+    g = vhadd(weight * gv);
+    b = vhadd(weight * bv);
+}
+#else
 inline void interpolateTransformCubic(rtengine::Imagefloat* src, int xs, int ys, float Dx, float Dy, float &r, float &g, float &b, float mul)
 {
-    // I tried hand written SSE code but gcc vectorizes better
     constexpr float A = -0.85f;
 
     // Vertical
@@ -120,10 +145,31 @@ inline void interpolateTransformCubic(rtengine::Imagefloat* src, int xs, int ys,
     g = mul * (gv[0] * w0Hor + gv[1] * w1Hor + gv[2] * w2Hor + gv[3] * w3Hor);
     b = mul * (bv[0] * w0Hor + bv[1] * w1Hor + bv[2] * w2Hor + bv[3] * w3Hor);
 }
-
+#endif
+#ifdef __SSE2__
+inline void interpolateTransformChannelsCubic(const float* const * src, int xs, int ys, float Dx, float Dy, float &dest, float mul)
+{
+    constexpr float A = -0.85f;
+
+    // Vertical
+    const float t1Vert = A * (Dy - Dy * Dy);
+    const float t2Vert = (3.f - 2.f * Dy) * Dy * Dy;
+    const vfloat w3Vert = F2V(t1Vert * Dy);
+    const vfloat w2Vert = F2V(t1Vert * Dy - t1Vert + t2Vert);
+    const vfloat w1Vert = F2V(1.f - (t1Vert * Dy) - t2Vert);
+    const vfloat w0Vert = F2V(t1Vert - (t1Vert * Dy));
+
+    const vfloat cv = (w0Vert * LVFU(src[ys][xs]) + w1Vert * LVFU(src[ys + 1][xs])) + (w2Vert * LVFU(src[ys + 2][xs]) + w3Vert * LVFU(src[ys + 3][xs]));
+
+    // Horizontal
+    const float t1Hor = A * (Dx - Dx * Dx);
+    const float t2Hor = (3.f - 2.f * Dx) * Dx * Dx;
+    const vfloat weight = _mm_set_ps(t1Hor * Dx, t1Hor * Dx - t1Hor + t2Hor, 1.f - (t1Hor * Dx) - t2Hor, t1Hor - (t1Hor * Dx));
+    dest = mul * vhadd(weight * cv);
+}
+#else
 inline void interpolateTransformChannelsCubic(const float* const * src, int xs, int ys, float Dx, float Dy, float &dest, float mul)
 {
-    // I tried hand written SSE code but gcc vectorizes better
     constexpr float A = -0.85f;
 
     // Vertical
@@ -149,7 +195,7 @@ inline void interpolateTransformChannelsCubic(const float* const * src, int xs,
 
     dest = mul * (cv[0] * w0Hor + cv[1] * w1Hor + cv[2] * w2Hor + cv[3] * w3Hor);
 }
-
+#endif
 
 }
 
@@ -817,69 +863,54 @@ void ImProcFunctions::transformGeneral(bool highQuality, Imagefloat *original, I
     const double h2 = (double) oH  / 2.0 - 0.5;
 
     double vig_w2, vig_h2, maxRadius, v, b, mul;
-    calcVignettingParams (oW, oH, params->vignetting, vig_w2, vig_h2, maxRadius, v, b, mul);
+    calcVignettingParams(oW, oH, params->vignetting, vig_w2, vig_h2, maxRadius, v, b, mul);
 
     struct grad_params gp;
 
     if (enableGradient) {
-        calcGradientParams (oW, oH, params->gradient, gp);
+        calcGradientParams(oW, oH, params->gradient, gp);
     }
 
     struct pcv_params pcv;
 
     if (enablePCVignetting) {
-        calcPCVignetteParams (fW, fH, oW, oH, params->pcvignette, params->crop, pcv);
+        calcPCVignetteParams(fW, fH, oW, oH, params->pcvignette, params->crop, pcv);
     }
 
-    float** chOrig[3];
-    chOrig[0] = original->r.ptrs;
-    chOrig[1] = original->g.ptrs;
-    chOrig[2] = original->b.ptrs;
-
-    float** chTrans[3];
-    chTrans[0] = transformed->r.ptrs;
-    chTrans[1] = transformed->g.ptrs;
-    chTrans[2] = transformed->b.ptrs;
+    float** chOrig[3] = {original->r.ptrs, original->g.ptrs, original->b.ptrs};
+    float** chTrans[3] = {transformed->r.ptrs, transformed->g.ptrs, transformed->b.ptrs};
 
     // auxiliary variables for c/a correction
-    double chDist[3];
-    chDist[0] = enableCA ? params->cacorrection.red : 0.0;
-    chDist[1] = 0.0;
-    chDist[2] = enableCA ? params->cacorrection.blue : 0.0;
+    const double chDist[3] = {enableCA ? params->cacorrection.red : 0.0, 0.0, enableCA ? params->cacorrection.blue : 0.0};
 
     // auxiliary variables for distortion correction
     const double distAmount = params->distortion.amount;
 
     // auxiliary variables for rotation
-    const double cost = cos (params->rotate.degree * rtengine::RT_PI / 180.0);
-    const double sint = sin (params->rotate.degree * rtengine::RT_PI / 180.0);
+    const double cost = cos(params->rotate.degree * rtengine::RT_PI / 180.0);
+    const double sint = sin(params->rotate.degree * rtengine::RT_PI / 180.0);
 
     // auxiliary variables for vertical perspective correction
-    double vpdeg = params->perspective.vertical / 100.0 * 45.0;
-    double vpalpha = (90.0 - vpdeg) / 180.0 * rtengine::RT_PI;
-    double vpteta  = fabs (vpalpha - rtengine::RT_PI / 2) < 3e-4 ? 0.0 : acos ((vpdeg > 0 ? 1.0 : -1.0) * sqrt ((-SQR (oW * tan (vpalpha)) + (vpdeg > 0 ? 1.0 : -1.0) *
-                     oW * tan (vpalpha) * sqrt (SQR (4 * maxRadius) + SQR (oW * tan (vpalpha)))) / (SQR (maxRadius) * 8)));
-    double vpcospt = (vpdeg >= 0 ? 1.0 : -1.0) * cos (vpteta), vptanpt = tan (vpteta);
+    const double vpdeg = params->perspective.vertical / 100.0 * 45.0;
+    const double vpalpha = (90.0 - vpdeg) / 180.0 * rtengine::RT_PI;
+    const double vpteta  = fabs(vpalpha - rtengine::RT_PI / 2) < 3e-4 ? 0.0 : acos((vpdeg > 0 ? 1.0 : -1.0) * sqrt((-SQR(oW * tan(vpalpha)) + (vpdeg > 0 ? 1.0 : -1.0) *
+                           oW * tan(vpalpha) * sqrt(SQR(4 * maxRadius) + SQR(oW * tan(vpalpha)))) / (SQR(maxRadius) * 8)));
+    const double vpcospt = (vpdeg >= 0 ? 1.0 : -1.0) * cos(vpteta), vptanpt = tan(vpteta);
 
     // auxiliary variables for horizontal perspective correction
-    double hpdeg = params->perspective.horizontal / 100.0 * 45.0;
-    double hpalpha = (90.0 - hpdeg) / 180.0 * rtengine::RT_PI;
-    double hpteta  = fabs (hpalpha - rtengine::RT_PI / 2) < 3e-4 ? 0.0 : acos ((hpdeg > 0 ? 1.0 : -1.0) * sqrt ((-SQR (oH * tan (hpalpha)) + (hpdeg > 0 ? 1.0 : -1.0) *
-                     oH * tan (hpalpha) * sqrt (SQR (4 * maxRadius) + SQR (oH * tan (hpalpha)))) / (SQR (maxRadius) * 8)));
-    double hpcospt = (hpdeg >= 0 ? 1.0 : -1.0) * cos (hpteta), hptanpt = tan (hpteta);
+    const double hpdeg = params->perspective.horizontal / 100.0 * 45.0;
+    const double hpalpha = (90.0 - hpdeg) / 180.0 * rtengine::RT_PI;
+    const double hpteta  = fabs(hpalpha - rtengine::RT_PI / 2) < 3e-4 ? 0.0 : acos((hpdeg > 0 ? 1.0 : -1.0) * sqrt((-SQR(oH * tan(hpalpha)) + (hpdeg > 0 ? 1.0 : -1.0) *
+                           oH * tan(hpalpha) * sqrt(SQR(4 * maxRadius) + SQR(oH * tan(hpalpha)))) / (SQR(maxRadius) * 8)));
+    const double hpcospt = (hpdeg >= 0 ? 1.0 : -1.0) * cos(hpteta), hptanpt = tan(hpteta);
 
-    double ascale = params->commonTrans.autofill ? getTransformAutoFill (oW, oH, pLCPMap) : 1.0;
+    const double ascale = params->commonTrans.autofill ? getTransformAutoFill(oW, oH, pLCPMap) : 1.0;
 
-#if defined( __GNUC__ ) && __GNUC__ >= 7// silence warning
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wimplicit-fallthrough"
-#endif
-#if defined( __GNUC__ ) && __GNUC__ >= 7
-#pragma GCC diagnostic pop
-#endif
-    // main cycle
     const bool darkening = (params->vignetting.amount <= 0.0);
+    const double centerFactorx = cx - w2;
+    const double centerFactory = cy - h2;
 
+    // main cycle
 #ifdef _OPENMP
     #pragma omp parallel for schedule(dynamic, 16) if(multiThread)
 #endif
@@ -895,8 +926,8 @@ void ImProcFunctions::transformGeneral(bool highQuality, Imagefloat *original, I
                 y_d *= ascale;
             }
 
-            x_d += ascale * (cx - w2);     // centering x coord & scale
-            y_d += ascale * (cy - h2);     // centering y coord & scale
+            x_d += ascale * centerFactorx; // centering x coord & scale
+            y_d += ascale * centerFactory; // centering y coord & scale
 
             if (enablePerspective) {
                 // horizontal perspective transformation
@@ -909,15 +940,15 @@ void ImProcFunctions::transformGeneral(bool highQuality, Imagefloat *original, I
             }
 
             // rotate
-            double Dxc = x_d * cost - y_d * sint;
-            double Dyc = x_d * sint + y_d * cost;
+            const double Dxc = x_d * cost - y_d * sint;
+            const double Dyc = x_d * sint + y_d * cost;
 
             // distortion correction
             double s = 1;
 
             if (enableDistortion) {
-                double r = sqrt (Dxc * Dxc + Dyc * Dyc) / maxRadius; // sqrt is slow
-                s = 1.0 - distAmount + distAmount * r ;
+                double r = sqrt(Dxc * Dxc + Dyc * Dyc) / maxRadius;
+                s = 1.0 - distAmount + distAmount * r;
             }
 
             for (int c = 0; c < (enableCA ? 3 : 1); c++) {
@@ -947,46 +978,46 @@ void ImProcFunctions::transformGeneral(bool highQuality, Imagefloat *original, I
                         const double vig_y_d = ascale * (y + cy - vig_h2);       // centering y coord & scale
                         const double vig_Dx = vig_x_d * cost - vig_y_d * sint;
                         const double vig_Dy = vig_x_d * sint + vig_y_d * cost;
-                        const double r2 = sqrt (vig_Dx * vig_Dx + vig_Dy * vig_Dy);
+                        const double r2 = sqrt(vig_Dx * vig_Dx + vig_Dy * vig_Dy);
                         if (darkening) {
-                            vignmul /= std::max (v + mul * tanh (b * (maxRadius - s * r2) / maxRadius), 0.001);
+                            vignmul /= std::max(v + mul * tanh(b * (maxRadius - s * r2) / maxRadius), 0.001);
                         } else {
-                            vignmul *= (v + mul * tanh (b * (maxRadius - s * r2) / maxRadius));
+                            vignmul *= (v + mul * tanh(b * (maxRadius - s * r2) / maxRadius));
                         }
                     }
 
                     if (enableGradient) {
-                        vignmul *= calcGradientFactor (gp, cx + x, cy + y);
+                        vignmul *= calcGradientFactor(gp, cx + x, cy + y);
                     }
 
                     if (enablePCVignetting) {
-                        vignmul *= calcPCVignetteFactor (pcv, cx + x, cy + y);
+                        vignmul *= calcPCVignetteFactor(pcv, cx + x, cy + y);
                     }
 
                     if (yc > 0 && yc < original->getHeight() - 2 && xc > 0 && xc < original->getWidth() - 2) {
                         // all interpolation pixels inside image
                         if (enableCA) {
-                            interpolateTransformChannelsCubic (chOrig[c], xc - 1, yc - 1, Dx, Dy, chTrans[c][y][x], vignmul);
+                            interpolateTransformChannelsCubic(chOrig[c], xc - 1, yc - 1, Dx, Dy, chTrans[c][y][x], vignmul);
                         } else if (!highQuality) {
-                            transformed->r (y, x) = vignmul * (original->r (yc, xc) * (1.0 - Dx) * (1.0 - Dy) + original->r (yc, xc + 1) * Dx * (1.0 - Dy) + original->r (yc + 1, xc) * (1.0 - Dx) * Dy + original->r (yc + 1, xc + 1) * Dx * Dy);
-                            transformed->g (y, x) = vignmul * (original->g (yc, xc) * (1.0 - Dx) * (1.0 - Dy) + original->g (yc, xc + 1) * Dx * (1.0 - Dy) + original->g (yc + 1, xc) * (1.0 - Dx) * Dy + original->g (yc + 1, xc + 1) * Dx * Dy);
-                            transformed->b (y, x) = vignmul * (original->b (yc, xc) * (1.0 - Dx) * (1.0 - Dy) + original->b (yc, xc + 1) * Dx * (1.0 - Dy) + original->b (yc + 1, xc) * (1.0 - Dx) * Dy + original->b (yc + 1, xc + 1) * Dx * Dy);
+                            transformed->r(y, x) = vignmul * (original->r(yc, xc) * (1.0 - Dx) * (1.0 - Dy) + original->r(yc, xc + 1) * Dx * (1.0 - Dy) + original->r(yc + 1, xc) * (1.0 - Dx) * Dy + original->r(yc + 1, xc + 1) * Dx * Dy);
+                            transformed->g(y, x) = vignmul * (original->g(yc, xc) * (1.0 - Dx) * (1.0 - Dy) + original->g(yc, xc + 1) * Dx * (1.0 - Dy) + original->g(yc + 1, xc) * (1.0 - Dx) * Dy + original->g(yc + 1, xc + 1) * Dx * Dy);
+                            transformed->b(y, x) = vignmul * (original->b(yc, xc) * (1.0 - Dx) * (1.0 - Dy) + original->b(yc, xc + 1) * Dx * (1.0 - Dy) + original->b(yc + 1, xc) * (1.0 - Dx) * Dy + original->b(yc + 1, xc + 1) * Dx * Dy);
                         } else {
-                            interpolateTransformCubic (original, xc - 1, yc - 1, Dx, Dy, transformed->r (y, x), transformed->g (y, x), transformed->b (y, x), vignmul);
+                            interpolateTransformCubic(original, xc - 1, yc - 1, Dx, Dy, transformed->r(y, x), transformed->g(y, x), transformed->b(y, x), vignmul);
                         }
                     } else {
                         // edge pixels
-                        int y1 = LIM (yc,   0, original->getHeight() - 1);
-                        int y2 = LIM (yc + 1, 0, original->getHeight() - 1);
-                        int x1 = LIM (xc,   0, original->getWidth() - 1);
-                        int x2 = LIM (xc + 1, 0, original->getWidth() - 1);
+                        int y1 = LIM(yc,   0, original->getHeight() - 1);
+                        int y2 = LIM(yc + 1, 0, original->getHeight() - 1);
+                        int x1 = LIM(xc,   0, original->getWidth() - 1);
+                        int x2 = LIM(xc + 1, 0, original->getWidth() - 1);
 
                         if (enableCA) {
                             chTrans[c][y][x] = vignmul * (chOrig[c][y1][x1] * (1.0 - Dx) * (1.0 - Dy) + chOrig[c][y1][x2] * Dx * (1.0 - Dy) + chOrig[c][y2][x1] * (1.0 - Dx) * Dy + chOrig[c][y2][x2] * Dx * Dy);
                         } else {
-                            transformed->r (y, x) = vignmul * (original->r (y1, x1) * (1.0 - Dx) * (1.0 - Dy) + original->r (y1, x2) * Dx * (1.0 - Dy) + original->r (y2, x1) * (1.0 - Dx) * Dy + original->r (y2, x2) * Dx * Dy);
-                            transformed->g (y, x) = vignmul * (original->g (y1, x1) * (1.0 - Dx) * (1.0 - Dy) + original->g (y1, x2) * Dx * (1.0 - Dy) + original->g (y2, x1) * (1.0 - Dx) * Dy + original->g (y2, x2) * Dx * Dy);
-                            transformed->b (y, x) = vignmul * (original->b (y1, x1) * (1.0 - Dx) * (1.0 - Dy) + original->b (y1, x2) * Dx * (1.0 - Dy) + original->b (y2, x1) * (1.0 - Dx) * Dy + original->b (y2, x2) * Dx * Dy);
+                            transformed->r(y, x) = vignmul * (original->r(y1, x1) * (1.0 - Dx) * (1.0 - Dy) + original->r(y1, x2) * Dx * (1.0 - Dy) + original->r(y2, x1) * (1.0 - Dx) * Dy + original->r(y2, x2) * Dx * Dy);
+                            transformed->g(y, x) = vignmul * (original->g(y1, x1) * (1.0 - Dx) * (1.0 - Dy) + original->g(y1, x2) * Dx * (1.0 - Dy) + original->g(y2, x1) * (1.0 - Dx) * Dy + original->g(y2, x2) * Dx * Dy);
+                            transformed->b(y, x) = vignmul * (original->b(y1, x1) * (1.0 - Dx) * (1.0 - Dy) + original->b(y1, x2) * Dx * (1.0 - Dy) + original->b(y2, x1) * (1.0 - Dx) * Dy + original->b(y2, x2) * Dx * Dy);
                         }
                     }
                 } else {
@@ -994,9 +1025,9 @@ void ImProcFunctions::transformGeneral(bool highQuality, Imagefloat *original, I
                         // not valid (source pixel x,y not inside source image, etc.)
                         chTrans[c][y][x] = 0;
                     } else {
-                        transformed->r (y, x) = 0;
-                        transformed->g (y, x) = 0;
-                        transformed->b (y, x) = 0;
+                        transformed->r(y, x) = 0;
+                        transformed->g(y, x) = 0;
+                        transformed->b(y, x) = 0;
                     }
                 }
             }

From 2e34aa8a1de8c0f8c0786b050fad6e1bdea63e22 Mon Sep 17 00:00:00 2001
From: Ingo Weyrich <heckflosse67@gmx.de>
Date: Tue, 10 Sep 2019 23:23:37 +0200
Subject: [PATCH 04/15] Soft Light: speedup and reduce memory usage

---
 rtengine/ipsoftlight.cc | 115 +++++++++++++++++++++++++++++++---------
 1 file changed, 90 insertions(+), 25 deletions(-)

diff --git a/rtengine/ipsoftlight.cc b/rtengine/ipsoftlight.cc
index c7a4d1af7..1d94a29c2 100644
--- a/rtengine/ipsoftlight.cc
+++ b/rtengine/ipsoftlight.cc
@@ -3,6 +3,7 @@
  *  This file is part of RawTherapee.
  *
  *  Copyright 2018 Alberto Griggio <alberto.griggio@gmail.com>
+  * Optimized 2019 Ingo Weyrich <heckflosse67@gmx.de>
  *
  *  RawTherapee is free software: you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License as published by
@@ -18,14 +19,10 @@
  *  along with RawTherapee.  If not, see <https://www.gnu.org/licenses/>.
  */
 
-#ifdef _OPENMP
-#include <omp.h>
-#endif
-
 #include "improcfun.h"
-
 #include "procparams.h"
-
+#define BENCHMARK
+#include "StopWatch.h"
 namespace rtengine {
 
 namespace {
@@ -33,18 +30,29 @@ namespace {
 inline float sl(float blend, float x)
 {
     if (!OOG(x)) {
-        const float orig = 1.f - blend;
         float v = Color::gamma_srgb(x) / MAXVALF;
-        // Pegtop's formula from
+        // using Pegtop's formula from
         // https://en.wikipedia.org/wiki/Blend_modes#Soft_Light
-        float v2 = v * v;
-        float v22 = v2 * 2.f;
-        v = v2 + v22 - v22 * v;
-        x = blend * Color::igamma_srgb(v * MAXVALF) + orig * x;
+        // const float orig = 1.f - blend;
+        // float v2 = v * v;
+        // float v22 = v2 * 2.f;
+        // v = v2 + v22 - v22 * v;
+        // return blend * Color::igamma_srgb(v * MAXVALF) + orig * x;
+
+        // using optimized formula (heckflosse67@gmx.de)
+        return intp(blend, Color::igamma_srgb(v * v * (3.f - 2.f * v) * MAXVALF), x);
     }
     return x;
 }
 
+#ifdef __SSE2__
+inline vfloat sl(vfloat blend, vfloat x)
+{
+    const vfloat v = Color::gammatab_srgb[x] / F2V(MAXVALF);
+    return vself(vmaskf_gt(x, F2V(MAXVALF)), x, vself(vmaskf_lt(x, ZEROV), x, vintpf(blend, Color::igammatab_srgb[v * v * (F2V(3.f) - (v + v)) * MAXVALF], x)));
+}
+#endif
+
 } // namespace
 
 
@@ -53,24 +61,81 @@ void ImProcFunctions::softLight(LabImage *lab)
     if (!params->softlight.enabled || !params->softlight.strength) {
         return;
     }
+    BENCHFUN
 
-    Imagefloat working(lab->W, lab->H);
-    lab2rgb(*lab, working, params->icm.workingProfile);
+    TMatrix wprof = ICCStore::getInstance()->workingSpaceMatrix(params->icm.workingProfile);
+    const float wp[3][3] = {
+        {static_cast<float> (wprof[0][0]), static_cast<float> (wprof[0][1]), static_cast<float> (wprof[0][2])},
+        {static_cast<float> (wprof[1][0]), static_cast<float> (wprof[1][1]), static_cast<float> (wprof[1][2])},
+        {static_cast<float> (wprof[2][0]), static_cast<float> (wprof[2][1]), static_cast<float> (wprof[2][2])}
+    };
 
-    const float blend = params->softlight.strength / 100.f;
+    TMatrix wiprof = ICCStore::getInstance()->workingSpaceInverseMatrix(params->icm.workingProfile);
+    const float wip[3][3] = {
+        {static_cast<float> (wiprof[0][0]), static_cast<float> (wiprof[0][1]), static_cast<float> (wiprof[0][2])},
+        {static_cast<float> (wiprof[1][0]), static_cast<float> (wiprof[1][1]), static_cast<float> (wiprof[1][2])},
+        {static_cast<float> (wiprof[2][0]), static_cast<float> (wiprof[2][1]), static_cast<float> (wiprof[2][2])}
+    };
 
-#ifdef _OPENMP
-    #pragma omp parallel for
-#endif
-    for (int y = 0; y < working.getHeight(); ++y) {
-        for (int x = 0; x < working.getWidth(); ++x) {
-            working.r(y, x) = sl(blend, working.r(y, x));
-            working.g(y, x) = sl(blend, working.g(y, x));
-            working.b(y, x) = sl(blend, working.b(y, x));
+#ifdef __SSE2__
+    vfloat wipv[3][3];
+
+    for (int i = 0; i < 3; i++) {
+        for (int j = 0; j < 3; j++) {
+            wipv[i][j] = F2V(wiprof[i][j]);
+        }
+    }
+
+    vfloat wpv[3][3];
+
+    for (int i = 0; i < 3; i++) {
+        for (int j = 0; j < 3; j++) {
+            wpv[i][j] = F2V(wprof[i][j]);
+        }
+    }
+#endif
+
+#ifdef _OPENMP
+    #pragma omp parallel
+#endif
+    {
+        const float blend = params->softlight.strength / 100.f;
+#ifdef __SSE2__
+        const vfloat blendv = F2V(blend);
+#endif
+#ifdef _OPENMP
+        #pragma omp for schedule(dynamic,16)
+#endif
+        for (int i = 0; i < lab->H; ++i) {
+            int j = 0;
+#ifdef __SSE2__
+            for (; j < lab->W - 3; j += 4) {
+                vfloat Xv, Yv, Zv;
+                vfloat Rv, Gv, Bv;
+                Color::Lab2XYZ(LVFU(lab->L[i][j]),LVFU (lab->a[i][j]),LVFU (lab->b[i][j]), Xv, Yv, Zv);
+                Color::xyz2rgb(Xv, Yv, Zv, Rv, Gv, Bv, wipv);
+                Rv = sl(blendv, Rv);
+                Gv = sl(blendv, Gv);
+                Bv = sl(blendv, Bv);
+                Color::rgbxyz(Rv, Gv, Bv, Xv, Yv, Zv, wpv);
+                for (int k = 0; k < 4; ++k) {
+                    Color::XYZ2Lab(Xv[k], Yv[k], Zv[k], lab->L[i][j + k], lab->a[i][j + k], lab->b[i][j+ k]);
+                }
+            }
+#endif
+            for (; j < lab->W; j++) {
+                float X, Y, Z;
+                float R, G, B;
+                Color::Lab2XYZ(lab->L[i][j], lab->a[i][j], lab->b[i][j], X, Y, Z);
+                Color::xyz2rgb(X, Y, Z, R, G, B, wip);
+                R = sl(blend, R);
+                G = sl(blend, G);
+                B = sl(blend, B);
+                Color::rgbxyz(R, G, B, X, Y, Z, wp);
+                Color::XYZ2Lab(X, Y, Z, lab->L[i][j], lab->a[i][j], lab->b[i][j]);
+            }
         }
     }
-    
-    rgb2lab(working, *lab, params->icm.workingProfile);
 }
 
 } // namespace rtengine

From 9a020899a32447be884d0e4b9b3c744cefbe00f2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fl=C3=B6ssie?= <floessie.mail@gmail.com>
Date: Wed, 11 Sep 2019 09:16:06 +0200
Subject: [PATCH 05/15] Minor cleanups

---
 rtengine/iptransform.cc | 95 +++++++++++++++++++++++++----------------
 1 file changed, 59 insertions(+), 36 deletions(-)

diff --git a/rtengine/iptransform.cc b/rtengine/iptransform.cc
index 608fa0337..1e739dac6 100644
--- a/rtengine/iptransform.cc
+++ b/rtengine/iptransform.cc
@@ -16,16 +16,21 @@
  *  You should have received a copy of the GNU General Public License
  *  along with RawTherapee.  If not, see <http://www.gnu.org/licenses/>.
  */
-#include "rtengine.h"
-#include "improcfun.h"
-#include "procparams.h"
+#include <array>
+
 #ifdef _OPENMP
 #include <omp.h>
 #endif
+
+#include "improcfun.h"
+
 #include "mytime.h"
+#include "procparams.h"
 #include "rt_math.h"
-#include "sleef.c"
+#include "rtengine.h"
 #include "rtlensfun.h"
+#include "sleef.c"
+
 #define BENCHMARK
 #include "StopWatch.h"
 
@@ -147,7 +152,7 @@ inline void interpolateTransformCubic(rtengine::Imagefloat* src, int xs, int ys,
 }
 #endif
 #ifdef __SSE2__
-inline void interpolateTransformChannelsCubic(const float* const * src, int xs, int ys, float Dx, float Dy, float &dest, float mul)
+inline void interpolateTransformChannelsCubic(const float* const* src, int xs, int ys, float Dx, float Dy, float& dest, float mul)
 {
     constexpr float A = -0.85f;
 
@@ -168,7 +173,7 @@ inline void interpolateTransformChannelsCubic(const float* const * src, int xs,
     dest = mul * vhadd(weight * cv);
 }
 #else
-inline void interpolateTransformChannelsCubic(const float* const * src, int xs, int ys, float Dx, float Dy, float &dest, float mul)
+inline void interpolateTransformChannelsCubic(const float* const* src, int xs, int ys, float Dx, float Dy, float& dest, float mul)
 {
     constexpr float A = -0.85f;
 
@@ -859,29 +864,45 @@ void ImProcFunctions::transformGeneral(bool highQuality, Imagefloat *original, I
     const bool enablePerspective = needsPerspective();
     const bool enableDistortion = needsDistortion();
 
-    const double w2 = (double) oW  / 2.0 - 0.5;
-    const double h2 = (double) oH  / 2.0 - 0.5;
+    const double w2 = static_cast<double>(oW)  / 2.0 - 0.5;
+    const double h2 = static_cast<double>(oH)  / 2.0 - 0.5;
 
     double vig_w2, vig_h2, maxRadius, v, b, mul;
     calcVignettingParams(oW, oH, params->vignetting, vig_w2, vig_h2, maxRadius, v, b, mul);
 
-    struct grad_params gp;
+    grad_params gp;
 
     if (enableGradient) {
         calcGradientParams(oW, oH, params->gradient, gp);
     }
 
-    struct pcv_params pcv;
+    pcv_params pcv;
 
     if (enablePCVignetting) {
         calcPCVignetteParams(fW, fH, oW, oH, params->pcvignette, params->crop, pcv);
     }
 
-    float** chOrig[3] = {original->r.ptrs, original->g.ptrs, original->b.ptrs};
-    float** chTrans[3] = {transformed->r.ptrs, transformed->g.ptrs, transformed->b.ptrs};
+    const std::array<const float* const*, 3> chOrig = {
+        original->r.ptrs,
+        original->g.ptrs,
+        original->b.ptrs
+    };
+    const std::array<float* const*, 3> chTrans = {
+        transformed->r.ptrs,
+        transformed->g.ptrs,
+        transformed->b.ptrs
+    };
 
     // auxiliary variables for c/a correction
-    const double chDist[3] = {enableCA ? params->cacorrection.red : 0.0, 0.0, enableCA ? params->cacorrection.blue : 0.0};
+    const std::array<double, 3> chDist = {
+        enableCA
+            ? params->cacorrection.red
+            : 0.0,
+        0.0,
+        enableCA
+            ? params->cacorrection.blue
+            : 0.0
+    };
 
     // auxiliary variables for distortion correction
     const double distAmount = params->distortion.amount;
@@ -893,16 +914,18 @@ void ImProcFunctions::transformGeneral(bool highQuality, Imagefloat *original, I
     // auxiliary variables for vertical perspective correction
     const double vpdeg = params->perspective.vertical / 100.0 * 45.0;
     const double vpalpha = (90.0 - vpdeg) / 180.0 * rtengine::RT_PI;
-    const double vpteta  = fabs(vpalpha - rtengine::RT_PI / 2) < 3e-4 ? 0.0 : acos((vpdeg > 0 ? 1.0 : -1.0) * sqrt((-SQR(oW * tan(vpalpha)) + (vpdeg > 0 ? 1.0 : -1.0) *
-                           oW * tan(vpalpha) * sqrt(SQR(4 * maxRadius) + SQR(oW * tan(vpalpha)))) / (SQR(maxRadius) * 8)));
-    const double vpcospt = (vpdeg >= 0 ? 1.0 : -1.0) * cos(vpteta), vptanpt = tan(vpteta);
+    const double vpteta = fabs(vpalpha - rtengine::RT_PI / 2) < 3e-4 ? 0.0 : acos((vpdeg > 0 ? 1.0 : -1.0) * sqrt((-SQR(oW * tan(vpalpha)) + (vpdeg > 0 ? 1.0 : -1.0) *
+                          oW * tan(vpalpha) * sqrt(SQR(4 * maxRadius) + SQR(oW * tan(vpalpha)))) / (SQR(maxRadius) * 8)));
+    const double vpcospt = (vpdeg >= 0 ? 1.0 : -1.0) * cos(vpteta);
+    const double vptanpt = tan(vpteta);
 
     // auxiliary variables for horizontal perspective correction
     const double hpdeg = params->perspective.horizontal / 100.0 * 45.0;
     const double hpalpha = (90.0 - hpdeg) / 180.0 * rtengine::RT_PI;
-    const double hpteta  = fabs(hpalpha - rtengine::RT_PI / 2) < 3e-4 ? 0.0 : acos((hpdeg > 0 ? 1.0 : -1.0) * sqrt((-SQR(oH * tan(hpalpha)) + (hpdeg > 0 ? 1.0 : -1.0) *
-                           oH * tan(hpalpha) * sqrt(SQR(4 * maxRadius) + SQR(oH * tan(hpalpha)))) / (SQR(maxRadius) * 8)));
-    const double hpcospt = (hpdeg >= 0 ? 1.0 : -1.0) * cos(hpteta), hptanpt = tan(hpteta);
+    const double hpteta = fabs(hpalpha - rtengine::RT_PI / 2) < 3e-4 ? 0.0 : acos((hpdeg > 0 ? 1.0 : -1.0) * sqrt((-SQR(oH * tan(hpalpha)) + (hpdeg > 0 ? 1.0 : -1.0) *
+                          oH * tan(hpalpha) * sqrt(SQR(4 * maxRadius) + SQR(oH * tan(hpalpha)))) / (SQR(maxRadius) * 8)));
+    const double hpcospt = (hpdeg >= 0 ? 1.0 : -1.0) * cos(hpteta);
+    const double hptanpt = tan(hpteta);
 
     const double ascale = params->commonTrans.autofill ? getTransformAutoFill(oW, oH, pLCPMap) : 1.0;
 
@@ -915,9 +938,10 @@ void ImProcFunctions::transformGeneral(bool highQuality, Imagefloat *original, I
     #pragma omp parallel for schedule(dynamic, 16) if(multiThread)
 #endif
 
-    for (int y = 0; y < transformed->getHeight(); y++) {
-        for (int x = 0; x < transformed->getWidth(); x++) {
-            double x_d = x, y_d = y;
+    for (int y = 0; y < transformed->getHeight(); ++y) {
+        for (int x = 0; x < transformed->getWidth(); ++x) {
+            double x_d = x;
+            double y_d = y;
 
             if (enableLCPDist) {
                 pLCPMap->correctDistortion(x_d, y_d, cx, cy, ascale); // must be first transform
@@ -944,14 +968,14 @@ void ImProcFunctions::transformGeneral(bool highQuality, Imagefloat *original, I
             const double Dyc = x_d * sint + y_d * cost;
 
             // distortion correction
-            double s = 1;
+            double s = 1.0;
 
             if (enableDistortion) {
-                double r = sqrt(Dxc * Dxc + Dyc * Dyc) / maxRadius;
+                const double r = sqrt(Dxc * Dxc + Dyc * Dyc) / maxRadius;
                 s = 1.0 - distAmount + distAmount * r;
             }
 
-            for (int c = 0; c < (enableCA ? 3 : 1); c++) {
+            for (int c = 0; c < (enableCA ? 3 : 1); ++c) {
                 double Dx = Dxc * (s + chDist[c]);
                 double Dy = Dyc * (s + chDist[c]);
 
@@ -960,22 +984,21 @@ void ImProcFunctions::transformGeneral(bool highQuality, Imagefloat *original, I
                 Dy += h2;
 
                 // Extract integer and fractions of source screen coordinates
-                int xc = (int)Dx;
-                Dx -= (double)xc;
+                int xc = Dx;
+                Dx -= xc;
                 xc -= sx;
-                int yc = (int)Dy;
-                Dy -= (double)yc;
+                int yc = Dy;
+                Dy -= yc;
                 yc -= sy;
 
                 // Convert only valid pixels
                 if (yc >= 0 && yc < original->getHeight() && xc >= 0 && xc < original->getWidth()) {
-
                     // multiplier for vignetting correction
                     double vignmul = 1.0;
 
                     if (enableVignetting) {
-                        const double vig_x_d = ascale * (x + cx - vig_w2);       // centering x coord & scale
-                        const double vig_y_d = ascale * (y + cy - vig_h2);       // centering y coord & scale
+                        const double vig_x_d = ascale * (x + cx - vig_w2); // centering x coord & scale
+                        const double vig_y_d = ascale * (y + cy - vig_h2); // centering y coord & scale
                         const double vig_Dx = vig_x_d * cost - vig_y_d * sint;
                         const double vig_Dy = vig_x_d * sint + vig_y_d * cost;
                         const double r2 = sqrt(vig_Dx * vig_Dx + vig_Dy * vig_Dy);
@@ -1007,10 +1030,10 @@ void ImProcFunctions::transformGeneral(bool highQuality, Imagefloat *original, I
                         }
                     } else {
                         // edge pixels
-                        int y1 = LIM(yc,   0, original->getHeight() - 1);
-                        int y2 = LIM(yc + 1, 0, original->getHeight() - 1);
-                        int x1 = LIM(xc,   0, original->getWidth() - 1);
-                        int x2 = LIM(xc + 1, 0, original->getWidth() - 1);
+                        const int y1 = LIM(yc, 0, original->getHeight() - 1);
+                        const int y2 = LIM(yc + 1, 0, original->getHeight() - 1);
+                        const int x1 = LIM(xc, 0, original->getWidth() - 1);
+                        const int x2 = LIM(xc + 1, 0, original->getWidth() - 1);
 
                         if (enableCA) {
                             chTrans[c][y][x] = vignmul * (chOrig[c][y1][x1] * (1.0 - Dx) * (1.0 - Dy) + chOrig[c][y1][x2] * Dx * (1.0 - Dy) + chOrig[c][y2][x1] * (1.0 - Dx) * Dy + chOrig[c][y2][x2] * Dx * Dy);

From 4079bb9920be8cf6c568c1b0f91bfbe3a2544b30 Mon Sep 17 00:00:00 2001
From: Ingo Weyrich <heckflosse67@gmx.de>
Date: Wed, 11 Sep 2019 18:56:07 +0200
Subject: [PATCH 06/15] Capture Sharpening: automatic radius calculation

---
 rtdata/languages/default      |   2 +
 rtengine/capturesharpening.cc | 307 ++++++++++++++++++++++++++++++++--
 rtengine/imagesource.h        |   2 +-
 rtengine/improccoordinator.cc |   7 +-
 rtengine/improccoordinator.h  |   6 +
 rtengine/procparams.cc        |   4 +
 rtengine/procparams.h         |   1 +
 rtengine/rawimagesource.h     |   2 +-
 rtengine/refreshmap.cc        |   2 +-
 rtengine/rt_algo.cc           |  23 ++-
 rtengine/rt_algo.h            |   2 +-
 rtengine/rtengine.h           |   8 +
 rtengine/simpleprocess.cc     |   2 +-
 rtengine/stdimagesource.h     |   2 +-
 rtgui/paramsedited.cc         |   8 +-
 rtgui/paramsedited.h          |   1 +
 rtgui/pdsharpening.cc         |  74 ++++++--
 rtgui/pdsharpening.h          |   5 +-
 rtgui/toolpanelcoord.cc       |   1 +
 rtgui/toolpanelcoord.h        |   1 -
 20 files changed, 416 insertions(+), 44 deletions(-)

diff --git a/rtdata/languages/default b/rtdata/languages/default
index 5bf1863df..3749a706a 100644
--- a/rtdata/languages/default
+++ b/rtdata/languages/default
@@ -766,6 +766,7 @@ HISTORY_MSG_METADATA_MODE;Metadata copy mode
 HISTORY_MSG_MICROCONTRAST_CONTRAST;Microcontrast - Contrast threshold
 HISTORY_MSG_PDSHARPEN_CONTRAST;CAS - Contrast threshold
 HISTORY_MSG_PDSHARPEN_AUTO_CONTRAST;CAS - Auto threshold
+HISTORY_MSG_PDSHARPEN_AUTO_RADIUS;CAS - Auto radius
 HISTORY_MSG_PDSHARPEN_GAMMA;CAS - Gamma
 HISTORY_MSG_PDSHARPEN_ITERATIONS;CAS - Iterations
 HISTORY_MSG_PDSHARPEN_RADIUS;CAS - Radius
@@ -1800,6 +1801,7 @@ TP_PCVIGNETTE_ROUNDNESS_TOOLTIP;Roundness:\n0 = rectangle,\n50 = fitted ellipse,
 TP_PCVIGNETTE_STRENGTH;Strength
 TP_PCVIGNETTE_STRENGTH_TOOLTIP;Filter strength in stops (reached in corners).
 TP_PDSHARPENING_LABEL;Capture Sharpening
+TP_PDSHARPENING_AUTORADIUS_TOOLTIP;If the checkbox is checked, RawTherapee calculates a value based on the raw data of the image.
 TP_PERSPECTIVE_HORIZONTAL;Horizontal
 TP_PERSPECTIVE_LABEL;Perspective
 TP_PERSPECTIVE_VERTICAL;Vertical
diff --git a/rtengine/capturesharpening.cc b/rtengine/capturesharpening.cc
index eb95d2633..e741016db 100644
--- a/rtengine/capturesharpening.cc
+++ b/rtengine/capturesharpening.cc
@@ -37,6 +37,243 @@
 #include "../rtgui/multilangmgr.h"
 
 namespace {
+
+void buildClipMaskBayer(const float * const *rawData, int W, int H, float** clipMask, const float whites[2][2])
+{
+
+#ifdef _OPENMP
+    #pragma omp parallel for schedule(dynamic, 16)
+#endif
+    for (int row = 0; row < H; ++row) {
+        for (int col = 0; col < W; ++col) {
+            clipMask[row][col] = 1.f;
+        }
+    }
+
+#ifdef _OPENMP
+    #pragma omp parallel for schedule(dynamic, 16)
+#endif
+    for (int row = 2; row < H - 2; ++row) {
+        float clip0 = whites[row & 1][0];
+        float clip1 = whites[row & 1][1];
+        for (int col = 2; col < W - 2; ++col) {
+            if (rawData[row][col] >= clip0) {
+                clipMask[row - 2][col - 1] = clipMask[row - 2][col] = clipMask[row - 2][col + 1] = 0.f;
+                clipMask[row - 1][col - 2] = clipMask[row - 1][col - 1] = clipMask[row - 1][col] = clipMask[row - 1][col + 1] = clipMask[row - 1][col + 2] = 0.f;
+                clipMask[row][col - 2] = clipMask[row][col - 1] = clipMask[row][col] = clipMask[row][col + 1] = clipMask[row][col + 2] = 0.f;
+                clipMask[row + 1][col - 2] = clipMask[row + 1][col - 1] = clipMask[row + 1][col] = clipMask[row + 1][col + 1] = clipMask[row + 1][col + 2] = 0.f;
+                clipMask[row + 2][col - 1] = clipMask[row + 2][col] = clipMask[row + 2][col + 1] = 0.f;
+            }
+            std::swap(clip0, clip1);
+        }
+    }
+}
+
+void buildClipMaskXtrans(const float * const *rawData, int W, int H, float** clipMask, const float whites[6][6])
+{
+
+#ifdef _OPENMP
+    #pragma omp parallel for schedule(dynamic, 16)
+#endif
+    for (int row = 0; row < H; ++row) {
+        for (int col = 0; col < W; ++col) {
+            clipMask[row][col] = 1.f;
+        }
+    }
+
+#ifdef _OPENMP
+    #pragma omp parallel for schedule(dynamic, 16)
+#endif
+    for (int row = 2; row < H - 2; ++row) {
+        for (int col = 2; col < W - 2; ++col) {
+            const float clip = whites[row % 6][col % 6];
+            if (rawData[row][col] >= clip) {
+                clipMask[row - 2][col - 1] = clipMask[row - 2][col] = clipMask[row - 2][col + 1] = 0.f;
+                clipMask[row - 1][col - 2] = clipMask[row - 1][col - 1] = clipMask[row - 1][col] = clipMask[row - 1][col + 1] = clipMask[row - 1][col + 2] = 0.f;
+                clipMask[row][col - 2] = clipMask[row][col - 1] = clipMask[row][col] = clipMask[row][col + 1] = clipMask[row][col + 2] = 0.f;
+                clipMask[row + 1][col - 2] = clipMask[row + 1][col - 1] = clipMask[row + 1][col] = clipMask[row + 1][col + 1] = clipMask[row + 1][col + 2] = 0.f;
+                clipMask[row + 2][col - 1] = clipMask[row + 2][col] = clipMask[row + 2][col + 1] = 0.f;
+            }
+        }
+    }
+}
+
+void buildClipMaskMono(const float * const *rawData, int W, int H, float** clipMask, float white)
+{
+
+#ifdef _OPENMP
+    #pragma omp parallel for schedule(dynamic, 16)
+#endif
+    for (int row = 0; row < H; ++row) {
+        for (int col = 0; col < W; ++col) {
+            clipMask[row][col] = 1.f;
+        }
+    }
+
+#ifdef _OPENMP
+    #pragma omp parallel for schedule(dynamic, 16)
+#endif
+    for (int row = 2; row < H - 2; ++row) {
+        for (int col = 2; col < W - 2; ++col) {
+            if (rawData[row][col] >= white) {
+                clipMask[row - 2][col - 1] = clipMask[row - 2][col] = clipMask[row - 2][col + 1] = 0.f;
+                clipMask[row - 1][col - 2] = clipMask[row - 1][col - 1] = clipMask[row - 1][col] = clipMask[row - 1][col + 1] = clipMask[row - 1][col + 2] = 0.f;
+                clipMask[row][col - 2] = clipMask[row][col - 1] = clipMask[row][col] = clipMask[row][col + 1] = clipMask[row][col + 2] = 0.f;
+                clipMask[row + 1][col - 2] = clipMask[row + 1][col - 1] = clipMask[row + 1][col] = clipMask[row + 1][col + 1] = clipMask[row + 1][col + 2] = 0.f;
+                clipMask[row + 2][col - 1] = clipMask[row + 2][col] = clipMask[row + 2][col + 1] = 0.f;
+            }
+        }
+    }
+}
+
+float calcRadiusBayer(const float * const *rawData, int W, int H, float lowerLimit, float upperLimit, const unsigned int fc[2])
+{
+
+    float maxRatio = 1.f;
+#ifdef _OPENMP
+    #pragma omp parallel for reduction(max:maxRatio) schedule(dynamic, 16)
+#endif
+    for (int row = 4; row < H - 4; ++row) {
+        for (int col = 5 + (fc[row & 1] & 1); col < W - 4; col += 2) {
+            const float val00 = rawData[row][col];
+            if (val00 > 0.f) {
+                const float val1m1 = rawData[row + 1][col - 1];
+                const float val1p1 = rawData[row + 1][col + 1];
+                const float maxVal0 = std::max(val00, val1m1);
+                if (val1m1 > 0.f && maxVal0 > lowerLimit) {
+                    const float minVal = std::min(val00, val1m1);
+                    if (UNLIKELY(maxVal0 > maxRatio * minVal)) {
+                        bool clipped = false;
+                        if (maxVal0 == val00) { // check for influence by clipped green in neighborhood
+                            if (rtengine::max(rawData[row - 1][col - 1], rawData[row - 1][col + 1], val1p1) >= upperLimit) {
+                                clipped = true;
+                            }
+                        } else { // check for influence by clipped green in neighborhood
+                            if (rtengine::max(rawData[row][col - 2], val00, rawData[row + 2][col - 2], rawData[row + 2][col]) >= upperLimit) {
+                                clipped = true;
+                            }
+                        }
+                        if (!clipped) {
+                            maxRatio = maxVal0 / minVal;
+                        }
+                    }
+                }
+                const float maxVal1 = std::max(val00, val1p1);
+                if (val1p1 > 0.f && maxVal1 > lowerLimit) {
+                    const float minVal = std::min(val00, val1p1);
+                    if (UNLIKELY(maxVal1 > maxRatio * minVal)) {
+                        if (maxVal1 == val00) { // check for influence by clipped green in neighborhood
+                            if (rtengine::max(rawData[row - 1][col - 1], rawData[row - 1][col + 1], val1p1) >= upperLimit) {
+                                continue;
+                            }
+                        } else { // check for influence by clipped green in neighborhood
+                            if (rtengine::max(val00, rawData[row][col + 2], rawData[row + 2][col], rawData[row + 2][col + 2]) >= upperLimit) {
+                                continue;
+                            }
+                        }
+                        maxRatio = maxVal1 / minVal;
+                    }
+                }
+            }
+        }
+    }
+    return std::sqrt((1.f / (std::log(1.f / maxRatio) /  2.f)) / -2.f);
+}
+
+float calcRadiusXtrans(const float * const *rawData, int W, int H, float lowerLimit, float upperLimit, unsigned int starty, unsigned int startx)
+{
+
+    float maxRatio = 1.f;
+#ifdef _OPENMP
+    #pragma omp parallel for reduction(max:maxRatio) schedule(dynamic, 16)
+#endif
+    for (int row = starty + 3; row < H - 4; row += 3) {
+        for (int col = startx + 3; col < W - 4; col += 3) {
+            const float valtl = rawData[row][col];
+            const float valtr = rawData[row][col + 1];
+            const float valbl = rawData[row + 1][col];
+            const float valbr = rawData[row + 1][col + 1];
+            if (valtl > 1.f) {
+                const float maxValtltr = std::max(valtl, valtr);
+                if (valtr > 1.f && maxValtltr > lowerLimit) {
+                    const float minVal = std::min(valtl, valtr);
+                    if (UNLIKELY(maxValtltr > maxRatio * minVal)) {
+                        bool clipped = false;
+                        if (maxValtltr == valtl) { // check for influence by clipped green in neighborhood
+                            if (rtengine::max(rawData[row - 1][col - 1], valtr, valbl, valbr) >= upperLimit) {
+                                clipped = true;
+                            }
+                        } else { // check for influence by clipped green in neighborhood
+                            if (rtengine::max(rawData[row - 1][col + 2], valtl, valbl, valbr) >= upperLimit) {
+                                clipped = true;
+                            }
+                        }
+                        if (!clipped) {
+                            maxRatio = maxValtltr / minVal;
+                        }
+                    }
+                }
+                const float maxValtlbl = std::max(valtl, valbl);
+                if (valbl > 1.f && maxValtlbl > lowerLimit) {
+                    const float minVal = std::min(valtl, valbl);
+                    if (UNLIKELY(maxValtlbl > maxRatio * minVal)) {
+                        bool clipped = false;
+                        if (maxValtlbl == valtl) { // check for influence by clipped green in neighborhood
+                            if (rtengine::max(rawData[row - 1][col - 1], valtr, valbl, valbr) >= upperLimit) {
+                                clipped = true;
+                            }
+                        } else { // check for influence by clipped green in neighborhood
+                            if (rtengine::max(valtl, valtr, rawData[row + 2][col - 1], valbr) >= upperLimit) {
+                                clipped = true;
+                            }
+                        }
+                        if (!clipped) {
+                            maxRatio = maxValtlbl / minVal;
+                        }
+                    }
+                }
+            }
+            if (valbr > 1.f) {
+                const float maxValblbr = std::max(valbl, valbr);
+                if (valbl > 1.f && maxValblbr > lowerLimit) {
+                    const float minVal = std::min(valbl, valbr);
+                    if (UNLIKELY(maxValblbr > maxRatio * minVal)) {
+                        bool clipped = false;
+                        if (maxValblbr == valbr) { // check for influence by clipped green in neighborhood
+                            if (rtengine::max(valtl, valtr, valbl, rawData[row + 2][col + 2]) >= upperLimit) {
+                                clipped = true;
+                            }
+                        } else { // check for influence by clipped green in neighborhood
+                            if (rtengine::max(valtl, valtr, rawData[row + 2][col - 1], valbr) >= upperLimit) {
+                                clipped = true;
+                            }
+                        }
+                        if (!clipped) {
+                            maxRatio = maxValblbr / minVal;
+                        }
+                    }
+                }
+                const float maxValtrbr = std::max(valtr, valbr);
+                if (valtr > 1.f && maxValtrbr > lowerLimit) {
+                    const float minVal = std::min(valtr, valbr);
+                    if (UNLIKELY(maxValtrbr > maxRatio * minVal)) {
+                        if (maxValtrbr == valbr) { // check for influence by clipped green in neighborhood
+                            if (rtengine::max(valtl, valtr, valbl, rawData[row + 2][col + 2]) >= upperLimit) {
+                                continue;
+                            }
+                        } else { // check for influence by clipped green in neighborhood
+                            if (rtengine::max(rawData[row - 1][col + 2], valtl, valbl, valbr) >= upperLimit) {
+                                continue;
+                            }
+                        }
+                        maxRatio = maxValtrbr / minVal;
+                    }
+                }
+            }
+        }
+    }
+    return std::sqrt((1.f / (std::log(1.f / maxRatio))) / -2.f);
+}
 void CaptureDeconvSharpening (float** luminance, float** tmp, const float * const * blend, int W, int H, double sigma, int iterations, rtengine::ProgressListener* plistener, double start, double step)
 {
 
@@ -54,7 +291,7 @@ void CaptureDeconvSharpening (float** luminance, float** tmp, const float * cons
                 tmpI[i][j] = max(luminance[i][j], 0.f);
             }
         }
-        
+
         for (int k = 0; k < iterations; k++) {
             // apply gaussian blur and divide luminance by result of gaussian blur
             gaussianBlur(tmpI, tmp, W, H, sigma, nullptr, GAUSS_DIV, luminance);
@@ -85,9 +322,7 @@ void CaptureDeconvSharpening (float** luminance, float** tmp, const float * cons
 namespace rtengine
 {
 
-void RawImageSource::captureSharpening(const procparams::CaptureSharpeningParams &sharpeningParams, bool showMask, double &conrastThreshold) {
-BENCHFUN
-
+void RawImageSource::captureSharpening(const procparams::CaptureSharpeningParams &sharpeningParams, bool showMask, double &conrastThreshold, double &radius) {
 
     if (plistener) {
         plistener->setProgressStr(M("TP_PDSHARPENING_LABEL"));
@@ -102,12 +337,62 @@ BENCHFUN
 
     float contrast = conrastThreshold / 100.f;
 
+    const float clipVal = (ri->get_white(1) - ri->get_cblack(1)) * scale_mul[1];
+
     array2D<float>& redVals = redCache ? *redCache : red;
     array2D<float>& greenVals = greenCache ? *greenCache : green;
     array2D<float>& blueVals = blueCache ? *blueCache : blue;
 
+    array2D<float> clipMask(W, H);
+    constexpr float clipLimit = 0.95f;
+    if (ri->getSensorType() == ST_BAYER) {
+        const float whites[2][2] = {
+                                    {(ri->get_white(FC(0,0)) - c_black[FC(0,0)]) * scale_mul[FC(0,0)] * clipLimit, (ri->get_white(FC(0,1)) - c_black[FC(0,1)]) * scale_mul[FC(0,1)] * clipLimit},
+                                    {(ri->get_white(FC(1,0)) - c_black[FC(1,0)]) * scale_mul[FC(1,0)] * clipLimit, (ri->get_white(FC(1,1)) - c_black[FC(1,1)]) * scale_mul[FC(1,1)] * clipLimit}
+                                   };
+        buildClipMaskBayer(rawData, W, H, clipMask, whites);
+        const unsigned int fc[2] = {FC(0,0), FC(1,0)};
+        if (sharpeningParams.autoRadius) {
+            radius = calcRadiusBayer(rawData, W, H, 1000.f, clipVal, fc);
+        }
+    } else if (ri->getSensorType() == ST_FUJI_XTRANS) {
+        float whites[6][6];
+        for (int i = 0; i < 6; ++i) {
+            for (int j = 0; j < 6; ++j) {
+                const auto color = ri->XTRANSFC(i, j);
+                whites[i][j] = (ri->get_white(color) - c_black[color]) * scale_mul[color] * clipLimit;
+            }
+        }
+        buildClipMaskXtrans(rawData, W, H, clipMask, whites);
+        bool found = false;
+        int i, j;
+        for (i = 6; i < 12 && !found; ++i) {
+            for (j = 6; j < 12 && !found; ++j) {
+                if (ri->XTRANSFC(i, j) == 1) {
+                    if (ri->XTRANSFC(i, j - 1) != ri->XTRANSFC(i, j + 1)) {
+                        if (ri->XTRANSFC(i - 1, j) != 1) {
+                            if (ri->XTRANSFC(i, j - 1) != 1) {
+                                found = true;
+                                break;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        if (sharpeningParams.autoRadius) {
+            radius = calcRadiusXtrans(rawData, W, H, 1000.f, clipVal, i, j);
+        }
+
+    } else if (ri->get_colors() == 1) {
+        buildClipMaskMono(rawData, W, H, clipMask, (ri->get_white(0) - c_black[0]) * scale_mul[0] * clipLimit);
+        if (sharpeningParams.autoRadius) {
+            const unsigned int fc[2] = {0, 0};
+            radius = calcRadiusBayer(rawData, W, H, 1000.f, clipVal, fc);
+        }
+    }
+
     if (showMask) {
-        StopWatch Stop1("Show mask");
         array2D<float>& L = blue; // blue will be overridden anyway => we can use its buffer to store L
 #ifdef _OPENMP
         #pragma omp parallel for
@@ -119,8 +404,9 @@ BENCHFUN
         if (plistener) {
             plistener->setProgress(0.1);
         }
+
         array2D<float>& blend = red; // red will be overridden anyway => we can use its buffer to store the blend mask
-        buildBlendMask(L, blend, W, H, contrast, 1.f, sharpeningParams.autoContrast);
+        buildBlendMask(L, blend, W, H, contrast, 1.f, sharpeningParams.autoContrast, clipMask);
         if (plistener) {
             plistener->setProgress(0.2);
         }
@@ -157,7 +443,6 @@ BENCHFUN
     array2D<float>& YOld = YOldbuffer ? * YOldbuffer : green;
     array2D<float>& YNew = YNewbuffer ? * YNewbuffer : blue;
 
-    StopWatch Stop1("rgb2YL");
 #ifdef _OPENMP
     #pragma omp parallel for schedule(dynamic, 16)
 #endif
@@ -170,19 +455,17 @@ BENCHFUN
     }
     // calculate contrast based blend factors to reduce sharpening in regions with low contrast
     JaggedArray<float> blend(W, H);
-    buildBlendMask(L, blend, W, H, contrast, 1.f, sharpeningParams.autoContrast);
+    buildBlendMask(L, blend, W, H, contrast, 1.f, sharpeningParams.autoContrast, clipMask);
     if (plistener) {
         plistener->setProgress(0.2);
     }
     conrastThreshold = contrast * 100.f;
 
-    Stop1.stop();
     array2D<float>& tmp = L; // L is not used anymore now => we can use its buffer as the needed temporary buffer
-    CaptureDeconvSharpening(YNew, tmp, blend, W, H, sharpeningParams.deconvradius, sharpeningParams.deconviter, plistener, 0.2, (0.9 - 0.2) / sharpeningParams.deconviter);
+    CaptureDeconvSharpening(YNew, tmp, blend, W, H, radius, sharpeningParams.deconviter, plistener, 0.2, (0.9 - 0.2) / sharpeningParams.deconviter);
     if (plistener) {
         plistener->setProgress(0.9);
     }
-    StopWatch Stop2("Y2RGB");
     const float gamma = sharpeningParams.gamma;
 #ifdef _OPENMP
     #pragma omp parallel for schedule(dynamic, 16)
@@ -206,7 +489,7 @@ BENCHFUN
             blue[i][j] = blueVals[i][j] * factor;
         }
     }
-    Stop2.stop();
+
     delete Lbuffer;
     delete YOldbuffer;
     delete YNewbuffer;
diff --git a/rtengine/imagesource.h b/rtengine/imagesource.h
index c62dfe7d8..edc1102c4 100644
--- a/rtengine/imagesource.h
+++ b/rtengine/imagesource.h
@@ -182,7 +182,7 @@ public:
         return this;
     }
     virtual void getRawValues(int x, int y, int rotate, int &R, int &G, int &B) = 0;
-    virtual void captureSharpening(const procparams::CaptureSharpeningParams &sharpeningParams, bool showMask, double &conrastThreshold) = 0;
+    virtual void captureSharpening(const procparams::CaptureSharpeningParams &sharpeningParams, bool showMask, double &conrastThreshold, double &radius) = 0;
 };
 
 }
diff --git a/rtengine/improccoordinator.cc b/rtengine/improccoordinator.cc
index c6268576a..8b9e49124 100644
--- a/rtengine/improccoordinator.cc
+++ b/rtengine/improccoordinator.cc
@@ -122,6 +122,7 @@ ImProcCoordinator::ImProcCoordinator() :
     bayerAutoContrastListener(nullptr),
     xtransAutoContrastListener(nullptr),
     pdSharpenAutoContrastListener(nullptr),
+    pdSharpenAutoRadiusListener(nullptr),
     frameCountListener(nullptr),
     imageTypeListener(nullptr),
     actListener(nullptr),
@@ -346,10 +347,14 @@ void ImProcCoordinator::updatePreviewImage(int todo, bool panningRelatedChange)
 
         if ((todo & (M_RAW | M_CSHARP)) && params->pdsharpening.enabled) {
             double pdSharpencontrastThreshold = params->pdsharpening.contrast;
-            imgsrc->captureSharpening(params->pdsharpening, sharpMask, pdSharpencontrastThreshold);
+            double pdSharpenRadius = params->pdsharpening.deconvradius;
+            imgsrc->captureSharpening(params->pdsharpening, sharpMask, pdSharpencontrastThreshold, pdSharpenRadius);
             if (pdSharpenAutoContrastListener && params->pdsharpening.autoContrast) {
                 pdSharpenAutoContrastListener->autoContrastChanged(pdSharpencontrastThreshold);
             }
+            if (pdSharpenAutoRadiusListener && params->pdsharpening.autoRadius) {
+                pdSharpenAutoRadiusListener->autoRadiusChanged(pdSharpenRadius);
+            }
         }
 
 
diff --git a/rtengine/improccoordinator.h b/rtengine/improccoordinator.h
index df4ec8134..379a3fb20 100644
--- a/rtengine/improccoordinator.h
+++ b/rtengine/improccoordinator.h
@@ -162,6 +162,7 @@ protected:
     AutoContrastListener *bayerAutoContrastListener;
     AutoContrastListener *xtransAutoContrastListener;
     AutoContrastListener *pdSharpenAutoContrastListener;
+    AutoRadiusListener *pdSharpenAutoRadiusListener;
     FrameCountListener *frameCountListener;
     ImageTypeListener *imageTypeListener;
 
@@ -364,6 +365,11 @@ public:
         xtransAutoContrastListener = acl;
     }
 
+    void setpdSharpenAutoRadiusListener  (AutoRadiusListener* acl) override
+    {
+        pdSharpenAutoRadiusListener = acl;
+    }
+
     void setpdSharpenAutoContrastListener  (AutoContrastListener* acl) override
     {
         pdSharpenAutoContrastListener = acl;
diff --git a/rtengine/procparams.cc b/rtengine/procparams.cc
index 2fcc5d358..f88220c4e 100644
--- a/rtengine/procparams.cc
+++ b/rtengine/procparams.cc
@@ -1152,6 +1152,7 @@ bool SharpeningParams::operator !=(const SharpeningParams& other) const
 CaptureSharpeningParams::CaptureSharpeningParams() :
     enabled(false),
     autoContrast(true),
+    autoRadius(true),
     contrast(10.0),
     gamma(1.00),
     deconvradius(0.75),
@@ -1166,6 +1167,7 @@ bool CaptureSharpeningParams::operator ==(const CaptureSharpeningParams& other)
         && contrast == other.contrast
         && gamma == other.gamma
         && autoContrast == other.autoContrast
+        && autoRadius == other.autoRadius
         && deconvradius == other.deconvradius
         && deconviter == other.deconviter;
 }
@@ -3370,6 +3372,7 @@ int ProcParams::save(const Glib::ustring& fname, const Glib::ustring& fname2, bo
         saveToKeyfile(!pedited || pedited->pdsharpening.enabled, "PostDemosaicSharpening", "Enabled", pdsharpening.enabled, keyFile);
         saveToKeyfile(!pedited || pedited->pdsharpening.contrast, "PostDemosaicSharpening", "Contrast", pdsharpening.contrast, keyFile);
         saveToKeyfile(!pedited || pedited->pdsharpening.autoContrast, "PostDemosaicSharpening", "AutoContrast", pdsharpening.autoContrast, keyFile);
+        saveToKeyfile(!pedited || pedited->pdsharpening.autoRadius, "PostDemosaicSharpening", "AutoRadius", pdsharpening.autoRadius, keyFile);
         saveToKeyfile(!pedited || pedited->pdsharpening.gamma, "PostDemosaicSharpening", "DeconvGamma", pdsharpening.gamma, keyFile);
         saveToKeyfile(!pedited || pedited->pdsharpening.deconvradius, "PostDemosaicSharpening", "DeconvRadius", pdsharpening.deconvradius, keyFile);
         saveToKeyfile(!pedited || pedited->pdsharpening.deconviter, "PostDemosaicSharpening", "DeconvIterations", pdsharpening.deconviter, keyFile);
@@ -4458,6 +4461,7 @@ int ProcParams::load(const Glib::ustring& fname, ParamsEdited* pedited)
             assignFromKeyfile(keyFile, "PostDemosaicSharpening", "Enabled", pedited, pdsharpening.enabled, pedited->pdsharpening.enabled);
             assignFromKeyfile(keyFile, "PostDemosaicSharpening", "Contrast", pedited, pdsharpening.contrast, pedited->pdsharpening.contrast);
             assignFromKeyfile(keyFile, "PostDemosaicSharpening", "AutoContrast", pedited, pdsharpening.autoContrast, pedited->pdsharpening.autoContrast);
+            assignFromKeyfile(keyFile, "PostDemosaicSharpening", "AutoRadius", pedited, pdsharpening.autoRadius, pedited->pdsharpening.autoRadius);
 
             assignFromKeyfile(keyFile, "PostDemosaicSharpening", "DeconvGamma", pedited, pdsharpening.gamma, pedited->pdsharpening.gamma);
             assignFromKeyfile(keyFile, "PostDemosaicSharpening", "DeconvRadius", pedited, pdsharpening.deconvradius, pedited->pdsharpening.deconvradius);
diff --git a/rtengine/procparams.h b/rtengine/procparams.h
index e8f846a1d..ce03efc7d 100644
--- a/rtengine/procparams.h
+++ b/rtengine/procparams.h
@@ -545,6 +545,7 @@ struct SharpenMicroParams {
 struct CaptureSharpeningParams {
     bool           enabled;
     bool           autoContrast;
+    bool           autoRadius;
     double         contrast;
     double         gamma;
     double         deconvradius;
diff --git a/rtengine/rawimagesource.h b/rtengine/rawimagesource.h
index e20024f3c..e52adea18 100644
--- a/rtengine/rawimagesource.h
+++ b/rtengine/rawimagesource.h
@@ -311,7 +311,7 @@ protected:
     void    hflip       (Imagefloat* im);
     void    vflip       (Imagefloat* im);
     void getRawValues(int x, int y, int rotate, int &R, int &G, int &B) override;
-    void captureSharpening(const procparams::CaptureSharpeningParams &sharpeningParams, bool showMask, double &conrastThreshold) override;
+    void captureSharpening(const procparams::CaptureSharpeningParams &sharpeningParams, bool showMask, double &conrastThreshold, double &radius) override;
 };
 
 }
diff --git a/rtengine/refreshmap.cc b/rtengine/refreshmap.cc
index db56500b8..6917d856e 100644
--- a/rtengine/refreshmap.cc
+++ b/rtengine/refreshmap.cc
@@ -521,7 +521,7 @@ int refreshmap[rtengine::NUMOFEVENTS] = {
     RGBCURVE,         // EvRGBEnabled
     LUMINANCECURVE,   // EvLEnabled
     DEMOSAIC,         // EvPdShrEnabled
-    ALLNORAW          // EvPdShrMaskToggled
+    CAPTURESHARPEN    // EvPdShrMaskToggled
 
 };
 
diff --git a/rtengine/rt_algo.cc b/rtengine/rt_algo.cc
index 9fdb59652..4afd8ac73 100644
--- a/rtengine/rt_algo.cc
+++ b/rtengine/rt_algo.cc
@@ -156,7 +156,7 @@ float calcContrastThreshold(const float* const * luminance, int tileY, int tileX
         }
     }
 
-    return c / 100.f;
+    return (c + 1) / 100.f;
 }
 }
 
@@ -299,7 +299,7 @@ void findMinMaxPercentile(const float* data, size_t size, float minPrct, float&
     maxOut = rtengine::LIM(maxOut, minVal, maxVal);
 }
 
-void buildBlendMask(const float* const * luminance, float **blend, int W, int H, float &contrastThreshold, float amount, bool autoContrast) {
+void buildBlendMask(const float* const * luminance, float **blend, int W, int H, float &contrastThreshold, float amount, bool autoContrast, float ** clipMask) {
 
     if (autoContrast) {
         constexpr float minLuminance = 2000.f;
@@ -424,11 +424,20 @@ void buildBlendMask(const float* const * luminance, float **blend, int W, int H,
             for(int j = 2; j < H - 2; ++j) {
                 int i = 2;
 #ifdef __SSE2__
-                for(; i < W - 5; i += 4) {
-                    vfloat contrastv = vsqrtf(SQRV(LVFU(luminance[j][i+1]) - LVFU(luminance[j][i-1])) + SQRV(LVFU(luminance[j+1][i]) - LVFU(luminance[j-1][i])) +
-                                              SQRV(LVFU(luminance[j][i+2]) - LVFU(luminance[j][i-2])) + SQRV(LVFU(luminance[j+2][i]) - LVFU(luminance[j-2][i]))) * scalev;
+                if (clipMask) {
+                    for(; i < W - 5; i += 4) {
+                        vfloat contrastv = vsqrtf(SQRV(LVFU(luminance[j][i+1]) - LVFU(luminance[j][i-1])) + SQRV(LVFU(luminance[j+1][i]) - LVFU(luminance[j-1][i])) +
+                                                  SQRV(LVFU(luminance[j][i+2]) - LVFU(luminance[j][i-2])) + SQRV(LVFU(luminance[j+2][i]) - LVFU(luminance[j-2][i]))) * scalev;
 
-                    STVFU(blend[j][i], amountv * calcBlendFactor(contrastv, contrastThresholdv));
+                        STVFU(blend[j][i], LVFU(clipMask[j][i]) * amountv * calcBlendFactor(contrastv, contrastThresholdv));
+                    }
+                } else {
+                    for(; i < W - 5; i += 4) {
+                        vfloat contrastv = vsqrtf(SQRV(LVFU(luminance[j][i+1]) - LVFU(luminance[j][i-1])) + SQRV(LVFU(luminance[j+1][i]) - LVFU(luminance[j-1][i])) +
+                                                  SQRV(LVFU(luminance[j][i+2]) - LVFU(luminance[j][i-2])) + SQRV(LVFU(luminance[j+2][i]) - LVFU(luminance[j-2][i]))) * scalev;
+
+                        STVFU(blend[j][i], amountv * calcBlendFactor(contrastv, contrastThresholdv));
+                    }
                 }
 #endif
                 for(; i < W - 2; ++i) {
@@ -436,7 +445,7 @@ void buildBlendMask(const float* const * luminance, float **blend, int W, int H,
                     float contrast = sqrtf(rtengine::SQR(luminance[j][i+1] - luminance[j][i-1]) + rtengine::SQR(luminance[j+1][i] - luminance[j-1][i]) + 
                                            rtengine::SQR(luminance[j][i+2] - luminance[j][i-2]) + rtengine::SQR(luminance[j+2][i] - luminance[j-2][i])) * scale;
 
-                    blend[j][i] = amount * calcBlendFactor(contrast, contrastThreshold);
+                    blend[j][i] = (clipMask ? clipMask[j][i] : 1.f) * amount * calcBlendFactor(contrast, contrastThreshold);
                 }
             }
 
diff --git a/rtengine/rt_algo.h b/rtengine/rt_algo.h
index 0ea000a26..5485346c7 100644
--- a/rtengine/rt_algo.h
+++ b/rtengine/rt_algo.h
@@ -24,5 +24,5 @@
 namespace rtengine
 {
 void findMinMaxPercentile(const float* data, size_t size, float minPrct, float& minOut, float maxPrct, float& maxOut, bool multiThread = true);
-void buildBlendMask(const float* const * luminance, float **blend, int W, int H, float &contrastThreshold, float amount = 1.f, bool autoContrast = false);
+void buildBlendMask(const float* const * luminance, float **blend, int W, int H, float &contrastThreshold, float amount = 1.f, bool autoContrast = false, float ** clipmask = nullptr);
 }
diff --git a/rtengine/rtengine.h b/rtengine/rtengine.h
index 33cca1aba..c188622af 100644
--- a/rtengine/rtengine.h
+++ b/rtengine/rtengine.h
@@ -415,6 +415,13 @@ public :
     virtual void autoContrastChanged (double autoContrast) = 0;
 };
 
+class AutoRadiusListener
+{
+public :
+    virtual ~AutoRadiusListener() = default;
+    virtual void autoRadiusChanged (double autoRadius) = 0;
+};
+
 class WaveletListener
 {
 public:
@@ -524,6 +531,7 @@ public:
     virtual void        setBayerAutoContrastListener (AutoContrastListener* l) = 0;
     virtual void        setXtransAutoContrastListener (AutoContrastListener* l) = 0;
     virtual void        setpdSharpenAutoContrastListener (AutoContrastListener* l) = 0;
+    virtual void        setpdSharpenAutoRadiusListener (AutoRadiusListener* l) = 0;
     virtual void        setAutoBWListener       (AutoBWListener* l) = 0;
     virtual void        setAutoWBListener       (AutoWBListener* l) = 0;
     virtual void        setAutoColorTonListener (AutoColorTonListener* l) = 0;
diff --git a/rtengine/simpleprocess.cc b/rtengine/simpleprocess.cc
index 2022d2d15..6dbb0e649 100644
--- a/rtengine/simpleprocess.cc
+++ b/rtengine/simpleprocess.cc
@@ -222,7 +222,7 @@ private:
 
         imgsrc->demosaic (params.raw, autoContrast, contrastThreshold, params.pdsharpening.enabled && pl);
         if (params.pdsharpening.enabled) {
-            imgsrc->captureSharpening(params.pdsharpening, false, params.pdsharpening.contrast);
+            imgsrc->captureSharpening(params.pdsharpening, false, params.pdsharpening.contrast, params.pdsharpening.deconvradius);
         }
 
 
diff --git a/rtengine/stdimagesource.h b/rtengine/stdimagesource.h
index ef1294dea..0dffe2fd0 100644
--- a/rtengine/stdimagesource.h
+++ b/rtengine/stdimagesource.h
@@ -102,7 +102,7 @@ public:
     void getRawValues(int x, int y, int rotate, int &R, int &G, int &B) override { R = G = B = 0;}
 
     void        flushRGB          () override;
-    void captureSharpening(const procparams::CaptureSharpeningParams &sharpeningParams, bool showMask, double &conrastThreshold) override {};
+    void captureSharpening(const procparams::CaptureSharpeningParams &sharpeningParams, bool showMask, double &conrastThreshold, double &radius) override {};
 };
 }
 #endif
diff --git a/rtgui/paramsedited.cc b/rtgui/paramsedited.cc
index 84860329f..2ab5702ea 100644
--- a/rtgui/paramsedited.cc
+++ b/rtgui/paramsedited.cc
@@ -168,6 +168,7 @@ void ParamsEdited::set(bool v)
     pdsharpening.enabled            = v;
     pdsharpening.contrast           = v;
     pdsharpening.autoContrast           = v;
+    pdsharpening.autoRadius           = v;
     pdsharpening.gamma   = v;
     pdsharpening.deconvradius   = v;
     pdsharpening.deconviter     = v;
@@ -752,6 +753,7 @@ void ParamsEdited::initFrom(const std::vector<rtengine::procparams::ProcParams>&
         pdsharpening.enabled = pdsharpening.enabled && p.pdsharpening.enabled == other.pdsharpening.enabled;
         pdsharpening.contrast = pdsharpening.contrast && p.pdsharpening.contrast == other.pdsharpening.contrast;
         pdsharpening.autoContrast = pdsharpening.autoContrast && p.pdsharpening.autoContrast == other.pdsharpening.autoContrast;
+        pdsharpening.autoRadius = pdsharpening.autoRadius && p.pdsharpening.autoRadius == other.pdsharpening.autoRadius;
         pdsharpening.gamma = pdsharpening.gamma && p.pdsharpening.gamma == other.pdsharpening.gamma;
         pdsharpening.deconvradius = pdsharpening.deconvradius && p.pdsharpening.deconvradius == other.pdsharpening.deconvradius;
         pdsharpening.deconviter = pdsharpening.deconviter && p.pdsharpening.deconviter == other.pdsharpening.deconviter;
@@ -1732,6 +1734,10 @@ void ParamsEdited::combine(rtengine::procparams::ProcParams& toEdit, const rteng
         toEdit.pdsharpening.autoContrast = mods.pdsharpening.autoContrast;
     }
 
+    if (pdsharpening.autoRadius) {
+        toEdit.pdsharpening.autoRadius = mods.pdsharpening.autoRadius;
+    }
+
     if (pdsharpening.gamma) {
         toEdit.pdsharpening.gamma = dontforceSet && options.baBehav[ADDSET_SHARP_GAMMA] ? toEdit.pdsharpening.gamma + mods.pdsharpening.gamma : mods.pdsharpening.gamma;
     }
@@ -3289,5 +3295,5 @@ bool FilmNegativeParamsEdited::isUnchanged() const
 
 bool CaptureSharpeningParamsEdited::isUnchanged() const
 {
-    return enabled && contrast && autoContrast && gamma && deconvradius && deconviter;
+    return enabled && contrast && autoContrast && autoRadius && gamma && deconvradius && deconviter;
 }
\ No newline at end of file
diff --git a/rtgui/paramsedited.h b/rtgui/paramsedited.h
index 860ffc551..1bd7170d4 100644
--- a/rtgui/paramsedited.h
+++ b/rtgui/paramsedited.h
@@ -202,6 +202,7 @@ struct CaptureSharpeningParamsEdited {
     bool enabled;
     bool contrast;
     bool autoContrast;
+    bool autoRadius;
     bool gamma;
     bool deconvradius;
     bool deconviter;
diff --git a/rtgui/pdsharpening.cc b/rtgui/pdsharpening.cc
index c85b42e95..ef0ad90c2 100644
--- a/rtgui/pdsharpening.cc
+++ b/rtgui/pdsharpening.cc
@@ -35,6 +35,7 @@ PdSharpening::PdSharpening() : FoldableToolPanel(this, "pdsharpening", M("TP_PDS
     EvPdShrDRadius = m->newEvent(CAPTURESHARPEN, "HISTORY_MSG_PDSHARPEN_RADIUS");
     EvPdShrDIterations = m->newEvent(CAPTURESHARPEN, "HISTORY_MSG_PDSHARPEN_ITERATIONS");
     EvPdShrAutoContrast = m->newEvent(CAPTURESHARPEN, "HISTORY_MSG_PDSHARPEN_AUTO_CONTRAST");
+    EvPdShrAutoRadius = m->newEvent(CAPTURESHARPEN, "HISTORY_MSG_PDSHARPEN_AUTO_RADIUS");
 
     Gtk::HBox* hb = Gtk::manage(new Gtk::HBox());
     hb->show();
@@ -51,6 +52,8 @@ PdSharpening::PdSharpening() : FoldableToolPanel(this, "pdsharpening", M("TP_PDS
     Gtk::VBox* rld = Gtk::manage(new Gtk::VBox());
     gamma = Gtk::manage(new Adjuster(M("TP_SHARPENING_GAMMA"), 0.5, 6.0, 0.05, 1.00));
     dradius = Gtk::manage(new Adjuster(M("TP_SHARPENING_EDRADIUS"), 0.4, 1.15, 0.01, 0.75));
+    dradius->addAutoButton(M("TP_PDSHARPENING_AUTORADIUS_TOOLTIP"));
+    dradius->setAutoValue(true);
     diter = Gtk::manage(new Adjuster(M("TP_SHARPENING_RLD_ITERATIONS"), 1, 100, 1, 20));
     rld->pack_start(*gamma);
     rld->pack_start(*dradius);
@@ -85,6 +88,7 @@ void PdSharpening::read(const ProcParams* pp, const ParamsEdited* pedited)
     if (pedited) {
         contrast->setEditedState(pedited->pdsharpening.contrast ? Edited : UnEdited);
         contrast->setAutoInconsistent(multiImage && !pedited->pdsharpening.autoContrast);
+        dradius->setAutoInconsistent(multiImage && !pedited->pdsharpening.autoRadius);
         gamma->setEditedState(pedited->pdsharpening.gamma ? Edited : UnEdited);
         dradius->setEditedState(pedited->pdsharpening.deconvradius ? Edited : UnEdited);
         diter->setEditedState(pedited->pdsharpening.deconviter ? Edited : UnEdited);
@@ -98,8 +102,10 @@ void PdSharpening::read(const ProcParams* pp, const ParamsEdited* pedited)
     contrast->setAutoValue(pp->pdsharpening.autoContrast);
     gamma->setValue(pp->pdsharpening.gamma);
     dradius->setValue(pp->pdsharpening.deconvradius);
+    dradius->setAutoValue(pp->pdsharpening.autoRadius);
     diter->setValue(pp->pdsharpening.deconviter);
     lastAutoContrast = pp->pdsharpening.autoContrast;
+    lastAutoRadius = pp->pdsharpening.autoRadius;
 
     enableListener();
 }
@@ -112,6 +118,7 @@ void PdSharpening::write(ProcParams* pp, ParamsEdited* pedited)
     pp->pdsharpening.enabled = getEnabled();
     pp->pdsharpening.gamma = gamma->getValue();
     pp->pdsharpening.deconvradius = dradius->getValue();
+    pp->pdsharpening.autoRadius = dradius->getAutoValue();
     pp->pdsharpening.deconviter =(int)diter->getValue();
 
     if (pedited) {
@@ -119,6 +126,7 @@ void PdSharpening::write(ProcParams* pp, ParamsEdited* pedited)
         pedited->pdsharpening.autoContrast = !contrast->getAutoInconsistent();
         pedited->pdsharpening.gamma = gamma->getEditedState();
         pedited->pdsharpening.deconvradius = dradius->getEditedState();
+        pedited->pdsharpening.autoRadius = !dradius->getAutoInconsistent();
         pedited->pdsharpening.deconviter = diter->getEditedState();
         pedited->pdsharpening.enabled = !get_inconsistent();
     }
@@ -224,26 +232,62 @@ void PdSharpening::autoContrastChanged(double autoContrast)
     );
 }
 
+void PdSharpening::autoRadiusChanged(double autoRadius)
+{
+    idle_register.add(
+        [this, autoRadius]() -> bool
+        {
+            disableListener();
+            dradius->setValue(autoRadius);
+            enableListener();
+            return false;
+        }
+    );
+}
+
 void PdSharpening::adjusterAutoToggled(Adjuster* a, bool newval)
 {
-    if (multiImage) {
-        if (contrast->getAutoInconsistent()) {
-            contrast->setAutoInconsistent(false);
-            contrast->setAutoValue(false);
-        } else if (lastAutoContrast) {
-            contrast->setAutoInconsistent(true);
+    if (a == contrast) {
+        if (multiImage) {
+            if (contrast->getAutoInconsistent()) {
+                contrast->setAutoInconsistent(false);
+                contrast->setAutoValue(false);
+            } else if (lastAutoContrast) {
+                contrast->setAutoInconsistent(true);
+            }
+
+            lastAutoContrast = contrast->getAutoValue();
         }
 
-        lastAutoContrast = contrast->getAutoValue();
-    }
+        if (listener) {
+            if (contrast->getAutoInconsistent()) {
+                listener->panelChanged(EvPdShrAutoContrast, M("GENERAL_UNCHANGED"));
+            } else if (contrast->getAutoValue()) {
+                listener->panelChanged(EvPdShrAutoContrast, M("GENERAL_ENABLED"));
+            } else {
+                listener->panelChanged(EvPdShrAutoContrast, M("GENERAL_DISABLED"));
+            }
+        }
+    } else { // must be dradius
+        if (multiImage) {
+            if (dradius->getAutoInconsistent()) {
+                dradius->setAutoInconsistent(false);
+                dradius->setAutoValue(false);
+            } else if (lastAutoRadius) {
+                dradius->setAutoInconsistent(true);
+            }
 
-    if (listener) {
-        if (contrast->getAutoInconsistent()) {
-            listener->panelChanged(EvPdShrAutoContrast, M("GENERAL_UNCHANGED"));
-        } else if (contrast->getAutoValue()) {
-            listener->panelChanged(EvPdShrAutoContrast, M("GENERAL_ENABLED"));
-        } else {
-            listener->panelChanged(EvPdShrAutoContrast, M("GENERAL_DISABLED"));
+            lastAutoRadius = dradius->getAutoValue();
+        }
+
+        if (listener) {
+            if (dradius->getAutoInconsistent()) {
+                listener->panelChanged(EvPdShrAutoRadius, M("GENERAL_UNCHANGED"));
+            } else if (dradius->getAutoValue()) {
+                listener->panelChanged(EvPdShrAutoRadius, M("GENERAL_ENABLED"));
+            } else {
+                listener->panelChanged(EvPdShrAutoRadius, M("GENERAL_DISABLED"));
+            }
         }
     }
 }
diff --git a/rtgui/pdsharpening.h b/rtgui/pdsharpening.h
index b2f7b6e57..e56b4b085 100644
--- a/rtgui/pdsharpening.h
+++ b/rtgui/pdsharpening.h
@@ -21,7 +21,7 @@
 #include "adjuster.h"
 #include "toolpanel.h"
 
-class PdSharpening final : public ToolParamBlock, public AdjusterListener, public FoldableToolPanel, public rtengine::AutoContrastListener
+class PdSharpening final : public ToolParamBlock, public AdjusterListener, public FoldableToolPanel, public rtengine::AutoContrastListener, public rtengine::AutoRadiusListener
 {
 
 protected:
@@ -31,11 +31,13 @@ protected:
     Adjuster* diter;
 
     bool lastAutoContrast;
+    bool lastAutoRadius;
     rtengine::ProcEvent EvPdShrContrast;
     rtengine::ProcEvent EvPdShrDRadius;
     rtengine::ProcEvent EvPdSharpenGamma;
     rtengine::ProcEvent EvPdShrDIterations;
     rtengine::ProcEvent EvPdShrAutoContrast;
+    rtengine::ProcEvent EvPdShrAutoRadius;
     IdleRegister idle_register;
 
 public:
@@ -53,6 +55,7 @@ public:
     void enabledChanged () override;
 
     void autoContrastChanged (double autoContrast) override;
+    void autoRadiusChanged (double autoRadius) override;
 
     void setAdjusterBehavior (bool contrastadd, bool gammaadd, bool radiusadd, bool iteradds);
     void trimValues (rtengine::procparams::ProcParams* pp) override;
diff --git a/rtgui/toolpanelcoord.cc b/rtgui/toolpanelcoord.cc
index a664942a1..4fec4a18a 100644
--- a/rtgui/toolpanelcoord.cc
+++ b/rtgui/toolpanelcoord.cc
@@ -573,6 +573,7 @@ void ToolPanelCoordinator::initImage (rtengine::StagedImageProcessor* ipc_, bool
         ipc->setBayerAutoContrastListener (bayerprocess);
         ipc->setXtransAutoContrastListener (xtransprocess);
         ipc->setpdSharpenAutoContrastListener (pdSharpening);
+        ipc->setpdSharpenAutoRadiusListener (pdSharpening);
         ipc->setAutoWBListener (whitebalance);
         ipc->setAutoColorTonListener (colortoning);
         ipc->setAutoChromaListener (dirpyrdenoise);
diff --git a/rtgui/toolpanelcoord.h b/rtgui/toolpanelcoord.h
index 390967395..3a4ddc05f 100644
--- a/rtgui/toolpanelcoord.h
+++ b/rtgui/toolpanelcoord.h
@@ -242,7 +242,6 @@ public:
 
     void imageTypeChanged (bool isRaw, bool isBayer, bool isXtrans, bool isMono = false) override;
 
-//    void autoContrastChanged (double autoContrast);
     // profilechangelistener interface
     void profileChange(
         const rtengine::procparams::PartialProfile* nparams,

From af10bf8c7c8f06c5fc2effecda4533937016ee9a Mon Sep 17 00:00:00 2001
From: "luz.paz" <luzpaz@users.noreply.github.com>
Date: Wed, 11 Sep 2019 17:38:55 -0400
Subject: [PATCH 07/15] Fix misc. source comment typo

Found via `codespell -q 3 -I ../rawtherapy-whitelist.txt -S ./rtdata/languages -L bord,hist,fo,reall,bloc,alph,dof,thre,makro,chang,currentry,portugues,vektor,ue`
---
 rtengine/demosaic_algos.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/rtengine/demosaic_algos.cc b/rtengine/demosaic_algos.cc
index 49b386b0a..51db8bb3f 100644
--- a/rtengine/demosaic_algos.cc
+++ b/rtengine/demosaic_algos.cc
@@ -2024,7 +2024,8 @@ void RawImageSource::refinement(int PassCount)
 
 // Refinement based on EECI demozaicing algorithm by L. Chang and Y.P. Tan
 // from "Lassus" : Luis Sanz Rodriguez, adapted by Jacques Desmis - JDC - and Oliver Duis for RawTherapee
-// increases the signal to noise ratio (PSNR) # +1 to +2 dB : tested with Dcraw : eg: Lighthouse + AMaZE : whitout refinement:39.96dB, with refinement:41.86 dB
+// increases the signal to noise ratio (PSNR) # +1 to +2 dB : tested with Dcraw : 
+// eg: Lighthouse + AMaZE : without refinement:39.96 dB, with refinement:41.86 dB
 // reduce color artifacts, improves the interpolation
 // but it's relatively slow
 //

From 1e75f38dba6b95d380f1d97fb9ec61748a139ff1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fl=C3=B6ssie?= <floessie.mail@gmail.com>
Date: Thu, 12 Sep 2019 14:49:51 +0200
Subject: [PATCH 08/15] Softlight cleanups

- More `const`
- Removed `using namespace`
- Whitespace cleanups
---
 rtengine/ipsoftlight.cc | 57 ++++++++++++++++++-----------------------
 1 file changed, 25 insertions(+), 32 deletions(-)

diff --git a/rtengine/ipsoftlight.cc b/rtengine/ipsoftlight.cc
index 1d94a29c2..556790eb4 100644
--- a/rtengine/ipsoftlight.cc
+++ b/rtengine/ipsoftlight.cc
@@ -20,17 +20,17 @@
  */
 
 #include "improcfun.h"
+
 #include "procparams.h"
 #define BENCHMARK
 #include "StopWatch.h"
-namespace rtengine {
 
 namespace {
 
 inline float sl(float blend, float x)
 {
-    if (!OOG(x)) {
-        float v = Color::gamma_srgb(x) / MAXVALF;
+    if (!rtengine::OOG(x)) {
+        float v = rtengine::Color::gamma_srgb(x) / rtengine::MAXVALF;
         // using Pegtop's formula from
         // https://en.wikipedia.org/wiki/Blend_modes#Soft_Light
         // const float orig = 1.f - blend;
@@ -40,7 +40,7 @@ inline float sl(float blend, float x)
         // return blend * Color::igamma_srgb(v * MAXVALF) + orig * x;
 
         // using optimized formula (heckflosse67@gmx.de)
-        return intp(blend, Color::igamma_srgb(v * v * (3.f - 2.f * v) * MAXVALF), x);
+        return rtengine::intp(blend, rtengine::Color::igamma_srgb(v * v * (3.f - 2.f * v) * rtengine::MAXVALF), x);
     }
     return x;
 }
@@ -48,51 +48,46 @@ inline float sl(float blend, float x)
 #ifdef __SSE2__
 inline vfloat sl(vfloat blend, vfloat x)
 {
-    const vfloat v = Color::gammatab_srgb[x] / F2V(MAXVALF);
-    return vself(vmaskf_gt(x, F2V(MAXVALF)), x, vself(vmaskf_lt(x, ZEROV), x, vintpf(blend, Color::igammatab_srgb[v * v * (F2V(3.f) - (v + v)) * MAXVALF], x)));
+    const vfloat v = rtengine::Color::gammatab_srgb[x] / F2V(rtengine::MAXVALF);
+    return vself(vmaskf_gt(x, F2V(rtengine::MAXVALF)), x, vself(vmaskf_lt(x, ZEROV), x, vintpf(blend, rtengine::Color::igammatab_srgb[v * v * (F2V(3.f) - (v + v)) * rtengine::MAXVALF], x)));
 }
 #endif
 
 } // namespace
 
-
-void ImProcFunctions::softLight(LabImage *lab)
+void rtengine::ImProcFunctions::softLight(LabImage *lab)
 {
     if (!params->softlight.enabled || !params->softlight.strength) {
         return;
     }
     BENCHFUN
 
-    TMatrix wprof = ICCStore::getInstance()->workingSpaceMatrix(params->icm.workingProfile);
+    const TMatrix wprof = ICCStore::getInstance()->workingSpaceMatrix(params->icm.workingProfile);
     const float wp[3][3] = {
-        {static_cast<float> (wprof[0][0]), static_cast<float> (wprof[0][1]), static_cast<float> (wprof[0][2])},
-        {static_cast<float> (wprof[1][0]), static_cast<float> (wprof[1][1]), static_cast<float> (wprof[1][2])},
-        {static_cast<float> (wprof[2][0]), static_cast<float> (wprof[2][1]), static_cast<float> (wprof[2][2])}
+        {static_cast<float>(wprof[0][0]), static_cast<float>(wprof[0][1]), static_cast<float>(wprof[0][2])},
+        {static_cast<float>(wprof[1][0]), static_cast<float>(wprof[1][1]), static_cast<float>(wprof[1][2])},
+        {static_cast<float>(wprof[2][0]), static_cast<float>(wprof[2][1]), static_cast<float>(wprof[2][2])}
     };
 
-    TMatrix wiprof = ICCStore::getInstance()->workingSpaceInverseMatrix(params->icm.workingProfile);
+    const TMatrix wiprof = ICCStore::getInstance()->workingSpaceInverseMatrix(params->icm.workingProfile);
     const float wip[3][3] = {
-        {static_cast<float> (wiprof[0][0]), static_cast<float> (wiprof[0][1]), static_cast<float> (wiprof[0][2])},
-        {static_cast<float> (wiprof[1][0]), static_cast<float> (wiprof[1][1]), static_cast<float> (wiprof[1][2])},
-        {static_cast<float> (wiprof[2][0]), static_cast<float> (wiprof[2][1]), static_cast<float> (wiprof[2][2])}
+        {static_cast<float>(wiprof[0][0]), static_cast<float>(wiprof[0][1]), static_cast<float>(wiprof[0][2])},
+        {static_cast<float>(wiprof[1][0]), static_cast<float>(wiprof[1][1]), static_cast<float>(wiprof[1][2])},
+        {static_cast<float>(wiprof[2][0]), static_cast<float>(wiprof[2][1]), static_cast<float>(wiprof[2][2])}
     };
 
 #ifdef __SSE2__
-    vfloat wipv[3][3];
+    const vfloat wpv[3][3] = {
+        {F2V(wprof[0][0]), F2V(wprof[0][1]), F2V(wprof[0][2])},
+        {F2V(wprof[1][0]), F2V(wprof[1][1]), F2V(wprof[1][2])},
+        {F2V(wprof[2][0]), F2V(wprof[2][1]), F2V(wprof[2][2])}
+    };
 
-    for (int i = 0; i < 3; i++) {
-        for (int j = 0; j < 3; j++) {
-            wipv[i][j] = F2V(wiprof[i][j]);
-        }
-    }
-
-    vfloat wpv[3][3];
-
-    for (int i = 0; i < 3; i++) {
-        for (int j = 0; j < 3; j++) {
-            wpv[i][j] = F2V(wprof[i][j]);
-        }
-    }
+    const vfloat wipv[3][3] = {
+        {F2V(wiprof[0][0]), F2V(wiprof[0][1]), F2V(wiprof[0][2])},
+        {F2V(wiprof[1][0]), F2V(wiprof[1][1]), F2V(wiprof[1][2])},
+        {F2V(wiprof[2][0]), F2V(wiprof[2][1]), F2V(wiprof[2][2])}
+    };
 #endif
 
 #ifdef _OPENMP
@@ -137,5 +132,3 @@ void ImProcFunctions::softLight(LabImage *lab)
         }
     }
 }
-
-} // namespace rtengine

From 62ec0f0fd295d6e86437953bc3254a3d32b5339e Mon Sep 17 00:00:00 2001
From: Ingo Weyrich <heckflosse67@gmx.de>
Date: Sat, 14 Sep 2019 14:43:32 +0200
Subject: [PATCH 09/15] Capture sharpening: tiled deconvolution

---
 rtengine/capturesharpening.cc | 381 ++++++++++++++++++++++++++++++----
 rtengine/rt_algo.cc           |   2 +-
 2 files changed, 346 insertions(+), 37 deletions(-)

diff --git a/rtengine/capturesharpening.cc b/rtengine/capturesharpening.cc
index e741016db..ef35695f3 100644
--- a/rtengine/capturesharpening.cc
+++ b/rtengine/capturesharpening.cc
@@ -28,7 +28,7 @@
 #include "color.h"
 #include "gauss.h"
 #include "rt_algo.h"
-#define BENCHMARK
+//#define BENCHMARK
 #include "StopWatch.h"
 #ifdef _OPENMP
 #include <omp.h>
@@ -38,6 +38,257 @@
 
 namespace {
 
+void compute7x7kernel(float sigma, float kernel[7][7]) {
+
+    const double temp = -2.f * rtengine::SQR(sigma);
+    float sum = 0.f;
+    for (int i = -3; i <= 3; ++i) {
+        for (int j = -3; j <= 3; ++j) {
+            if((rtengine::SQR(i) + rtengine::SQR(j)) <= rtengine::SQR(3.0 * 1.15)) {
+                kernel[i + 3][j + 3] = std::exp((rtengine::SQR(i) + rtengine::SQR(j)) / temp);
+                sum += kernel[i + 3][j + 3];
+            } else {
+                kernel[i + 3][j + 3] = 0.f;
+            }
+        }
+    }
+
+    for (int i = 0; i < 7; ++i) {
+        for (int j = 0; j < 7; ++j) {
+            kernel[i][j] /= sum;
+        }
+    }
+}
+
+void compute5x5kernel(float sigma, float kernel[5][5]) {
+
+    const double temp = -2.f * rtengine::SQR(sigma);
+    float sum = 0.f;
+    for (int i = -2; i <= 2; ++i) {
+        for (int j = -2; j <= 2; ++j) {
+            if((rtengine::SQR(i) + rtengine::SQR(j)) <= rtengine::SQR(3.0 * 0.84)) {
+                kernel[i + 2][j + 2] = std::exp((rtengine::SQR(i) + rtengine::SQR(j)) / temp);
+                sum += kernel[i + 2][j + 2];
+            } else {
+                kernel[i + 2][j + 2] = 0.f;
+            }
+        }
+    }
+
+    for (int i = 0; i < 5; ++i) {
+        for (int j = 0; j < 5; ++j) {
+            kernel[i][j] /= sum;
+        }
+    }
+}
+
+void compute3x3kernel(float sigma, float kernel[3][3]) {
+
+    const double temp = -2.f * rtengine::SQR(sigma);
+    float sum = 0.f;
+    for (int i = -1; i <= 1; ++i) {
+        for (int j = -1; j <= 1; ++j) {
+            if((rtengine::SQR(i) + rtengine::SQR(j)) <= rtengine::SQR(3.0 * 0.84)) {
+                kernel[i + 1][j + 1] = std::exp((rtengine::SQR(i) + rtengine::SQR(j)) / temp);
+                sum += kernel[i + 1][j + 1];
+            } else {
+                kernel[i + 1][j + 1] = 0.f;
+            }
+        }
+    }
+
+    for (int i = 0; i < 3; ++i) {
+        for (int j = 0; j < 3; ++j) {
+            kernel[i][j] /= sum;
+        }
+    }
+}
+
+inline void gauss3x3div (float** RESTRICT src, float** RESTRICT dst, float** RESTRICT divBuffer, const int W, const int H, const float kernel[3][3])
+{
+
+    const float c11 = kernel[0][0];
+    const float c10 = kernel[0][1];
+    const float c00 = kernel[1][1];
+
+    for (int i = 1; i < H - 1; i++) {
+        dst[i][0] = 1.f;
+        for (int j = 1; j < W - 1; j++) {
+            const float val = c11 * (src[i - 1][j - 1] + src[i - 1][j + 1] + src[i + 1][j - 1] + src[i + 1][j + 1]) + 
+                              c10 * (src[i - 1][j] + src[i][j - 1] + src[i][j + 1] + src[i + 1][j]) + 
+                              c00 * src[i][j];
+            dst[i][j] = divBuffer[i][j] / std::max(val, 0.00001f);
+        }
+        dst[i][W - 1] = 1.f;
+    }
+    // first and last rows
+    {
+        for (int i = 0; i < 1; ++i) {
+            for (int j = 0; j < W; ++j) {
+                dst[i][j] = 1.f;
+            }
+        }
+        for (int i = H - 1 ; i < H; ++i) {
+            for (int j = 0; j < W; ++j) {
+                dst[i][j] = 1.f;
+            }
+        }
+    }
+}
+
+inline void gauss5x5div (float** RESTRICT src, float** RESTRICT dst, float** RESTRICT divBuffer, const int W, const int H, const float kernel[5][5])
+{
+
+    const float c21 = kernel[0][1];
+    const float c20 = kernel[0][2];
+    const float c11 = kernel[1][1];
+    const float c10 = kernel[1][2];
+    const float c00 = kernel[2][2];
+
+    for (int i = 2; i < H - 2; ++i) {
+        dst[i][0] = dst[i][1] = 1.f;
+        // I tried hand written SSE code but gcc vectorizes better
+        for (int j = 2; j < W - 2; ++j) {
+            const float val = c21 * (src[i - 2][j - 1] + src[i - 2][j + 1] + src[i - 1][j - 2] + src[i - 1][j + 2] + src[i + 1][j - 2] + src[i + 1][j + 2] + src[i + 2][j - 1] + src[i + 2][j + 1]) +
+                              c20 * (src[i - 2][j] + src[i][j - 2] + src[i][j + 2] + src[i + 2][j]) +
+                              c11 * (src[i - 1][j - 1] + src[i - 1][j + 1] + src[i + 1][j - 1] + src[i + 1][j + 1]) +
+                              c10 * (src[i - 1][j] + src[i][j - 1] + src[i][j + 1] + src[i + 1][j]) +
+                              c00 * src[i][j];
+
+            dst[i][j] = divBuffer[i][j] / std::max(val, 0.00001f);
+        }
+        dst[i][W - 2] = dst[i][W - 1] = 1.f;
+    }
+
+    // first and last rows
+    {
+        for (int i = 0; i < 2; ++i) {
+            for (int j = 0; j < W; ++j) {
+                dst[i][j] = 1.f;
+            }
+        }
+        for (int i = H - 2 ; i < H; ++i) {
+            for (int j = 0; j < W; ++j) {
+                dst[i][j] = 1.f;
+            }
+        }
+    }
+}
+
+inline void gauss7x7div(float** RESTRICT src, float** RESTRICT dst, float** RESTRICT divBuffer, const int W, const int H, const float kernel[7][7])
+{
+
+    const float c31 = kernel[0][2];
+    const float c30 = kernel[0][3];
+    const float c22 = kernel[1][1];
+    const float c21 = kernel[1][2];
+    const float c20 = kernel[1][3];
+    const float c11 = kernel[2][2];
+    const float c10 = kernel[2][3];
+    const float c00 = kernel[3][3];
+
+    for (int i = 3; i < H - 3; ++i) {
+        dst[i][0] = dst[i][1] = dst[i][2] = 1.f;
+        // I tried hand written SSE code but gcc vectorizes better
+        for (int j = 3; j < W - 3; ++j) {
+            const float val = c31 * (src[i - 3][j - 1] + src[i - 3][j + 1] + src[i - 1][j - 3] + src[i - 1][j + 3] + src[i + 1][j - 3] + src[i + 1][j + 3] + src[i + 3][j - 1] + src[i + 3][j + 1]) +
+                              c30 * (src[i - 3][j] + src[i][j - 3] + src[i][j + 3] + src[i + 3][j]) +
+                              c22 * (src[i - 2][j - 2] + src[i - 2][j + 2] + src[i + 2][j - 2] + src[i + 2][j + 2]) +
+                              c21 * (src[i - 2][j - 1] + src[i - 2][j + 1] * c21 + src[i - 1][j - 2] + src[i - 1][j + 2] + src[i + 1][j - 2] + src[i + 1][j + 2] + src[i + 2][j - 1] + src[i + 2][j + 1]) +
+                              c20 * (src[i - 2][j] + src[i][j - 2] + src[i][j + 2] + src[i + 2][j]) +
+                              c11 * (src[i - 1][j - 1] + src[i - 1][j + 1] + src[i + 1][j - 1] + src[i + 1][j + 1]) +
+                              c10 * (src[i - 1][j] + src[i][j - 1] + src[i][j + 1] + src[i + 1][j]) +
+                              c00 * src[i][j];
+
+            dst[i][j] = divBuffer[i][j] / std::max(val, 0.00001f);
+        }
+        dst[i][W - 3] = dst[i][W - 2] = dst[i][W - 1] = 1.f;
+    }
+
+    // first and last rows
+    {
+        for (int i = 0; i < 3; ++i) {
+            for (int j = 0; j < W; ++j) {
+                dst[i][j] = 1.f;
+            }
+        }
+        for (int i = H - 3 ; i < H; ++i) {
+            for (int j = 0; j < W; ++j) {
+                dst[i][j] = 1.f;
+            }
+        }
+    }
+}
+
+inline void gauss3x3mult(float** RESTRICT src, float** RESTRICT dst, const int W, const int H, const float kernel[3][3])
+{
+    const float c11 = kernel[0][0];
+    const float c10 = kernel[0][1];
+    const float c00 = kernel[1][1];
+
+    for (int i = 1; i < H - 1; i++) {
+        for (int j = 1; j < W - 1; j++) {
+            const float val = c11 * (src[i - 1][j - 1] + src[i - 1][j + 1] + src[i + 1][j - 1] + src[i + 1][j + 1]) + 
+                              c10 * (src[i - 1][j] + src[i][j - 1] + src[i][j + 1] + src[i + 1][j]) + 
+                              c00 * src[i][j];
+            dst[i][j] *= val;
+        }
+    }
+
+}
+
+inline void gauss5x5mult (float** RESTRICT src, float** RESTRICT dst, const int W, const int H, const float kernel[5][5])
+{
+
+    const float c21 = kernel[0][1];
+    const float c20 = kernel[0][2];
+    const float c11 = kernel[1][1];
+    const float c10 = kernel[1][2];
+    const float c00 = kernel[2][2];
+
+    for (int i = 2; i < H - 2; ++i) {
+        // I tried hand written SSE code but gcc vectorizes better
+        for (int j = 2; j < W - 2; ++j) {
+            const float val = c21 * (src[i - 2][j - 1] + src[i - 2][j + 1] + src[i - 1][j - 2] + src[i - 1][j + 2] + src[i + 1][j - 2] + src[i + 1][j + 2] + src[i + 2][j - 1] + src[i + 2][j + 1]) +
+                              c20 * (src[i - 2][j] + src[i][j - 2] + src[i][j + 2] + src[i + 2][j]) +
+                              c11 * (src[i - 1][j - 1] + src[i - 1][j + 1] + src[i + 1][j - 1] + src[i + 1][j + 1]) +
+                              c10 * (src[i - 1][j] + src[i][j - 1] + src[i][j + 1] + src[i + 1][j]) +
+                              c00 * src[i][j];
+
+            dst[i][j] *= val;
+        }
+    }
+}
+
+inline void gauss7x7mult(float** RESTRICT src, float** RESTRICT dst, const int W, const int H, const float kernel[7][7])
+{
+
+    const float c31 = kernel[0][2];
+    const float c30 = kernel[0][3];
+    const float c22 = kernel[1][1];
+    const float c21 = kernel[1][2];
+    const float c20 = kernel[1][3];
+    const float c11 = kernel[2][2];
+    const float c10 = kernel[2][3];
+    const float c00 = kernel[3][3];
+
+    for (int i = 3; i < H - 3; ++i) {
+        // I tried hand written SSE code but gcc vectorizes better
+        for (int j = 3; j < W - 3; ++j) {
+            const float val = c31 * (src[i - 3][j - 1] + src[i - 3][j + 1] + src[i - 1][j - 3] + src[i - 1][j + 3] + src[i + 1][j - 3] + src[i + 1][j + 3] + src[i + 3][j - 1] + src[i + 3][j + 1]) +
+                              c30 * (src[i - 3][j] + src[i][j - 3] + src[i][j + 3] + src[i + 3][j]) +
+                              c22 * (src[i - 2][j - 2] + src[i - 2][j + 2] + src[i + 2][j - 2] + src[i + 2][j + 2]) +
+                              c21 * (src[i - 2][j - 1] + src[i - 2][j + 1] * c21 + src[i - 1][j - 2] + src[i - 1][j + 2] + src[i + 1][j - 2] + src[i + 1][j + 2] + src[i + 2][j - 1] + src[i + 2][j + 1]) +
+                              c20 * (src[i - 2][j] + src[i][j - 2] + src[i][j + 2] + src[i + 2][j]) +
+                              c11 * (src[i - 1][j - 1] + src[i - 1][j + 1] + src[i + 1][j - 1] + src[i + 1][j + 1]) +
+                              c10 * (src[i - 1][j] + src[i][j - 1] + src[i][j + 1] + src[i + 1][j]) +
+                              c00 * src[i][j];
+
+            dst[i][j] *= val;
+        }
+    }
+}
+
 void buildClipMaskBayer(const float * const *rawData, int W, int H, float** clipMask, const float whites[2][2])
 {
 
@@ -274,47 +525,106 @@ float calcRadiusXtrans(const float * const *rawData, int W, int H, float lowerLi
     }
     return std::sqrt((1.f / (std::log(1.f / maxRatio))) / -2.f);
 }
-void CaptureDeconvSharpening (float** luminance, float** tmp, const float * const * blend, int W, int H, double sigma, int iterations, rtengine::ProgressListener* plistener, double start, double step)
+void CaptureDeconvSharpening (float** luminance, float** oldLuminance, const float * const * blend, int W, int H, double sigma, int iterations, rtengine::ProgressListener* plistener, double startVal, double endVal)
 {
+BENCHFUN
+    const bool is5x5 = (sigma <= 0.84);
+    const bool is3x3 = (sigma < 0.6);
+    float kernel7[7][7];
+    float kernel5[5][5];
+    float kernel3[3][3];
+    if (is3x3) {
+        compute3x3kernel(sigma, kernel3);
+    } else if (is5x5) {
+        compute5x5kernel(sigma, kernel5);
+    } else {
+        compute7x7kernel(sigma, kernel7);
+    }
 
-    rtengine::JaggedArray<float> tmpI(W, H);
+    constexpr int tileSize = 194;
+    constexpr int border = 3;
+    constexpr int fullTileSize = tileSize + 2 * border;
 
+    double progress = startVal;
+    const double progressStep = (endVal - startVal) * rtengine::SQR(tileSize) / (W * H);
 #ifdef _OPENMP
     #pragma omp parallel
 #endif
     {
+        int progresscounter = 0;
+        rtengine::JaggedArray<float> tmpIThr(fullTileSize, fullTileSize);
+        rtengine::JaggedArray<float> tmpThr(fullTileSize, fullTileSize);
+        rtengine::JaggedArray<float> lumThr(fullTileSize, fullTileSize);
+#pragma omp for schedule(dynamic,2) collapse(2)
+        for (int i = border; i < H - border; i+= tileSize) {
+            for(int j = border; j < W - border; j+= tileSize) {
+                const bool endOfCol = (i + tileSize + border) >= H;
+                const bool endOfRow = (j + tileSize + border) >= W;
+                // fill tiles
+                if (endOfRow || endOfCol) {
+                    // special handling for small tiles at end of row or column
+                    for (int k = 0, ii = endOfCol ? H - fullTileSize : i; k < fullTileSize; ++k, ++ii) {
+                        for (int l = 0, jj = endOfRow ? W - fullTileSize : j; l < fullTileSize; ++l, ++jj) {
+                            tmpIThr[k][l] = oldLuminance[ii - border][jj - border];
+                            lumThr[k][l] = oldLuminance[ii - border][jj - border];
+                        }
+                    }
+                } else {
+                    for (int ii = i; ii < i + fullTileSize; ++ii) {
+                        for (int jj = j; jj < j + fullTileSize; ++jj) {
+                            tmpIThr[ii - i][jj - j] = oldLuminance[ii - border][jj - border];
+                            lumThr[ii - i][jj - j] = oldLuminance[ii - border][jj - border];
+                        }
+                    }
+                }
+                if (is3x3) {
+                    for (int k = 0; k < iterations; ++k) {
+                        // apply 3x3 gaussian blur and divide luminance by result of gaussian blur
+                        gauss3x3div(tmpIThr, tmpThr, lumThr, fullTileSize, fullTileSize, kernel3);
+                        gauss3x3mult(tmpThr, tmpIThr, fullTileSize, fullTileSize, kernel3);
+                    }
+                } else if (is5x5) {
+                    for (int k = 0; k < iterations; ++k) {
+                        // apply 5x5 gaussian blur and divide luminance by result of gaussian blur
+                        gauss5x5div(tmpIThr, tmpThr, lumThr, fullTileSize, fullTileSize, kernel5);
+                        gauss5x5mult(tmpThr, tmpIThr, fullTileSize, fullTileSize, kernel5);
+                    }
+                } else {
+                    for (int k = 0; k < iterations; ++k) {
+                        // apply 7x7 gaussian blur and divide luminance by result of gaussian blur
+                        gauss7x7div(tmpIThr, tmpThr, lumThr, fullTileSize, fullTileSize, kernel7);
+                        gauss7x7mult(tmpThr, tmpIThr, fullTileSize, fullTileSize, kernel7);
+                    }
+                }
+                if (endOfRow || endOfCol) {
+                    // special handling for small tiles at end of row or column
+                    for (int k = border, ii = endOfCol ? H - fullTileSize - border : i - border; k < fullTileSize - border; ++k) {
+                        for (int l = border, jj = endOfRow ? W - fullTileSize - border : j - border; l < fullTileSize - border; ++l) {
+                            luminance[ii + k][jj + l] = rtengine::intp(blend[ii + k][jj + l], max(tmpIThr[k][l], 0.0f), luminance[ii + k][jj + l]);
+                        }
+                    }
+                } else {
+                    for (int ii = border; ii < fullTileSize - border; ++ii) {
+                        for (int jj = border; jj < fullTileSize - border; ++jj) {
+                            luminance[i + ii - border][j + jj - border] = rtengine::intp(blend[i + ii - border][j + jj - border], max(tmpIThr[ii][jj], 0.0f), luminance[i + ii - border][j + jj - border]);
+                        }
+                    }
+                }
+                if (plistener) {
+                    if (++progresscounter % 16 == 0) {
 #ifdef _OPENMP
-        #pragma omp for
+                        #pragma omp critical(csprogress)
 #endif
-        for (int i = 0; i < H; i++) {
-            for(int j = 0; j < W; j++) {
-                tmpI[i][j] = max(luminance[i][j], 0.f);
+                        {
+                            progress += 16.0 * progressStep;
+                            progress = rtengine::min(progress, endVal);
+                            plistener->setProgress(progress);
+                        }
+                    }
+                }
             }
         }
-
-        for (int k = 0; k < iterations; k++) {
-            // apply gaussian blur and divide luminance by result of gaussian blur
-            gaussianBlur(tmpI, tmp, W, H, sigma, nullptr, GAUSS_DIV, luminance);
-            gaussianBlur(tmp, tmpI, W, H, sigma, nullptr, GAUSS_MULT);
-            if (plistener) {
-#ifdef _OPENMP
-                #pragma omp single
-#endif
-                start += step;
-                plistener->setProgress(start);
-            }
-        } // end for
-
-#ifdef _OPENMP
-        #pragma omp for
-#endif
-
-        for (int i = 0; i < H; ++i) {
-            for (int j = 0; j < W; ++j) {
-                luminance[i][j] = rtengine::intp(blend[i][j], max(tmpI[i][j], 0.0f), luminance[i][j]);
-            }
-        }
-    } // end parallel
+    }
 }
 
 }
@@ -328,7 +638,7 @@ void RawImageSource::captureSharpening(const procparams::CaptureSharpeningParams
         plistener->setProgressStr(M("TP_PDSHARPENING_LABEL"));
         plistener->setProgress(0.0);
     }
-
+BENCHFUN
     const float xyz_rgb[3][3] = {          // XYZ from RGB
                                     { 0.412453, 0.357580, 0.180423 },
                                     { 0.212671, 0.715160, 0.072169 },
@@ -442,13 +752,14 @@ void RawImageSource::captureSharpening(const procparams::CaptureSharpeningParams
     array2D<float>& L = Lbuffer ? *Lbuffer : red;
     array2D<float>& YOld = YOldbuffer ? * YOldbuffer : green;
     array2D<float>& YNew = YNewbuffer ? * YNewbuffer : blue;
+    const float gamma = sharpeningParams.gamma;
 
 #ifdef _OPENMP
     #pragma omp parallel for schedule(dynamic, 16)
 #endif
     for (int i = 0; i < H; ++i) {
         Color::RGB2L(redVals[i], greenVals[i], blueVals[i], L[i], xyz_rgb, W);
-        Color::RGB2Y(redVals[i], greenVals[i], blueVals[i], YOld[i], YNew[i], sharpeningParams.gamma, W);
+        Color::RGB2Y(redVals[i], greenVals[i], blueVals[i], YOld[i], YNew[i], gamma, W);
     }
     if (plistener) {
         plistener->setProgress(0.1);
@@ -461,12 +772,10 @@ void RawImageSource::captureSharpening(const procparams::CaptureSharpeningParams
     }
     conrastThreshold = contrast * 100.f;
 
-    array2D<float>& tmp = L; // L is not used anymore now => we can use its buffer as the needed temporary buffer
-    CaptureDeconvSharpening(YNew, tmp, blend, W, H, radius, sharpeningParams.deconviter, plistener, 0.2, (0.9 - 0.2) / sharpeningParams.deconviter);
+    CaptureDeconvSharpening(YNew, YOld, blend, W, H, radius, sharpeningParams.deconviter, plistener, 0.2, 0.9);
     if (plistener) {
         plistener->setProgress(0.9);
     }
-    const float gamma = sharpeningParams.gamma;
 #ifdef _OPENMP
     #pragma omp parallel for schedule(dynamic, 16)
 #endif
diff --git a/rtengine/rt_algo.cc b/rtengine/rt_algo.cc
index 4afd8ac73..dd7783c1d 100644
--- a/rtengine/rt_algo.cc
+++ b/rtengine/rt_algo.cc
@@ -394,7 +394,7 @@ void buildBlendMask(const float* const * luminance, float **blend, int W, int H,
                         }
                     }
 
-                    contrastThreshold = minvar <= 4.f ? calcContrastThreshold(luminance, topLeftYStart + minI, topLeftXStart + minJ, tilesize) : 0.f;
+                    contrastThreshold = minvar <= 8.f ? calcContrastThreshold(luminance, topLeftYStart + minI, topLeftXStart + minJ, tilesize) : 0.f;
                 }
             }
         }

From aadeb539a92b1729fa8a8a2d7b4a0b2d71682dfd Mon Sep 17 00:00:00 2001
From: Ingo Weyrich <heckflosse67@gmx.de>
Date: Sat, 14 Sep 2019 17:51:28 +0200
Subject: [PATCH 10/15] Capture sharpening: small speedup, also reduced memory
 usage by width * height * 4 byte

---
 rtengine/capturesharpening.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rtengine/capturesharpening.cc b/rtengine/capturesharpening.cc
index ef35695f3..4bfe91aba 100644
--- a/rtengine/capturesharpening.cc
+++ b/rtengine/capturesharpening.cc
@@ -765,7 +765,7 @@ BENCHFUN
         plistener->setProgress(0.1);
     }
     // calculate contrast based blend factors to reduce sharpening in regions with low contrast
-    JaggedArray<float> blend(W, H);
+    array2D<float>& blend = clipMask; // we can share blend and clipMask buffer here
     buildBlendMask(L, blend, W, H, contrast, 1.f, sharpeningParams.autoContrast, clipMask);
     if (plistener) {
         plistener->setProgress(0.2);

From 51daaf5fa1e4e75a3e225dd4b3f98ab8cfa64c89 Mon Sep 17 00:00:00 2001
From: Ingo Weyrich <heckflosse67@gmx.de>
Date: Sat, 14 Sep 2019 19:35:19 +0200
Subject: [PATCH 11/15] Capture Sharpening: some code cleanups

---
 rtengine/capturesharpening.cc | 58 ++++++++++++++---------------------
 1 file changed, 23 insertions(+), 35 deletions(-)

diff --git a/rtengine/capturesharpening.cc b/rtengine/capturesharpening.cc
index 4bfe91aba..8d9ee5b64 100644
--- a/rtengine/capturesharpening.cc
+++ b/rtengine/capturesharpening.cc
@@ -19,7 +19,6 @@
 #include <cmath>
 #include <iostream>
 
-#include "jaggedarray.h"
 #include "rtengine.h"
 #include "rawimagesource.h"
 #include "rt_math.h"
@@ -121,18 +120,12 @@ inline void gauss3x3div (float** RESTRICT src, float** RESTRICT dst, float** RES
         }
         dst[i][W - 1] = 1.f;
     }
-    // first and last rows
-    {
-        for (int i = 0; i < 1; ++i) {
-            for (int j = 0; j < W; ++j) {
-                dst[i][j] = 1.f;
-            }
-        }
-        for (int i = H - 1 ; i < H; ++i) {
-            for (int j = 0; j < W; ++j) {
-                dst[i][j] = 1.f;
-            }
-        }
+    // first and last row
+    for (int j = 0; j < W; ++j) {
+        dst[0][j] = 1.f;
+    }
+    for (int j = 0; j < W; ++j) {
+        dst[H - 1][j] = 1.f;
     }
 }
 
@@ -161,16 +154,14 @@ inline void gauss5x5div (float** RESTRICT src, float** RESTRICT dst, float** RES
     }
 
     // first and last rows
-    {
-        for (int i = 0; i < 2; ++i) {
-            for (int j = 0; j < W; ++j) {
-                dst[i][j] = 1.f;
-            }
+    for (int i = 0; i < 2; ++i) {
+        for (int j = 0; j < W; ++j) {
+            dst[i][j] = 1.f;
         }
-        for (int i = H - 2 ; i < H; ++i) {
-            for (int j = 0; j < W; ++j) {
-                dst[i][j] = 1.f;
-            }
+    }
+    for (int i = H - 2 ; i < H; ++i) {
+        for (int j = 0; j < W; ++j) {
+            dst[i][j] = 1.f;
         }
     }
 }
@@ -206,16 +197,14 @@ inline void gauss7x7div(float** RESTRICT src, float** RESTRICT dst, float** REST
     }
 
     // first and last rows
-    {
-        for (int i = 0; i < 3; ++i) {
-            for (int j = 0; j < W; ++j) {
-                dst[i][j] = 1.f;
-            }
+    for (int i = 0; i < 3; ++i) {
+        for (int j = 0; j < W; ++j) {
+            dst[i][j] = 1.f;
         }
-        for (int i = H - 3 ; i < H; ++i) {
-            for (int j = 0; j < W; ++j) {
-                dst[i][j] = 1.f;
-            }
+    }
+    for (int i = H - 3 ; i < H; ++i) {
+        for (int j = 0; j < W; ++j) {
+            dst[i][j] = 1.f;
         }
     }
 }
@@ -300,7 +289,6 @@ void buildClipMaskBayer(const float * const *rawData, int W, int H, float** clip
             clipMask[row][col] = 1.f;
         }
     }
-
 #ifdef _OPENMP
     #pragma omp parallel for schedule(dynamic, 16)
 #endif
@@ -552,9 +540,9 @@ BENCHFUN
 #endif
     {
         int progresscounter = 0;
-        rtengine::JaggedArray<float> tmpIThr(fullTileSize, fullTileSize);
-        rtengine::JaggedArray<float> tmpThr(fullTileSize, fullTileSize);
-        rtengine::JaggedArray<float> lumThr(fullTileSize, fullTileSize);
+        array2D<float> tmpIThr(fullTileSize, fullTileSize);
+        array2D<float> tmpThr(fullTileSize, fullTileSize);
+        array2D<float> lumThr(fullTileSize, fullTileSize);
 #pragma omp for schedule(dynamic,2) collapse(2)
         for (int i = border; i < H - border; i+= tileSize) {
             for(int j = border; j < W - border; j+= tileSize) {

From 19e01124952069794dbfc980e4e0a315566fdd83 Mon Sep 17 00:00:00 2001
From: Ingo Weyrich <heckflosse67@gmx.de>
Date: Sun, 15 Sep 2019 12:55:20 +0200
Subject: [PATCH 12/15] xtrans: border artifacts when using 1-pass Markesteijn,
 fixes #5452

---
 rtengine/xtrans_demosaic.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rtengine/xtrans_demosaic.cc b/rtengine/xtrans_demosaic.cc
index e84b5217e..9a3b341cc 100644
--- a/rtengine/xtrans_demosaic.cc
+++ b/rtengine/xtrans_demosaic.cc
@@ -959,7 +959,7 @@ void RawImageSource::xtrans_interpolate (const int passes, const bool useCieLab,
         free(buffer);
     }
 
-    xtransborder_interpolate(8, red, green, blue);
+    xtransborder_interpolate(passes > 1 ? 8 : 11, red, green, blue);
 }
 #undef CLIP
 void RawImageSource::fast_xtrans_interpolate (const array2D<float> &rawData, array2D<float> &red, array2D<float> &green, array2D<float> &blue)

From a06c2714e27a7415554fc036c7749bc209f33f2a Mon Sep 17 00:00:00 2001
From: Ingo Weyrich <heckflosse@users.noreply.github.com>
Date: Sun, 15 Sep 2019 16:29:29 +0200
Subject: [PATCH 13/15] Update capturesharpening.cc

Fix artifacts at radius > 0.85
---
 rtengine/capturesharpening.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rtengine/capturesharpening.cc b/rtengine/capturesharpening.cc
index 8d9ee5b64..df2b1d908 100644
--- a/rtengine/capturesharpening.cc
+++ b/rtengine/capturesharpening.cc
@@ -530,7 +530,7 @@ BENCHFUN
     }
 
     constexpr int tileSize = 194;
-    constexpr int border = 3;
+    constexpr int border = 5;
     constexpr int fullTileSize = tileSize + 2 * border;
 
     double progress = startVal;

From f55afb91c306c3449f579bff0f1f6cfdcde8c831 Mon Sep 17 00:00:00 2001
From: Ingo Weyrich <heckflosse67@gmx.de>
Date: Sun, 15 Sep 2019 22:30:03 +0200
Subject: [PATCH 14/15] Soft Light: remove benchmark code, closes #5447

---
 rtengine/ipsoftlight.cc | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/rtengine/ipsoftlight.cc b/rtengine/ipsoftlight.cc
index 556790eb4..cd49e858f 100644
--- a/rtengine/ipsoftlight.cc
+++ b/rtengine/ipsoftlight.cc
@@ -22,8 +22,6 @@
 #include "improcfun.h"
 
 #include "procparams.h"
-#define BENCHMARK
-#include "StopWatch.h"
 
 namespace {
 
@@ -60,7 +58,6 @@ void rtengine::ImProcFunctions::softLight(LabImage *lab)
     if (!params->softlight.enabled || !params->softlight.strength) {
         return;
     }
-    BENCHFUN
 
     const TMatrix wprof = ICCStore::getInstance()->workingSpaceMatrix(params->icm.workingProfile);
     const float wp[3][3] = {

From 796e8f02896c0506dd9c403572adabf80187e03b Mon Sep 17 00:00:00 2001
From: Ingo Weyrich <heckflosse67@gmx.de>
Date: Mon, 16 Sep 2019 17:17:49 +0200
Subject: [PATCH 15/15] Removed timing code

---
 rtengine/iptransform.cc | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/rtengine/iptransform.cc b/rtengine/iptransform.cc
index 1e739dac6..1a38d8e2f 100644
--- a/rtengine/iptransform.cc
+++ b/rtengine/iptransform.cc
@@ -31,9 +31,6 @@
 #include "rtlensfun.h"
 #include "sleef.c"
 
-#define BENCHMARK
-#include "StopWatch.h"
-
 using namespace std;
 
 namespace
@@ -854,7 +851,6 @@ void ImProcFunctions::transformLuminanceOnly (Imagefloat* original, Imagefloat*
 
 void ImProcFunctions::transformGeneral(bool highQuality, Imagefloat *original, Imagefloat *transformed, int cx, int cy, int sx, int sy, int oW, int oH, int fW, int fH, const LensCorrection *pLCPMap)
 {
-    BENCHFUN
     // set up stuff, depending on the mode we are
     const bool enableLCPDist = pLCPMap && params->lensProf.useDist;
     const bool enableCA = highQuality && needsCA();