From 0c1c2152622fbbc09f3fac99cf4687fdf77b0fff Mon Sep 17 00:00:00 2001
From: Ingo Weyrich <heckflosse67@gmx.de>
Date: Sun, 15 Sep 2019 21:57:17 +0200
Subject: [PATCH 01/31] Improve readability of hasselblad_load_raw() code,
 #5434

---
 rtengine/dcraw.cc | 121 +++++++++++++++++++++++++++-------------------
 1 file changed, 70 insertions(+), 51 deletions(-)

diff --git a/rtengine/dcraw.cc b/rtengine/dcraw.cc
index e15a2bb0f..5e85b1e2e 100644
--- a/rtengine/dcraw.cc
+++ b/rtengine/dcraw.cc
@@ -2417,59 +2417,78 @@ void CLASS hasselblad_correct()
 
 void CLASS hasselblad_load_raw()
 {
-  struct jhead jh;
-  int shot, row, col, *back[5], len[2], diff[12], pred, sh, f, s, c;
-  unsigned upix, urow, ucol;
-  ushort *ip;
+    struct jhead jh;
+    int *back[5], diff[12];
 
-  if (!ljpeg_start (&jh, 0)) return;
-  order = 0x4949;
-  ph1_bithuff_t ph1_bithuff(this, ifp, order);
-  hb_bits(-1);
-  back[4] = (int *) calloc (raw_width, 3*sizeof **back);
-  merror (back[4], "hasselblad_load_raw()");
-  FORC3 back[c] = back[4] + c*raw_width;
-  cblack[6] >>= sh = tiff_samples > 1;
-  shot = LIM(shot_select, 1, tiff_samples) - 1;
-  for (row=0; row < raw_height; row++) {
-    FORC4 back[(c+3) & 3] = back[c];
-    for (col=0; col < raw_width; col+=2) {
-      for (s=0; s < tiff_samples*2; s+=2) {
-	FORC(2) len[c] = ph1_huff(jh.huff[0]);
-	FORC(2) {
-	  diff[s+c] = hb_bits(len[c]);
-	  if ((diff[s+c] & (1 << (len[c]-1))) == 0)
-	    diff[s+c] -= (1 << len[c]) - 1;
-	  if (diff[s+c] == 65535) diff[s+c] = -32768;
-	}
-      }
-      for (s=col; s < col+2; s++) {
-	pred = 0x8000 + load_flags;
-	if (col) pred = back[2][s-2];
-	if (col && row > 1) switch (jh.psv) {
-	  case 11: pred += back[0][s]/2 - back[0][s-2]/2;  break;
-	}
-	f = (row & 1)*3 ^ ((col+s) & 1);
-	FORC (tiff_samples) {
-	  pred += diff[(s & 1)*tiff_samples+c];
-	  upix = pred >> sh & 0xffff;
-	  if (raw_image && c == shot)
-	    RAW(row,s) = upix;
-	  if (image) {
-	    urow = row-top_margin  + (c & 1);
-	    ucol = col-left_margin - ((c >> 1) & 1);
-	    ip = &image[urow*width+ucol][f];
-	    if (urow < height && ucol < width)
-	      *ip = c < 4 ? upix : (*ip + upix) >> 1;
-	  }
-	}
-	back[2][s] = pred;
-      }
+    if (!ljpeg_start (&jh, 0)) {
+        return;
+    }
+    order = 0x4949;
+    ph1_bithuff_t ph1_bithuff(this, ifp, order);
+    hb_bits(-1);
+    back[4] = (int *) calloc(raw_width, 3 * sizeof **back);
+    merror(back[4], "hasselblad_load_raw()");
+    for (int c = 0; c < 3; ++c) {
+        back[c] = back[4] + c * raw_width;
+    }
+    const int sh = tiff_samples > 1;
+    cblack[6] >>= sh;
+    const int shot = LIM(shot_select, 1, tiff_samples) - 1;
+    for (int row = 0; row < raw_height; ++row) {
+        for (int c = 0; c < 4; ++c) {
+            back[(c + 3) & 3] = back[c];
+        }
+        for (int col = 0; col < raw_width; col += 2) {
+            for (int s = 0; s < tiff_samples * 2; s += 2) {
+                int len[2];
+	            for (int c = 0; c < 2; ++c) {
+	                len[c] = ph1_huff(jh.huff[0]);
+	            }
+                for (int c = 0; c < 2; ++c) {
+                    diff[s + c] = hb_bits(len[c]);
+                    if ((diff[s + c] & (1 << (len[c] - 1))) == 0) {
+                        diff[s + c] -= (1 << len[c]) - 1;
+                    }
+                    if (diff[s + c] == 65535) {
+                        diff[s + c] = -32768;
+                    }
+                }
+            }
+            for (int s = col; s < col + 2; ++s) {
+	            int pred;
+	            if (col) {
+                    pred = back[2][s - 2];
+                    if (row > 1 && jh.psv == 11) {
+                        pred += back[0][s] / 2 - back[0][s - 2] / 2;
+                    }
+	            } else {
+	                 pred = 0x8000 + load_flags;
+	            }
+                for (int c = 0; c < tiff_samples; ++c) {
+                    pred += diff[(s & 1) * tiff_samples + c];
+                    const unsigned upix = pred >> sh & 0xffff;
+                    if (raw_image && c == shot) {
+                        RAW(row, s) = upix;
+                    }
+                    if (image) {
+                        const int f = (row & 1) * 3 ^ ((col + s) & 1);
+                        const unsigned urow = row - top_margin  + (c & 1);
+                        const unsigned ucol = col - left_margin - ((c >> 1) & 1);
+                        ushort* const ip = &image[urow * width + ucol][f];
+                        if (urow < height && ucol < width) {
+                            *ip = c < 4 ? upix : (*ip + upix) >> 1;
+                        }
+                    }
+                }
+                back[2][s] = pred;
+            }
+        }
+    }
+    free(back[4]);
+    ljpeg_end(&jh);
+    if (image) {
+        mix_green = 1;
     }
-  }
-  free (back[4]);
-  ljpeg_end (&jh);
-  if (image) mix_green = 1;
 }
 
 void CLASS leaf_hdr_load_raw()

From a0c6c1569c1f7574952341e4be4a8a0859ba811a Mon Sep 17 00:00:00 2001
From: Ingo Weyrich <heckflosse67@gmx.de>
Date: Mon, 16 Sep 2019 21:43:03 +0200
Subject: [PATCH 02/31] Fix indentations

---
 rtengine/dcraw.cc | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/rtengine/dcraw.cc b/rtengine/dcraw.cc
index 5e85b1e2e..5275c42c1 100644
--- a/rtengine/dcraw.cc
+++ b/rtengine/dcraw.cc
@@ -2441,9 +2441,9 @@ void CLASS hasselblad_load_raw()
         for (int col = 0; col < raw_width; col += 2) {
             for (int s = 0; s < tiff_samples * 2; s += 2) {
                 int len[2];
-	            for (int c = 0; c < 2; ++c) {
-	                len[c] = ph1_huff(jh.huff[0]);
-	            }
+                for (int c = 0; c < 2; ++c) {
+                    len[c] = ph1_huff(jh.huff[0]);
+                }
                 for (int c = 0; c < 2; ++c) {
                     diff[s + c] = hb_bits(len[c]);
                     if ((diff[s + c] & (1 << (len[c] - 1))) == 0) {
@@ -2455,15 +2455,15 @@ void CLASS hasselblad_load_raw()
                 }
             }
             for (int s = col; s < col + 2; ++s) {
-	            int pred;
-	            if (col) {
+                int pred;
+                if (col) {
                     pred = back[2][s - 2];
                     if (row > 1 && jh.psv == 11) {
                         pred += back[0][s] / 2 - back[0][s - 2] / 2;
                     }
-	            } else {
-	                 pred = 0x8000 + load_flags;
-	            }
+                } else {
+                     pred = 0x8000 + load_flags;
+                }
                 for (int c = 0; c < tiff_samples; ++c) {
                     pred += diff[(s & 1) * tiff_samples + c];
                     const unsigned upix = pred >> sh & 0xffff;

From 991fc94d89e08ce625256ca4a146d7151ae5c5fd Mon Sep 17 00:00:00 2001
From: Ingo Weyrich <heckflosse67@gmx.de>
Date: Tue, 17 Sep 2019 15:11:12 +0200
Subject: [PATCH 03/31] Speedup for guided filter

---
 rtengine/guidedfilter.cc | 28 ++++++++++++++++------------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/rtengine/guidedfilter.cc b/rtengine/guidedfilter.cc
index 0ebe6c172..bc7f64f05 100644
--- a/rtengine/guidedfilter.cc
+++ b/rtengine/guidedfilter.cc
@@ -207,9 +207,6 @@ void guidedFilter(const array2D<float> &guide, const array2D<float> &src, array2
     apply(SUBMUL, b, a, meanI, meanp);
     DEBUG_DUMP(b);
 
-    meanI.free(); // frees w * h * 4 byte
-    meanp.free(); // frees w * h * 4 byte
-
     array2D<float> &meana = a;
     f_mean(meana, a, r1);
     DEBUG_DUMP(meana);
@@ -218,18 +215,25 @@ void guidedFilter(const array2D<float> &guide, const array2D<float> &src, array2
     f_mean(meanb, b, r1);
     DEBUG_DUMP(meanb);
 
-    blur_buf.resize(0); // frees w * h * 4 byte
+    const int Ws = meana.width();
+    const int Hs = meana.height();
+    const int Wd = q.width();
+    const int Hd = q.height();
 
-    array2D<float> meanA(W, H);
-    f_upsample(meanA, meana);
-    DEBUG_DUMP(meanA);
+    float col_scale = float (Ws) / float (Wd);
+    float row_scale = float (Hs) / float (Hd);
 
-    array2D<float> &meanB = q;
-    f_upsample(meanB, meanb);
-    DEBUG_DUMP(meanB);
+#ifdef _OPENMP
+    #pragma omp parallel for if (multithread)
+#endif
 
-    apply(ADDMUL, q, meanA, I, meanB);
-    DEBUG_DUMP(q);
+    for (int y = 0; y < Hd; ++y) {
+        float ymrs = y * row_scale;
+
+        for (int x = 0; x < Wd; ++x) {
+            q[y][x] = getBilinearValue(meana, x * col_scale, ymrs) * I[y][x] + getBilinearValue(meanb, x * col_scale, ymrs);
+        }
+    }
 }
 
 } // namespace rtengine

From 3ab379ad0a3871f4cd8b1ce933dfa7a8d37b001b Mon Sep 17 00:00:00 2001
From: Ingo Weyrich <heckflosse67@gmx.de>
Date: Thu, 19 Sep 2019 20:56:33 +0200
Subject: [PATCH 04/31] Dehaze: further speedup, #5456

---
 rtengine/boxblur.h       | 255 +++++++++++++++++++++++++++++++++++---
 rtengine/guidedfilter.cc | 187 ++++++++++------------------
 rtengine/ipdehaze.cc     | 260 +++++++++++++++++++++------------------
 3 files changed, 442 insertions(+), 260 deletions(-)

diff --git a/rtengine/boxblur.h b/rtengine/boxblur.h
index da302964b..3020278b2 100644
--- a/rtengine/boxblur.h
+++ b/rtengine/boxblur.h
@@ -204,15 +204,15 @@ template<class T, class A> void boxblur (T** src, A** dst, T* buffer, int radx,
 
             tempv = tempv / lenv;
             temp1v = temp1v / lenv;
-            STVFU( dst[0][col], tempv);
-            STVFU( dst[0][col + 4], temp1v);
+            STVFU(dst[0][col], tempv);
+            STVFU(dst[0][col + 4], temp1v);
 
             for (int row = 1; row <= rady; row++) {
                 lenp1v = lenv + onev;
                 tempv = (tempv * lenv + LVFU(temp[(row + rady) * W + col])) / lenp1v;
                 temp1v = (temp1v * lenv + LVFU(temp[(row + rady) * W + col + 4])) / lenp1v;
-                STVFU( dst[row][col], tempv);
-                STVFU( dst[row][col + 4], temp1v);
+                STVFU(dst[row][col], tempv);
+                STVFU(dst[row][col + 4], temp1v);
                 lenv = lenp1v;
             }
 
@@ -221,16 +221,16 @@ template<class T, class A> void boxblur (T** src, A** dst, T* buffer, int radx,
             for (int row = rady + 1; row < H - rady; row++) {
                 tempv = tempv + (LVFU(temp[(row + rady) * W + col]) - LVFU(temp[(row - rady - 1) * W + col])) * rlenv ;
                 temp1v = temp1v + (LVFU(temp[(row + rady) * W + col + 4]) - LVFU(temp[(row - rady - 1) * W + col + 4])) * rlenv ;
-                STVFU( dst[row][col], tempv);
-                STVFU( dst[row][col + 4], temp1v);
+                STVFU(dst[row][col], tempv);
+                STVFU(dst[row][col + 4], temp1v);
             }
 
             for (int row = H - rady; row < H; row++) {
                 lenm1v = lenv - onev;
                 tempv = (tempv * lenv - LVFU(temp[(row - rady - 1) * W + col])) / lenm1v;
                 temp1v = (temp1v * lenv - LVFU(temp[(row - rady - 1) * W + col + 4])) / lenm1v;
-                STVFU( dst[row][col], tempv);
-                STVFU( dst[row][col + 4], temp1v);
+                STVFU(dst[row][col], tempv);
+                STVFU(dst[row][col + 4], temp1v);
                 lenv = lenm1v;
             }
         }
@@ -312,6 +312,221 @@ template<class T, class A> void boxblur (T** src, A** dst, T* buffer, int radx,
 
 }
 
+inline void boxblur (float** src, float** dst, int radius, int W, int H, bool multiThread)
+{
+    //box blur using rowbuffers and linebuffers instead of a full size buffer
+
+    if (radius == 0) {
+        if (src != dst) {
+#ifdef _OPENMP
+            #pragma omp parallel for if (multiThread)
+#endif
+
+            for (int row = 0; row < H; row++) {
+                for (int col = 0; col < W; col++) {
+                    dst[row][col] = src[row][col];
+                }
+            }
+        }
+        return;
+    }
+
+    constexpr int numCols = 8; // process numCols columns at once for better usage of L1 cpu cache
+#ifdef _OPENMP
+    #pragma omp parallel if (multiThread)
+#endif
+    {
+        float* const buffer = new float[std::max(W, 8 * H)];
+        //horizontal blur
+        float* const lineBuffer = buffer;
+#ifdef _OPENMP
+        #pragma omp for
+#endif
+        for (int row = 0; row < H; row++) {
+            float len = radius + 1;
+            float tempval = src[row][0];
+            lineBuffer[0] = tempval;
+            for (int j = 1; j <= radius; j++) {
+                tempval += src[row][j];
+            }
+
+            tempval /= len;
+            dst[row][0] = tempval;
+
+            for (int col = 1; col <= radius; col++) {
+                lineBuffer[col] = src[row][col];
+                dst[row][col] = tempval = (tempval * len + src[row][col + radius]) / (len + 1);
+                len ++;
+            }
+
+            for (int col = radius + 1; col < W - radius; col++) {
+                lineBuffer[col] = src[row][col];
+                dst[row][col] = tempval = tempval + (src[row][col + radius] - lineBuffer[col - radius - 1]) / len;
+            }
+
+            for (int col = W - radius; col < W; col++) {
+                dst[row][col] = tempval = (tempval * len - lineBuffer[col - radius - 1]) / (len - 1);
+                len --;
+            }
+        }
+
+        //vertical blur
+#ifdef __SSE2__
+        vfloat (* const rowBuffer)[2] = (vfloat(*)[2]) buffer;
+        vfloat leninitv = F2V(radius + 1);
+        vfloat onev = F2V(1.f);
+        vfloat tempv, temp1v, lenv, lenp1v, lenm1v, rlenv;
+
+#ifdef _OPENMP
+        #pragma omp for nowait
+#endif
+
+        for (int col = 0; col < W - 7; col += 8) {
+            lenv = leninitv;
+            tempv = LVFU(dst[0][col]);
+            temp1v = LVFU(dst[0][col + 4]);
+            rowBuffer[0][0] = tempv;
+            rowBuffer[0][1] = temp1v;
+
+            for (int i = 1; i <= radius; i++) {
+                tempv = tempv + LVFU(dst[i][col]);
+                temp1v = temp1v + LVFU(dst[i][col + 4]);
+            }
+
+            tempv = tempv / lenv;
+            temp1v = temp1v / lenv;
+            STVFU(dst[0][col], tempv);
+            STVFU(dst[0][col + 4], temp1v);
+
+            for (int row = 1; row <= radius; row++) {
+                rowBuffer[row][0] = LVFU(dst[row][col]);
+                rowBuffer[row][1] = LVFU(dst[row][col + 4]);
+                lenp1v = lenv + onev;
+                tempv = (tempv * lenv + LVFU(dst[row + radius][col])) / lenp1v;
+                temp1v = (temp1v * lenv + LVFU(dst[row + radius][col + 4])) / lenp1v;
+                STVFU(dst[row][col], tempv);
+                STVFU(dst[row][col + 4], temp1v);
+                lenv = lenp1v;
+            }
+
+            rlenv = onev / lenv;
+
+            for (int row = radius + 1; row < H - radius; row++) {
+                rowBuffer[row][0] = LVFU(dst[row][col]);
+                rowBuffer[row][1] = LVFU(dst[row][col + 4]);
+                tempv = tempv + (LVFU(dst[row + radius][col]) - rowBuffer[row - radius - 1][0]) * rlenv ;
+                temp1v = temp1v + (LVFU(dst[row + radius][col + 4]) - rowBuffer[row - radius - 1][1]) * rlenv ;
+                STVFU(dst[row][col], tempv);
+                STVFU(dst[row][col + 4], temp1v);
+            }
+
+            for (int row = H - radius; row < H; row++) {
+                lenm1v = lenv - onev;
+                tempv = (tempv * lenv - rowBuffer[row - radius - 1][0]) / lenm1v;
+                temp1v = (temp1v * lenv - rowBuffer[row - radius - 1][1]) / lenm1v;
+                STVFU(dst[row][col], tempv);
+                STVFU(dst[row][col + 4], temp1v);
+                lenv = lenm1v;
+            }
+        }
+
+#else
+        float (* const rowBuffer)[8] = (float(*)[8]) buffer;
+#ifdef _OPENMP
+        #pragma omp for nowait
+#endif
+
+        for (int col = 0; col < W - numCols + 1; col += 8) {
+            float len = radius + 1;
+
+            for(int k = 0; k < numCols; k++) {
+                rowBuffer[0][k] = dst[0][col + k];
+            }
+
+            for (int i = 1; i <= radius; i++) {
+                for(int k = 0; k < numCols; k++) {
+                    dst[0][col + k] += dst[i][col + k];
+                }
+            }
+
+            for(int k = 0; k < numCols; k++) {
+                dst[0][col + k] /= len;
+            }
+
+            for (int row = 1; row <= radius; row++) {
+                for(int k = 0; k < numCols; k++) {
+                    rowBuffer[row][k] = dst[row][col + k];
+                    dst[row][col + k] = (dst[row - 1][col + k] * len + dst[row + radius][col + k]) / (len + 1);
+                }
+
+                len ++;
+            }
+
+            for (int row = radius + 1; row < H - radius; row++) {
+                for(int k = 0; k < numCols; k++) {
+                    rowBuffer[row][k] = dst[row][col + k];
+                    dst[row][col + k] = dst[row - 1][col + k] + (dst[row + radius][col + k] - rowBuffer[row - radius - 1][k]) / len;
+                }
+            }
+
+            for (int row = H - radius; row < H; row++) {
+                for(int k = 0; k < numCols; k++) {
+                    dst[row][col + k] = (dst[row - 1][col + k] * len - rowBuffer[row - radius - 1][k]) / (len - 1);
+                }
+
+                len --;
+            }
+        }
+
+#endif
+        //vertical blur, remaining columns
+#ifdef _OPENMP
+        #pragma omp single
+#endif
+        {
+            const int remaining = W % numCols;
+            if (remaining > 0) {
+                float (* const rowBuffer)[8] = (float(*)[8]) buffer;
+                const int col = W - remaining;
+
+                float len = radius + 1;
+                for(int k = 0; k < remaining; k++) {
+                    rowBuffer[0][k] = dst[0][col + k];
+                }
+                for (int i = 1; i <= radius; i++) {
+                    for(int k = 0; k < remaining; k++) {
+                        dst[0][col + k] += dst[i][col + k];
+                    }
+                }
+                for(int k = 0; k < remaining; k++) {
+                    dst[0][col + k] /= len;
+                }
+                for (int row = 1; row <= radius; row++) {
+                    for(int k = 0; k < remaining; k++) {
+                        rowBuffer[row][k] = dst[row][col + k];
+                        dst[row][col + k] = (dst[(row - 1)][col + k] * len + dst[row + radius][col + k]) / (len + 1);
+                        len ++;
+                    }
+                }
+                const float rlen = 1.f / len;
+                for (int row = radius + 1; row < H - radius; row++) {
+                    for(int k = 0; k < remaining; k++) {
+                        rowBuffer[row][k] = dst[row][col + k];
+                        dst[row][col + k] = dst[(row - 1)][col + k] + (dst[row + radius][col + k] - rowBuffer[row - radius - 1][k]) * rlen;
+                    }
+                }
+                for (int row = H - radius; row < H; row++) {
+                    for(int k = 0; k < remaining; k++) {
+                        dst[row][col + k] = (dst[(row - 1)][col + k] * len - rowBuffer[row - radius - 1][k]) / (len - 1);
+                        len --;
+                    }
+                }
+            }
+        }
+        delete [] buffer;
+    }
+}
+
 template<class T, class A> void boxblur (T* src, A* dst, A* buffer, int radx, int rady, int W, int H)
 {
     //box blur image; box range = (radx,rady) i.e. box size is (2*radx+1)x(2*rady+1)
@@ -382,15 +597,15 @@ template<class T, class A> void boxblur (T* src, A* dst, A* buffer, int radx, in
 
             tempv = tempv / lenv;
             temp1v = temp1v / lenv;
-            STVFU( dst[0 * W + col], tempv);
-            STVFU( dst[0 * W + col + 4], temp1v);
+            STVFU(dst[0 * W + col], tempv);
+            STVFU(dst[0 * W + col + 4], temp1v);
 
             for (int row = 1; row <= rady; row++) {
                 lenp1v = lenv + onev;
                 tempv = (tempv * lenv + LVFU(temp[(row + rady) * W + col])) / lenp1v;
                 temp1v = (temp1v * lenv + LVFU(temp[(row + rady) * W + col + 4])) / lenp1v;
-                STVFU( dst[row * W + col], tempv);
-                STVFU( dst[row * W + col + 4], temp1v);
+                STVFU(dst[row * W + col], tempv);
+                STVFU(dst[row * W + col + 4], temp1v);
                 lenv = lenp1v;
             }
 
@@ -399,16 +614,16 @@ template<class T, class A> void boxblur (T* src, A* dst, A* buffer, int radx, in
             for (int row = rady + 1; row < H - rady; row++) {
                 tempv = tempv + (LVFU(temp[(row + rady) * W + col]) - LVFU(temp[(row - rady - 1) * W + col])) * rlenv ;
                 temp1v = temp1v + (LVFU(temp[(row + rady) * W + col + 4]) - LVFU(temp[(row - rady - 1) * W + col + 4])) * rlenv ;
-                STVFU( dst[row * W + col], tempv);
-                STVFU( dst[row * W + col + 4], temp1v);
+                STVFU(dst[row * W + col], tempv);
+                STVFU(dst[row * W + col + 4], temp1v);
             }
 
             for (int row = H - rady; row < H; row++) {
                 lenm1v = lenv - onev;
                 tempv = (tempv * lenv - LVFU(temp[(row - rady - 1) * W + col])) / lenm1v;
                 temp1v = (temp1v * lenv - LVFU(temp[(row - rady - 1) * W + col + 4])) / lenm1v;
-                STVFU( dst[row * W + col], tempv);
-                STVFU( dst[row * W + col + 4], temp1v);
+                STVFU(dst[row * W + col], tempv);
+                STVFU(dst[row * W + col + 4], temp1v);
                 lenv = lenm1v;
             }
         }
@@ -422,12 +637,12 @@ template<class T, class A> void boxblur (T* src, A* dst, A* buffer, int radx, in
             }
 
             tempv = tempv / lenv;
-            STVFU( dst[0 * W + col], tempv);
+            STVFU(dst[0 * W + col], tempv);
 
             for (int row = 1; row <= rady; row++) {
                 lenp1v = lenv + onev;
                 tempv = (tempv * lenv + LVFU(temp[(row + rady) * W + col])) / lenp1v;
-                STVFU( dst[row * W + col], tempv);
+                STVFU(dst[row * W + col], tempv);
                 lenv = lenp1v;
             }
 
@@ -435,13 +650,13 @@ template<class T, class A> void boxblur (T* src, A* dst, A* buffer, int radx, in
 
             for (int row = rady + 1; row < H - rady; row++) {
                 tempv = tempv + (LVFU(temp[(row + rady) * W + col]) - LVFU(temp[(row - rady - 1) * W + col])) * rlenv ;
-                STVFU( dst[row * W + col], tempv);
+                STVFU(dst[row * W + col], tempv);
             }
 
             for (int row = H - rady; row < H; row++) {
                 lenm1v = lenv - onev;
                 tempv = (tempv * lenv - LVFU(temp[(row - rady - 1) * W + col])) / lenm1v;
-                STVFU( dst[row * W + col], tempv);
+                STVFU(dst[row * W + col], tempv);
                 lenv = lenm1v;
             }
         }
diff --git a/rtengine/guidedfilter.cc b/rtengine/guidedfilter.cc
index bc7f64f05..8d19fc7a5 100644
--- a/rtengine/guidedfilter.cc
+++ b/rtengine/guidedfilter.cc
@@ -3,6 +3,7 @@
  *  This file is part of RawTherapee.
  *
  *  Copyright (c) 2018 Alberto Griggio <alberto.griggio@gmail.com>
+ *  Optimized 2019 Ingo Weyrich <heckflosse67@gmx.de>
  *
  *  RawTherapee is free software: you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License as published by
@@ -16,9 +17,9 @@
  *
  *  You should have received a copy of the GNU General Public License
  *  along with RawTherapee.  If not, see <https://www.gnu.org/licenses/>.
- */
+*/
 
-/**
+/*
  * This is a Fast Guided Filter implementation, derived directly from the
  * pseudo-code of the paper:
  *
@@ -26,32 +27,16 @@
  * by Kaiming He, Jian Sun
  *
  * available at https://arxiv.org/abs/1505.00996
- */
+*/
 
 #include "guidedfilter.h"
 #include "boxblur.h"
 #include "rescale.h"
 #include "imagefloat.h"
-
+#define BENCHMARK
+#include "StopWatch.h"
 namespace rtengine {
 
-#if 0
-#  define DEBUG_DUMP(arr)                                                 \
-    do {                                                                \
-        Imagefloat im(arr.width(), arr.height());                      \
-        const char *out = "/tmp/" #arr ".tif";                     \
-        for (int y = 0; y < im.getHeight(); ++y) {                      \
-            for (int x = 0; x < im.getWidth(); ++x) {                   \
-                im.r(y, x) = im.g(y, x) = im.b(y, x) = arr[y][x] * 65535.f; \
-            }                                                           \
-        }                                                               \
-        im.saveTIFF(out, 16);                                           \
-    } while (false)
-#else
-#  define DEBUG_DUMP(arr)
-#endif
-
-
 namespace {
 
 int calculate_subsampling(int w, int h, int r)
@@ -78,15 +63,7 @@ int calculate_subsampling(int w, int h, int r)
 
 void guidedFilter(const array2D<float> &guide, const array2D<float> &src, array2D<float> &dst, int r, float epsilon, bool multithread, int subsampling)
 {
-
-    const int W = src.width();
-    const int H = src.height();
-
-    if (subsampling <= 0) {
-        subsampling = calculate_subsampling(W, H, r);
-    }
-
-    enum Op { MUL, DIVEPSILON, ADD, SUB, ADDMUL, SUBMUL };
+    enum Op {MUL, DIVEPSILON, SUBMUL};
 
     const auto apply =
         [=](Op op, array2D<float> &res, const array2D<float> &a, const array2D<float> &b, const array2D<float> &c=array2D<float>()) -> void
@@ -99,139 +76,107 @@ void guidedFilter(const array2D<float> &guide, const array2D<float> &src, array2
 #endif
             for (int y = 0; y < h; ++y) {
                 for (int x = 0; x < w; ++x) {
-                    float r;
-                    float aa = a[y][x];
-                    float bb = b[y][x];
                     switch (op) {
-                    case MUL:
-                        r = aa * bb;
-                        break;
-                    case DIVEPSILON:
-                        r = aa / (bb + epsilon);
-                        break;
-                    case ADD:
-                        r = aa + bb;
-                        break;
-                    case SUB:
-                        r = aa - bb;
-                        break;
-                    case ADDMUL:
-                        r = aa * bb + c[y][x];
-                        break;
-                    case SUBMUL:
-                        r = c[y][x] - (aa * bb);
-                        break;
-                    default:
-                        assert(false);
-                        r = 0;
-                        break;
+                        case MUL:
+                            res[y][x] = a[y][x] * b[y][x];
+                            break;
+                        case DIVEPSILON:
+                            res[y][x] = a[y][x] / (b[y][x] + epsilon); // note: the value of epsilon intentionally has an impact on the result. It is not only to avoid divisions by zero
+                            break;
+                        case SUBMUL:
+                            res[y][x] = c[y][x] - (a[y][x] * b[y][x]);
+                            break;
+                        default:
+                            assert(false);
+                            res[y][x] = 0;
+                            break;
                     }
-                    res[y][x] = r;
                 }
             }
         };
 
-    // use the terminology of the paper (Algorithm 2)
-    const array2D<float> &I = guide;
-    const array2D<float> &p = src;
-    array2D<float> &q = dst;
-
     const auto f_subsample =
         [=](array2D<float> &d, const array2D<float> &s) -> void
         {
             rescaleBilinear(s, d, multithread);
         };
 
-    const auto f_upsample = f_subsample;
-    
-    const size_t w = W / subsampling;
-    const size_t h = H / subsampling;
-
-    AlignedBuffer<float> blur_buf(w * h);
     const auto f_mean =
         [&](array2D<float> &d, array2D<float> &s, int rad) -> void
         {
             rad = LIM(rad, 0, (min(s.width(), s.height()) - 1) / 2 - 1);
-            float **src = s;
-            float **dst = d;
-#ifdef _OPENMP
-            #pragma omp parallel if (multithread)
-#endif
-            boxblur<float, float>(src, dst, blur_buf.data, rad, rad, s.width(), s.height());
+            boxblur(s, d, rad, s.width(), s.height(), multithread);
         };
 
+    const int W = src.width();
+    const int H = src.height();
+
+    if (subsampling <= 0) {
+        subsampling = calculate_subsampling(W, H, r);
+    }
+
+    const size_t w = W / subsampling;
+    const size_t h = H / subsampling;
+    const float r1 = float(r) / subsampling;
+
     array2D<float> I1(w, h);
     array2D<float> p1(w, h);
 
-    f_subsample(I1, I);
-    f_subsample(p1, p);
+    f_subsample(I1, guide);
 
-    DEBUG_DUMP(I);
-    DEBUG_DUMP(p);
-    DEBUG_DUMP(I1);
-    DEBUG_DUMP(p1);
+    if (&guide == &src) {
+        f_mean(p1, I1, r1);
 
-    float r1 = float(r) / subsampling;
+        apply(MUL, I1, I1, I1);        // I1 = I1 * I1
 
-    array2D<float> meanI(w, h);
-    f_mean(meanI, I1, r1);
-    DEBUG_DUMP(meanI);
+        f_mean(I1, I1, r1);
 
-    array2D<float> meanp(w, h);
-    f_mean(meanp, p1, r1);
-    DEBUG_DUMP(meanp);
+        apply(SUBMUL, I1, p1, p1, I1); // I1 = I1 - p1 * p1
+        apply(DIVEPSILON, I1, I1, I1); // I1 = I1 / (I1 + epsilon)
+        apply(SUBMUL, p1, I1, p1, p1); // p1 = p1 - I1 * p1
 
-    array2D<float> &corrIp = p1;
-    apply(MUL, corrIp, I1, p1);
-    f_mean(corrIp, corrIp, r1);
-    DEBUG_DUMP(corrIp);
+    } else {
+        f_subsample(p1, src);
 
-    array2D<float> &corrI = I1;
-    apply(MUL, corrI, I1, I1);
-    f_mean(corrI, corrI, r1);
-    DEBUG_DUMP(corrI);
+        array2D<float> meanI(w, h);
+        f_mean(meanI, I1, r1);
 
-    array2D<float> &varI = corrI;
-    apply(SUBMUL, varI, meanI, meanI, corrI);
-    DEBUG_DUMP(varI);
+        array2D<float> meanp(w, h);
+        f_mean(meanp, p1, r1);
 
-    array2D<float> &covIp = corrIp;
-    apply(SUBMUL, covIp, meanI, meanp, corrIp);
-    DEBUG_DUMP(covIp);
+        apply(MUL, p1, I1, p1);
 
-    array2D<float> &a = varI;
-    apply(DIVEPSILON, a, covIp, varI);
-    DEBUG_DUMP(a);
+        f_mean(p1, p1, r1);
 
-    array2D<float> &b = covIp;
-    apply(SUBMUL, b, a, meanI, meanp);
-    DEBUG_DUMP(b);
+        apply(MUL, I1, I1, I1);
 
-    array2D<float> &meana = a;
-    f_mean(meana, a, r1);
-    DEBUG_DUMP(meana);
+        f_mean(I1, I1, r1);
 
-    array2D<float> &meanb = b;
-    f_mean(meanb, b, r1);
-    DEBUG_DUMP(meanb);
+        apply(SUBMUL, I1, meanI, meanI, I1);
+        apply(SUBMUL, p1, meanI, meanp, p1);
+        apply(DIVEPSILON, I1, p1, I1);
+        apply(SUBMUL, p1, I1, meanI, meanp);
+    }
 
-    const int Ws = meana.width();
-    const int Hs = meana.height();
-    const int Wd = q.width();
-    const int Hd = q.height();
+    f_mean(I1, I1, r1);
+    f_mean(p1, p1, r1);
 
-    float col_scale = float (Ws) / float (Wd);
-    float row_scale = float (Hs) / float (Hd);
+    const int Ws = I1.width();
+    const int Hs = I1.height();
+    const int Wd = dst.width();
+    const int Hd = dst.height();
+
+    const float col_scale = static_cast<float>(Ws) / static_cast<float>(Wd);
+    const float row_scale = static_cast<float>(Hs) / static_cast<float>(Hd);
 
 #ifdef _OPENMP
     #pragma omp parallel for if (multithread)
 #endif
 
     for (int y = 0; y < Hd; ++y) {
-        float ymrs = y * row_scale;
-
+        const float ymrs = y * row_scale;
         for (int x = 0; x < Wd; ++x) {
-            q[y][x] = getBilinearValue(meana, x * col_scale, ymrs) * I[y][x] + getBilinearValue(meanb, x * col_scale, ymrs);
+            dst[y][x] = getBilinearValue(I1, x * col_scale, ymrs) * guide[y][x] + getBilinearValue(p1, x * col_scale, ymrs);
         }
     }
 }
diff --git a/rtengine/ipdehaze.cc b/rtengine/ipdehaze.cc
index 60d4cb9ff..68af84970 100644
--- a/rtengine/ipdehaze.cc
+++ b/rtengine/ipdehaze.cc
@@ -35,7 +35,10 @@
 #include "improcfun.h"
 #include "procparams.h"
 #include "rt_algo.h"
+#include "rt_algo.h"
 #include "rt_math.h"
+#define BENCHMARK
+#include "StopWatch.h"
 
 extern Options options;
 
@@ -43,24 +46,7 @@ namespace rtengine {
 
 namespace {
 
-#if 0
-#  define DEBUG_DUMP(arr)                                                 \
-    do {                                                                \
-        Imagefloat im(arr.width(), arr.height());                      \
-        const char *out = "/tmp/" #arr ".tif";                     \
-        for (int y = 0; y < im.getHeight(); ++y) {                      \
-            for (int x = 0; x < im.getWidth(); ++x) {                   \
-                im.r(y, x) = im.g(y, x) = im.b(y, x) = arr[y][x] * 65535.f; \
-            }                                                           \
-        }                                                               \
-        im.saveTIFF(out, 16);                                           \
-    } while (false)
-#else
-#  define DEBUG_DUMP(arr)
-#endif
-
-
-int get_dark_channel(const array2D<float> &R, const array2D<float> &G, const array2D<float> &B, array2D<float> &dst, int patchsize, const float ambient[3], bool clip, bool multithread)
+int get_dark_channel(const array2D<float> &R, const array2D<float> &G, const array2D<float> &B, array2D<float> &dst, int patchsize, const float ambient[3], bool clip, bool multithread, float strength)
 {
     const int W = R.width();
     const int H = R.height();
@@ -73,22 +59,12 @@ int get_dark_channel(const array2D<float> &R, const array2D<float> &G, const arr
         for (int x = 0; x < W; x += patchsize) {
             float val = RT_INFINITY_F;
             const int pW = min(x + patchsize, W);
-            for (int yy = y; yy < pH; ++yy) {
-                for (int xx = x; xx < pW; ++xx) {
-                    float r = R[yy][xx];
-                    float g = G[yy][xx];
-                    float b = B[yy][xx];
-                    if (ambient) {
-                        r /= ambient[0];
-                        g /= ambient[1];
-                        b /= ambient[2];
-                    }
-                    val = min(val, r, g, b);
+            for (int xx = x; xx < pW; ++xx) {
+                for (int yy = y; yy < pH; ++yy) {
+                    val = min(val, R[yy][xx] / ambient[0], G[yy][xx] / ambient[1], B[yy][xx] / ambient[2]);
                 }
             }
-            if (clip) {
-                val = LIM01(val);
-            }
+            val = 1.f - strength * LIM01(val);
             for (int yy = y; yy < pH; ++yy) {
                 std::fill(dst[yy] + x, dst[yy] + pW, val);
             }
@@ -98,41 +74,59 @@ int get_dark_channel(const array2D<float> &R, const array2D<float> &G, const arr
     return (W / patchsize + ((W % patchsize) > 0)) *  (H / patchsize + ((H % patchsize) > 0));
 }
 
+int get_dark_channel_downsized(const array2D<float> &R, const array2D<float> &G, const array2D<float> &B, array2D<float> &dst, int patchsize, bool multithread)
+{
+    const int W = R.width();
+    const int H = R.height();
+
+#ifdef _OPENMP
+    #pragma omp parallel for if (multithread)
+#endif
+    for (int y = 0; y < H; y += patchsize) {
+        int yy = y / patchsize;
+        const int pH = min(y + patchsize, H);
+        for (int x = 0, xx = 0; x < W; x += patchsize, ++xx) {
+            float val = RT_INFINITY_F;
+            const int pW = min(x + patchsize, W);
+            for (int xp = x; xp < pW; ++xp) {
+                for (int yp = y; yp < pH; ++yp) {
+                    val = min(val, R[yp][xp], G[yp][xp], B[yp][xp]);
+                }
+            }
+            dst[yy][xx] = val;
+        }
+    }
+
+    return (W / patchsize + ((W % patchsize) > 0)) *  (H / patchsize + ((H % patchsize) > 0));
+}
+
 
 float estimate_ambient_light(const array2D<float> &R, const array2D<float> &G, const array2D<float> &B, const array2D<float> &dark, int patchsize, int npatches, float ambient[3])
 {
     const int W = R.width();
     const int H = R.height();
 
-    const auto get_percentile =
-        [](std::priority_queue<float> &q, float prcnt) -> float
-        {
-            size_t n = LIM<size_t>(q.size() * prcnt, 1, q.size());
-            while (q.size() > n) {
-                q.pop();
-            }
-            return q.top();
-        };
-    
     float darklim = RT_INFINITY_F;
     {
-        std::priority_queue<float> p;
-        for (int y = 0; y < H; y += patchsize) {
-            for (int x = 0; x < W; x += patchsize) {
-                if (!OOG(dark[y][x], 1.f - 1e-5f)) {
-                    p.push(dark[y][x]);
+        std::vector<float> p;
+        for (int y = 0, yy = 0; y < H; y += patchsize, ++yy) {
+            for (int x = 0, xx = 0; x < W; x += patchsize, ++xx) {
+                if (!OOG(dark[yy][xx], 1.f - 1e-5f)) {
+                    p.push_back(dark[yy][xx]);
                 }
             }
         }
-        darklim = get_percentile(p, 0.95);
+        const int pos = p.size() * 0.95;
+        std::nth_element(p.begin(), p.begin() + pos, p.end());
+        darklim = p[pos];
     }
 
     std::vector<std::pair<int, int>> patches;
     patches.reserve(npatches);
 
-    for (int y = 0; y < H; y += patchsize) {
-        for (int x = 0; x < W; x += patchsize) {
-            if (dark[y][x] >= darklim && !OOG(dark[y][x], 1.f)) {
+    for (int y = 0, yy = 0; y < H; y += patchsize, ++yy) {
+        for (int x = 0, xx = 0; x < W; x += patchsize, ++xx) {
+            if (dark[yy][xx] >= darklim && !OOG(dark[yy][xx], 1.f)) {
                 patches.push_back(std::make_pair(x, y));
             }
         }
@@ -145,33 +139,38 @@ float estimate_ambient_light(const array2D<float> &R, const array2D<float> &G, c
 
     float bright_lim = RT_INFINITY_F;
     {
-        std::priority_queue<float> l;
+        std::vector<float> l;
+        l.reserve(patches.size() * patchsize * patchsize);
         
-        for (auto &p : patches) {
-            const int pW = min(p.first+patchsize, W);
-            const int pH = min(p.second+patchsize, H);
+        for (const auto &p : patches) {
+            const int pW = min(p.first + patchsize, W);
+            const int pH = min(p.second + patchsize, H);
             
             for (int y = p.second; y < pH; ++y) {
                 for (int x = p.first; x < pW; ++x) {
-                    l.push(R[y][x] + G[y][x] + B[y][x]);
+                    l.push_back(R[y][x] + G[y][x] + B[y][x]);
                 }
             }
         }
-
-        bright_lim = get_percentile(l, 0.95);
+        const int pos = l.size() * 0.95;
+        std::nth_element(l.begin(), l.begin() + pos, l.end());
+        bright_lim = l[pos];
     }
 
     double rr = 0, gg = 0, bb = 0;
     int n = 0;
-    for (auto &p : patches) {
-        const int pW = min(p.first+patchsize, W);
-        const int pH = min(p.second+patchsize, H);
+#ifdef _OPENMP
+    #pragma omp parallel for schedule(dynamic) reduction(+:rr,gg,bb,n)
+#endif
+    for (const auto &p : patches) {
+        const int pW = min(p.first + patchsize, W);
+        const int pH = min(p.second + patchsize, H);
             
         for (int y = p.second; y < pH; ++y) {
             for (int x = p.first; x < pW; ++x) {
-                float r = R[y][x];
-                float g = G[y][x];
-                float b = B[y][x];
+                const float r = R[y][x];
+                const float g = G[y][x];
+                const float b = B[y][x];
                 if (r + g + b >= bright_lim) {
                     rr += r;
                     gg += g;
@@ -181,6 +180,7 @@ float estimate_ambient_light(const array2D<float> &R, const array2D<float> &G, c
             }
         }
     }
+
     n = std::max(n, 1);
     ambient[0] = rr / n;
     ambient[1] = gg / n;
@@ -211,12 +211,12 @@ void extract_channels(Imagefloat *img, array2D<float> &r, array2D<float> &g, arr
 
 void ImProcFunctions::dehaze(Imagefloat *img)
 {
-    if (!params->dehaze.enabled) {
+    if (!params->dehaze.enabled || params->dehaze.strength == 0.0) {
         return;
     }
-
+BENCHFUN
     img->normalizeFloatTo1();
-    
+
     const int W = img->getWidth();
     const int H = img->getHeight();
     const float strength = LIM01(float(params->dehaze.strength) / 100.f * 0.9f);
@@ -229,21 +229,19 @@ void ImProcFunctions::dehaze(Imagefloat *img)
 
     int patchsize = max(int(5 / scale), 2);
     float ambient[3];
-    array2D<float> &t_tilde = dark;
     float max_t = 0.f;
 
     {
-        int npatches = 0;
         array2D<float> R(W, H);
         array2D<float> G(W, H);
         array2D<float> B(W, H);
         extract_channels(img, R, G, B, patchsize, 1e-1, multiThread);
-    
-        patchsize = max(max(W, H) / 600, 2);
-        npatches = get_dark_channel(R, G, B, dark, patchsize, nullptr, false, multiThread);
-        DEBUG_DUMP(dark);
 
-        max_t = estimate_ambient_light(R, G, B, dark, patchsize, npatches, ambient);
+        patchsize = max(max(W, H) / 600, 2);
+        array2D<float> darkDownsized(W / patchsize + 1, H / patchsize + 1);
+        const int npatches = get_dark_channel_downsized(R, G, B, darkDownsized, patchsize, multiThread);
+
+        max_t = estimate_ambient_light(R, G, B, darkDownsized, patchsize, npatches, ambient);
 
         if (options.rtSettings.verbose) {
             std::cout << "dehaze: ambient light is "
@@ -251,78 +249,102 @@ void ImProcFunctions::dehaze(Imagefloat *img)
                       << std::endl;
         }
 
-        get_dark_channel(R, G, B, dark, patchsize, ambient, true, multiThread);
-    }
-
-    if (min(ambient[0], ambient[1], ambient[2]) < 0.01f) {
-        if (options.rtSettings.verbose) {
-            std::cout << "dehaze: no haze detected" << std::endl;
+        if (min(ambient[0], ambient[1], ambient[2]) < 0.01f) {
+            if (options.rtSettings.verbose) {
+                std::cout << "dehaze: no haze detected" << std::endl;
+            }
+            img->normalizeFloatTo65535();
+            return; // probably no haze at all
         }
-        img->normalizeFloatTo65535();
-        return; // probably no haze at all
-    }
 
-    DEBUG_DUMP(t_tilde);
-
-#ifdef _OPENMP
-    #pragma omp parallel for if (multiThread)
-#endif
-    for (int y = 0; y < H; ++y) {
-        for (int x = 0; x < W; ++x) {
-            dark[y][x] = 1.f - strength * dark[y][x];
-        }
+        get_dark_channel(R, G, B, dark, patchsize, ambient, true, multiThread, strength);
     }
 
     const int radius = patchsize * 4;
-    const float epsilon = 1e-5;
-    array2D<float> &t = t_tilde;
+    constexpr float epsilon = 1e-5f;
 
     {
         array2D<float> guideB(W, H, img->b.ptrs, ARRAY2D_BYREFERENCE);
-        guidedFilter(guideB, t_tilde, t, radius, epsilon, multiThread);
+        guidedFilter(guideB, dark, dark, radius, epsilon, multiThread);
     }
         
-    DEBUG_DUMP(t);
-
     if (options.rtSettings.verbose) {
         std::cout << "dehaze: max distance is " << max_t << std::endl;
     }
 
-    float depth = -float(params->dehaze.depth) / 100.f;
+    const float depth = -float(params->dehaze.depth) / 100.f;
     const float t0 = max(1e-3f, std::exp(depth * max_t));
     const float teps = 1e-3f;
 #ifdef _OPENMP
     #pragma omp parallel for if (multiThread)
 #endif
     for (int y = 0; y < H; ++y) {
-        for (int x = 0; x < W; ++x) {
+        int x = 0;
+#ifdef __SSE2__
+        const vfloat onev = F2V(1.f);
+        const vfloat ambient0v = F2V(ambient[0]);
+        const vfloat ambient1v = F2V(ambient[1]);
+        const vfloat ambient2v = F2V(ambient[2]);
+        const vfloat t0v = F2V(t0);
+        const vfloat tepsv = F2V(teps);
+        const vfloat c65535v = F2V(65535.f);
+        for (; x < W - 3; x += 4) {
             // ensure that the transmission is such that to avoid clipping...
-            float rgb[3] = { img->r(y, x), img->g(y, x), img->b(y, x) };
+            vfloat r = LVFU(img->r(y, x));
+            vfloat g = LVFU(img->g(y, x));
+            vfloat b = LVFU(img->b(y, x));
             // ... t >= tl to avoid negative values
-            float tl = 1.f - min(rgb[0]/ambient[0], rgb[1]/ambient[1], rgb[2]/ambient[2]);
+            const vfloat tlv = onev - vminf(r / ambient0v, vminf(g / ambient1v, b / ambient2v));
             // ... t >= tu to avoid values > 1
-            float tu = t0 - teps;
-            for (int c = 0; c < 3; ++c) {
-                if (ambient[c] < 1) {
-                    tu = max(tu, (rgb[c] - ambient[c])/(1.f - ambient[c]));
-                }
-            }
-            float mt = max(t[y][x], t0, tl + teps, tu + teps);
-            if (params->dehaze.showDepthMap) {
-                img->r(y, x) = img->g(y, x) = img->b(y, x) = LIM01(1.f - mt);
-            } else {
-                float r = (rgb[0] - ambient[0]) / mt + ambient[0];
-                float g = (rgb[1] - ambient[1]) / mt + ambient[1];
-                float b = (rgb[2] - ambient[2]) / mt + ambient[2];
+            r -= ambient0v;
+            g -= ambient1v;
+            b -= ambient2v;
 
-                img->r(y, x) = r;
-                img->g(y, x) = g;
-                img->b(y, x) = b;
+            vfloat tuv = t0v - tepsv;
+            tuv = vself(vmaskf_lt(ambient0v, onev), vmaxf(tuv, r / (onev - ambient0v)), tuv);
+            tuv = vself(vmaskf_lt(ambient1v, onev), vmaxf(tuv, g / (onev - ambient1v)), tuv);
+            tuv = vself(vmaskf_lt(ambient2v, onev), vmaxf(tuv, b / (onev - ambient2v)), tuv);
+
+            const vfloat mtv = vmaxf(LVFU(dark[y][x]), vmaxf(tlv, tuv) + tepsv);
+            if (params->dehaze.showDepthMap) {
+                const vfloat valv = vclampf(onev - mtv, ZEROV, onev) * c65535v;
+                STVFU(img->r(y, x), valv);
+                STVFU(img->g(y, x), valv);
+                STVFU(img->b(y, x), valv);
+            } else {
+                STVFU(img->r(y, x), (r / mtv + ambient0v) * c65535v);
+                STVFU(img->g(y, x), (g / mtv + ambient1v) * c65535v);
+                STVFU(img->b(y, x), (b / mtv + ambient2v) * c65535v);
+            }
+        }
+#endif
+        for (; x < W; ++x) {
+            // ensure that the transmission is such that to avoid clipping...
+            float r = img->r(y, x);
+            float g = img->g(y, x);
+            float b = img->b(y, x);
+            // ... t >= tl to avoid negative values
+            const float tl = 1.f - min(r / ambient[0], g / ambient[1], b / ambient[2]);
+            // ... t >= tu to avoid values > 1
+            r -= ambient[0];
+            g -= ambient[1];
+            b -= ambient[2];
+
+            float tu = t0 - teps;
+            tu = ambient[0] < 1.f ? max(tu, r / (1.f - ambient[0])) : tu;
+            tu = ambient[1] < 1.f ? max(tu, g / (1.f - ambient[1])) : tu;
+            tu = ambient[2] < 1.f ? max(tu, b / (1.f - ambient[2])) : tu;
+
+            const float mt = max(dark[y][x], tl + teps, tu + teps);
+            if (params->dehaze.showDepthMap) {
+                img->r(y, x) = img->g(y, x) = img->b(y, x) = LIM01(1.f - mt) * 65535.f;
+            } else {
+                img->r(y, x) = (r / mt + ambient[0]) * 65535.f;
+                img->g(y, x) = (g / mt + ambient[1]) * 65535.f;
+                img->b(y, x) = (b / mt + ambient[2]) * 65535.f;
             }
         }
     }
-
-    img->normalizeFloatTo65535();
 }
 
 

From 3981e285b9bda62d3126ad9704f58bf232ba67e0 Mon Sep 17 00:00:00 2001
From: Ingo Weyrich <heckflosse67@gmx.de>
Date: Thu, 19 Sep 2019 22:06:41 +0200
Subject: [PATCH 05/31] dehaze: fix broken build on gcc < 9.x, #5456

---
 rtengine/ipdehaze.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/rtengine/ipdehaze.cc b/rtengine/ipdehaze.cc
index 68af84970..cf333a5d5 100644
--- a/rtengine/ipdehaze.cc
+++ b/rtengine/ipdehaze.cc
@@ -162,7 +162,8 @@ float estimate_ambient_light(const array2D<float> &R, const array2D<float> &G, c
 #ifdef _OPENMP
     #pragma omp parallel for schedule(dynamic) reduction(+:rr,gg,bb,n)
 #endif
-    for (const auto &p : patches) {
+    for (size_t i = 0; i < patches.size(); ++i) {
+        const auto &p = patches[i];
         const int pW = min(p.first + patchsize, W);
         const int pH = min(p.second + patchsize, H);
             

From 7d5ec6c0678f5a5d8a0eabccc197b51128949e3b Mon Sep 17 00:00:00 2001
From: Ingo Weyrich <heckflosse67@gmx.de>
Date: Thu, 19 Sep 2019 22:21:45 +0200
Subject: [PATCH 06/31] Fix bug at right border in new boxblur function, #5456

---
 rtengine/boxblur.h | 32 +++++++++++++++++---------------
 1 file changed, 17 insertions(+), 15 deletions(-)

diff --git a/rtengine/boxblur.h b/rtengine/boxblur.h
index 3020278b2..5cc7430e2 100644
--- a/rtengine/boxblur.h
+++ b/rtengine/boxblur.h
@@ -26,6 +26,7 @@
 #include "alignedbuffer.h"
 #include "rt_math.h"
 #include "opthelper.h"
+#include "StopWatch.h"
 
 
 namespace rtengine
@@ -485,41 +486,42 @@ inline void boxblur (float** src, float** dst, int radius, int W, int H, bool mu
 #endif
         {
             const int remaining = W % numCols;
+
             if (remaining > 0) {
                 float (* const rowBuffer)[8] = (float(*)[8]) buffer;
                 const int col = W - remaining;
 
                 float len = radius + 1;
-                for(int k = 0; k < remaining; k++) {
+                for(int k = 0; k < remaining; ++k) {
                     rowBuffer[0][k] = dst[0][col + k];
                 }
-                for (int i = 1; i <= radius; i++) {
-                    for(int k = 0; k < remaining; k++) {
-                        dst[0][col + k] += dst[i][col + k];
+                for (int row = 1; row <= radius; ++row) {
+                    for(int k = 0; k < remaining; ++k) {
+                        dst[0][col + k] += dst[row][col + k];
                     }
                 }
-                for(int k = 0; k < remaining; k++) {
+                for(int k = 0; k < remaining; ++k) {
                     dst[0][col + k] /= len;
                 }
-                for (int row = 1; row <= radius; row++) {
-                    for(int k = 0; k < remaining; k++) {
+                for (int row = 1; row <= radius; ++row) {
+                    for(int k = 0; k < remaining; ++k) {
                         rowBuffer[row][k] = dst[row][col + k];
-                        dst[row][col + k] = (dst[(row - 1)][col + k] * len + dst[row + radius][col + k]) / (len + 1);
-                        len ++;
+                        dst[row][col + k] = (dst[row - 1][col + k] * len + dst[row + radius][col + k]) / (len + 1);
                     }
+                    len ++;
                 }
                 const float rlen = 1.f / len;
-                for (int row = radius + 1; row < H - radius; row++) {
-                    for(int k = 0; k < remaining; k++) {
+                for (int row = radius + 1; row < H - radius; ++row) {
+                    for(int k = 0; k < remaining; ++k) {
                         rowBuffer[row][k] = dst[row][col + k];
-                        dst[row][col + k] = dst[(row - 1)][col + k] + (dst[row + radius][col + k] - rowBuffer[row - radius - 1][k]) * rlen;
+                        dst[row][col + k] = dst[row - 1][col + k] + (dst[row + radius][col + k] - rowBuffer[row - radius - 1][k]) * rlen;
                     }
                 }
-                for (int row = H - radius; row < H; row++) {
-                    for(int k = 0; k < remaining; k++) {
+                for (int row = H - radius; row < H; ++row) {
+                    for(int k = 0; k < remaining; ++k) {
                         dst[row][col + k] = (dst[(row - 1)][col + k] * len - rowBuffer[row - radius - 1][k]) / (len - 1);
-                        len --;
                     }
+                    len --;
                 }
             }
         }

From 7ff3192cc96d7f9339775a3469e52715ddf3e08a Mon Sep 17 00:00:00 2001
From: Ingo Weyrich <heckflosse67@gmx.de>
Date: Fri, 20 Sep 2019 14:03:09 +0200
Subject: [PATCH 07/31] dehaze: added lumimance mode from ART, #5456, thanks to
 @agriggio

---
 rtdata/languages/default |  2 ++
 rtengine/color.h         |  5 +++++
 rtengine/ipdehaze.cc     | 43 +++++++++++++++++++++++++++++++---------
 rtengine/procparams.cc   |  9 ++++++---
 rtengine/procparams.h    |  2 +-
 rtgui/dehaze.cc          | 16 +++++++++++++++
 rtgui/dehaze.h           |  5 ++++-
 rtgui/paramsedited.cc    |  6 ++++++
 rtgui/paramsedited.h     |  1 +
 9 files changed, 75 insertions(+), 14 deletions(-)

diff --git a/rtdata/languages/default b/rtdata/languages/default
index 3749a706a..49dabe9b9 100644
--- a/rtdata/languages/default
+++ b/rtdata/languages/default
@@ -744,6 +744,7 @@ HISTORY_MSG_COLORTONING_LABREGION_SHOWMASK;CT - region show mask
 HISTORY_MSG_COLORTONING_LABREGION_SLOPE;CT - region slope
 HISTORY_MSG_DEHAZE_DEPTH;Dehaze - Depth
 HISTORY_MSG_DEHAZE_ENABLED;Haze Removal
+HISTORY_MSG_DEHAZE_LUMINANCE;Dehaze - Luminance only
 HISTORY_MSG_DEHAZE_SHOW_DEPTH_MAP;Dehaze - Show depth map
 HISTORY_MSG_DEHAZE_STRENGTH;Dehaze - Strength
 HISTORY_MSG_DUALDEMOSAIC_AUTO_CONTRAST;Dual demosaic - Auto threshold
@@ -1538,6 +1539,7 @@ TP_DEFRINGE_RADIUS;Radius
 TP_DEFRINGE_THRESHOLD;Threshold
 TP_DEHAZE_DEPTH;Depth
 TP_DEHAZE_LABEL;Haze Removal
+TP_DEHAZE_LUMINANCE;Luminance only
 TP_DEHAZE_SHOW_DEPTH_MAP;Show depth map
 TP_DEHAZE_STRENGTH;Strength
 TP_DIRPYRDENOISE_CHROMINANCE_AMZ;Auto multi-zones
diff --git a/rtengine/color.h b/rtengine/color.h
index b859fb0cf..1031ca150 100644
--- a/rtengine/color.h
+++ b/rtengine/color.h
@@ -210,6 +210,11 @@ public:
         return r * workingspace[1][0] + g * workingspace[1][1] + b * workingspace[1][2];
     }
 
+    static vfloat rgbLuminance(vfloat r, vfloat g, vfloat b, const vfloat workingspace[3])
+    {
+        return r * workingspace[0] + g * workingspace[1] + b * workingspace[2];
+    }
+
     /**
     * @brief Convert red/green/blue to L*a*b
     * @brief Convert red/green/blue to hue/saturation/luminance
diff --git a/rtengine/ipdehaze.cc b/rtengine/ipdehaze.cc
index cf333a5d5..4eb5ed8e9 100644
--- a/rtengine/ipdehaze.cc
+++ b/rtengine/ipdehaze.cc
@@ -276,6 +276,13 @@ BENCHFUN
     const float depth = -float(params->dehaze.depth) / 100.f;
     const float t0 = max(1e-3f, std::exp(depth * max_t));
     const float teps = 1e-3f;
+
+    const bool luminance = params->dehaze.luminance;
+    TMatrix ws = ICCStore::getInstance()->workingSpaceMatrix(params->icm.workingProfile);
+#ifdef __SSE2__
+    const vfloat wsv[3] = {F2V(ws[1][0]), F2V(ws[1][1]),F2V(ws[1][2])};
+#endif
+    const float ambientY = Color::rgbLuminance(ambient[0], ambient[1], ambient[2], ws);
 #ifdef _OPENMP
     #pragma omp parallel for if (multiThread)
 #endif
@@ -286,6 +293,8 @@ BENCHFUN
         const vfloat ambient0v = F2V(ambient[0]);
         const vfloat ambient1v = F2V(ambient[1]);
         const vfloat ambient2v = F2V(ambient[2]);
+        const vfloat ambientYv = F2V(ambientY);
+        const vfloat epsYv = F2V(1e-5f);
         const vfloat t0v = F2V(t0);
         const vfloat tepsv = F2V(teps);
         const vfloat c65535v = F2V(65535.f);
@@ -297,14 +306,14 @@ BENCHFUN
             // ... t >= tl to avoid negative values
             const vfloat tlv = onev - vminf(r / ambient0v, vminf(g / ambient1v, b / ambient2v));
             // ... t >= tu to avoid values > 1
-            r -= ambient0v;
-            g -= ambient1v;
-            b -= ambient2v;
+//            r -= ambient0v;
+//            g -= ambient1v;
+//            b -= ambient2v;
 
             vfloat tuv = t0v - tepsv;
-            tuv = vself(vmaskf_lt(ambient0v, onev), vmaxf(tuv, r / (onev - ambient0v)), tuv);
-            tuv = vself(vmaskf_lt(ambient1v, onev), vmaxf(tuv, g / (onev - ambient1v)), tuv);
-            tuv = vself(vmaskf_lt(ambient2v, onev), vmaxf(tuv, b / (onev - ambient2v)), tuv);
+            tuv = vself(vmaskf_lt(ambient0v, onev), vmaxf(tuv, (r - ambient0v) / (onev - ambient0v)), tuv);
+            tuv = vself(vmaskf_lt(ambient1v, onev), vmaxf(tuv, (g - ambient1v) / (onev - ambient1v)), tuv);
+            tuv = vself(vmaskf_lt(ambient2v, onev), vmaxf(tuv, (b - ambient2v) / (onev - ambient2v)), tuv);
 
             const vfloat mtv = vmaxf(LVFU(dark[y][x]), vmaxf(tlv, tuv) + tepsv);
             if (params->dehaze.showDepthMap) {
@@ -312,10 +321,17 @@ BENCHFUN
                 STVFU(img->r(y, x), valv);
                 STVFU(img->g(y, x), valv);
                 STVFU(img->b(y, x), valv);
+            } else if (luminance) {
+                const vfloat Yv = Color::rgbLuminance(r, g, b, wsv);
+                const vfloat YYv = (Yv - ambientYv) / mtv + ambientYv;
+                const vfloat fv = vself(vmaskf_gt(Yv, epsYv), c65535v * YYv / Yv, c65535v);
+                STVFU(img->r(y, x), r * fv);
+                STVFU(img->g(y, x), g * fv);
+                STVFU(img->b(y, x), b * fv);
             } else {
-                STVFU(img->r(y, x), (r / mtv + ambient0v) * c65535v);
-                STVFU(img->g(y, x), (g / mtv + ambient1v) * c65535v);
-                STVFU(img->b(y, x), (b / mtv + ambient2v) * c65535v);
+                STVFU(img->r(y, x), ((r - ambient0v) / mtv + ambient0v) * c65535v);
+                STVFU(img->g(y, x), ((g - ambient1v) / mtv + ambient1v) * c65535v);
+                STVFU(img->b(y, x), ((b - ambient2v) / mtv + ambient2v) * c65535v);
             }
         }
 #endif
@@ -339,6 +355,15 @@ BENCHFUN
             const float mt = max(dark[y][x], tl + teps, tu + teps);
             if (params->dehaze.showDepthMap) {
                 img->r(y, x) = img->g(y, x) = img->b(y, x) = LIM01(1.f - mt) * 65535.f;
+            } else if (luminance) {
+                const float Y = Color::rgbLuminance(img->r(y, x), img->g(y, x), img->b(y, x), ws);
+                const float YY = (Y - ambientY) / mt + ambientY;
+                if (Y > 1e-5f) {
+                    const float f = 65535.f * YY / Y;
+                    img->r(y, x) *= f;
+                    img->g(y, x) *= f;
+                    img->b(y, x) *= f;
+                }
             } else {
                 img->r(y, x) = (r / mt + ambient[0]) * 65535.f;
                 img->g(y, x) = (g / mt + ambient[1]) * 65535.f;
diff --git a/rtengine/procparams.cc b/rtengine/procparams.cc
index f88220c4e..fcf6dd4db 100644
--- a/rtengine/procparams.cc
+++ b/rtengine/procparams.cc
@@ -2517,7 +2517,8 @@ DehazeParams::DehazeParams() :
     enabled(false),
     strength(50),
     showDepthMap(false),
-    depth(25)
+    depth(25),
+    luminance(false)
 {
 }
 
@@ -2527,7 +2528,8 @@ bool DehazeParams::operator ==(const DehazeParams& other) const
         enabled == other.enabled
         && strength == other.strength
         && showDepthMap == other.showDepthMap
-        && depth == other.depth;
+        && depth == other.depth
+        && luminance == other.luminance;
 }
 
 bool DehazeParams::operator !=(const DehazeParams& other) const
@@ -3238,7 +3240,7 @@ int ProcParams::save(const Glib::ustring& fname, const Glib::ustring& fname2, bo
         saveToKeyfile(!pedited || pedited->dehaze.strength, "Dehaze", "Strength", dehaze.strength, keyFile);        
         saveToKeyfile(!pedited || pedited->dehaze.showDepthMap, "Dehaze", "ShowDepthMap", dehaze.showDepthMap, keyFile);        
         saveToKeyfile(!pedited || pedited->dehaze.depth, "Dehaze", "Depth", dehaze.depth, keyFile);        
-
+        saveToKeyfile(!pedited || pedited->dehaze.depth, "Dehaze", "Luminance", dehaze.luminance, keyFile);
 // Directional pyramid denoising
         saveToKeyfile(!pedited || pedited->dirpyrDenoise.enabled, "Directional Pyramid Denoising", "Enabled", dirpyrDenoise.enabled, keyFile);
         saveToKeyfile(!pedited || pedited->dirpyrDenoise.enhance, "Directional Pyramid Denoising", "Enhance", dirpyrDenoise.enhance, keyFile);
@@ -4878,6 +4880,7 @@ int ProcParams::load(const Glib::ustring& fname, ParamsEdited* pedited)
             assignFromKeyfile(keyFile, "Dehaze", "Strength", pedited, dehaze.strength, pedited->dehaze.strength);
             assignFromKeyfile(keyFile, "Dehaze", "ShowDepthMap", pedited, dehaze.showDepthMap, pedited->dehaze.showDepthMap);
             assignFromKeyfile(keyFile, "Dehaze", "Depth", pedited, dehaze.depth, pedited->dehaze.depth);
+            assignFromKeyfile(keyFile, "Dehaze", "Luminance", pedited, dehaze.luminance, pedited->dehaze.luminance);
         }
         
         if (keyFile.has_group("Film Simulation")) {
diff --git a/rtengine/procparams.h b/rtengine/procparams.h
index ce03efc7d..a60c497bc 100644
--- a/rtengine/procparams.h
+++ b/rtengine/procparams.h
@@ -1342,7 +1342,7 @@ struct DehazeParams {
     int strength;
     bool showDepthMap;
     int depth;
-
+    bool luminance;
     DehazeParams();
 
     bool operator==(const DehazeParams &other) const;
diff --git a/rtgui/dehaze.cc b/rtgui/dehaze.cc
index 6f60d08d6..6b7fcd64f 100644
--- a/rtgui/dehaze.cc
+++ b/rtgui/dehaze.cc
@@ -36,6 +36,7 @@ Dehaze::Dehaze(): FoldableToolPanel(this, "dehaze", M("TP_DEHAZE_LABEL"), false,
     EvDehazeStrength = m->newEvent(HDR, "HISTORY_MSG_DEHAZE_STRENGTH");
     EvDehazeShowDepthMap = m->newEvent(HDR, "HISTORY_MSG_DEHAZE_SHOW_DEPTH_MAP");
     EvDehazeDepth = m->newEvent(HDR, "HISTORY_MSG_DEHAZE_DEPTH");
+    EvDehazeLuminance = m->newEvent(HDR, "HISTORY_MSG_DEHAZE_LUMINANCE");
     
     strength = Gtk::manage(new Adjuster(M("TP_DEHAZE_STRENGTH"), 0., 100., 1., 50.));
     strength->setAdjusterListener(this);
@@ -45,12 +46,17 @@ Dehaze::Dehaze(): FoldableToolPanel(this, "dehaze", M("TP_DEHAZE_LABEL"), false,
     depth->setAdjusterListener(this);
     depth->show();
 
+    luminance = Gtk::manage(new Gtk::CheckButton(M("TP_DEHAZE_LUMINANCE")));
+    luminance->signal_toggled().connect(sigc::mem_fun(*this, &Dehaze::luminanceChanged));
+    luminance->show();
+
     showDepthMap = Gtk::manage(new Gtk::CheckButton(M("TP_DEHAZE_SHOW_DEPTH_MAP")));
     showDepthMap->signal_toggled().connect(sigc::mem_fun(*this, &Dehaze::showDepthMapChanged));
     showDepthMap->show();
     
     pack_start(*strength);
     pack_start(*depth);
+    pack_start(*luminance);
     pack_start(*showDepthMap);
 }
 
@@ -64,12 +70,14 @@ void Dehaze::read(const ProcParams *pp, const ParamsEdited *pedited)
         depth->setEditedState(pedited->dehaze.depth ? Edited : UnEdited);
         set_inconsistent(multiImage && !pedited->dehaze.enabled);
         showDepthMap->set_inconsistent(!pedited->dehaze.showDepthMap);
+        luminance->set_inconsistent(!pedited->dehaze.luminance);
     }
 
     setEnabled(pp->dehaze.enabled);
     strength->setValue(pp->dehaze.strength);
     depth->setValue(pp->dehaze.depth);
     showDepthMap->set_active(pp->dehaze.showDepthMap);
+    luminance->set_active(pp->dehaze.luminance);
 
     enableListener();
 }
@@ -81,12 +89,14 @@ void Dehaze::write(ProcParams *pp, ParamsEdited *pedited)
     pp->dehaze.depth = depth->getValue();
     pp->dehaze.enabled = getEnabled();
     pp->dehaze.showDepthMap = showDepthMap->get_active();
+    pp->dehaze.luminance = luminance->get_active();
 
     if (pedited) {
         pedited->dehaze.strength = strength->getEditedState();
         pedited->dehaze.depth = depth->getEditedState();
         pedited->dehaze.enabled = !get_inconsistent();
         pedited->dehaze.showDepthMap = !showDepthMap->get_inconsistent();
+        pedited->dehaze.luminance = !luminance->get_inconsistent();
     }
 }
 
@@ -138,6 +148,12 @@ void Dehaze::showDepthMapChanged()
     }
 }
 
+void Dehaze::luminanceChanged()
+{
+    if (listener) {
+        listener->panelChanged(EvDehazeLuminance, luminance->get_active() ? M("GENERAL_ENABLED") : M("GENERAL_DISABLED"));
+    }
+}
 
 void Dehaze::setBatchMode(bool batchMode)
 {
diff --git a/rtgui/dehaze.h b/rtgui/dehaze.h
index 3120dfc91..6a9d31cd1 100644
--- a/rtgui/dehaze.h
+++ b/rtgui/dehaze.h
@@ -28,12 +28,14 @@ class Dehaze: public ToolParamBlock, public AdjusterListener, public FoldableToo
 private:
     Adjuster *strength;
     Adjuster *depth;
-    Gtk::CheckButton *showDepthMap;    
+    Gtk::CheckButton *showDepthMap;
+    Gtk::CheckButton *luminance;
 
     rtengine::ProcEvent EvDehazeEnabled;
     rtengine::ProcEvent EvDehazeStrength;
     rtengine::ProcEvent EvDehazeDepth;
     rtengine::ProcEvent EvDehazeShowDepthMap;
+    rtengine::ProcEvent EvDehazeLuminance;
     
 public:
 
@@ -47,6 +49,7 @@ public:
     void adjusterChanged(Adjuster *a, double newval) override;
     void enabledChanged() override;
     void showDepthMapChanged();
+    void luminanceChanged();
     void setAdjusterBehavior(bool strengthAdd);
 };
 
diff --git a/rtgui/paramsedited.cc b/rtgui/paramsedited.cc
index 2ab5702ea..9ea89f267 100644
--- a/rtgui/paramsedited.cc
+++ b/rtgui/paramsedited.cc
@@ -587,6 +587,7 @@ void ParamsEdited::set(bool v)
     dehaze.strength = v;
     dehaze.showDepthMap = v;
     dehaze.depth = v;
+    dehaze.luminance = v;
     metadata.mode = v;
     filmNegative.enabled = v;
     filmNegative.redRatio = v;
@@ -1158,6 +1159,7 @@ void ParamsEdited::initFrom(const std::vector<rtengine::procparams::ProcParams>&
         dehaze.strength = dehaze.strength && p.dehaze.strength == other.dehaze.strength;
         dehaze.showDepthMap = dehaze.showDepthMap && p.dehaze.showDepthMap == other.dehaze.showDepthMap;
         dehaze.depth = dehaze.depth && p.dehaze.depth == other.dehaze.depth;
+        dehaze.luminance = dehaze.luminance && p.dehaze.luminance == other.dehaze.luminance;
         metadata.mode = metadata.mode && p.metadata.mode == other.metadata.mode;
         filmNegative.enabled = filmNegative.enabled && p.filmNegative.enabled == other.filmNegative.enabled;
         filmNegative.redRatio = filmNegative.redRatio && p.filmNegative.redRatio == other.filmNegative.redRatio;
@@ -3224,6 +3226,10 @@ void ParamsEdited::combine(rtengine::procparams::ProcParams& toEdit, const rteng
         toEdit.dehaze.showDepthMap = mods.dehaze.showDepthMap;
     }
 
+    if (dehaze.luminance) {
+        toEdit.dehaze.luminance = mods.dehaze.luminance;
+    }
+
     if (metadata.mode) {
         toEdit.metadata.mode = mods.metadata.mode;
     }
diff --git a/rtgui/paramsedited.h b/rtgui/paramsedited.h
index 1bd7170d4..41af0510d 100644
--- a/rtgui/paramsedited.h
+++ b/rtgui/paramsedited.h
@@ -596,6 +596,7 @@ struct DehazeParamsEdited {
     bool strength;
     bool showDepthMap;
     bool depth;
+    bool luminance;
 };
 
 struct RAWParamsEdited {

From 83a8ca8ef52a03b600610a40720149bee30c5bec Mon Sep 17 00:00:00 2001
From: Ingo Weyrich <heckflosse67@gmx.de>
Date: Fri, 20 Sep 2019 15:29:35 +0200
Subject: [PATCH 08/31] dehaze: Fix artifacts when blue channel is clipped,
 #5456, thanks to @agriggio

---
 rtengine/ipdehaze.cc | 50 +++++++++++++-------------------------------
 1 file changed, 14 insertions(+), 36 deletions(-)

diff --git a/rtengine/ipdehaze.cc b/rtengine/ipdehaze.cc
index 4eb5ed8e9..81074a15b 100644
--- a/rtengine/ipdehaze.cc
+++ b/rtengine/ipdehaze.cc
@@ -264,10 +264,8 @@ BENCHFUN
     const int radius = patchsize * 4;
     constexpr float epsilon = 1e-5f;
 
-    {
-        array2D<float> guideB(W, H, img->b.ptrs, ARRAY2D_BYREFERENCE);
-        guidedFilter(guideB, dark, dark, radius, epsilon, multiThread);
-    }
+    array2D<float> guideB(W, H, img->b.ptrs, ARRAY2D_BYREFERENCE);
+    guidedFilter(guideB, dark, dark, radius, epsilon, multiThread);
         
     if (options.rtSettings.verbose) {
         std::cout << "dehaze: max distance is " << max_t << std::endl;
@@ -300,22 +298,12 @@ BENCHFUN
         const vfloat c65535v = F2V(65535.f);
         for (; x < W - 3; x += 4) {
             // ensure that the transmission is such that to avoid clipping...
-            vfloat r = LVFU(img->r(y, x));
-            vfloat g = LVFU(img->g(y, x));
-            vfloat b = LVFU(img->b(y, x));
+            const vfloat r = LVFU(img->r(y, x));
+            const vfloat g = LVFU(img->g(y, x));
+            const vfloat b = LVFU(img->b(y, x));
             // ... t >= tl to avoid negative values
             const vfloat tlv = onev - vminf(r / ambient0v, vminf(g / ambient1v, b / ambient2v));
-            // ... t >= tu to avoid values > 1
-//            r -= ambient0v;
-//            g -= ambient1v;
-//            b -= ambient2v;
-
-            vfloat tuv = t0v - tepsv;
-            tuv = vself(vmaskf_lt(ambient0v, onev), vmaxf(tuv, (r - ambient0v) / (onev - ambient0v)), tuv);
-            tuv = vself(vmaskf_lt(ambient1v, onev), vmaxf(tuv, (g - ambient1v) / (onev - ambient1v)), tuv);
-            tuv = vself(vmaskf_lt(ambient2v, onev), vmaxf(tuv, (b - ambient2v) / (onev - ambient2v)), tuv);
-
-            const vfloat mtv = vmaxf(LVFU(dark[y][x]), vmaxf(tlv, tuv) + tepsv);
+            const vfloat mtv = vmaxf(LVFU(dark[y][x]), vmaxf(tlv + tepsv, t0v));
             if (params->dehaze.showDepthMap) {
                 const vfloat valv = vclampf(onev - mtv, ZEROV, onev) * c65535v;
                 STVFU(img->r(y, x), valv);
@@ -337,37 +325,27 @@ BENCHFUN
 #endif
         for (; x < W; ++x) {
             // ensure that the transmission is such that to avoid clipping...
-            float r = img->r(y, x);
-            float g = img->g(y, x);
-            float b = img->b(y, x);
+            const float r = img->r(y, x);
+            const float g = img->g(y, x);
+            const float b = img->b(y, x);
             // ... t >= tl to avoid negative values
             const float tl = 1.f - min(r / ambient[0], g / ambient[1], b / ambient[2]);
-            // ... t >= tu to avoid values > 1
-            r -= ambient[0];
-            g -= ambient[1];
-            b -= ambient[2];
-
-            float tu = t0 - teps;
-            tu = ambient[0] < 1.f ? max(tu, r / (1.f - ambient[0])) : tu;
-            tu = ambient[1] < 1.f ? max(tu, g / (1.f - ambient[1])) : tu;
-            tu = ambient[2] < 1.f ? max(tu, b / (1.f - ambient[2])) : tu;
-
-            const float mt = max(dark[y][x], tl + teps, tu + teps);
+            const float mt = max(dark[y][x], t0, tl + teps);
             if (params->dehaze.showDepthMap) {
                 img->r(y, x) = img->g(y, x) = img->b(y, x) = LIM01(1.f - mt) * 65535.f;
             } else if (luminance) {
                 const float Y = Color::rgbLuminance(img->r(y, x), img->g(y, x), img->b(y, x), ws);
-                const float YY = (Y - ambientY) / mt + ambientY;
                 if (Y > 1e-5f) {
+                    const float YY = (Y - ambientY) / mt + ambientY;
                     const float f = 65535.f * YY / Y;
                     img->r(y, x) *= f;
                     img->g(y, x) *= f;
                     img->b(y, x) *= f;
                 }
             } else {
-                img->r(y, x) = (r / mt + ambient[0]) * 65535.f;
-                img->g(y, x) = (g / mt + ambient[1]) * 65535.f;
-                img->b(y, x) = (b / mt + ambient[2]) * 65535.f;
+                img->r(y, x) = ((r - ambient[0]) / mt + ambient[0]) * 65535.f;
+                img->g(y, x) = ((g - ambient[1]) / mt + ambient[1]) * 65535.f;
+                img->b(y, x) = ((b - ambient[2]) / mt + ambient[2]) * 65535.f;
             }
         }
     }

From a7cc59c91dc2b715203e63bfcf8f5423103a18e6 Mon Sep 17 00:00:00 2001
From: Ingo Weyrich <heckflosse67@gmx.de>
Date: Sat, 21 Sep 2019 21:33:05 +0200
Subject: [PATCH 09/31] dehaze: further speedup, stolen from ART, thanks
 @agriggio, #5456

---
 rtengine/ipdehaze.cc | 91 ++++++++++++++++++++++++--------------------
 1 file changed, 49 insertions(+), 42 deletions(-)

diff --git a/rtengine/ipdehaze.cc b/rtengine/ipdehaze.cc
index 81074a15b..6f516d95b 100644
--- a/rtengine/ipdehaze.cc
+++ b/rtengine/ipdehaze.cc
@@ -39,7 +39,7 @@
 #include "rt_math.h"
 #define BENCHMARK
 #include "StopWatch.h"
-
+#include "rescale.h"
 extern Options options;
 
 namespace rtengine {
@@ -83,24 +83,24 @@ int get_dark_channel_downsized(const array2D<float> &R, const array2D<float> &G,
     #pragma omp parallel for if (multithread)
 #endif
     for (int y = 0; y < H; y += patchsize) {
-        int yy = y / patchsize;
         const int pH = min(y + patchsize, H);
-        for (int x = 0, xx = 0; x < W; x += patchsize, ++xx) {
+        for (int x = 0; x < W; x += patchsize) {
             float val = RT_INFINITY_F;
             const int pW = min(x + patchsize, W);
-            for (int xp = x; xp < pW; ++xp) {
-                for (int yp = y; yp < pH; ++yp) {
-                    val = min(val, R[yp][xp], G[yp][xp], B[yp][xp]);
+            for (int xx = x; xx < pW; ++xx) {
+                for (int yy = y; yy < pH; ++yy) {
+                    val = min(val, R[yy][xx], G[yy][xx], B[yy][xx]);
                 }
             }
-            dst[yy][xx] = val;
+            for (int yy = y; yy < pH; ++yy) {
+                std::fill(dst[yy] + x, dst[yy] + pW, val);
+            }
         }
     }
 
     return (W / patchsize + ((W % patchsize) > 0)) *  (H / patchsize + ((H % patchsize) > 0));
 }
 
-
 float estimate_ambient_light(const array2D<float> &R, const array2D<float> &G, const array2D<float> &B, const array2D<float> &dark, int patchsize, int npatches, float ambient[3])
 {
     const int W = R.width();
@@ -109,10 +109,10 @@ float estimate_ambient_light(const array2D<float> &R, const array2D<float> &G, c
     float darklim = RT_INFINITY_F;
     {
         std::vector<float> p;
-        for (int y = 0, yy = 0; y < H; y += patchsize, ++yy) {
-            for (int x = 0, xx = 0; x < W; x += patchsize, ++xx) {
-                if (!OOG(dark[yy][xx], 1.f - 1e-5f)) {
-                    p.push_back(dark[yy][xx]);
+        for (int y = 0; y < H; y += patchsize) {
+            for (int x = 0; x < W; x += patchsize) {
+                if (!OOG(dark[y][x], 1.f - 1e-5f)) {
+                    p.push_back(dark[y][x]);
                 }
             }
         }
@@ -124,9 +124,9 @@ float estimate_ambient_light(const array2D<float> &R, const array2D<float> &G, c
     std::vector<std::pair<int, int>> patches;
     patches.reserve(npatches);
 
-    for (int y = 0, yy = 0; y < H; y += patchsize, ++yy) {
-        for (int x = 0, xx = 0; x < W; x += patchsize, ++xx) {
-            if (dark[yy][xx] >= darklim && !OOG(dark[yy][xx], 1.f)) {
+    for (int y = 0; y < H; y += patchsize) {
+        for (int x = 0; x < W; x += patchsize) {
+            if (dark[y][x] >= darklim && !OOG(dark[y][x], 1.f)) {
                 patches.push_back(std::make_pair(x, y));
             }
         }
@@ -142,9 +142,9 @@ float estimate_ambient_light(const array2D<float> &R, const array2D<float> &G, c
         std::vector<float> l;
         l.reserve(patches.size() * patchsize * patchsize);
         
-        for (const auto &p : patches) {
-            const int pW = min(p.first + patchsize, W);
-            const int pH = min(p.second + patchsize, H);
+        for (auto &p : patches) {
+            const int pW = min(p.first+patchsize, W);
+            const int pH = min(p.second+patchsize, H);
             
             for (int y = p.second; y < pH; ++y) {
                 for (int x = p.first; x < pW; ++x) {
@@ -159,19 +159,15 @@ float estimate_ambient_light(const array2D<float> &R, const array2D<float> &G, c
 
     double rr = 0, gg = 0, bb = 0;
     int n = 0;
-#ifdef _OPENMP
-    #pragma omp parallel for schedule(dynamic) reduction(+:rr,gg,bb,n)
-#endif
-    for (size_t i = 0; i < patches.size(); ++i) {
-        const auto &p = patches[i];
-        const int pW = min(p.first + patchsize, W);
-        const int pH = min(p.second + patchsize, H);
+    for (auto &p : patches) {
+        const int pW = min(p.first+patchsize, W);
+        const int pH = min(p.second+patchsize, H);
             
         for (int y = p.second; y < pH; ++y) {
             for (int x = p.first; x < pW; ++x) {
-                const float r = R[y][x];
-                const float g = G[y][x];
-                const float b = B[y][x];
+                float r = R[y][x];
+                float g = G[y][x];
+                float b = B[y][x];
                 if (r + g + b >= bright_lim) {
                     rr += r;
                     gg += g;
@@ -181,7 +177,6 @@ float estimate_ambient_light(const array2D<float> &R, const array2D<float> &G, c
             }
         }
     }
-
     n = std::max(n, 1);
     ambient[0] = rr / n;
     ambient[1] = gg / n;
@@ -191,7 +186,6 @@ float estimate_ambient_light(const array2D<float> &R, const array2D<float> &G, c
     return darklim > 0 ? -1.125f * std::log(darklim) : std::log(std::numeric_limits<float>::max()) / 2;
 }
 
-
 void extract_channels(Imagefloat *img, array2D<float> &r, array2D<float> &g, array2D<float> &b, int radius, float epsilon, bool multithread)
 {
     const int W = img->getWidth();
@@ -238,11 +232,32 @@ BENCHFUN
         array2D<float> B(W, H);
         extract_channels(img, R, G, B, patchsize, 1e-1, multiThread);
 
-        patchsize = max(max(W, H) / 600, 2);
-        array2D<float> darkDownsized(W / patchsize + 1, H / patchsize + 1);
-        const int npatches = get_dark_channel_downsized(R, G, B, darkDownsized, patchsize, multiThread);
+        {
+            constexpr int sizecap = 200;
+            float r = float(W)/float(H);
+            const int hh = r >= 1.f ? sizecap : sizecap / r;
+            const int ww = r >= 1.f ? sizecap * r : sizecap;
 
-        max_t = estimate_ambient_light(R, G, B, darkDownsized, patchsize, npatches, ambient);
+            if (W <= ww && H <= hh) {
+                // don't rescale small thumbs
+                array2D<float> D(W, H);
+                int npatches = get_dark_channel_downsized(R, G, B, D, 2, multiThread);
+                max_t = estimate_ambient_light(R, G, B, D, patchsize, npatches, ambient);
+            } else {
+                array2D<float> RR(ww, hh);
+                array2D<float> GG(ww, hh);
+                array2D<float> BB(ww, hh);
+                rescaleNearest(R, RR, multiThread);
+                rescaleNearest(G, GG, multiThread);
+                rescaleNearest(B, BB, multiThread);
+                array2D<float> D(ww, hh);
+
+                int npatches = get_dark_channel_downsized(RR, GG, BB, D, 2, multiThread);
+                max_t = estimate_ambient_light(RR, GG, BB, D, patchsize, npatches, ambient);
+            }
+        }
+
+        patchsize = max(max(W, H) / 600, 2);
 
         if (options.rtSettings.verbose) {
             std::cout << "dehaze: ambient light is "
@@ -250,14 +265,6 @@ BENCHFUN
                       << std::endl;
         }
 
-        if (min(ambient[0], ambient[1], ambient[2]) < 0.01f) {
-            if (options.rtSettings.verbose) {
-                std::cout << "dehaze: no haze detected" << std::endl;
-            }
-            img->normalizeFloatTo65535();
-            return; // probably no haze at all
-        }
-
         get_dark_channel(R, G, B, dark, patchsize, ambient, true, multiThread, strength);
     }
 

From cab84aed38456964f34f1bbe68b4ca3104753cd7 Mon Sep 17 00:00:00 2001
From: Ingo Weyrich <heckflosse67@gmx.de>
Date: Sun, 22 Sep 2019 13:21:34 +0200
Subject: [PATCH 10/31] dehaze: fix bug in luminance mode, #5456

---
 rtengine/ipdehaze.cc | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/rtengine/ipdehaze.cc b/rtengine/ipdehaze.cc
index 6f516d95b..125fa4d60 100644
--- a/rtengine/ipdehaze.cc
+++ b/rtengine/ipdehaze.cc
@@ -283,7 +283,7 @@ BENCHFUN
     const float teps = 1e-3f;
 
     const bool luminance = params->dehaze.luminance;
-    TMatrix ws = ICCStore::getInstance()->workingSpaceMatrix(params->icm.workingProfile);
+    const TMatrix ws = ICCStore::getInstance()->workingSpaceMatrix(params->icm.workingProfile);
 #ifdef __SSE2__
     const vfloat wsv[3] = {F2V(ws[1][0]), F2V(ws[1][1]),F2V(ws[1][2])};
 #endif
@@ -342,13 +342,11 @@ BENCHFUN
                 img->r(y, x) = img->g(y, x) = img->b(y, x) = LIM01(1.f - mt) * 65535.f;
             } else if (luminance) {
                 const float Y = Color::rgbLuminance(img->r(y, x), img->g(y, x), img->b(y, x), ws);
-                if (Y > 1e-5f) {
-                    const float YY = (Y - ambientY) / mt + ambientY;
-                    const float f = 65535.f * YY / Y;
-                    img->r(y, x) *= f;
-                    img->g(y, x) *= f;
-                    img->b(y, x) *= f;
-                }
+                const float YY = (Y - ambientY) / mt + ambientY;
+                const float f = Y > 1e-5f ? 65535.f * YY / Y : 65535.f;
+                img->r(y, x) *= f;
+                img->g(y, x) *= f;
+                img->b(y, x) *= f;
             } else {
                 img->r(y, x) = ((r - ambient[0]) / mt + ambient[0]) * 65535.f;
                 img->g(y, x) = ((g - ambient[1]) / mt + ambient[1]) * 65535.f;

From 189f474e033c908ed06bfcb32580ab63427b9ad0 Mon Sep 17 00:00:00 2001
From: Ingo Weyrich <heckflosse67@gmx.de>
Date: Sun, 22 Sep 2019 20:53:03 +0200
Subject: [PATCH 11/31] dehaze: add accidently removed early exit in case there
 is no haze detected, #5456

---
 rtengine/ipdehaze.cc | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/rtengine/ipdehaze.cc b/rtengine/ipdehaze.cc
index 125fa4d60..00236d1de 100644
--- a/rtengine/ipdehaze.cc
+++ b/rtengine/ipdehaze.cc
@@ -257,6 +257,13 @@ BENCHFUN
             }
         }
 
+        if (min(ambient[0], ambient[1], ambient[2]) < 0.01f) {
+            if (options.rtSettings.verbose) {
+                std::cout << "dehaze: no haze detected" << std::endl;
+            }
+            img->normalizeFloatTo65535();
+            return; // probably no haze at all
+        }
         patchsize = max(max(W, H) / 600, 2);
 
         if (options.rtSettings.verbose) {

From ca162e8ffc7ecd1ad924a0735c9e364c0aff3779 Mon Sep 17 00:00:00 2001
From: Ingo Weyrich <heckflosse67@gmx.de>
Date: Sun, 22 Sep 2019 23:45:19 +0200
Subject: [PATCH 12/31] dehaze: (experimental) input normalization to improve
 handling of overexposed pictures, thanks @agriggio, #5456

---
 rtengine/ipdehaze.cc | 79 ++++++++++++++++++++++++++++++++++----------
 1 file changed, 62 insertions(+), 17 deletions(-)

diff --git a/rtengine/ipdehaze.cc b/rtengine/ipdehaze.cc
index 00236d1de..8ba1b935a 100644
--- a/rtengine/ipdehaze.cc
+++ b/rtengine/ipdehaze.cc
@@ -28,24 +28,69 @@
  *
  */  
 
+#include <algorithm>
 #include <iostream>
-#include <queue>
+#include <vector>
 
 #include "guidedfilter.h"
 #include "improcfun.h"
 #include "procparams.h"
-#include "rt_algo.h"
-#include "rt_algo.h"
+#include "rescale.h"
 #include "rt_math.h"
 #define BENCHMARK
 #include "StopWatch.h"
-#include "rescale.h"
+
 extern Options options;
 
 namespace rtengine {
 
 namespace {
 
+float normalize(Imagefloat *rgb, bool multithread)
+{
+    float maxval = 0.f;
+    const int W = rgb->getWidth();
+    const int H = rgb->getHeight();
+#ifdef _OPENMP
+#   pragma omp parallel for reduction(max:maxval) if (multithread)
+#endif
+    for (int y = 0; y < H; ++y) {
+        for (int x = 0; x < W; ++x) {
+            maxval = max(maxval, rgb->r(y, x), rgb->g(y, x), rgb->b(y, x));
+        }
+    }
+    maxval = max(maxval * 2.f, 65535.f);
+#ifdef _OPENMP
+#   pragma omp parallel for if (multithread)
+#endif
+    for (int y = 0; y < H; ++y) {
+        for (int x = 0; x < W; ++x) {
+            rgb->r(y, x) /= maxval;
+            rgb->g(y, x) /= maxval;
+            rgb->b(y, x) /= maxval;
+        }
+    }
+    return maxval;
+}
+
+void restore(Imagefloat *rgb, float maxval, bool multithread)
+{
+    const int W = rgb->getWidth();
+    const int H = rgb->getHeight();
+    if (maxval > 0.f && maxval != 1.f) {
+#ifdef _OPENMP
+#       pragma omp parallel for if (multithread)
+#endif
+        for (int y = 0; y < H; ++y) {
+            for (int x = 0; x < W; ++x) {
+                rgb->r(y, x) *= maxval;
+                rgb->g(y, x) *= maxval;
+                rgb->b(y, x) *= maxval;
+            }
+        }
+    }
+}
+
 int get_dark_channel(const array2D<float> &R, const array2D<float> &G, const array2D<float> &B, array2D<float> &dst, int patchsize, const float ambient[3], bool clip, bool multithread, float strength)
 {
     const int W = R.width();
@@ -210,7 +255,7 @@ void ImProcFunctions::dehaze(Imagefloat *img)
         return;
     }
 BENCHFUN
-    img->normalizeFloatTo1();
+    const float maxChannel = normalize(img, multiThread);
 
     const int W = img->getWidth();
     const int H = img->getHeight();
@@ -261,7 +306,7 @@ BENCHFUN
             if (options.rtSettings.verbose) {
                 std::cout << "dehaze: no haze detected" << std::endl;
             }
-            img->normalizeFloatTo65535();
+            restore(img, maxChannel, multiThread);
             return; // probably no haze at all
         }
         patchsize = max(max(W, H) / 600, 2);
@@ -309,7 +354,7 @@ BENCHFUN
         const vfloat epsYv = F2V(1e-5f);
         const vfloat t0v = F2V(t0);
         const vfloat tepsv = F2V(teps);
-        const vfloat c65535v = F2V(65535.f);
+        const vfloat cmaxChannelv = F2V(maxChannel);
         for (; x < W - 3; x += 4) {
             // ensure that the transmission is such that to avoid clipping...
             const vfloat r = LVFU(img->r(y, x));
@@ -319,21 +364,21 @@ BENCHFUN
             const vfloat tlv = onev - vminf(r / ambient0v, vminf(g / ambient1v, b / ambient2v));
             const vfloat mtv = vmaxf(LVFU(dark[y][x]), vmaxf(tlv + tepsv, t0v));
             if (params->dehaze.showDepthMap) {
-                const vfloat valv = vclampf(onev - mtv, ZEROV, onev) * c65535v;
+                const vfloat valv = vclampf(onev - mtv, ZEROV, onev) * cmaxChannelv;
                 STVFU(img->r(y, x), valv);
                 STVFU(img->g(y, x), valv);
                 STVFU(img->b(y, x), valv);
             } else if (luminance) {
                 const vfloat Yv = Color::rgbLuminance(r, g, b, wsv);
                 const vfloat YYv = (Yv - ambientYv) / mtv + ambientYv;
-                const vfloat fv = vself(vmaskf_gt(Yv, epsYv), c65535v * YYv / Yv, c65535v);
+                const vfloat fv = vself(vmaskf_gt(Yv, epsYv), cmaxChannelv * YYv / Yv, cmaxChannelv);
                 STVFU(img->r(y, x), r * fv);
                 STVFU(img->g(y, x), g * fv);
                 STVFU(img->b(y, x), b * fv);
             } else {
-                STVFU(img->r(y, x), ((r - ambient0v) / mtv + ambient0v) * c65535v);
-                STVFU(img->g(y, x), ((g - ambient1v) / mtv + ambient1v) * c65535v);
-                STVFU(img->b(y, x), ((b - ambient2v) / mtv + ambient2v) * c65535v);
+                STVFU(img->r(y, x), ((r - ambient0v) / mtv + ambient0v) * cmaxChannelv);
+                STVFU(img->g(y, x), ((g - ambient1v) / mtv + ambient1v) * cmaxChannelv);
+                STVFU(img->b(y, x), ((b - ambient2v) / mtv + ambient2v) * cmaxChannelv);
             }
         }
 #endif
@@ -346,18 +391,18 @@ BENCHFUN
             const float tl = 1.f - min(r / ambient[0], g / ambient[1], b / ambient[2]);
             const float mt = max(dark[y][x], t0, tl + teps);
             if (params->dehaze.showDepthMap) {
-                img->r(y, x) = img->g(y, x) = img->b(y, x) = LIM01(1.f - mt) * 65535.f;
+                img->r(y, x) = img->g(y, x) = img->b(y, x) = LIM01(1.f - mt) * maxChannel;
             } else if (luminance) {
                 const float Y = Color::rgbLuminance(img->r(y, x), img->g(y, x), img->b(y, x), ws);
                 const float YY = (Y - ambientY) / mt + ambientY;
-                const float f = Y > 1e-5f ? 65535.f * YY / Y : 65535.f;
+                const float f = Y > 1e-5f ? maxChannel * YY / Y : maxChannel;
                 img->r(y, x) *= f;
                 img->g(y, x) *= f;
                 img->b(y, x) *= f;
             } else {
-                img->r(y, x) = ((r - ambient[0]) / mt + ambient[0]) * 65535.f;
-                img->g(y, x) = ((g - ambient[1]) / mt + ambient[1]) * 65535.f;
-                img->b(y, x) = ((b - ambient[2]) / mt + ambient[2]) * 65535.f;
+                img->r(y, x) = ((r - ambient[0]) / mt + ambient[0]) * maxChannel;
+                img->g(y, x) = ((g - ambient[1]) / mt + ambient[1]) * maxChannel;
+                img->b(y, x) = ((b - ambient[2]) / mt + ambient[2]) * maxChannel;
             }
         }
     }

From f03605b73526235c27832f5815359943f4e0441e Mon Sep 17 00:00:00 2001
From: Ingo Weyrich <heckflosse67@gmx.de>
Date: Mon, 23 Sep 2019 13:39:21 +0200
Subject: [PATCH 13/31] boxblur: apply changes requested by @Floessie in code
 review

---
 rtengine/boxblur.h | 28 +++++++++++++++-------------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/rtengine/boxblur.h b/rtengine/boxblur.h
index 5cc7430e2..27aa9d2fc 100644
--- a/rtengine/boxblur.h
+++ b/rtengine/boxblur.h
@@ -20,6 +20,7 @@
 #define _BOXBLUR_H_
 
 #include <assert.h>
+#include <memory>
 #include <stdlib.h>
 #include <string.h>
 #include <math.h>
@@ -337,9 +338,10 @@ inline void boxblur (float** src, float** dst, int radius, int W, int H, bool mu
     #pragma omp parallel if (multiThread)
 #endif
     {
-        float* const buffer = new float[std::max(W, 8 * H)];
+        std::unique_ptr<float> buffer(new float[std::max(W, 8 * H)]);
+
         //horizontal blur
-        float* const lineBuffer = buffer;
+        float* const lineBuffer = buffer.get();
 #ifdef _OPENMP
         #pragma omp for
 #endif
@@ -356,8 +358,9 @@ inline void boxblur (float** src, float** dst, int radius, int W, int H, bool mu
 
             for (int col = 1; col <= radius; col++) {
                 lineBuffer[col] = src[row][col];
-                dst[row][col] = tempval = (tempval * len + src[row][col + radius]) / (len + 1);
-                len ++;
+                tempval = (tempval * len + src[row][col + radius]) / (len + 1);
+                dst[row][col] = tempval;
+                ++len;
             }
 
             for (int col = radius + 1; col < W - radius; col++) {
@@ -367,15 +370,15 @@ inline void boxblur (float** src, float** dst, int radius, int W, int H, bool mu
 
             for (int col = W - radius; col < W; col++) {
                 dst[row][col] = tempval = (tempval * len - lineBuffer[col - radius - 1]) / (len - 1);
-                len --;
+                --len;
             }
         }
 
         //vertical blur
 #ifdef __SSE2__
-        vfloat (* const rowBuffer)[2] = (vfloat(*)[2]) buffer;
-        vfloat leninitv = F2V(radius + 1);
-        vfloat onev = F2V(1.f);
+        vfloat (* const rowBuffer)[2] = (vfloat(*)[2]) buffer.get();
+        const vfloat leninitv = F2V(radius + 1);
+        const vfloat onev = F2V(1.f);
         vfloat tempv, temp1v, lenv, lenp1v, lenm1v, rlenv;
 
 #ifdef _OPENMP
@@ -432,7 +435,7 @@ inline void boxblur (float** src, float** dst, int radius, int W, int H, bool mu
         }
 
 #else
-        float (* const rowBuffer)[8] = (float(*)[8]) buffer;
+        float (* const rowBuffer)[8] = (float(*)[8]) buffer.get();
 #ifdef _OPENMP
         #pragma omp for nowait
 #endif
@@ -440,12 +443,12 @@ inline void boxblur (float** src, float** dst, int radius, int W, int H, bool mu
         for (int col = 0; col < W - numCols + 1; col += 8) {
             float len = radius + 1;
 
-            for(int k = 0; k < numCols; k++) {
+            for (int k = 0; k < numCols; k++) {
                 rowBuffer[0][k] = dst[0][col + k];
             }
 
             for (int i = 1; i <= radius; i++) {
-                for(int k = 0; k < numCols; k++) {
+                for (int k = 0; k < numCols; k++) {
                     dst[0][col + k] += dst[i][col + k];
                 }
             }
@@ -488,7 +491,7 @@ inline void boxblur (float** src, float** dst, int radius, int W, int H, bool mu
             const int remaining = W % numCols;
 
             if (remaining > 0) {
-                float (* const rowBuffer)[8] = (float(*)[8]) buffer;
+                float (* const rowBuffer)[8] = (float(*)[8]) buffer.get();
                 const int col = W - remaining;
 
                 float len = radius + 1;
@@ -525,7 +528,6 @@ inline void boxblur (float** src, float** dst, int radius, int W, int H, bool mu
                 }
             }
         }
-        delete [] buffer;
     }
 }
 

From 3ca7f09655e0c703a7d23c74414723d8f0d7b8a4 Mon Sep 17 00:00:00 2001
From: Ingo Weyrich <heckflosse67@gmx.de>
Date: Mon, 23 Sep 2019 13:39:50 +0200
Subject: [PATCH 14/31] guidedfilter: apply changes requested by @Floessie in
 code review

---
 rtengine/guidedfilter.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/rtengine/guidedfilter.cc b/rtengine/guidedfilter.cc
index 8d19fc7a5..159e89504 100644
--- a/rtengine/guidedfilter.cc
+++ b/rtengine/guidedfilter.cc
@@ -66,7 +66,7 @@ void guidedFilter(const array2D<float> &guide, const array2D<float> &src, array2
     enum Op {MUL, DIVEPSILON, SUBMUL};
 
     const auto apply =
-        [=](Op op, array2D<float> &res, const array2D<float> &a, const array2D<float> &b, const array2D<float> &c=array2D<float>()) -> void
+        [multithread, epsilon](Op op, array2D<float> &res, const array2D<float> &a, const array2D<float> &b, const array2D<float> &c=array2D<float>()) -> void
         {
             const int w = res.width();
             const int h = res.height();
@@ -96,13 +96,13 @@ void guidedFilter(const array2D<float> &guide, const array2D<float> &src, array2
         };
 
     const auto f_subsample =
-        [=](array2D<float> &d, const array2D<float> &s) -> void
+        [multithread](array2D<float> &d, const array2D<float> &s) -> void
         {
             rescaleBilinear(s, d, multithread);
         };
 
     const auto f_mean =
-        [&](array2D<float> &d, array2D<float> &s, int rad) -> void
+        [multithread](array2D<float> &d, array2D<float> &s, int rad) -> void
         {
             rad = LIM(rad, 0, (min(s.width(), s.height()) - 1) / 2 - 1);
             boxblur(s, d, rad, s.width(), s.height(), multithread);

From 7a8225d2745ef86eb71b537a2a852cbdf9882f26 Mon Sep 17 00:00:00 2001
From: Ingo Weyrich <heckflosse67@gmx.de>
Date: Mon, 23 Sep 2019 13:40:19 +0200
Subject: [PATCH 15/31] procparams: apply changes requested by @Floessie in
 code review

---
 rtengine/procparams.cc | 1 +
 rtengine/procparams.h  | 1 +
 2 files changed, 2 insertions(+)

diff --git a/rtengine/procparams.cc b/rtengine/procparams.cc
index 734796cd8..8de7e2b55 100644
--- a/rtengine/procparams.cc
+++ b/rtengine/procparams.cc
@@ -3243,6 +3243,7 @@ int ProcParams::save(const Glib::ustring& fname, const Glib::ustring& fname2, bo
         saveToKeyfile(!pedited || pedited->dehaze.showDepthMap, "Dehaze", "ShowDepthMap", dehaze.showDepthMap, keyFile);        
         saveToKeyfile(!pedited || pedited->dehaze.depth, "Dehaze", "Depth", dehaze.depth, keyFile);        
         saveToKeyfile(!pedited || pedited->dehaze.depth, "Dehaze", "Luminance", dehaze.luminance, keyFile);
+
 // Directional pyramid denoising
         saveToKeyfile(!pedited || pedited->dirpyrDenoise.enabled, "Directional Pyramid Denoising", "Enabled", dirpyrDenoise.enabled, keyFile);
         saveToKeyfile(!pedited || pedited->dirpyrDenoise.enhance, "Directional Pyramid Denoising", "Enhance", dirpyrDenoise.enhance, keyFile);
diff --git a/rtengine/procparams.h b/rtengine/procparams.h
index 0ef0f045b..734ca7556 100644
--- a/rtengine/procparams.h
+++ b/rtengine/procparams.h
@@ -1344,6 +1344,7 @@ struct DehazeParams {
     bool showDepthMap;
     int depth;
     bool luminance;
+
     DehazeParams();
 
     bool operator==(const DehazeParams &other) const;

From 204475dd05576f9e2d4b8b8d8e443603f1f3d9aa Mon Sep 17 00:00:00 2001
From: Ingo Weyrich <heckflosse67@gmx.de>
Date: Mon, 23 Sep 2019 13:42:23 +0200
Subject: [PATCH 16/31] sleefsseavx: added horizontal min and max

---
 rtengine/sleefsseavx.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/rtengine/sleefsseavx.c b/rtengine/sleefsseavx.c
index 3000c1c10..cce88df5d 100644
--- a/rtengine/sleefsseavx.c
+++ b/rtengine/sleefsseavx.c
@@ -1390,6 +1390,18 @@ static inline float vhadd( vfloat a ) {
     return _mm_cvtss_f32(_mm_add_ss(a, _mm_shuffle_ps(a, a, 1)));
 }
 
+static inline float vhmin(vfloat a) {
+    // returns min(a[0], a[1], a[2], a[3])
+    a = vminf(a, _mm_movehl_ps(a, a));
+    return _mm_cvtss_f32(vminf(a, _mm_shuffle_ps(a, a, 1)));
+}
+
+static inline float vhmax(vfloat a) {
+    // returns max(a[0], a[1], a[2], a[3])
+    a = vmaxf(a, _mm_movehl_ps(a, a));
+    return _mm_cvtss_f32(vmaxf(a, _mm_shuffle_ps(a, a, 1)));
+}
+
 static INLINE vfloat vmul2f(vfloat a){
     // fastest way to multiply by 2
 	return a + a;

From 9cff2bca486e36b8bedc2612d26587d938df5c6a Mon Sep 17 00:00:00 2001
From: Ingo Weyrich <heckflosse67@gmx.de>
Date: Mon, 23 Sep 2019 13:43:43 +0200
Subject: [PATCH 17/31] dehaze: speedup and changes requested by @Floessie in
 code review, #5456

---
 rtengine/ipdehaze.cc | 57 +++++++++++++++++++++++++++++++-------------
 1 file changed, 40 insertions(+), 17 deletions(-)

diff --git a/rtengine/ipdehaze.cc b/rtengine/ipdehaze.cc
index 8ba1b935a..fb4f73903 100644
--- a/rtengine/ipdehaze.cc
+++ b/rtengine/ipdehaze.cc
@@ -16,7 +16,7 @@
  *
  *  You should have received a copy of the GNU General Public License
  *  along with RawTherapee.  If not, see <https://www.gnu.org/licenses/>.
- */
+*/
 
 /*
  * Haze removal using the algorithm described in the paper:
@@ -26,7 +26,7 @@
  *
  * using a guided filter for the "soft matting" of the transmission map
  *
- */  
+*/
 
 #include <algorithm>
 #include <iostream>
@@ -52,7 +52,7 @@ float normalize(Imagefloat *rgb, bool multithread)
     const int W = rgb->getWidth();
     const int H = rgb->getHeight();
 #ifdef _OPENMP
-#   pragma omp parallel for reduction(max:maxval) if (multithread)
+    #pragma omp parallel for reduction(max:maxval) schedule(dynamic, 16) if (multithread)
 #endif
     for (int y = 0; y < H; ++y) {
         for (int x = 0; x < W; ++x) {
@@ -61,7 +61,7 @@ float normalize(Imagefloat *rgb, bool multithread)
     }
     maxval = max(maxval * 2.f, 65535.f);
 #ifdef _OPENMP
-#   pragma omp parallel for if (multithread)
+    #pragma omp parallel for schedule(dynamic, 16) if (multithread)
 #endif
     for (int y = 0; y < H; ++y) {
         for (int x = 0; x < W; ++x) {
@@ -102,13 +102,36 @@ int get_dark_channel(const array2D<float> &R, const array2D<float> &G, const arr
     for (int y = 0; y < H; y += patchsize) {
         const int pH = min(y + patchsize, H);
         for (int x = 0; x < W; x += patchsize) {
-            float val = RT_INFINITY_F;
+            float minR = RT_INFINITY_F;
+            float minG = RT_INFINITY_F;
+            float minB = RT_INFINITY_F;
+#ifdef __SSE2__
+            vfloat minRv = F2V(minR);
+            vfloat minGv = F2V(minG);
+            vfloat minBv = F2V(minB);
+#endif
             const int pW = min(x + patchsize, W);
-            for (int xx = x; xx < pW; ++xx) {
-                for (int yy = y; yy < pH; ++yy) {
-                    val = min(val, R[yy][xx] / ambient[0], G[yy][xx] / ambient[1], B[yy][xx] / ambient[2]);
+            for (int yy = y; yy < pH; ++yy) {
+                int xx = x;
+#ifdef __SSE2__
+                for (; xx < pW - 3; xx += 4) {
+                    minRv = vminf(minRv, LVFU(R[yy][xx]));
+                    minGv = vminf(minGv, LVFU(G[yy][xx]));
+                    minBv = vminf(minBv, LVFU(B[yy][xx]));
+                }
+#endif
+                for (; xx < pW; ++xx) {
+                    minR = min(minR, R[yy][xx]);
+                    minG = min(minG, G[yy][xx]);
+                    minB = min(minB, B[yy][xx]);
                 }
             }
+#ifdef __SSE2__
+            minR = min(minR, vhmin(minRv));
+            minG = min(minG, vhmin(minGv));
+            minB = min(minB, vhmin(minBv));
+#endif
+            float val = min(minR / ambient[0], minG / ambient[1], minB / ambient[2]);
             val = 1.f - strength * LIM01(val);
             for (int yy = y; yy < pH; ++yy) {
                 std::fill(dst[yy] + x, dst[yy] + pW, val);
@@ -269,25 +292,25 @@ BENCHFUN
 
     int patchsize = max(int(5 / scale), 2);
     float ambient[3];
-    float max_t = 0.f;
+    float maxDistance = 0.f;
 
     {
-        array2D<float> R(W, H);
+        array2D<float>& R = dark; // R and dark can safely use the same buffer, which is faster and reduces memory allocations/deallocations
         array2D<float> G(W, H);
         array2D<float> B(W, H);
         extract_channels(img, R, G, B, patchsize, 1e-1, multiThread);
 
         {
             constexpr int sizecap = 200;
-            float r = float(W)/float(H);
+            const float r = static_cast<float>(W) / static_cast<float>(H);
             const int hh = r >= 1.f ? sizecap : sizecap / r;
             const int ww = r >= 1.f ? sizecap * r : sizecap;
 
             if (W <= ww && H <= hh) {
                 // don't rescale small thumbs
                 array2D<float> D(W, H);
-                int npatches = get_dark_channel_downsized(R, G, B, D, 2, multiThread);
-                max_t = estimate_ambient_light(R, G, B, D, patchsize, npatches, ambient);
+                const int npatches = get_dark_channel_downsized(R, G, B, D, 2, multiThread);
+                maxDistance = estimate_ambient_light(R, G, B, D, patchsize, npatches, ambient);
             } else {
                 array2D<float> RR(ww, hh);
                 array2D<float> GG(ww, hh);
@@ -297,8 +320,8 @@ BENCHFUN
                 rescaleNearest(B, BB, multiThread);
                 array2D<float> D(ww, hh);
 
-                int npatches = get_dark_channel_downsized(RR, GG, BB, D, 2, multiThread);
-                max_t = estimate_ambient_light(RR, GG, BB, D, patchsize, npatches, ambient);
+                const int npatches = get_dark_channel_downsized(RR, GG, BB, D, 2, multiThread);
+                maxDistance = estimate_ambient_light(RR, GG, BB, D, patchsize, npatches, ambient);
             }
         }
 
@@ -327,11 +350,11 @@ BENCHFUN
     guidedFilter(guideB, dark, dark, radius, epsilon, multiThread);
         
     if (options.rtSettings.verbose) {
-        std::cout << "dehaze: max distance is " << max_t << std::endl;
+        std::cout << "dehaze: max distance is " << maxDistance << std::endl;
     }
 
     const float depth = -float(params->dehaze.depth) / 100.f;
-    const float t0 = max(1e-3f, std::exp(depth * max_t));
+    const float t0 = max(1e-3f, std::exp(depth * maxDistance));
     const float teps = 1e-3f;
 
     const bool luminance = params->dehaze.luminance;

From c83b577dc7b06a8d092c27f60f2e47911e5caba8 Mon Sep 17 00:00:00 2001
From: Ingo Weyrich <heckflosse67@gmx.de>
Date: Mon, 23 Sep 2019 14:22:14 +0200
Subject: [PATCH 18/31] hasselblad_load_raw: apply changes requested by
 @Floessie in code review

---
 rtengine/dcraw.cc | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/rtengine/dcraw.cc b/rtengine/dcraw.cc
index 5275c42c1..c9c66a8fb 100644
--- a/rtengine/dcraw.cc
+++ b/rtengine/dcraw.cc
@@ -2440,10 +2440,10 @@ void CLASS hasselblad_load_raw()
         }
         for (int col = 0; col < raw_width; col += 2) {
             for (int s = 0; s < tiff_samples * 2; s += 2) {
-                int len[2];
-                for (int c = 0; c < 2; ++c) {
-                    len[c] = ph1_huff(jh.huff[0]);
-                }
+                const int len[2]= {
+                    ph1_huff(jh.huff[0]),
+                    ph1_huff(jh.huff[0])
+                };
                 for (int c = 0; c < 2; ++c) {
                     diff[s + c] = hb_bits(len[c]);
                     if ((diff[s + c] & (1 << (len[c] - 1))) == 0) {

From 1e41ee62650fc27129a1b602539536434949372d Mon Sep 17 00:00:00 2001
From: Ingo Weyrich <heckflosse67@gmx.de>
Date: Mon, 23 Sep 2019 15:54:11 +0200
Subject: [PATCH 19/31] dehaze: fix two cppcheck style warnings, #5456

---
 rtengine/ipdehaze.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/rtengine/ipdehaze.cc b/rtengine/ipdehaze.cc
index fb4f73903..d36c876c2 100644
--- a/rtengine/ipdehaze.cc
+++ b/rtengine/ipdehaze.cc
@@ -91,7 +91,7 @@ void restore(Imagefloat *rgb, float maxval, bool multithread)
     }
 }
 
-int get_dark_channel(const array2D<float> &R, const array2D<float> &G, const array2D<float> &B, array2D<float> &dst, int patchsize, const float ambient[3], bool clip, bool multithread, float strength)
+int get_dark_channel(const array2D<float> &R, const array2D<float> &G, const array2D<float> &B, const array2D<float> &dst, int patchsize, const float ambient[3], bool clip, bool multithread, float strength)
 {
     const int W = R.width();
     const int H = R.height();
@@ -142,7 +142,7 @@ int get_dark_channel(const array2D<float> &R, const array2D<float> &G, const arr
     return (W / patchsize + ((W % patchsize) > 0)) *  (H / patchsize + ((H % patchsize) > 0));
 }
 
-int get_dark_channel_downsized(const array2D<float> &R, const array2D<float> &G, const array2D<float> &B, array2D<float> &dst, int patchsize, bool multithread)
+int get_dark_channel_downsized(const array2D<float> &R, const array2D<float> &G, const array2D<float> &B, const array2D<float> &dst, int patchsize, bool multithread)
 {
     const int W = R.width();
     const int H = R.height();

From d1ccf27780d8825eadab1effc5fb38af96707497 Mon Sep 17 00:00:00 2001
From: Ingo Weyrich <heckflosse67@gmx.de>
Date: Mon, 23 Sep 2019 16:32:23 +0200
Subject: [PATCH 20/31] Capture sharpening: add missing history message

---
 rtdata/languages/default | 1 +
 1 file changed, 1 insertion(+)

diff --git a/rtdata/languages/default b/rtdata/languages/default
index be1638e8e..d54536603 100644
--- a/rtdata/languages/default
+++ b/rtdata/languages/default
@@ -770,6 +770,7 @@ HISTORY_MSG_PDSHARPEN_AUTO_RADIUS;CAS - Auto radius
 HISTORY_MSG_PDSHARPEN_GAMMA;CAS - Gamma
 HISTORY_MSG_PDSHARPEN_ITERATIONS;CAS - Iterations
 HISTORY_MSG_PDSHARPEN_RADIUS;CAS - Radius
+HISTORY_MSG_PDSHARPEN_RADIUS_OFFSET;CAS - Radius offset
 HISTORY_MSG_PIXELSHIFT_DEMOSAIC;PS - Demosaic method for motion
 HISTORY_MSG_PREPROCESS_LINEDENOISE_DIRECTION;Line noise filter direction
 HISTORY_MSG_PREPROCESS_PDAFLINESFILTER;PDAF lines filter

From 8d5c999ad84946f127815416f39cfc389f6d30da Mon Sep 17 00:00:00 2001
From: Ingo Weyrich <heckflosse67@gmx.de>
Date: Mon, 23 Sep 2019 16:44:55 +0200
Subject: [PATCH 21/31] hasselblad_load_raw: fix warning

---
 rtengine/dcraw.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/rtengine/dcraw.cc b/rtengine/dcraw.cc
index ad634c1e3..ef209118b 100644
--- a/rtengine/dcraw.cc
+++ b/rtengine/dcraw.cc
@@ -2441,8 +2441,8 @@ void CLASS hasselblad_load_raw()
         for (int col = 0; col < raw_width; col += 2) {
             for (int s = 0; s < tiff_samples * 2; s += 2) {
                 const int len[2]= {
-                    ph1_huff(jh.huff[0]),
-                    ph1_huff(jh.huff[0])
+                    static_cast<int>(ph1_huff(jh.huff[0])),
+                    static_cast<int>(ph1_huff(jh.huff[0]))
                 };
                 for (int c = 0; c < 2; ++c) {
                     diff[s + c] = hb_bits(len[c]);

From 57466be795ee871cc04bbbce9b399e1ee05db814 Mon Sep 17 00:00:00 2001
From: Ingo Weyrich <heckflosse67@gmx.de>
Date: Mon, 23 Sep 2019 20:33:32 +0200
Subject: [PATCH 22/31] Capture sharpening: Label and tooltip changes, also
 renamed key for favorite tab to capturesharpening

---
 rtdata/languages/default |  5 ++---
 rtgui/pdsharpening.cc    | 10 +++++-----
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/rtdata/languages/default b/rtdata/languages/default
index d54536603..aabff4bbe 100644
--- a/rtdata/languages/default
+++ b/rtdata/languages/default
@@ -770,7 +770,7 @@ HISTORY_MSG_PDSHARPEN_AUTO_RADIUS;CAS - Auto radius
 HISTORY_MSG_PDSHARPEN_GAMMA;CAS - Gamma
 HISTORY_MSG_PDSHARPEN_ITERATIONS;CAS - Iterations
 HISTORY_MSG_PDSHARPEN_RADIUS;CAS - Radius
-HISTORY_MSG_PDSHARPEN_RADIUS_OFFSET;CAS - Radius offset
+HISTORY_MSG_PDSHARPEN_RADIUS_BOOST;CAS - Corner radius boost
 HISTORY_MSG_PIXELSHIFT_DEMOSAIC;PS - Demosaic method for motion
 HISTORY_MSG_PREPROCESS_LINEDENOISE_DIRECTION;Line noise filter direction
 HISTORY_MSG_PREPROCESS_PDAFLINESFILTER;PDAF lines filter
@@ -1802,7 +1802,6 @@ TP_PCVIGNETTE_ROUNDNESS_TOOLTIP;Roundness:\n0 = rectangle,\n50 = fitted ellipse,
 TP_PCVIGNETTE_STRENGTH;Strength
 TP_PCVIGNETTE_STRENGTH_TOOLTIP;Filter strength in stops (reached in corners).
 TP_PDSHARPENING_LABEL;Capture Sharpening
-TP_PDSHARPENING_AUTORADIUS_TOOLTIP;If the checkbox is checked, RawTherapee calculates a value based on the raw data of the image.
 TP_PERSPECTIVE_HORIZONTAL;Horizontal
 TP_PERSPECTIVE_LABEL;Perspective
 TP_PERSPECTIVE_VERTICAL;Vertical
@@ -2038,7 +2037,7 @@ TP_SHARPENING_LABEL;Sharpening
 TP_SHARPENING_METHOD;Method
 TP_SHARPENING_ONLYEDGES;Sharpen only edges
 TP_SHARPENING_RADIUS;Radius
-TP_SHARPENING_RADIUS_OFFSET;Radius corner offset
+TP_SHARPENING_RADIUS_BOOST;Corner radius boost
 TP_SHARPENING_RLD;RL Deconvolution
 TP_SHARPENING_RLD_AMOUNT;Amount
 TP_SHARPENING_RLD_DAMPING;Damping
diff --git a/rtgui/pdsharpening.cc b/rtgui/pdsharpening.cc
index f25e44e69..cd34a466e 100644
--- a/rtgui/pdsharpening.cc
+++ b/rtgui/pdsharpening.cc
@@ -26,14 +26,14 @@
 using namespace rtengine;
 using namespace rtengine::procparams;
 
-PdSharpening::PdSharpening() : FoldableToolPanel(this, "pdsharpening", M("TP_PDSHARPENING_LABEL"), false, true)
+PdSharpening::PdSharpening() : FoldableToolPanel(this, "capturesharpening", M("TP_PDSHARPENING_LABEL"), false, true)
 {
 
     auto m = ProcEventMapper::getInstance();
     EvPdShrContrast = m->newEvent(CAPTURESHARPEN, "HISTORY_MSG_PDSHARPEN_CONTRAST");
     EvPdSharpenGamma = m->newEvent(CAPTURESHARPEN, "HISTORY_MSG_PDSHARPEN_GAMMA");
     EvPdShrDRadius = m->newEvent(CAPTURESHARPEN, "HISTORY_MSG_PDSHARPEN_RADIUS");
-    EvPdShrDRadiusOffset = m->newEvent(CAPTURESHARPEN, "HISTORY_MSG_PDSHARPEN_RADIUS_OFFSET");
+    EvPdShrDRadiusOffset = m->newEvent(CAPTURESHARPEN, "HISTORY_MSG_PDSHARPEN_RADIUS_BOOST");
     EvPdShrDIterations = m->newEvent(CAPTURESHARPEN, "HISTORY_MSG_PDSHARPEN_ITERATIONS");
     EvPdShrAutoContrast = m->newEvent(CAPTURESHARPEN, "HISTORY_MSG_PDSHARPEN_AUTO_CONTRAST");
     EvPdShrAutoRadius = m->newEvent(CAPTURESHARPEN, "HISTORY_MSG_PDSHARPEN_AUTO_RADIUS");
@@ -42,7 +42,7 @@ PdSharpening::PdSharpening() : FoldableToolPanel(this, "pdsharpening", M("TP_PDS
     hb->show();
     contrast = Gtk::manage(new Adjuster(M("TP_SHARPENING_CONTRAST"), 0, 200, 1, 10));
     contrast->setAdjusterListener(this);
-    contrast->addAutoButton(M("TP_RAW_DUALDEMOSAICAUTOCONTRAST_TOOLTIP"));
+    contrast->addAutoButton();
     contrast->setAutoValue(true);
 
     pack_start(*contrast);
@@ -53,9 +53,9 @@ PdSharpening::PdSharpening() : FoldableToolPanel(this, "pdsharpening", M("TP_PDS
     Gtk::VBox* rld = Gtk::manage(new Gtk::VBox());
     gamma = Gtk::manage(new Adjuster(M("TP_SHARPENING_GAMMA"), 0.5, 6.0, 0.05, 1.00));
     dradius = Gtk::manage(new Adjuster(M("TP_SHARPENING_RADIUS"), 0.4, 1.15, 0.01, 0.75));
-    dradius->addAutoButton(M("TP_PDSHARPENING_AUTORADIUS_TOOLTIP"));
+    dradius->addAutoButton();
     dradius->setAutoValue(true);
-    dradiusOffset = Gtk::manage(new Adjuster(M("TP_SHARPENING_RADIUS_OFFSET"), 0.0, 0.5, 0.01, 0.0));
+    dradiusOffset = Gtk::manage(new Adjuster(M("TP_SHARPENING_RADIUS_BOOST"), 0.0, 0.5, 0.01, 0.0));
     diter = Gtk::manage(new Adjuster(M("TP_SHARPENING_RLD_ITERATIONS"), 1, 100, 1, 20));
     rld->pack_start(*gamma);
     rld->pack_start(*dradius);

From e62b004434e72c042bd8d05dabae74417482feee Mon Sep 17 00:00:00 2001
From: Ingo Weyrich <heckflosse67@gmx.de>
Date: Mon, 23 Sep 2019 22:14:52 +0200
Subject: [PATCH 23/31] dehaze: removed benchmark code

---
 rtengine/ipdehaze.cc | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/rtengine/ipdehaze.cc b/rtengine/ipdehaze.cc
index d36c876c2..e7bf71ba6 100644
--- a/rtengine/ipdehaze.cc
+++ b/rtengine/ipdehaze.cc
@@ -37,8 +37,6 @@
 #include "procparams.h"
 #include "rescale.h"
 #include "rt_math.h"
-#define BENCHMARK
-#include "StopWatch.h"
 
 extern Options options;
 
@@ -277,7 +275,7 @@ void ImProcFunctions::dehaze(Imagefloat *img)
     if (!params->dehaze.enabled || params->dehaze.strength == 0.0) {
         return;
     }
-BENCHFUN
+
     const float maxChannel = normalize(img, multiThread);
 
     const int W = img->getWidth();

From 5b72cc0dd3ce0288408ee0a1b0785f7c603a059e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fl=C3=B6ssie?= <floessie.mail@gmail.com>
Date: Tue, 24 Sep 2019 09:59:31 +0200
Subject: [PATCH 24/31] Quote parameters correctly for Linux when spawning
 (#5463)

---
 rtgui/extprog.cc | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/rtgui/extprog.cc b/rtgui/extprog.cc
index a6a9050c0..95c1c937d 100644
--- a/rtgui/extprog.cc
+++ b/rtgui/extprog.cc
@@ -58,7 +58,7 @@ bool ExtProgAction::execute (const std::vector<Glib::ustring>& fileNames) const
     }
 
     for (const auto& fileName : fileNames) {
-        cmdLine += " \"" + fileName + "\"";
+        cmdLine += " " + Glib::shell_quote(fileName);
     }
 
     return ExtProgStore::spawnCommandAsync (cmdLine);
@@ -256,7 +256,7 @@ bool ExtProgStore::openInGimp (const Glib::ustring& fileName)
 
 #else
 
-    auto cmdLine = Glib::ustring("gimp \"") + fileName + Glib::ustring("\"");
+    auto cmdLine = Glib::ustring("gimp ") + Glib::shell_quote(fileName);
     auto success = spawnCommandAsync (cmdLine);
 
 #endif
@@ -291,7 +291,7 @@ bool ExtProgStore::openInGimp (const Glib::ustring& fileName)
 
 #else
 
-    cmdLine = Glib::ustring("gimp-remote \"") + fileName + Glib::ustring("\"");
+    cmdLine = Glib::ustring("gimp-remote ") + Glib::shell_quote(fileName);
     success = ExtProgStore::spawnCommandAsync (cmdLine);
 
 #endif
@@ -312,7 +312,7 @@ bool ExtProgStore::openInPhotoshop (const Glib::ustring& fileName)
 
 #else
 
-    const auto cmdLine = Glib::ustring("\"") + Glib::build_filename(options.psDir, "Photoshop.exe") + Glib::ustring("\" \"") + fileName + Glib::ustring("\"");
+    const auto cmdLine = Glib::ustring("\"") + Glib::build_filename(options.psDir, "Photoshop.exe") + "\" " + Glib::shell_quote(fileName);
 
 #endif
 
@@ -334,7 +334,7 @@ bool ExtProgStore::openInCustomEditor (const Glib::ustring& fileName)
 
 #else
 
-    const auto cmdLine = Glib::ustring("\"") + options.customEditorProg + Glib::ustring("\" \"") + fileName + Glib::ustring("\"");
+    const auto cmdLine = Glib::ustring("\"") + options.customEditorProg + "\" " + Glib::shell_quote(fileName);
     return spawnCommandAsync (cmdLine);
 
 #endif

From 54ca2977c3ee3219d96b84fbdb0a2bbbce4f1af7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fl=C3=B6ssie?= <floessie.mail@gmail.com>
Date: Tue, 24 Sep 2019 14:54:13 +0200
Subject: [PATCH 25/31] Add missing inits and header to `PdSharpening`

---
 rtgui/pdsharpening.cc | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/rtgui/pdsharpening.cc b/rtgui/pdsharpening.cc
index cd34a466e..3134afa1c 100644
--- a/rtgui/pdsharpening.cc
+++ b/rtgui/pdsharpening.cc
@@ -18,17 +18,23 @@
 */
 
 #include <cmath>
-#include "eventmapper.h"
+#include <iomanip>
+
 #include "pdsharpening.h"
+
+#include "eventmapper.h"
 #include "options.h"
+
 #include "../rtengine/procparams.h"
 
 using namespace rtengine;
 using namespace rtengine::procparams;
 
-PdSharpening::PdSharpening() : FoldableToolPanel(this, "capturesharpening", M("TP_PDSHARPENING_LABEL"), false, true)
+PdSharpening::PdSharpening() :
+    FoldableToolPanel(this, "capturesharpening", M("TP_PDSHARPENING_LABEL"), false, true),
+    lastAutoContrast(true),
+    lastAutoRadius(true)
 {
-
     auto m = ProcEventMapper::getInstance();
     EvPdShrContrast = m->newEvent(CAPTURESHARPEN, "HISTORY_MSG_PDSHARPEN_CONTRAST");
     EvPdSharpenGamma = m->newEvent(CAPTURESHARPEN, "HISTORY_MSG_PDSHARPEN_GAMMA");

From 26bfb526bf8abd87b2f52022bb7537f98ab5a8ba Mon Sep 17 00:00:00 2001
From: Morgan Hardwood <bugs@londonlight.org>
Date: Tue, 24 Sep 2019 17:52:12 +0200
Subject: [PATCH 26/31] Deleted obsolete travis.yml file

---
 .travis.yml.fixme | 44 --------------------------------------------
 1 file changed, 44 deletions(-)
 delete mode 100644 .travis.yml.fixme

diff --git a/.travis.yml.fixme b/.travis.yml.fixme
deleted file mode 100644
index 0aa85f3b4..000000000
--- a/.travis.yml.fixme
+++ /dev/null
@@ -1,44 +0,0 @@
-sudo: required
-dist: trusty
-
-language: cpp
-
-compiler:
-  - gcc
-
-os:
-  - linux
-
-#branches:
-#  only:
-#  - master
-
-notifications:
-  irc:
-    channels:
-    - "chat.freenode.net#rawtherapee"
-    skip_join: true
-    template:
-    - "%{repository}/%{branch} (%{commit} - %{author}): %{build_url}: %{message}"
-  email:
-    on_success: change
-    on_failure: always
-
-env:
-  global:
-  - OMP_NUM_THREADS=4
-
-before_install:
-  - sudo add-apt-repository ppa:ubuntu-toolchain-r/test -y
-  - sudo add-apt-repository "deb http://archive.ubuntu.com/ubuntu/ xenial main"
-  - sudo apt-get -qq update
-  - sudo apt-get install gcc-6 g++-6
-  - sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-6 60 --slave /usr/bin/g++ g++ /usr/bin/g++-6
-  - sudo apt-get install build-essential cmake curl git libbz2-dev libcanberra-gtk3-dev libexiv2-dev libexpat-dev libfftw3-dev libglibmm-2.4-dev libgtk-3-dev libgtkmm-3.0-dev libiptcdata0-dev libjpeg8-dev liblcms2-dev libpng12-dev libsigc++-2.0-dev libtiff5-dev zlib1g-dev
-
-before_script:
-  - mkdir build
-  - cd build
-  - cmake -DCMAKE_CXX_FLAGS="-Wno-deprecated-declarations" -DWITH_LTO="OFF" -DPROC_TARGET_NUMBER="2" ..
-
-script: make

From 79b3ff8e6e559e85c5eec1a80460f5da4212447e Mon Sep 17 00:00:00 2001
From: Ingo Weyrich <heckflosse67@gmx.de>
Date: Tue, 24 Sep 2019 18:59:02 +0200
Subject: [PATCH 27/31] capture sharpening: allow negative corner boost

---
 rtengine/capturesharpening.cc | 17 ++++++++++-------
 rtgui/pdsharpening.cc         |  2 +-
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/rtengine/capturesharpening.cc b/rtengine/capturesharpening.cc
index e08243713..ef8a55a6e 100644
--- a/rtengine/capturesharpening.cc
+++ b/rtengine/capturesharpening.cc
@@ -581,14 +581,17 @@ BENCHFUN
                         gauss5x5mult(tmpThr, tmpIThr, fullTileSize, fullTileSize, kernel5);
                     }
                 } else {
-                    if (sigmaCornerOffset > 0.0) {
-                        float lkernel7[7][7];
+                    if (sigmaCornerOffset != 0.0) {
                         const float distance = sqrt(rtengine::SQR(i + tileSize / 2 - H / 2) + rtengine::SQR(j + tileSize / 2 - W / 2));
-                        compute7x7kernel(sigma + distanceFactor * distance, lkernel7);
-                        for (int k = 0; k < iterations - 1; ++k) {
-                            // apply 7x7 gaussian blur and divide luminance by result of gaussian blur
-                            gauss7x7div(tmpIThr, tmpThr, lumThr, fullTileSize, fullTileSize, lkernel7);
-                            gauss7x7mult(tmpThr, tmpIThr, fullTileSize, fullTileSize, lkernel7);
+                        const float sigmaTile = sigma + distanceFactor * distance;
+                        if (sigmaTile >= 0.4f) {
+                            float lkernel7[7][7];
+                            compute7x7kernel(sigma + distanceFactor * distance, lkernel7);
+                            for (int k = 0; k < iterations - 1; ++k) {
+                                // apply 7x7 gaussian blur and divide luminance by result of gaussian blur
+                                gauss7x7div(tmpIThr, tmpThr, lumThr, fullTileSize, fullTileSize, lkernel7);
+                                gauss7x7mult(tmpThr, tmpIThr, fullTileSize, fullTileSize, lkernel7);
+                            }
                         }
                     } else {
                         for (int k = 0; k < iterations; ++k) {
diff --git a/rtgui/pdsharpening.cc b/rtgui/pdsharpening.cc
index cd34a466e..759461ba5 100644
--- a/rtgui/pdsharpening.cc
+++ b/rtgui/pdsharpening.cc
@@ -55,7 +55,7 @@ PdSharpening::PdSharpening() : FoldableToolPanel(this, "capturesharpening", M("T
     dradius = Gtk::manage(new Adjuster(M("TP_SHARPENING_RADIUS"), 0.4, 1.15, 0.01, 0.75));
     dradius->addAutoButton();
     dradius->setAutoValue(true);
-    dradiusOffset = Gtk::manage(new Adjuster(M("TP_SHARPENING_RADIUS_BOOST"), 0.0, 0.5, 0.01, 0.0));
+    dradiusOffset = Gtk::manage(new Adjuster(M("TP_SHARPENING_RADIUS_BOOST"), -0.5, 0.5, 0.01, 0.0));
     diter = Gtk::manage(new Adjuster(M("TP_SHARPENING_RLD_ITERATIONS"), 1, 100, 1, 20));
     rld->pack_start(*gamma);
     rld->pack_start(*dradius);

From 851a12e165c50333dcd779f78ea0658c38798972 Mon Sep 17 00:00:00 2001
From: Ingo Weyrich <heckflosse67@gmx.de>
Date: Tue, 24 Sep 2019 19:18:26 +0200
Subject: [PATCH 28/31] capture sharpening: more clear variable names

---
 rtengine/capturesharpening.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/rtengine/capturesharpening.cc b/rtengine/capturesharpening.cc
index ef8a55a6e..6720d9abc 100644
--- a/rtengine/capturesharpening.cc
+++ b/rtengine/capturesharpening.cc
@@ -532,9 +532,9 @@ BENCHFUN
     constexpr int tileSize = 194;
     constexpr int border = 5;
     constexpr int fullTileSize = tileSize + 2 * border;
-    const float maxRadius = std::min<float>(1.15f, sigma + sigmaCornerOffset);
-    const float maxDistance = sqrt(rtengine::SQR(W * 0.5f) + rtengine::SQR(H * 0.5f));
-    const float distanceFactor = (maxRadius - sigma) / maxDistance;
+    const float cornerRadius = std::min<float>(1.15f, sigma + sigmaCornerOffset);
+    const float cornerDistance = sqrt(rtengine::SQR(W * 0.5f) + rtengine::SQR(H * 0.5f));
+    const float distanceFactor = (cornerRadius - sigma) / cornerDistance;
 
     double progress = startVal;
     const double progressStep = (endVal - startVal) * rtengine::SQR(tileSize) / (W * H);

From 5a19632475eda7958d93a9c9ce94fa3841fca7b6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fl=C3=B6ssie?= <floessie.mail@gmail.com>
Date: Wed, 25 Sep 2019 11:33:56 +0200
Subject: [PATCH 29/31] Fix non-SSE2 build

---
 rtengine/color.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/rtengine/color.h b/rtengine/color.h
index 1031ca150..97835ba10 100644
--- a/rtengine/color.h
+++ b/rtengine/color.h
@@ -210,10 +210,12 @@ public:
         return r * workingspace[1][0] + g * workingspace[1][1] + b * workingspace[1][2];
     }
 
+#ifdef __SSE2__
     static vfloat rgbLuminance(vfloat r, vfloat g, vfloat b, const vfloat workingspace[3])
     {
         return r * workingspace[0] + g * workingspace[1] + b * workingspace[2];
     }
+#endif
 
     /**
     * @brief Convert red/green/blue to L*a*b

From 277c494fefc69a63ddd970af217ef4499616bdf5 Mon Sep 17 00:00:00 2001
From: Morgan Hardwood <bugs@londonlight.org>
Date: Wed, 25 Sep 2019 11:54:30 +0200
Subject: [PATCH 30/31] Downgrade desktop file to Version=1.0, #5470

Version=1.1 caused Travis CI builds to fail.

Version 1.1 was necessary only to standardize the Keywords key,
but this key is a potential pitfall and unnecessary.
---
 rtdata/rawtherapee.desktop.in | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/rtdata/rawtherapee.desktop.in b/rtdata/rawtherapee.desktop.in
index c6c675c4d..107ce2a42 100644
--- a/rtdata/rawtherapee.desktop.in
+++ b/rtdata/rawtherapee.desktop.in
@@ -1,6 +1,6 @@
 [Desktop Entry]
 Type=Application
-Version=1.1
+Version=1.0
 Name=RawTherapee
 GenericName=Raw Photo Editor
 GenericName[cs]=Editor raw obrázků
@@ -16,5 +16,4 @@ Exec=rawtherapee %f
 Terminal=false
 MimeType=image/jpeg;image/png;image/tiff;image/x-adobe-dng;image/x-canon-cr2;image/x-canon-crf;image/x-canon-crw;image/x-fuji-raf;image/x-hasselblad-3fr;image/x-hasselblad-fff;image/x-jpg;image/x-kodak-dcr;image/x-kodak-k25;image/x-kodak-kdc;image/x-leaf-mos;image/x-leica-rwl;image/x-mamiya-mef;image/x-minolta-mrw;image/x-nikon-nef;image/x-nikon-nrw;image/x-olympus-orf;image/x-panasonic-raw;image/x-panasonic-rw2;image/x-pentax-pef;image/x-pentax-raw;image/x-phaseone-iiq;image/x-raw;image/x-rwz;image/x-samsung-srw;image/x-sigma-x3f;image/x-sony-arq;image/x-sony-arw;image/x-sony-sr2;image/x-sony-srf;image/x-tif;
 Categories=Graphics;Photography;2DGraphics;RasterGraphics;GTK;
-Keywords=raw;photo;photography;develop;pp3;graphics;
 StartupWMClass=rawtherapee

From b86b7a4af6f9b7248d9d3af0bc77395475a7721c Mon Sep 17 00:00:00 2001
From: Morgan Hardwood <bugs@londonlight.org>
Date: Wed, 25 Sep 2019 12:08:43 +0200
Subject: [PATCH 31/31] Reverted Keywords key in desktop file, #5470

---
 rtdata/rawtherapee.desktop.in | 1 +
 1 file changed, 1 insertion(+)

diff --git a/rtdata/rawtherapee.desktop.in b/rtdata/rawtherapee.desktop.in
index 107ce2a42..b059e7d6a 100644
--- a/rtdata/rawtherapee.desktop.in
+++ b/rtdata/rawtherapee.desktop.in
@@ -16,4 +16,5 @@ Exec=rawtherapee %f
 Terminal=false
 MimeType=image/jpeg;image/png;image/tiff;image/x-adobe-dng;image/x-canon-cr2;image/x-canon-crf;image/x-canon-crw;image/x-fuji-raf;image/x-hasselblad-3fr;image/x-hasselblad-fff;image/x-jpg;image/x-kodak-dcr;image/x-kodak-k25;image/x-kodak-kdc;image/x-leaf-mos;image/x-leica-rwl;image/x-mamiya-mef;image/x-minolta-mrw;image/x-nikon-nef;image/x-nikon-nrw;image/x-olympus-orf;image/x-panasonic-raw;image/x-panasonic-rw2;image/x-pentax-pef;image/x-pentax-raw;image/x-phaseone-iiq;image/x-raw;image/x-rwz;image/x-samsung-srw;image/x-sigma-x3f;image/x-sony-arq;image/x-sony-arw;image/x-sony-sr2;image/x-sony-srf;image/x-tif;
 Categories=Graphics;Photography;2DGraphics;RasterGraphics;GTK;
+Keywords=raw;photo;photography;develop;pp3;graphics;
 StartupWMClass=rawtherapee