diff --git a/rtdata/languages/default b/rtdata/languages/default
index 44b7aa226..13cbe64f5 100644
--- a/rtdata/languages/default
+++ b/rtdata/languages/default
@@ -1006,6 +1006,7 @@ HISTORY_MSG_COLORTONING_LABREGION_SHOWMASK;CT - region show mask
 HISTORY_MSG_COLORTONING_LABREGION_SLOPE;CT - region slope
 HISTORY_MSG_DEHAZE_DEPTH;Dehaze - Depth
 HISTORY_MSG_DEHAZE_ENABLED;Haze Removal
+HISTORY_MSG_DEHAZE_LUMINANCE;Dehaze - Luminance only
 HISTORY_MSG_DEHAZE_SHOW_DEPTH_MAP;Dehaze - Show depth map
 HISTORY_MSG_DEHAZE_STRENGTH;Dehaze - Strength
 HISTORY_MSG_DUALDEMOSAIC_AUTO_CONTRAST;Dual demosaic - Auto threshold
@@ -1032,7 +1033,7 @@ HISTORY_MSG_PDSHARPEN_AUTO_RADIUS;CAS - Auto radius
 HISTORY_MSG_PDSHARPEN_GAMMA;CAS - Gamma
 HISTORY_MSG_PDSHARPEN_ITERATIONS;CAS - Iterations
 HISTORY_MSG_PDSHARPEN_RADIUS;CAS - Radius
-HISTORY_MSG_PDSHARPEN_RADIUS_OFFSET;CAS - Radius offset
+HISTORY_MSG_PDSHARPEN_RADIUS_BOOST;CAS - Corner radius boost
 HISTORY_MSG_PIXELSHIFT_DEMOSAIC;PS - Demosaic method for motion
 HISTORY_MSG_PREPROCESS_LINEDENOISE_DIRECTION;Line noise filter direction
 HISTORY_MSG_PREPROCESS_PDAFLINESFILTER;PDAF lines filter
@@ -1807,6 +1808,7 @@ TP_DEFRINGE_RADIUS;Radius
 TP_DEFRINGE_THRESHOLD;Threshold
 TP_DEHAZE_DEPTH;Depth
 TP_DEHAZE_LABEL;Haze Removal
+TP_DEHAZE_LUMINANCE;Luminance only
 TP_DEHAZE_SHOW_DEPTH_MAP;Show depth map
 TP_DEHAZE_STRENGTH;Strength
 TP_DIRPYRDENOISE_CHROMINANCE_AMZ;Auto multi-zones
@@ -2202,7 +2204,7 @@ TP_LOCALLAB_RETIFRA;Retinex
 TP_LOCALLAB_RETI;Dehaze - Retinex Strong local contrast
 TP_LOCALLAB_RETI_NEIGH_VART_TOOLTIP;Adapt these values according to images - if misty images and depending on whether you want to act on the front or the background
 TP_LOCALLAB_LC_FFTW_TOOLTIP;FFT improve quality and allow big radius.\nThe treatment time depends on the surface to be treated.\nTo be used preferably for large radius.\n\nDimensions can be reduced by a few pixels to optimize FFTW.
-TP_LOCALLAB_RETI_FFTW_TOOLTIP;FFT improve quality and allow big radius.\nThe treatment time depends on the surface to be treated.\nTo be used preferably for large radius.\n\nDimensions can be reduced by a few pixels to optimize FFTW.\nThis optimization is not possible if the 2 delimiters of the Spot (vertical or horizontal) are outside the image
+TP_LOCALLAB_RETI_FFTW_TOOLTIP;FFT improve quality and allow big radius.\nThe treatment time depends on the surface to be treated\nThe treatment time depends on the value of scale (be carefull to high values).\nTo be used preferably for large radius.\n\nDimensions can be reduced by a few pixels to optimize FFTW.\nThis optimization is not possible if the 2 delimiters of the Spot (vertical or horizontal) are outside the image
 TP_LOCALLAB_TRANSMISSIONGAIN;Transmission gain
 TP_LOCALLAB_STREN;Compression Strength
 TP_LOCALLAB_STRGRID;Strength
@@ -2326,7 +2328,6 @@ TP_PCVIGNETTE_ROUNDNESS_TOOLTIP;Roundness:\n0 = rectangle,\n50 = fitted ellipse,
 TP_PCVIGNETTE_STRENGTH;Strength
 TP_PCVIGNETTE_STRENGTH_TOOLTIP;Filter strength in stops (reached in corners).
 TP_PDSHARPENING_LABEL;Capture Sharpening
-TP_PDSHARPENING_AUTORADIUS_TOOLTIP;If the checkbox is checked, RawTherapee calculates a value based on the raw data of the image.
 TP_PERSPECTIVE_HORIZONTAL;Horizontal
 TP_PERSPECTIVE_LABEL;Perspective
 TP_PERSPECTIVE_VERTICAL;Vertical
@@ -2562,7 +2563,7 @@ TP_SHARPENING_LABEL;Sharpening
 TP_SHARPENING_METHOD;Method
 TP_SHARPENING_ONLYEDGES;Sharpen only edges
 TP_SHARPENING_RADIUS;Radius
-TP_SHARPENING_RADIUS_OFFSET;Radius corner offset
+TP_SHARPENING_RADIUS_BOOST;Corner radius boost
 TP_SHARPENING_RLD;RL Deconvolution
 TP_SHARPENING_RLD_AMOUNT;Amount
 TP_SHARPENING_RLD_DAMPING;Damping
diff --git a/rtengine/boxblur.h b/rtengine/boxblur.h
index da302964b..27aa9d2fc 100644
--- a/rtengine/boxblur.h
+++ b/rtengine/boxblur.h
@@ -20,12 +20,14 @@
 #define _BOXBLUR_H_
 
 #include <assert.h>
+#include <memory>
 #include <stdlib.h>
 #include <string.h>
 #include <math.h>
 #include "alignedbuffer.h"
 #include "rt_math.h"
 #include "opthelper.h"
+#include "StopWatch.h"
 
 
 namespace rtengine
@@ -204,15 +206,15 @@ template<class T, class A> void boxblur (T** src, A** dst, T* buffer, int radx,
 
             tempv = tempv / lenv;
             temp1v = temp1v / lenv;
-            STVFU( dst[0][col], tempv);
-            STVFU( dst[0][col + 4], temp1v);
+            STVFU(dst[0][col], tempv);
+            STVFU(dst[0][col + 4], temp1v);
 
             for (int row = 1; row <= rady; row++) {
                 lenp1v = lenv + onev;
                 tempv = (tempv * lenv + LVFU(temp[(row + rady) * W + col])) / lenp1v;
                 temp1v = (temp1v * lenv + LVFU(temp[(row + rady) * W + col + 4])) / lenp1v;
-                STVFU( dst[row][col], tempv);
-                STVFU( dst[row][col + 4], temp1v);
+                STVFU(dst[row][col], tempv);
+                STVFU(dst[row][col + 4], temp1v);
                 lenv = lenp1v;
             }
 
@@ -221,16 +223,16 @@ template<class T, class A> void boxblur (T** src, A** dst, T* buffer, int radx,
             for (int row = rady + 1; row < H - rady; row++) {
                 tempv = tempv + (LVFU(temp[(row + rady) * W + col]) - LVFU(temp[(row - rady - 1) * W + col])) * rlenv ;
                 temp1v = temp1v + (LVFU(temp[(row + rady) * W + col + 4]) - LVFU(temp[(row - rady - 1) * W + col + 4])) * rlenv ;
-                STVFU( dst[row][col], tempv);
-                STVFU( dst[row][col + 4], temp1v);
+                STVFU(dst[row][col], tempv);
+                STVFU(dst[row][col + 4], temp1v);
             }
 
             for (int row = H - rady; row < H; row++) {
                 lenm1v = lenv - onev;
                 tempv = (tempv * lenv - LVFU(temp[(row - rady - 1) * W + col])) / lenm1v;
                 temp1v = (temp1v * lenv - LVFU(temp[(row - rady - 1) * W + col + 4])) / lenm1v;
-                STVFU( dst[row][col], tempv);
-                STVFU( dst[row][col + 4], temp1v);
+                STVFU(dst[row][col], tempv);
+                STVFU(dst[row][col + 4], temp1v);
                 lenv = lenm1v;
             }
         }
@@ -312,6 +314,223 @@ template<class T, class A> void boxblur (T** src, A** dst, T* buffer, int radx,
 
 }
 
+inline void boxblur (float** src, float** dst, int radius, int W, int H, bool multiThread)
+{
+    //box blur using rowbuffers and linebuffers instead of a full size buffer
+
+    if (radius == 0) {
+        if (src != dst) {
+#ifdef _OPENMP
+            #pragma omp parallel for if (multiThread)
+#endif
+
+            for (int row = 0; row < H; row++) {
+                for (int col = 0; col < W; col++) {
+                    dst[row][col] = src[row][col];
+                }
+            }
+        }
+        return;
+    }
+
+    constexpr int numCols = 8; // process numCols columns at once for better usage of L1 cpu cache
+#ifdef _OPENMP
+    #pragma omp parallel if (multiThread)
+#endif
+    {
+        std::unique_ptr<float> buffer(new float[std::max(W, 8 * H)]);
+
+        //horizontal blur
+        float* const lineBuffer = buffer.get();
+#ifdef _OPENMP
+        #pragma omp for
+#endif
+        for (int row = 0; row < H; row++) {
+            float len = radius + 1;
+            float tempval = src[row][0];
+            lineBuffer[0] = tempval;
+            for (int j = 1; j <= radius; j++) {
+                tempval += src[row][j];
+            }
+
+            tempval /= len;
+            dst[row][0] = tempval;
+
+            for (int col = 1; col <= radius; col++) {
+                lineBuffer[col] = src[row][col];
+                tempval = (tempval * len + src[row][col + radius]) / (len + 1);
+                dst[row][col] = tempval;
+                ++len;
+            }
+
+            for (int col = radius + 1; col < W - radius; col++) {
+                lineBuffer[col] = src[row][col];
+                dst[row][col] = tempval = tempval + (src[row][col + radius] - lineBuffer[col - radius - 1]) / len;
+            }
+
+            for (int col = W - radius; col < W; col++) {
+                dst[row][col] = tempval = (tempval * len - lineBuffer[col - radius - 1]) / (len - 1);
+                --len;
+            }
+        }
+
+        //vertical blur
+#ifdef __SSE2__
+        vfloat (* const rowBuffer)[2] = (vfloat(*)[2]) buffer.get();
+        const vfloat leninitv = F2V(radius + 1);
+        const vfloat onev = F2V(1.f);
+        vfloat tempv, temp1v, lenv, lenp1v, lenm1v, rlenv;
+
+#ifdef _OPENMP
+        #pragma omp for nowait
+#endif
+
+        for (int col = 0; col < W - 7; col += 8) {
+            lenv = leninitv;
+            tempv = LVFU(dst[0][col]);
+            temp1v = LVFU(dst[0][col + 4]);
+            rowBuffer[0][0] = tempv;
+            rowBuffer[0][1] = temp1v;
+
+            for (int i = 1; i <= radius; i++) {
+                tempv = tempv + LVFU(dst[i][col]);
+                temp1v = temp1v + LVFU(dst[i][col + 4]);
+            }
+
+            tempv = tempv / lenv;
+            temp1v = temp1v / lenv;
+            STVFU(dst[0][col], tempv);
+            STVFU(dst[0][col + 4], temp1v);
+
+            for (int row = 1; row <= radius; row++) {
+                rowBuffer[row][0] = LVFU(dst[row][col]);
+                rowBuffer[row][1] = LVFU(dst[row][col + 4]);
+                lenp1v = lenv + onev;
+                tempv = (tempv * lenv + LVFU(dst[row + radius][col])) / lenp1v;
+                temp1v = (temp1v * lenv + LVFU(dst[row + radius][col + 4])) / lenp1v;
+                STVFU(dst[row][col], tempv);
+                STVFU(dst[row][col + 4], temp1v);
+                lenv = lenp1v;
+            }
+
+            rlenv = onev / lenv;
+
+            for (int row = radius + 1; row < H - radius; row++) {
+                rowBuffer[row][0] = LVFU(dst[row][col]);
+                rowBuffer[row][1] = LVFU(dst[row][col + 4]);
+                tempv = tempv + (LVFU(dst[row + radius][col]) - rowBuffer[row - radius - 1][0]) * rlenv ;
+                temp1v = temp1v + (LVFU(dst[row + radius][col + 4]) - rowBuffer[row - radius - 1][1]) * rlenv ;
+                STVFU(dst[row][col], tempv);
+                STVFU(dst[row][col + 4], temp1v);
+            }
+
+            for (int row = H - radius; row < H; row++) {
+                lenm1v = lenv - onev;
+                tempv = (tempv * lenv - rowBuffer[row - radius - 1][0]) / lenm1v;
+                temp1v = (temp1v * lenv - rowBuffer[row - radius - 1][1]) / lenm1v;
+                STVFU(dst[row][col], tempv);
+                STVFU(dst[row][col + 4], temp1v);
+                lenv = lenm1v;
+            }
+        }
+
+#else
+        float (* const rowBuffer)[8] = (float(*)[8]) buffer.get();
+#ifdef _OPENMP
+        #pragma omp for nowait
+#endif
+
+        for (int col = 0; col < W - numCols + 1; col += 8) {
+            float len = radius + 1;
+
+            for (int k = 0; k < numCols; k++) {
+                rowBuffer[0][k] = dst[0][col + k];
+            }
+
+            for (int i = 1; i <= radius; i++) {
+                for (int k = 0; k < numCols; k++) {
+                    dst[0][col + k] += dst[i][col + k];
+                }
+            }
+
+            for(int k = 0; k < numCols; k++) {
+                dst[0][col + k] /= len;
+            }
+
+            for (int row = 1; row <= radius; row++) {
+                for(int k = 0; k < numCols; k++) {
+                    rowBuffer[row][k] = dst[row][col + k];
+                    dst[row][col + k] = (dst[row - 1][col + k] * len + dst[row + radius][col + k]) / (len + 1);
+                }
+
+                len ++;
+            }
+
+            for (int row = radius + 1; row < H - radius; row++) {
+                for(int k = 0; k < numCols; k++) {
+                    rowBuffer[row][k] = dst[row][col + k];
+                    dst[row][col + k] = dst[row - 1][col + k] + (dst[row + radius][col + k] - rowBuffer[row - radius - 1][k]) / len;
+                }
+            }
+
+            for (int row = H - radius; row < H; row++) {
+                for(int k = 0; k < numCols; k++) {
+                    dst[row][col + k] = (dst[row - 1][col + k] * len - rowBuffer[row - radius - 1][k]) / (len - 1);
+                }
+
+                len --;
+            }
+        }
+
+#endif
+        //vertical blur, remaining columns
+#ifdef _OPENMP
+        #pragma omp single
+#endif
+        {
+            const int remaining = W % numCols;
+
+            if (remaining > 0) {
+                float (* const rowBuffer)[8] = (float(*)[8]) buffer.get();
+                const int col = W - remaining;
+
+                float len = radius + 1;
+                for(int k = 0; k < remaining; ++k) {
+                    rowBuffer[0][k] = dst[0][col + k];
+                }
+                for (int row = 1; row <= radius; ++row) {
+                    for(int k = 0; k < remaining; ++k) {
+                        dst[0][col + k] += dst[row][col + k];
+                    }
+                }
+                for(int k = 0; k < remaining; ++k) {
+                    dst[0][col + k] /= len;
+                }
+                for (int row = 1; row <= radius; ++row) {
+                    for(int k = 0; k < remaining; ++k) {
+                        rowBuffer[row][k] = dst[row][col + k];
+                        dst[row][col + k] = (dst[row - 1][col + k] * len + dst[row + radius][col + k]) / (len + 1);
+                    }
+                    len ++;
+                }
+                const float rlen = 1.f / len;
+                for (int row = radius + 1; row < H - radius; ++row) {
+                    for(int k = 0; k < remaining; ++k) {
+                        rowBuffer[row][k] = dst[row][col + k];
+                        dst[row][col + k] = dst[row - 1][col + k] + (dst[row + radius][col + k] - rowBuffer[row - radius - 1][k]) * rlen;
+                    }
+                }
+                for (int row = H - radius; row < H; ++row) {
+                    for(int k = 0; k < remaining; ++k) {
+                        dst[row][col + k] = (dst[(row - 1)][col + k] * len - rowBuffer[row - radius - 1][k]) / (len - 1);
+                    }
+                    len --;
+                }
+            }
+        }
+    }
+}
+
 template<class T, class A> void boxblur (T* src, A* dst, A* buffer, int radx, int rady, int W, int H)
 {
     //box blur image; box range = (radx,rady) i.e. box size is (2*radx+1)x(2*rady+1)
@@ -382,15 +601,15 @@ template<class T, class A> void boxblur (T* src, A* dst, A* buffer, int radx, in
 
             tempv = tempv / lenv;
             temp1v = temp1v / lenv;
-            STVFU( dst[0 * W + col], tempv);
-            STVFU( dst[0 * W + col + 4], temp1v);
+            STVFU(dst[0 * W + col], tempv);
+            STVFU(dst[0 * W + col + 4], temp1v);
 
             for (int row = 1; row <= rady; row++) {
                 lenp1v = lenv + onev;
                 tempv = (tempv * lenv + LVFU(temp[(row + rady) * W + col])) / lenp1v;
                 temp1v = (temp1v * lenv + LVFU(temp[(row + rady) * W + col + 4])) / lenp1v;
-                STVFU( dst[row * W + col], tempv);
-                STVFU( dst[row * W + col + 4], temp1v);
+                STVFU(dst[row * W + col], tempv);
+                STVFU(dst[row * W + col + 4], temp1v);
                 lenv = lenp1v;
             }
 
@@ -399,16 +618,16 @@ template<class T, class A> void boxblur (T* src, A* dst, A* buffer, int radx, in
             for (int row = rady + 1; row < H - rady; row++) {
                 tempv = tempv + (LVFU(temp[(row + rady) * W + col]) - LVFU(temp[(row - rady - 1) * W + col])) * rlenv ;
                 temp1v = temp1v + (LVFU(temp[(row + rady) * W + col + 4]) - LVFU(temp[(row - rady - 1) * W + col + 4])) * rlenv ;
-                STVFU( dst[row * W + col], tempv);
-                STVFU( dst[row * W + col + 4], temp1v);
+                STVFU(dst[row * W + col], tempv);
+                STVFU(dst[row * W + col + 4], temp1v);
             }
 
             for (int row = H - rady; row < H; row++) {
                 lenm1v = lenv - onev;
                 tempv = (tempv * lenv - LVFU(temp[(row - rady - 1) * W + col])) / lenm1v;
                 temp1v = (temp1v * lenv - LVFU(temp[(row - rady - 1) * W + col + 4])) / lenm1v;
-                STVFU( dst[row * W + col], tempv);
-                STVFU( dst[row * W + col + 4], temp1v);
+                STVFU(dst[row * W + col], tempv);
+                STVFU(dst[row * W + col + 4], temp1v);
                 lenv = lenm1v;
             }
         }
@@ -422,12 +641,12 @@ template<class T, class A> void boxblur (T* src, A* dst, A* buffer, int radx, in
             }
 
             tempv = tempv / lenv;
-            STVFU( dst[0 * W + col], tempv);
+            STVFU(dst[0 * W + col], tempv);
 
             for (int row = 1; row <= rady; row++) {
                 lenp1v = lenv + onev;
                 tempv = (tempv * lenv + LVFU(temp[(row + rady) * W + col])) / lenp1v;
-                STVFU( dst[row * W + col], tempv);
+                STVFU(dst[row * W + col], tempv);
                 lenv = lenp1v;
             }
 
@@ -435,13 +654,13 @@ template<class T, class A> void boxblur (T* src, A* dst, A* buffer, int radx, in
 
             for (int row = rady + 1; row < H - rady; row++) {
                 tempv = tempv + (LVFU(temp[(row + rady) * W + col]) - LVFU(temp[(row - rady - 1) * W + col])) * rlenv ;
-                STVFU( dst[row * W + col], tempv);
+                STVFU(dst[row * W + col], tempv);
             }
 
             for (int row = H - rady; row < H; row++) {
                 lenm1v = lenv - onev;
                 tempv = (tempv * lenv - LVFU(temp[(row - rady - 1) * W + col])) / lenm1v;
-                STVFU( dst[row * W + col], tempv);
+                STVFU(dst[row * W + col], tempv);
                 lenv = lenm1v;
             }
         }
diff --git a/rtengine/color.h b/rtengine/color.h
index a84c023ab..58c7a1d5f 100644
--- a/rtengine/color.h
+++ b/rtengine/color.h
@@ -213,6 +213,11 @@ public:
         return r * workingspace[1][0] + g * workingspace[1][1] + b * workingspace[1][2];
     }
 
+    static vfloat rgbLuminance(vfloat r, vfloat g, vfloat b, const vfloat workingspace[3])
+    {
+        return r * workingspace[0] + g * workingspace[1] + b * workingspace[2];
+    }
+
     /**
     * @brief Convert red/green/blue to L*a*b
     * @brief Convert red/green/blue to hue/saturation/luminance
diff --git a/rtengine/guidedfilter.cc b/rtengine/guidedfilter.cc
index 0ebe6c172..159e89504 100644
--- a/rtengine/guidedfilter.cc
+++ b/rtengine/guidedfilter.cc
@@ -3,6 +3,7 @@
  *  This file is part of RawTherapee.
  *
  *  Copyright (c) 2018 Alberto Griggio <alberto.griggio@gmail.com>
+ *  Optimized 2019 Ingo Weyrich <heckflosse67@gmx.de>
  *
  *  RawTherapee is free software: you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License as published by
@@ -16,9 +17,9 @@
  *
  *  You should have received a copy of the GNU General Public License
  *  along with RawTherapee.  If not, see <https://www.gnu.org/licenses/>.
- */
+*/
 
-/**
+/*
  * This is a Fast Guided Filter implementation, derived directly from the
  * pseudo-code of the paper:
  *
@@ -26,32 +27,16 @@
  * by Kaiming He, Jian Sun
  *
  * available at https://arxiv.org/abs/1505.00996
- */
+*/
 
 #include "guidedfilter.h"
 #include "boxblur.h"
 #include "rescale.h"
 #include "imagefloat.h"
-
+#define BENCHMARK
+#include "StopWatch.h"
 namespace rtengine {
 
-#if 0
-#  define DEBUG_DUMP(arr)                                                 \
-    do {                                                                \
-        Imagefloat im(arr.width(), arr.height());                      \
-        const char *out = "/tmp/" #arr ".tif";                     \
-        for (int y = 0; y < im.getHeight(); ++y) {                      \
-            for (int x = 0; x < im.getWidth(); ++x) {                   \
-                im.r(y, x) = im.g(y, x) = im.b(y, x) = arr[y][x] * 65535.f; \
-            }                                                           \
-        }                                                               \
-        im.saveTIFF(out, 16);                                           \
-    } while (false)
-#else
-#  define DEBUG_DUMP(arr)
-#endif
-
-
 namespace {
 
 int calculate_subsampling(int w, int h, int r)
@@ -78,18 +63,10 @@ int calculate_subsampling(int w, int h, int r)
 
 void guidedFilter(const array2D<float> &guide, const array2D<float> &src, array2D<float> &dst, int r, float epsilon, bool multithread, int subsampling)
 {
-
-    const int W = src.width();
-    const int H = src.height();
-
-    if (subsampling <= 0) {
-        subsampling = calculate_subsampling(W, H, r);
-    }
-
-    enum Op { MUL, DIVEPSILON, ADD, SUB, ADDMUL, SUBMUL };
+    enum Op {MUL, DIVEPSILON, SUBMUL};
 
     const auto apply =
-        [=](Op op, array2D<float> &res, const array2D<float> &a, const array2D<float> &b, const array2D<float> &c=array2D<float>()) -> void
+        [multithread, epsilon](Op op, array2D<float> &res, const array2D<float> &a, const array2D<float> &b, const array2D<float> &c=array2D<float>()) -> void
         {
             const int w = res.width();
             const int h = res.height();
@@ -99,137 +76,109 @@ void guidedFilter(const array2D<float> &guide, const array2D<float> &src, array2
 #endif
             for (int y = 0; y < h; ++y) {
                 for (int x = 0; x < w; ++x) {
-                    float r;
-                    float aa = a[y][x];
-                    float bb = b[y][x];
                     switch (op) {
-                    case MUL:
-                        r = aa * bb;
-                        break;
-                    case DIVEPSILON:
-                        r = aa / (bb + epsilon);
-                        break;
-                    case ADD:
-                        r = aa + bb;
-                        break;
-                    case SUB:
-                        r = aa - bb;
-                        break;
-                    case ADDMUL:
-                        r = aa * bb + c[y][x];
-                        break;
-                    case SUBMUL:
-                        r = c[y][x] - (aa * bb);
-                        break;
-                    default:
-                        assert(false);
-                        r = 0;
-                        break;
+                        case MUL:
+                            res[y][x] = a[y][x] * b[y][x];
+                            break;
+                        case DIVEPSILON:
+                            res[y][x] = a[y][x] / (b[y][x] + epsilon); // note: the value of epsilon intentionally has an impact on the result. It is not only to avoid divisions by zero
+                            break;
+                        case SUBMUL:
+                            res[y][x] = c[y][x] - (a[y][x] * b[y][x]);
+                            break;
+                        default:
+                            assert(false);
+                            res[y][x] = 0;
+                            break;
                     }
-                    res[y][x] = r;
                 }
             }
         };
 
-    // use the terminology of the paper (Algorithm 2)
-    const array2D<float> &I = guide;
-    const array2D<float> &p = src;
-    array2D<float> &q = dst;
-
     const auto f_subsample =
-        [=](array2D<float> &d, const array2D<float> &s) -> void
+        [multithread](array2D<float> &d, const array2D<float> &s) -> void
         {
             rescaleBilinear(s, d, multithread);
         };
 
-    const auto f_upsample = f_subsample;
-    
-    const size_t w = W / subsampling;
-    const size_t h = H / subsampling;
-
-    AlignedBuffer<float> blur_buf(w * h);
     const auto f_mean =
-        [&](array2D<float> &d, array2D<float> &s, int rad) -> void
+        [multithread](array2D<float> &d, array2D<float> &s, int rad) -> void
         {
             rad = LIM(rad, 0, (min(s.width(), s.height()) - 1) / 2 - 1);
-            float **src = s;
-            float **dst = d;
-#ifdef _OPENMP
-            #pragma omp parallel if (multithread)
-#endif
-            boxblur<float, float>(src, dst, blur_buf.data, rad, rad, s.width(), s.height());
+            boxblur(s, d, rad, s.width(), s.height(), multithread);
         };
 
+    const int W = src.width();
+    const int H = src.height();
+
+    if (subsampling <= 0) {
+        subsampling = calculate_subsampling(W, H, r);
+    }
+
+    const size_t w = W / subsampling;
+    const size_t h = H / subsampling;
+    const float r1 = float(r) / subsampling;
+
     array2D<float> I1(w, h);
     array2D<float> p1(w, h);
 
-    f_subsample(I1, I);
-    f_subsample(p1, p);
+    f_subsample(I1, guide);
 
-    DEBUG_DUMP(I);
-    DEBUG_DUMP(p);
-    DEBUG_DUMP(I1);
-    DEBUG_DUMP(p1);
+    if (&guide == &src) {
+        f_mean(p1, I1, r1);
 
-    float r1 = float(r) / subsampling;
+        apply(MUL, I1, I1, I1);        // I1 = I1 * I1
 
-    array2D<float> meanI(w, h);
-    f_mean(meanI, I1, r1);
-    DEBUG_DUMP(meanI);
+        f_mean(I1, I1, r1);
 
-    array2D<float> meanp(w, h);
-    f_mean(meanp, p1, r1);
-    DEBUG_DUMP(meanp);
+        apply(SUBMUL, I1, p1, p1, I1); // I1 = I1 - p1 * p1
+        apply(DIVEPSILON, I1, I1, I1); // I1 = I1 / (I1 + epsilon)
+        apply(SUBMUL, p1, I1, p1, p1); // p1 = p1 - I1 * p1
 
-    array2D<float> &corrIp = p1;
-    apply(MUL, corrIp, I1, p1);
-    f_mean(corrIp, corrIp, r1);
-    DEBUG_DUMP(corrIp);
+    } else {
+        f_subsample(p1, src);
 
-    array2D<float> &corrI = I1;
-    apply(MUL, corrI, I1, I1);
-    f_mean(corrI, corrI, r1);
-    DEBUG_DUMP(corrI);
+        array2D<float> meanI(w, h);
+        f_mean(meanI, I1, r1);
 
-    array2D<float> &varI = corrI;
-    apply(SUBMUL, varI, meanI, meanI, corrI);
-    DEBUG_DUMP(varI);
+        array2D<float> meanp(w, h);
+        f_mean(meanp, p1, r1);
 
-    array2D<float> &covIp = corrIp;
-    apply(SUBMUL, covIp, meanI, meanp, corrIp);
-    DEBUG_DUMP(covIp);
+        apply(MUL, p1, I1, p1);
 
-    array2D<float> &a = varI;
-    apply(DIVEPSILON, a, covIp, varI);
-    DEBUG_DUMP(a);
+        f_mean(p1, p1, r1);
 
-    array2D<float> &b = covIp;
-    apply(SUBMUL, b, a, meanI, meanp);
-    DEBUG_DUMP(b);
+        apply(MUL, I1, I1, I1);
 
-    meanI.free(); // frees w * h * 4 byte
-    meanp.free(); // frees w * h * 4 byte
+        f_mean(I1, I1, r1);
 
-    array2D<float> &meana = a;
-    f_mean(meana, a, r1);
-    DEBUG_DUMP(meana);
+        apply(SUBMUL, I1, meanI, meanI, I1);
+        apply(SUBMUL, p1, meanI, meanp, p1);
+        apply(DIVEPSILON, I1, p1, I1);
+        apply(SUBMUL, p1, I1, meanI, meanp);
+    }
 
-    array2D<float> &meanb = b;
-    f_mean(meanb, b, r1);
-    DEBUG_DUMP(meanb);
+    f_mean(I1, I1, r1);
+    f_mean(p1, p1, r1);
 
-    blur_buf.resize(0); // frees w * h * 4 byte
+    const int Ws = I1.width();
+    const int Hs = I1.height();
+    const int Wd = dst.width();
+    const int Hd = dst.height();
 
-    array2D<float> meanA(W, H);
-    f_upsample(meanA, meana);
-    DEBUG_DUMP(meanA);
+    const float col_scale = static_cast<float>(Ws) / static_cast<float>(Wd);
+    const float row_scale = static_cast<float>(Hs) / static_cast<float>(Hd);
 
-    array2D<float> &meanB = q;
-    f_upsample(meanB, meanb);
-    DEBUG_DUMP(meanB);
+#ifdef _OPENMP
+    #pragma omp parallel for if (multithread)
+#endif
 
-    apply(ADDMUL, q, meanA, I, meanB);
-    DEBUG_DUMP(q);
+    for (int y = 0; y < Hd; ++y) {
+        const float ymrs = y * row_scale;
+        for (int x = 0; x < Wd; ++x) {
+            dst[y][x] = getBilinearValue(I1, x * col_scale, ymrs) * guide[y][x] + getBilinearValue(p1, x * col_scale, ymrs);
+        }
+    }
 }
 
 } // namespace rtengine
diff --git a/rtengine/ipdehaze.cc b/rtengine/ipdehaze.cc
index 100c302a9..bc8f8be74 100644
--- a/rtengine/ipdehaze.cc
+++ b/rtengine/ipdehaze.cc
@@ -16,7 +16,7 @@
  *
  *  You should have received a copy of the GNU General Public License
  *  along with RawTherapee.  If not, see <https://www.gnu.org/licenses/>.
- */
+*/
 
 /*
  * Haze removal using the algorithm described in the paper:
@@ -26,15 +26,16 @@
  *
  * using a guided filter for the "soft matting" of the transmission map
  *
- */  
+*/
 
+#include <algorithm>
 #include <iostream>
-#include <queue>
+#include <vector>
 
 #include "guidedfilter.h"
 #include "improcfun.h"
 #include "procparams.h"
-#include "rt_algo.h"
+#include "rescale.h"
 #include "rt_math.h"
 
 extern Options options;
@@ -43,24 +44,103 @@ namespace rtengine {
 
 namespace {
 
-#if 0
-#  define DEBUG_DUMP(arr)                                                 \
-    do {                                                                \
-        Imagefloat im(arr.width(), arr.height());                      \
-        const char *out = "/tmp/" #arr ".tif";                     \
-        for (int y = 0; y < im.getHeight(); ++y) {                      \
-            for (int x = 0; x < im.getWidth(); ++x) {                   \
-                im.r(y, x) = im.g(y, x) = im.b(y, x) = arr[y][x] * 65535.f; \
-            }                                                           \
-        }                                                               \
-        im.saveTIFF(out, 16);                                           \
-    } while (false)
-#else
-#  define DEBUG_DUMP(arr)
+float normalize(Imagefloat *rgb, bool multithread)
+{
+    float maxval = 0.f;
+    const int W = rgb->getWidth();
+    const int H = rgb->getHeight();
+#ifdef _OPENMP
+    #pragma omp parallel for reduction(max:maxval) schedule(dynamic, 16) if (multithread)
 #endif
+    for (int y = 0; y < H; ++y) {
+        for (int x = 0; x < W; ++x) {
+            maxval = max(maxval, rgb->r(y, x), rgb->g(y, x), rgb->b(y, x));
+        }
+    }
+    maxval = max(maxval * 2.f, 65535.f);
+#ifdef _OPENMP
+    #pragma omp parallel for schedule(dynamic, 16) if (multithread)
+#endif
+    for (int y = 0; y < H; ++y) {
+        for (int x = 0; x < W; ++x) {
+            rgb->r(y, x) /= maxval;
+            rgb->g(y, x) /= maxval;
+            rgb->b(y, x) /= maxval;
+        }
+    }
+    return maxval;
+}
 
+void restore(Imagefloat *rgb, float maxval, bool multithread)
+{
+    const int W = rgb->getWidth();
+    const int H = rgb->getHeight();
+    if (maxval > 0.f && maxval != 1.f) {
+#ifdef _OPENMP
+#       pragma omp parallel for if (multithread)
+#endif
+        for (int y = 0; y < H; ++y) {
+            for (int x = 0; x < W; ++x) {
+                rgb->r(y, x) *= maxval;
+                rgb->g(y, x) *= maxval;
+                rgb->b(y, x) *= maxval;
+            }
+        }
+    }
+}
 
-int get_dark_channel(const array2D<float> &R, const array2D<float> &G, const array2D<float> &B, array2D<float> &dst, int patchsize, const float ambient[3], bool clip, bool multithread)
+int get_dark_channel(const array2D<float> &R, const array2D<float> &G, const array2D<float> &B, const array2D<float> &dst, int patchsize, const float ambient[3], bool clip, bool multithread, float strength)
+{
+    const int W = R.width();
+    const int H = R.height();
+
+#ifdef _OPENMP
+    #pragma omp parallel for if (multithread)
+#endif
+    for (int y = 0; y < H; y += patchsize) {
+        const int pH = min(y + patchsize, H);
+        for (int x = 0; x < W; x += patchsize) {
+            float minR = RT_INFINITY_F;
+            float minG = RT_INFINITY_F;
+            float minB = RT_INFINITY_F;
+#ifdef __SSE2__
+            vfloat minRv = F2V(minR);
+            vfloat minGv = F2V(minG);
+            vfloat minBv = F2V(minB);
+#endif
+            const int pW = min(x + patchsize, W);
+            for (int yy = y; yy < pH; ++yy) {
+                int xx = x;
+#ifdef __SSE2__
+                for (; xx < pW - 3; xx += 4) {
+                    minRv = vminf(minRv, LVFU(R[yy][xx]));
+                    minGv = vminf(minGv, LVFU(G[yy][xx]));
+                    minBv = vminf(minBv, LVFU(B[yy][xx]));
+                }
+#endif
+                for (; xx < pW; ++xx) {
+                    minR = min(minR, R[yy][xx]);
+                    minG = min(minG, G[yy][xx]);
+                    minB = min(minB, B[yy][xx]);
+                }
+            }
+#ifdef __SSE2__
+            minR = min(minR, vhmin(minRv));
+            minG = min(minG, vhmin(minGv));
+            minB = min(minB, vhmin(minBv));
+#endif
+            float val = min(minR / ambient[0], minG / ambient[1], minB / ambient[2]);
+            val = 1.f - strength * LIM01(val);
+            for (int yy = y; yy < pH; ++yy) {
+                std::fill(dst[yy] + x, dst[yy] + pW, val);
+            }
+        }
+    }
+
+    return (W / patchsize + ((W % patchsize) > 0)) *  (H / patchsize + ((H % patchsize) > 0));
+}
+
+int get_dark_channel_downsized(const array2D<float> &R, const array2D<float> &G, const array2D<float> &B, const array2D<float> &dst, int patchsize, bool multithread)
 {
     const int W = R.width();
     const int H = R.height();
@@ -73,22 +153,11 @@ int get_dark_channel(const array2D<float> &R, const array2D<float> &G, const arr
         for (int x = 0; x < W; x += patchsize) {
             float val = RT_INFINITY_F;
             const int pW = min(x + patchsize, W);
-            for (int yy = y; yy < pH; ++yy) {
-                for (int xx = x; xx < pW; ++xx) {
-                    float r = R[yy][xx];
-                    float g = G[yy][xx];
-                    float b = B[yy][xx];
-                    if (ambient) {
-                        r /= ambient[0];
-                        g /= ambient[1];
-                        b /= ambient[2];
-                    }
-                    val = min(val, r, g, b);
+            for (int xx = x; xx < pW; ++xx) {
+                for (int yy = y; yy < pH; ++yy) {
+                    val = min(val, R[yy][xx], G[yy][xx], B[yy][xx]);
                 }
             }
-            if (clip) {
-                val = LIM01(val);
-            }
             for (int yy = y; yy < pH; ++yy) {
                 std::fill(dst[yy] + x, dst[yy] + pW, val);
             }
@@ -98,33 +167,24 @@ int get_dark_channel(const array2D<float> &R, const array2D<float> &G, const arr
     return (W / patchsize + ((W % patchsize) > 0)) *  (H / patchsize + ((H % patchsize) > 0));
 }
 
-
 float estimate_ambient_light(const array2D<float> &R, const array2D<float> &G, const array2D<float> &B, const array2D<float> &dark, int patchsize, int npatches, float ambient[3])
 {
     const int W = R.width();
     const int H = R.height();
 
-    const auto get_percentile =
-        [](std::priority_queue<float> &q, float prcnt) -> float
-        {
-            size_t n = LIM<size_t>(q.size() * prcnt, 1, q.size());
-            while (q.size() > n) {
-                q.pop();
-            }
-            return q.top();
-        };
-    
     float darklim = RT_INFINITY_F;
     {
-        std::priority_queue<float> p;
+        std::vector<float> p;
         for (int y = 0; y < H; y += patchsize) {
             for (int x = 0; x < W; x += patchsize) {
                 if (!OOG(dark[y][x], 1.f - 1e-5f)) {
-                    p.push(dark[y][x]);
+                    p.push_back(dark[y][x]);
                 }
             }
         }
-        darklim = get_percentile(p, 0.95);
+        const int pos = p.size() * 0.95;
+        std::nth_element(p.begin(), p.begin() + pos, p.end());
+        darklim = p[pos];
     }
 
     std::vector<std::pair<int, int>> patches;
@@ -145,7 +205,8 @@ float estimate_ambient_light(const array2D<float> &R, const array2D<float> &G, c
 
     float bright_lim = RT_INFINITY_F;
     {
-        std::priority_queue<float> l;
+        std::vector<float> l;
+        l.reserve(patches.size() * patchsize * patchsize);
         
         for (auto &p : patches) {
             const int pW = min(p.first+patchsize, W);
@@ -153,12 +214,13 @@ float estimate_ambient_light(const array2D<float> &R, const array2D<float> &G, c
             
             for (int y = p.second; y < pH; ++y) {
                 for (int x = p.first; x < pW; ++x) {
-                    l.push(R[y][x] + G[y][x] + B[y][x]);
+                    l.push_back(R[y][x] + G[y][x] + B[y][x]);
                 }
             }
         }
-
-        bright_lim = get_percentile(l, 0.95);
+        const int pos = l.size() * 0.95;
+        std::nth_element(l.begin(), l.begin() + pos, l.end());
+        bright_lim = l[pos];
     }
 
     double rr = 0, gg = 0, bb = 0;
@@ -190,7 +252,6 @@ float estimate_ambient_light(const array2D<float> &R, const array2D<float> &G, c
     return darklim > 0 ? -1.125f * std::log(darklim) : std::log(std::numeric_limits<float>::max()) / 2;
 }
 
-
 void extract_channels(Imagefloat *img, array2D<float> &r, array2D<float> &g, array2D<float> &b, int radius, float epsilon, bool multithread)
 {
     const int W = img->getWidth();
@@ -211,12 +272,12 @@ void extract_channels(Imagefloat *img, array2D<float> &r, array2D<float> &g, arr
 
 void ImProcFunctions::dehaze(Imagefloat *img, const DehazeParams &dehazeParams)
 {
-    if (!dehazeParams.enabled) {
+    if (!dehazeParams.enabled || dehazeParams.strength == 0.0) {
         return;
     }
 
-    img->normalizeFloatTo1();
-    
+    const float maxChannel = normalize(img, multiThread);
+
     const int W = img->getWidth();
     const int H = img->getHeight();
     const float strength = LIM01(float(dehazeParams.strength) / 100.f * 0.9f);
@@ -229,21 +290,47 @@ void ImProcFunctions::dehaze(Imagefloat *img, const DehazeParams &dehazeParams)
 
     int patchsize = max(int(5 / scale), 2);
     float ambient[3];
-    array2D<float> &t_tilde = dark;
-    float max_t = 0.f;
+    float maxDistance = 0.f;
 
     {
-        int npatches = 0;
-        array2D<float> R(W, H);
+        array2D<float>& R = dark; // R and dark can safely use the same buffer, which is faster and reduces memory allocations/deallocations
         array2D<float> G(W, H);
         array2D<float> B(W, H);
         extract_channels(img, R, G, B, patchsize, 1e-1, multiThread);
-    
-        patchsize = max(max(W, H) / 600, 2);
-        npatches = get_dark_channel(R, G, B, dark, patchsize, nullptr, false, multiThread);
-        DEBUG_DUMP(dark);
 
-        max_t = estimate_ambient_light(R, G, B, dark, patchsize, npatches, ambient);
+        {
+            constexpr int sizecap = 200;
+            const float r = static_cast<float>(W) / static_cast<float>(H);
+            const int hh = r >= 1.f ? sizecap : sizecap / r;
+            const int ww = r >= 1.f ? sizecap * r : sizecap;
+
+            if (W <= ww && H <= hh) {
+                // don't rescale small thumbs
+                array2D<float> D(W, H);
+                const int npatches = get_dark_channel_downsized(R, G, B, D, 2, multiThread);
+                maxDistance = estimate_ambient_light(R, G, B, D, patchsize, npatches, ambient);
+            } else {
+                array2D<float> RR(ww, hh);
+                array2D<float> GG(ww, hh);
+                array2D<float> BB(ww, hh);
+                rescaleNearest(R, RR, multiThread);
+                rescaleNearest(G, GG, multiThread);
+                rescaleNearest(B, BB, multiThread);
+                array2D<float> D(ww, hh);
+
+                const int npatches = get_dark_channel_downsized(RR, GG, BB, D, 2, multiThread);
+                maxDistance = estimate_ambient_light(RR, GG, BB, D, patchsize, npatches, ambient);
+            }
+        }
+
+        if (min(ambient[0], ambient[1], ambient[2]) < 0.01f) {
+            if (options.rtSettings.verbose) {
+                std::cout << "dehaze: no haze detected" << std::endl;
+            }
+            restore(img, maxChannel, multiThread);
+            return; // probably no haze at all
+        }
+        patchsize = max(max(W, H) / 600, 2);
 
         if (options.rtSettings.verbose) {
             std::cout << "dehaze: ambient light is "
@@ -251,78 +338,96 @@ void ImProcFunctions::dehaze(Imagefloat *img, const DehazeParams &dehazeParams)
                       << std::endl;
         }
 
-        get_dark_channel(R, G, B, dark, patchsize, ambient, true, multiThread);
-    }
-
-    if (min(ambient[0], ambient[1], ambient[2]) < 0.01f) {
-        if (options.rtSettings.verbose) {
-            std::cout << "dehaze: no haze detected" << std::endl;
-        }
-        img->normalizeFloatTo65535();
-        return; // probably no haze at all
-    }
-
-    DEBUG_DUMP(t_tilde);
-
-#ifdef _OPENMP
-    #pragma omp parallel for if (multiThread)
-#endif
-    for (int y = 0; y < H; ++y) {
-        for (int x = 0; x < W; ++x) {
-            dark[y][x] = 1.f - strength * dark[y][x];
-        }
+        get_dark_channel(R, G, B, dark, patchsize, ambient, true, multiThread, strength);
     }
 
     const int radius = patchsize * 4;
-    const float epsilon = 1e-5;
-    array2D<float> &t = t_tilde;
+    constexpr float epsilon = 1e-5f;
 
-    {
-        array2D<float> guideB(W, H, img->b.ptrs, ARRAY2D_BYREFERENCE);
-        guidedFilter(guideB, t_tilde, t, radius, epsilon, multiThread);
-    }
+    array2D<float> guideB(W, H, img->b.ptrs, ARRAY2D_BYREFERENCE);
+    guidedFilter(guideB, dark, dark, radius, epsilon, multiThread);
         
-    DEBUG_DUMP(t);
-
     if (options.rtSettings.verbose) {
-        std::cout << "dehaze: max distance is " << max_t << std::endl;
+        std::cout << "dehaze: max distance is " << maxDistance << std::endl;
     }
 
-    float depth = -float(dehazeParams.depth) / 100.f;
-    const float t0 = max(1e-3f, std::exp(depth * max_t));
+    const float depth = -float(dehazeParams.depth) / 100.f;
+    const float t0 = max(1e-3f, std::exp(depth * maxDistance));
     const float teps = 1e-3f;
+
+    const bool luminance = dehazeParams.luminance;
+    const TMatrix ws = ICCStore::getInstance()->workingSpaceMatrix(params->icm.workingProfile);
+#ifdef __SSE2__
+    const vfloat wsv[3] = {F2V(ws[1][0]), F2V(ws[1][1]),F2V(ws[1][2])};
+#endif
+    const float ambientY = Color::rgbLuminance(ambient[0], ambient[1], ambient[2], ws);
 #ifdef _OPENMP
     #pragma omp parallel for if (multiThread)
 #endif
     for (int y = 0; y < H; ++y) {
-        for (int x = 0; x < W; ++x) {
+        int x = 0;
+#ifdef __SSE2__
+        const vfloat onev = F2V(1.f);
+        const vfloat ambient0v = F2V(ambient[0]);
+        const vfloat ambient1v = F2V(ambient[1]);
+        const vfloat ambient2v = F2V(ambient[2]);
+        const vfloat ambientYv = F2V(ambientY);
+        const vfloat epsYv = F2V(1e-5f);
+        const vfloat t0v = F2V(t0);
+        const vfloat tepsv = F2V(teps);
+        const vfloat cmaxChannelv = F2V(maxChannel);
+        for (; x < W - 3; x += 4) {
             // ensure that the transmission is such that to avoid clipping...
-            float rgb[3] = { img->r(y, x), img->g(y, x), img->b(y, x) };
+            const vfloat r = LVFU(img->r(y, x));
+            const vfloat g = LVFU(img->g(y, x));
+            const vfloat b = LVFU(img->b(y, x));
             // ... t >= tl to avoid negative values
-            float tl = 1.f - min(rgb[0]/ambient[0], rgb[1]/ambient[1], rgb[2]/ambient[2]);
-            // ... t >= tu to avoid values > 1
-            float tu = t0 - teps;
-            for (int c = 0; c < 3; ++c) {
-                if (ambient[c] < 1) {
-                    tu = max(tu, (rgb[c] - ambient[c])/(1.f - ambient[c]));
-                }
-            }
-            float mt = max(t[y][x], t0, tl + teps, tu + teps);
+            const vfloat tlv = onev - vminf(r / ambient0v, vminf(g / ambient1v, b / ambient2v));
+            const vfloat mtv = vmaxf(LVFU(dark[y][x]), vmaxf(tlv + tepsv, t0v));
             if (dehazeParams.showDepthMap) {
-                img->r(y, x) = img->g(y, x) = img->b(y, x) = LIM01(1.f - mt);
+                const vfloat valv = vclampf(onev - mtv, ZEROV, onev) * cmaxChannelv;
+                STVFU(img->r(y, x), valv);
+                STVFU(img->g(y, x), valv);
+                STVFU(img->b(y, x), valv);
+            } else if (luminance) {
+                const vfloat Yv = Color::rgbLuminance(r, g, b, wsv);
+                const vfloat YYv = (Yv - ambientYv) / mtv + ambientYv;
+                const vfloat fv = vself(vmaskf_gt(Yv, epsYv), cmaxChannelv * YYv / Yv, cmaxChannelv);
+                STVFU(img->r(y, x), r * fv);
+                STVFU(img->g(y, x), g * fv);
+                STVFU(img->b(y, x), b * fv);
             } else {
-                float r = (rgb[0] - ambient[0]) / mt + ambient[0];
-                float g = (rgb[1] - ambient[1]) / mt + ambient[1];
-                float b = (rgb[2] - ambient[2]) / mt + ambient[2];
-
-                img->r(y, x) = r;
-                img->g(y, x) = g;
-                img->b(y, x) = b;
+                STVFU(img->r(y, x), ((r - ambient0v) / mtv + ambient0v) * cmaxChannelv);
+                STVFU(img->g(y, x), ((g - ambient1v) / mtv + ambient1v) * cmaxChannelv);
+                STVFU(img->b(y, x), ((b - ambient2v) / mtv + ambient2v) * cmaxChannelv);
+            }
+        }
+#endif
+        for (; x < W; ++x) {
+            // ensure that the transmission is such that to avoid clipping...
+            const float r = img->r(y, x);
+            const float g = img->g(y, x);
+            const float b = img->b(y, x);
+            // ... t >= tl to avoid negative values
+            const float tl = 1.f - min(r / ambient[0], g / ambient[1], b / ambient[2]);
+            const float mt = max(dark[y][x], t0, tl + teps);
+            if (dehazeParams.showDepthMap) {
+                img->r(y, x) = img->g(y, x) = img->b(y, x) = LIM01(1.f - mt) * maxChannel;
+            } else if (luminance) {
+                const float Y = Color::rgbLuminance(img->r(y, x), img->g(y, x), img->b(y, x), ws);
+                const float YY = (Y - ambientY) / mt + ambientY;
+                const float f = Y > 1e-5f ? maxChannel * YY / Y : maxChannel;
+                img->r(y, x) *= f;
+                img->g(y, x) *= f;
+                img->b(y, x) *= f;
+            } else {
+                img->r(y, x) = ((r - ambient[0]) / mt + ambient[0]) * maxChannel;
+                img->g(y, x) = ((g - ambient[1]) / mt + ambient[1]) * maxChannel;
+                img->b(y, x) = ((b - ambient[2]) / mt + ambient[2]) * maxChannel;
             }
         }
     }
-
-    img->normalizeFloatTo65535();
 }
 
+
 } // namespace rtengine
diff --git a/rtengine/iplocallab.cc b/rtengine/iplocallab.cc
index caba2147c..7aa5d7cf6 100644
--- a/rtengine/iplocallab.cc
+++ b/rtengine/iplocallab.cc
@@ -8678,6 +8678,7 @@ void ImProcFunctions::Lab_Local(int call, int sp, float** shbuffer, LabImage * o
                 dehazeParams.strength = lp.dehaze;
                 dehazeParams.showDepthMap = false;
                 dehazeParams.depth = lp.depth;
+                dehazeParams.luminance = params->locallab.spots.at(sp).lumonly;
                 tmpImage = new Imagefloat(bfw, bfh);
                 lab2rgb(*bufexpfin, *tmpImage, params->icm.workingProfile);
                 dehaze(tmpImage, dehazeParams);
@@ -8726,10 +8727,10 @@ void ImProcFunctions::Lab_Local(int call, int sp, float** shbuffer, LabImage * o
                 for (int jr = 0; jr < bfw; jr++) {
                     buflight[ir][jr] /= coef;
                     bufl_ab[ir][jr] /= coefC;
-                    if(params->locallab.spots.at(sp).lumonly) {
+//                    if(params->locallab.spots.at(sp).lumonly) {
                   //  if (lp.str >= 0.1f) {
-                        bufl_ab[ir][jr] = 0.f;
-                    }
+//                        bufl_ab[ir][jr] = 0.f;
+//                    }
                 }
             }
 
@@ -8923,6 +8924,7 @@ void ImProcFunctions::Lab_Local(int call, int sp, float** shbuffer, LabImage * o
                         dehazeParams.strength = 0.9f * lp.dehaze + 0.3f * lp.str;
                         dehazeParams.showDepthMap = false;
                         dehazeParams.depth = LIM(depthcombi, 0.f, 100.f);
+                        dehazeParams.luminance = params->locallab.spots.at(sp).lumonly;
 
                         tmpImage = new Imagefloat(Wd, Hd);
                         lab2rgb(*original, *tmpImage, params->icm.workingProfile);
@@ -8962,7 +8964,7 @@ void ImProcFunctions::Lab_Local(int call, int sp, float** shbuffer, LabImage * o
 
                 float minCD, maxCD, mini, maxi, Tmean, Tsigma, Tmin, Tmax;
                 bool fftw = lp.ftwreti;
-                fftw = false;
+               // fftw = false;
                 //for Retinex Mask are incorporated in MSR
                 ImProcFunctions::MSRLocal(sp, fftw, 1, bufreti, bufmask, buforig, buforigmas, orig, tmpl->L, orig1, Wd, Hd, params->locallab, sk, locRETgainCcurve, 0, 4, 1.f, minCD, maxCD, mini, maxi, Tmean, Tsigma, Tmin, Tmax,
                                           locccmasretiCurve, lcmasretiutili, locllmasretiCurve, llmasretiutili, lochhmasretiCurve, lhmasretiutili, llretiMask, transformed, lp.enaretiMasktmap, lp.enaretiMask);
@@ -9375,6 +9377,7 @@ void ImProcFunctions::Lab_Local(int call, int sp, float** shbuffer, LabImage * o
                         dehazeParams.strength = 0.9f * lp.dehaze + 0.3f * lp.str;
                         dehazeParams.showDepthMap = false;
                         dehazeParams.depth = LIM(depthcombi, 0.f, 100.f);
+                        dehazeParams.luminance = params->locallab.spots.at(sp).lumonly;
 
                         tmpImage = new Imagefloat(Wd, Hd);
                         lab2rgb(*original, *tmpImage, params->icm.workingProfile);
diff --git a/rtengine/procparams.cc b/rtengine/procparams.cc
index 84fdf9ee3..5d6d3f2ba 100644
--- a/rtengine/procparams.cc
+++ b/rtengine/procparams.cc
@@ -2601,7 +2601,7 @@ LocallabParams::LocallabSpot::LocallabSpot() :
     chromaskreti(0.0),
     gammaskreti(1.0),
     slomaskreti(0.0),
-    scalereti(3.0),
+    scalereti(2.0),
     darkness(2.0),
     lightnessreti(1.0),
     limd(8.0),
@@ -3086,7 +3086,8 @@ DehazeParams::DehazeParams() :
     enabled(false),
     strength(50),
     showDepthMap(false),
-    depth(25)
+    depth(25),
+    luminance(false)
 {
 }
 
@@ -3096,7 +3097,8 @@ bool DehazeParams::operator ==(const DehazeParams& other) const
         enabled == other.enabled
         && strength == other.strength
         && showDepthMap == other.showDepthMap
-        && depth == other.depth;
+        && depth == other.depth
+        && luminance == other.luminance;
 }
 
 bool DehazeParams::operator !=(const DehazeParams& other) const
@@ -3807,6 +3809,7 @@ int ProcParams::save(const Glib::ustring& fname, const Glib::ustring& fname2, bo
         saveToKeyfile(!pedited || pedited->dehaze.strength, "Dehaze", "Strength", dehaze.strength, keyFile);        
         saveToKeyfile(!pedited || pedited->dehaze.showDepthMap, "Dehaze", "ShowDepthMap", dehaze.showDepthMap, keyFile);        
         saveToKeyfile(!pedited || pedited->dehaze.depth, "Dehaze", "Depth", dehaze.depth, keyFile);        
+        saveToKeyfile(!pedited || pedited->dehaze.depth, "Dehaze", "Luminance", dehaze.luminance, keyFile);
 
 // Directional pyramid denoising
         saveToKeyfile(!pedited || pedited->dirpyrDenoise.enabled, "Directional Pyramid Denoising", "Enabled", dirpyrDenoise.enabled, keyFile);
@@ -6027,6 +6030,7 @@ int ProcParams::load(const Glib::ustring& fname, ParamsEdited* pedited)
             assignFromKeyfile(keyFile, "Dehaze", "Strength", pedited, dehaze.strength, pedited->dehaze.strength);
             assignFromKeyfile(keyFile, "Dehaze", "ShowDepthMap", pedited, dehaze.showDepthMap, pedited->dehaze.showDepthMap);
             assignFromKeyfile(keyFile, "Dehaze", "Depth", pedited, dehaze.depth, pedited->dehaze.depth);
+            assignFromKeyfile(keyFile, "Dehaze", "Luminance", pedited, dehaze.luminance, pedited->dehaze.luminance);
         }
         
         if (keyFile.has_group("Film Simulation")) {
diff --git a/rtengine/procparams.h b/rtengine/procparams.h
index efa49424f..9619262bd 100644
--- a/rtengine/procparams.h
+++ b/rtengine/procparams.h
@@ -1648,6 +1648,7 @@ struct DehazeParams {
     int strength;
     bool showDepthMap;
     int depth;
+    bool luminance;
 
     DehazeParams();
 
diff --git a/rtengine/sleefsseavx.c b/rtengine/sleefsseavx.c
index 3000c1c10..cce88df5d 100644
--- a/rtengine/sleefsseavx.c
+++ b/rtengine/sleefsseavx.c
@@ -1390,6 +1390,18 @@ static inline float vhadd( vfloat a ) {
     return _mm_cvtss_f32(_mm_add_ss(a, _mm_shuffle_ps(a, a, 1)));
 }
 
+static inline float vhmin(vfloat a) {
+    // returns min(a[0], a[1], a[2], a[3])
+    a = vminf(a, _mm_movehl_ps(a, a));
+    return _mm_cvtss_f32(vminf(a, _mm_shuffle_ps(a, a, 1)));
+}
+
+static inline float vhmax(vfloat a) {
+    // returns max(a[0], a[1], a[2], a[3])
+    a = vmaxf(a, _mm_movehl_ps(a, a));
+    return _mm_cvtss_f32(vmaxf(a, _mm_shuffle_ps(a, a, 1)));
+}
+
 static INLINE vfloat vmul2f(vfloat a){
     // fastest way to multiply by 2
 	return a + a;
diff --git a/rtgui/dehaze.cc b/rtgui/dehaze.cc
index 6f60d08d6..6b7fcd64f 100644
--- a/rtgui/dehaze.cc
+++ b/rtgui/dehaze.cc
@@ -36,6 +36,7 @@ Dehaze::Dehaze(): FoldableToolPanel(this, "dehaze", M("TP_DEHAZE_LABEL"), false,
     EvDehazeStrength = m->newEvent(HDR, "HISTORY_MSG_DEHAZE_STRENGTH");
     EvDehazeShowDepthMap = m->newEvent(HDR, "HISTORY_MSG_DEHAZE_SHOW_DEPTH_MAP");
     EvDehazeDepth = m->newEvent(HDR, "HISTORY_MSG_DEHAZE_DEPTH");
+    EvDehazeLuminance = m->newEvent(HDR, "HISTORY_MSG_DEHAZE_LUMINANCE");
     
     strength = Gtk::manage(new Adjuster(M("TP_DEHAZE_STRENGTH"), 0., 100., 1., 50.));
     strength->setAdjusterListener(this);
@@ -45,12 +46,17 @@ Dehaze::Dehaze(): FoldableToolPanel(this, "dehaze", M("TP_DEHAZE_LABEL"), false,
     depth->setAdjusterListener(this);
     depth->show();
 
+    luminance = Gtk::manage(new Gtk::CheckButton(M("TP_DEHAZE_LUMINANCE")));
+    luminance->signal_toggled().connect(sigc::mem_fun(*this, &Dehaze::luminanceChanged));
+    luminance->show();
+
     showDepthMap = Gtk::manage(new Gtk::CheckButton(M("TP_DEHAZE_SHOW_DEPTH_MAP")));
     showDepthMap->signal_toggled().connect(sigc::mem_fun(*this, &Dehaze::showDepthMapChanged));
     showDepthMap->show();
     
     pack_start(*strength);
     pack_start(*depth);
+    pack_start(*luminance);
     pack_start(*showDepthMap);
 }
 
@@ -64,12 +70,14 @@ void Dehaze::read(const ProcParams *pp, const ParamsEdited *pedited)
         depth->setEditedState(pedited->dehaze.depth ? Edited : UnEdited);
         set_inconsistent(multiImage && !pedited->dehaze.enabled);
         showDepthMap->set_inconsistent(!pedited->dehaze.showDepthMap);
+        luminance->set_inconsistent(!pedited->dehaze.luminance);
     }
 
     setEnabled(pp->dehaze.enabled);
     strength->setValue(pp->dehaze.strength);
     depth->setValue(pp->dehaze.depth);
     showDepthMap->set_active(pp->dehaze.showDepthMap);
+    luminance->set_active(pp->dehaze.luminance);
 
     enableListener();
 }
@@ -81,12 +89,14 @@ void Dehaze::write(ProcParams *pp, ParamsEdited *pedited)
     pp->dehaze.depth = depth->getValue();
     pp->dehaze.enabled = getEnabled();
     pp->dehaze.showDepthMap = showDepthMap->get_active();
+    pp->dehaze.luminance = luminance->get_active();
 
     if (pedited) {
         pedited->dehaze.strength = strength->getEditedState();
         pedited->dehaze.depth = depth->getEditedState();
         pedited->dehaze.enabled = !get_inconsistent();
         pedited->dehaze.showDepthMap = !showDepthMap->get_inconsistent();
+        pedited->dehaze.luminance = !luminance->get_inconsistent();
     }
 }
 
@@ -138,6 +148,12 @@ void Dehaze::showDepthMapChanged()
     }
 }
 
+void Dehaze::luminanceChanged()
+{
+    if (listener) {
+        listener->panelChanged(EvDehazeLuminance, luminance->get_active() ? M("GENERAL_ENABLED") : M("GENERAL_DISABLED"));
+    }
+}
 
 void Dehaze::setBatchMode(bool batchMode)
 {
diff --git a/rtgui/dehaze.h b/rtgui/dehaze.h
index 3120dfc91..6a9d31cd1 100644
--- a/rtgui/dehaze.h
+++ b/rtgui/dehaze.h
@@ -28,12 +28,14 @@ class Dehaze: public ToolParamBlock, public AdjusterListener, public FoldableToo
 private:
     Adjuster *strength;
     Adjuster *depth;
-    Gtk::CheckButton *showDepthMap;    
+    Gtk::CheckButton *showDepthMap;
+    Gtk::CheckButton *luminance;
 
     rtengine::ProcEvent EvDehazeEnabled;
     rtengine::ProcEvent EvDehazeStrength;
     rtengine::ProcEvent EvDehazeDepth;
     rtengine::ProcEvent EvDehazeShowDepthMap;
+    rtengine::ProcEvent EvDehazeLuminance;
     
 public:
 
@@ -47,6 +49,7 @@ public:
     void adjusterChanged(Adjuster *a, double newval) override;
     void enabledChanged() override;
     void showDepthMapChanged();
+    void luminanceChanged();
     void setAdjusterBehavior(bool strengthAdd);
 };
 
diff --git a/rtgui/locallab.cc b/rtgui/locallab.cc
index c9a2b611c..7584d1a58 100644
--- a/rtgui/locallab.cc
+++ b/rtgui/locallab.cc
@@ -357,7 +357,7 @@ Locallab::Locallab():
     lumonly(Gtk::manage(new Gtk::CheckButton(M("TP_LOCALLAB_LUMONLY")))),
     enaretiMask(Gtk::manage(new Gtk::CheckButton(M("TP_LOCALLAB_ENABLE_MASK")))),
     enaretiMasktmap(Gtk::manage(new Gtk::CheckButton(M("TP_LOCALLAB_TM_MASK")))),
-    fftwreti(Gtk::manage(new Gtk::CheckButton(M("TP_LOCALLAB_FFTW2")))),
+    fftwreti(Gtk::manage(new Gtk::CheckButton(M("TP_LOCALLAB_FFTW")))),
     // Sharpening
     inverssha(Gtk::manage(new Gtk::CheckButton(M("TP_LOCALLAB_INVERS")))),
     // Local contrast
diff --git a/rtgui/paramsedited.cc b/rtgui/paramsedited.cc
index 878ca6c8b..094c7074c 100644
--- a/rtgui/paramsedited.cc
+++ b/rtgui/paramsedited.cc
@@ -598,6 +598,7 @@ void ParamsEdited::set(bool v)
     dehaze.strength = v;
     dehaze.showDepthMap = v;
     dehaze.depth = v;
+    dehaze.luminance = v;
     metadata.mode = v;
     filmNegative.enabled = v;
     filmNegative.redRatio = v;
@@ -1455,6 +1456,7 @@ void ParamsEdited::initFrom(const std::vector<rtengine::procparams::ProcParams>&
         dehaze.strength = dehaze.strength && p.dehaze.strength == other.dehaze.strength;
         dehaze.showDepthMap = dehaze.showDepthMap && p.dehaze.showDepthMap == other.dehaze.showDepthMap;
         dehaze.depth = dehaze.depth && p.dehaze.depth == other.dehaze.depth;
+        dehaze.luminance = dehaze.luminance && p.dehaze.luminance == other.dehaze.luminance;
         metadata.mode = metadata.mode && p.metadata.mode == other.metadata.mode;
         filmNegative.enabled = filmNegative.enabled && p.filmNegative.enabled == other.filmNegative.enabled;
         filmNegative.redRatio = filmNegative.redRatio && p.filmNegative.redRatio == other.filmNegative.redRatio;
@@ -4551,6 +4553,10 @@ void ParamsEdited::combine(rtengine::procparams::ProcParams& toEdit, const rteng
         toEdit.dehaze.showDepthMap = mods.dehaze.showDepthMap;
     }
 
+    if (dehaze.luminance) {
+        toEdit.dehaze.luminance = mods.dehaze.luminance;
+    }
+
     if (metadata.mode) {
         toEdit.metadata.mode = mods.metadata.mode;
     }
diff --git a/rtgui/paramsedited.h b/rtgui/paramsedited.h
index b47f99cd5..3b2bb05b6 100644
--- a/rtgui/paramsedited.h
+++ b/rtgui/paramsedited.h
@@ -871,6 +871,7 @@ struct DehazeParamsEdited {
     bool strength;
     bool showDepthMap;
     bool depth;
+    bool luminance;
 };
 
 struct RAWParamsEdited {
diff --git a/rtgui/pdsharpening.cc b/rtgui/pdsharpening.cc
index f25e44e69..cd34a466e 100644
--- a/rtgui/pdsharpening.cc
+++ b/rtgui/pdsharpening.cc
@@ -26,14 +26,14 @@
 using namespace rtengine;
 using namespace rtengine::procparams;
 
-PdSharpening::PdSharpening() : FoldableToolPanel(this, "pdsharpening", M("TP_PDSHARPENING_LABEL"), false, true)
+PdSharpening::PdSharpening() : FoldableToolPanel(this, "capturesharpening", M("TP_PDSHARPENING_LABEL"), false, true)
 {
 
     auto m = ProcEventMapper::getInstance();
     EvPdShrContrast = m->newEvent(CAPTURESHARPEN, "HISTORY_MSG_PDSHARPEN_CONTRAST");
     EvPdSharpenGamma = m->newEvent(CAPTURESHARPEN, "HISTORY_MSG_PDSHARPEN_GAMMA");
     EvPdShrDRadius = m->newEvent(CAPTURESHARPEN, "HISTORY_MSG_PDSHARPEN_RADIUS");
-    EvPdShrDRadiusOffset = m->newEvent(CAPTURESHARPEN, "HISTORY_MSG_PDSHARPEN_RADIUS_OFFSET");
+    EvPdShrDRadiusOffset = m->newEvent(CAPTURESHARPEN, "HISTORY_MSG_PDSHARPEN_RADIUS_BOOST");
     EvPdShrDIterations = m->newEvent(CAPTURESHARPEN, "HISTORY_MSG_PDSHARPEN_ITERATIONS");
     EvPdShrAutoContrast = m->newEvent(CAPTURESHARPEN, "HISTORY_MSG_PDSHARPEN_AUTO_CONTRAST");
     EvPdShrAutoRadius = m->newEvent(CAPTURESHARPEN, "HISTORY_MSG_PDSHARPEN_AUTO_RADIUS");
@@ -42,7 +42,7 @@ PdSharpening::PdSharpening() : FoldableToolPanel(this, "pdsharpening", M("TP_PDS
     hb->show();
     contrast = Gtk::manage(new Adjuster(M("TP_SHARPENING_CONTRAST"), 0, 200, 1, 10));
     contrast->setAdjusterListener(this);
-    contrast->addAutoButton(M("TP_RAW_DUALDEMOSAICAUTOCONTRAST_TOOLTIP"));
+    contrast->addAutoButton();
     contrast->setAutoValue(true);
 
     pack_start(*contrast);
@@ -53,9 +53,9 @@ PdSharpening::PdSharpening() : FoldableToolPanel(this, "pdsharpening", M("TP_PDS
     Gtk::VBox* rld = Gtk::manage(new Gtk::VBox());
     gamma = Gtk::manage(new Adjuster(M("TP_SHARPENING_GAMMA"), 0.5, 6.0, 0.05, 1.00));
     dradius = Gtk::manage(new Adjuster(M("TP_SHARPENING_RADIUS"), 0.4, 1.15, 0.01, 0.75));
-    dradius->addAutoButton(M("TP_PDSHARPENING_AUTORADIUS_TOOLTIP"));
+    dradius->addAutoButton();
     dradius->setAutoValue(true);
-    dradiusOffset = Gtk::manage(new Adjuster(M("TP_SHARPENING_RADIUS_OFFSET"), 0.0, 0.5, 0.01, 0.0));
+    dradiusOffset = Gtk::manage(new Adjuster(M("TP_SHARPENING_RADIUS_BOOST"), 0.0, 0.5, 0.01, 0.0));
     diter = Gtk::manage(new Adjuster(M("TP_SHARPENING_RLD_ITERATIONS"), 1, 100, 1, 20));
     rld->pack_start(*gamma);
     rld->pack_start(*dradius);