From 6bebc19f02f23c5c6af98e44d807ae4c7f9dfee0 Mon Sep 17 00:00:00 2001
From: Ingo Weyrich <heckflosse67@gmx.de>
Date: Thu, 26 Sep 2019 15:03:09 +0200
Subject: [PATCH] reviewed boxblur code and usage

---
 rtengine/CMakeLists.txt    |    1 +
 rtengine/FTblockDN.cc      |  212 +++-----
 rtengine/boxblur.cc        |  420 +++++++++++++++
 rtengine/boxblur.h         |  872 +-----------------------------
 rtengine/gauss.cc          |   12 +-
 rtengine/gauss.h           |    2 +-
 rtengine/guidedfilter.cc   |    2 +-
 rtengine/improcfun.h       |   20 +-
 rtengine/ipretinex.cc      | 1044 +++++++++++++++++-------------------
 rtengine/ipsharpen.cc      |    4 +-
 rtengine/ipwavelet.cc      |    1 -
 rtengine/rawimagesource.cc |    4 +-
 rtengine/rawimagesource.h  |    2 +-
 rtengine/rescale.h         |    1 +
 rtengine/shmap.cc          |   13 +-
 15 files changed, 1015 insertions(+), 1595 deletions(-)
 create mode 100644 rtengine/boxblur.cc

diff --git a/rtengine/CMakeLists.txt b/rtengine/CMakeLists.txt
index f52cfa256..add2e51c0 100644
--- a/rtengine/CMakeLists.txt
+++ b/rtengine/CMakeLists.txt
@@ -31,6 +31,7 @@ set(CAMCONSTSFILE "camconst.json")
 
 set(RTENGINESOURCEFILES
     badpixels.cc
+    boxblur.cc
     CA_correct_RT.cc
     capturesharpening.cc
     EdgePreservingDecomposition.cc
diff --git a/rtengine/FTblockDN.cc b/rtengine/FTblockDN.cc
index d1e659114..8fd0ba29e 100644
--- a/rtengine/FTblockDN.cc
+++ b/rtengine/FTblockDN.cc
@@ -479,6 +479,7 @@ enum nrquality {QUALITY_STANDARD, QUALITY_HIGH};
 void ImProcFunctions::RGB_denoise(int kall, Imagefloat * src, Imagefloat * dst, Imagefloat * calclum, float * ch_M, float *max_r, float *max_b, bool isRAW, const procparams::DirPyrDenoiseParams & dnparams, const double expcomp, const NoiseCurve & noiseLCurve, const NoiseCurve & noiseCCurve, float &nresi, float &highresi)
 {
 BENCHFUN
+
 //#ifdef _DEBUG
     MyTime t1e, t2e;
     t1e.set();
@@ -687,8 +688,8 @@ BENCHFUN
             }
         }
 
-        int tilesize;
-        int overlap;
+        int tilesize = 0;
+        int overlap = 0;
 
         if (settings->leveldnti == 0) {
             tilesize = 1024;
@@ -1341,8 +1342,6 @@ BENCHFUN
 
 #ifdef _OPENMP
                                 int masterThread = omp_get_thread_num();
-#endif
-#ifdef _OPENMP
                                 #pragma omp parallel num_threads(denoiseNestedLevels) if (denoiseNestedLevels>1)
 #endif
                                 {
@@ -1351,11 +1350,9 @@ BENCHFUN
 #else
                                     int subThread = 0;
 #endif
-                                    float blurbuffer[TS * TS] ALIGNED64;
                                     float *Lblox = LbloxArray[subThread];
                                     float *fLblox = fLbloxArray[subThread];
                                     float pBuf[width + TS + 2 * blkrad * offset] ALIGNED16;
-                                    float nbrwt[TS * TS] ALIGNED64;
 #ifdef _OPENMP
                                     #pragma omp for
 #endif
@@ -1430,7 +1427,7 @@ BENCHFUN
 
 
                                         for (int hblk = 0; hblk < numblox_W; ++hblk) {
-                                            RGBtile_denoise(fLblox, hblk, noisevar_Ldetail, nbrwt, blurbuffer);
+                                            RGBtile_denoise(fLblox, hblk, noisevar_Ldetail);
                                         }//end of horizontal block loop
 
                                         //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
@@ -1447,14 +1444,8 @@ BENCHFUN
                                         //add row of blocks to output image tile
                                         RGBoutput_tile_row(Lblox, Ldetail, tilemask_out, height, width, topproc);
 
-                                        //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-
                                     }//end of vertical block loop
-
-                                    //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-
                                 }
-                                //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 
 #ifdef _OPENMP
                                 #pragma omp parallel for num_threads(denoiseNestedLevels) if (denoiseNestedLevels>1)
@@ -2041,26 +2032,20 @@ BENCHFUN
 
 }//end of main RGB_denoise
 
-
-//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-
-
-void ImProcFunctions::RGBtile_denoise(float * fLblox, int hblproc, float noisevar_Ldetail, float * nbrwt, float * blurbuffer)  //for DCT
+void ImProcFunctions::RGBtile_denoise(float* fLblox, int hblproc, float noisevar_Ldetail)  //for DCT
 {
-    int blkstart = hblproc * TS * TS;
+    float nbrwt[TS * TS] ALIGNED64;
+    const int blkstart = hblproc * TS * TS;
 
-    boxabsblur(fLblox + blkstart, nbrwt, 3, 3, TS, TS, blurbuffer); //blur neighbor weights for more robust estimation //for DCT
+    boxabsblur(fLblox + blkstart, nbrwt, 3, TS, TS, false); //blur neighbor weights for more robust estimation //for DCT
 
 #ifdef __SSE2__
-    __m128  tempv;
-    __m128  noisevar_Ldetailv = _mm_set1_ps(noisevar_Ldetail);
-    __m128  onev = _mm_set1_ps(1.0f);
+    const vfloat noisevar_Ldetailv = F2V(-1.f / noisevar_Ldetail);
+    const vfloat onev = F2V(1.f);
 
     for (int n = 0; n < TS * TS; n += 4) { //for DCT
-        tempv  = onev - xexpf(-SQRV(LVF(nbrwt[n])) / noisevar_Ldetailv);
-        _mm_storeu_ps(&fLblox[blkstart + n], LVFU(fLblox[blkstart + n]) * tempv);
+        const vfloat tempv  = onev - xexpf(SQRV(LVF(nbrwt[n])) * noisevar_Ldetailv);
+        STVF(fLblox[blkstart + n], LVF(fLblox[blkstart + n]) * tempv);
     }//output neighbor averaged result
 
 #else
@@ -2071,14 +2056,7 @@ void ImProcFunctions::RGBtile_denoise(float * fLblox, int hblproc, float noiseva
 
 #endif
 
-    //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-    //printf("vblk=%d  hlk=%d  wsqave=%f   ||   ",vblproc,hblproc,wsqave);
-
-}//end of function tile_denoise
-
-
-//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+}
 
 void ImProcFunctions::RGBoutput_tile_row(float *bloxrow_L, float ** Ldetail, float ** tilemask_out, int height, int width, int top)
 {
@@ -2207,7 +2185,7 @@ void ImProcFunctions::Noise_residualAB(const wavelet_decomposition &WaveletCoeff
     chmaxresid = maxresid;
 }
 
-bool ImProcFunctions::WaveletDenoiseAll_BiShrinkL(wavelet_decomposition &WaveletCoeffs_L, float *noisevarlum, float madL[8][3])
+bool ImProcFunctions::WaveletDenoiseAll_BiShrinkL(const wavelet_decomposition &WaveletCoeffs_L, float *noisevarlum, float madL[8][3])
 {
     int maxlvl = min(WaveletCoeffs_L.maxlevel(), 5);
     const float eps = 0.01f;
@@ -2258,23 +2236,22 @@ bool ImProcFunctions::WaveletDenoiseAll_BiShrinkL(wavelet_decomposition &Wavelet
                         //simple wavelet shrinkage
                         float * sfave = buffer[0] + 32;
                         float * sfaved = buffer[2] + 96;
-                        float * blurBuffer = buffer[1] + 64;
 
                         float mad_Lr = madL[lvl][dir - 1];
 
                         float levelFactor = mad_Lr * 5.f / (lvl + 1);
 #ifdef __SSE2__
-                        __m128 mad_Lv;
-                        __m128 ninev = _mm_set1_ps(9.0f);
-                        __m128 epsv = _mm_set1_ps(eps);
-                        __m128 mag_Lv;
-                        __m128 levelFactorv = _mm_set1_ps(levelFactor);
+                        vfloat mad_Lv;
+                        vfloat ninev = F2V(9.0f);
+                        vfloat epsv = F2V(eps);
+                        vfloat mag_Lv;
+                        vfloat levelFactorv = F2V(levelFactor);
                         int coeffloc_L;
 
                         for (coeffloc_L = 0; coeffloc_L < Hlvl_L * Wlvl_L - 3; coeffloc_L += 4) {
                             mad_Lv = LVFU(noisevarlum[coeffloc_L]) * levelFactorv;
                             mag_Lv = SQRV(LVFU(WavCoeffs_L[dir][coeffloc_L]));
-                            _mm_storeu_ps(&sfave[coeffloc_L], mag_Lv / (mag_Lv + mad_Lv * xexpf(-mag_Lv / (mad_Lv * ninev)) + epsv));
+                            STVFU(sfave[coeffloc_L], mag_Lv / (mag_Lv + mad_Lv * xexpf(-mag_Lv / (mad_Lv * ninev)) + epsv));
                         }
 
                         for (; coeffloc_L < Hlvl_L * Wlvl_L; ++coeffloc_L) {
@@ -2294,15 +2271,15 @@ bool ImProcFunctions::WaveletDenoiseAll_BiShrinkL(wavelet_decomposition &Wavelet
                         }
 
 #endif
-                        boxblur(sfave, sfaved, blurBuffer, lvl + 2, lvl + 2, Wlvl_L, Hlvl_L); //increase smoothness by locally averaging shrinkage
+                        boxblur(sfave, sfaved, lvl + 2, Wlvl_L, Hlvl_L, false); //increase smoothness by locally averaging shrinkage
 #ifdef __SSE2__
-                        __m128 sfavev;
-                        __m128 sf_Lv;
+                        vfloat sfavev;
+                        vfloat sf_Lv;
 
                         for (coeffloc_L = 0; coeffloc_L < Hlvl_L * Wlvl_L - 3; coeffloc_L += 4) {
                             sfavev = LVFU(sfaved[coeffloc_L]);
                             sf_Lv = LVFU(sfave[coeffloc_L]);
-                            _mm_storeu_ps(&WavCoeffs_L[dir][coeffloc_L], LVFU(WavCoeffs_L[dir][coeffloc_L]) * (SQRV(sfavev) + SQRV(sf_Lv)) / (sfavev + sf_Lv + epsv));
+                            STVFU(WavCoeffs_L[dir][coeffloc_L], LVFU(WavCoeffs_L[dir][coeffloc_L]) * (SQRV(sfavev) + SQRV(sf_Lv)) / (sfavev + sf_Lv + epsv));
                             //use smoothed shrinkage unless local shrinkage is much less
                         }
 
@@ -2340,7 +2317,7 @@ bool ImProcFunctions::WaveletDenoiseAll_BiShrinkL(wavelet_decomposition &Wavelet
     return (!memoryAllocationFailed);
 }
 
-bool ImProcFunctions::WaveletDenoiseAll_BiShrinkAB(wavelet_decomposition &WaveletCoeffs_L, wavelet_decomposition &WaveletCoeffs_ab,
+bool ImProcFunctions::WaveletDenoiseAll_BiShrinkAB(const wavelet_decomposition &WaveletCoeffs_L, const wavelet_decomposition &WaveletCoeffs_ab,
         float *noisevarchrom, float madL[8][3], float noisevar_ab, const bool useNoiseCCurve, bool autoch, bool denoiseMethodRgb)
 {
     int maxlvl = WaveletCoeffs_L.maxlevel();
@@ -2422,12 +2399,12 @@ bool ImProcFunctions::WaveletDenoiseAll_BiShrinkAB(wavelet_decomposition &Wavele
                         if (noisevar_ab > 0.001f) {
 
 #ifdef __SSE2__
-                            __m128 onev = _mm_set1_ps(1.f);
-                            __m128 mad_abrv = _mm_set1_ps(mad_abr);
-                            __m128 rmad_Lm9v = onev / _mm_set1_ps(mad_Lr * 9.f);
-                            __m128 mad_abv;
-                            __m128 mag_Lv, mag_abv;
-                            __m128 tempabv;
+                            vfloat onev = F2V(1.f);
+                            vfloat mad_abrv = F2V(mad_abr);
+                            vfloat rmad_Lm9v = onev / F2V(mad_Lr * 9.f);
+                            vfloat mad_abv;
+                            vfloat mag_Lv, mag_abv;
+                            vfloat tempabv;
                             int coeffloc_ab;
 
                             for (coeffloc_ab = 0; coeffloc_ab < Hlvl_ab * Wlvl_ab - 3; coeffloc_ab += 4) {
@@ -2437,7 +2414,7 @@ bool ImProcFunctions::WaveletDenoiseAll_BiShrinkAB(wavelet_decomposition &Wavele
                                 mag_Lv = LVFU(WavCoeffs_L[dir][coeffloc_ab]);
                                 mag_abv = SQRV(tempabv);
                                 mag_Lv = SQRV(mag_Lv) * rmad_Lm9v;
-                                _mm_storeu_ps(&WavCoeffs_ab[dir][coeffloc_ab], tempabv * SQRV((onev - xexpf(-(mag_abv / mad_abv) - (mag_Lv)))));
+                                STVFU(WavCoeffs_ab[dir][coeffloc_ab], tempabv * SQRV((onev - xexpf(-(mag_abv / mad_abv) - (mag_Lv)))));
                             }
 
                             // few remaining pixels
@@ -2470,9 +2447,7 @@ bool ImProcFunctions::WaveletDenoiseAll_BiShrinkAB(wavelet_decomposition &Wavele
         }
 
         for (int i = 2; i >= 0; i--) {
-            if (buffer[i] != nullptr) {
-                delete[] buffer[i];
-            }
+            delete[] buffer[i];
         }
 
     }
@@ -2480,7 +2455,7 @@ bool ImProcFunctions::WaveletDenoiseAll_BiShrinkAB(wavelet_decomposition &Wavele
 }
 
 
-bool ImProcFunctions::WaveletDenoiseAllL(wavelet_decomposition &WaveletCoeffs_L, float *noisevarlum, float madL[8][3], float * vari, int edge)//mod JD
+bool ImProcFunctions::WaveletDenoiseAllL(const wavelet_decomposition &WaveletCoeffs_L, float *noisevarlum, float madL[8][3], float * vari, int edge)//mod JD
 
 {
 
@@ -2530,16 +2505,14 @@ bool ImProcFunctions::WaveletDenoiseAllL(wavelet_decomposition &WaveletCoeffs_L,
         }
 
         for (int i = 3; i >= 0; i--) {
-            if (buffer[i] != nullptr) {
-                delete[] buffer[i];
-            }
+            delete[] buffer[i];
         }
     }
     return (!memoryAllocationFailed);
 }
 
 
-bool ImProcFunctions::WaveletDenoiseAllAB(wavelet_decomposition &WaveletCoeffs_L, wavelet_decomposition &WaveletCoeffs_ab,
+bool ImProcFunctions::WaveletDenoiseAllAB(const wavelet_decomposition &WaveletCoeffs_L, const wavelet_decomposition &WaveletCoeffs_ab,
         float *noisevarchrom, float madL[8][3], float noisevar_ab, const bool useNoiseCCurve, bool autoch, bool denoiseMethodRgb)//mod JD
 
 {
@@ -2596,7 +2569,7 @@ bool ImProcFunctions::WaveletDenoiseAllAB(wavelet_decomposition &WaveletCoeffs_L
 //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 
 
-void ImProcFunctions::ShrinkAllL(wavelet_decomposition &WaveletCoeffs_L, float **buffer, int level, int dir,
+void ImProcFunctions::ShrinkAllL(const wavelet_decomposition &WaveletCoeffs_L, float **buffer, int level, int dir,
         float *noisevarlum, float * madL, float * vari, int edge)
 
 {
@@ -2607,12 +2580,12 @@ void ImProcFunctions::ShrinkAllL(wavelet_decomposition &WaveletCoeffs_L, float *
     float * sfaved = buffer[1] + 64;
     float * blurBuffer = buffer[2] + 96;
 
-    int W_L = WaveletCoeffs_L.level_W(level);
-    int H_L = WaveletCoeffs_L.level_H(level);
+    const int W_L = WaveletCoeffs_L.level_W(level);
+    const int H_L = WaveletCoeffs_L.level_H(level);
 
     float ** WavCoeffs_L = WaveletCoeffs_L.level_coeffs(level);
-//      printf("OK lev=%d\n",level);
-    float mad_L = madL[dir - 1] ;
+    const float mad_L = madL[dir - 1] ;
+    const float levelFactor = mad_L * 5.f / static_cast<float>(level + 1);
 
     if (edge == 1 && vari) {
         noisevarlum = blurBuffer;       // we need one buffer, but fortunately we don't have to allocate a new one because we can use blurBuffer
@@ -2622,71 +2595,45 @@ void ImProcFunctions::ShrinkAllL(wavelet_decomposition &WaveletCoeffs_L, float *
         }
     }
 
-    float levelFactor = mad_L * 5.f / static_cast<float>(level + 1);
+    int i = 0;
 #ifdef __SSE2__
-    __m128  magv;
-    __m128 levelFactorv = _mm_set1_ps(levelFactor);
-    __m128  mad_Lv;
-    __m128  ninev = _mm_set1_ps(9.0f);
-    __m128  epsv = _mm_set1_ps(eps);
-    int i;
+    const vfloat levelFactorv = F2V(levelFactor);
+    const vfloat ninev = F2V(9.f);
+    const vfloat epsv = F2V(eps);
 
-    for (i = 0; i < W_L * H_L - 3; i += 4) {
-        mad_Lv = LVFU(noisevarlum[i]) * levelFactorv;
-        magv = SQRV(LVFU(WavCoeffs_L[dir][i]));
-        _mm_storeu_ps(&sfave[i], magv / (magv + mad_Lv * xexpf(-magv / (ninev * mad_Lv)) + epsv));
+    for (; i < W_L * H_L - 3; i += 4) {
+        const vfloat mad_Lv = LVFU(noisevarlum[i]) * levelFactorv;
+        const vfloat magv = SQRV(LVFU(WavCoeffs_L[dir][i]));
+        STVFU(sfave[i], magv / (magv + mad_Lv * xexpf(-magv / (ninev * mad_Lv)) + epsv));
     }
-
+#endif
     // few remaining pixels
     for (; i < W_L * H_L; ++i) {
-        float mag = SQR(WavCoeffs_L[dir][i]);
+        const float mag = SQR(WavCoeffs_L[dir][i]);
         sfave[i] = mag / (mag + levelFactor * noisevarlum[i] * xexpf(-mag / (9 * levelFactor * noisevarlum[i])) + eps);
     }
 
-#else
-
-    for (int i = 0; i < W_L * H_L; ++i) {
-
-        float mag = SQR(WavCoeffs_L[dir][i]);
-        float shrinkfactor = mag / (mag + levelFactor * noisevarlum[i] * xexpf(-mag / (9 * levelFactor * noisevarlum[i])) + eps);
-        sfave[i] = shrinkfactor;
-    }
-
-#endif
-    boxblur(sfave, sfaved, blurBuffer, level + 2, level + 2, W_L, H_L); //increase smoothness by locally averaging shrinkage
+    boxblur(sfave, sfaved, level + 2, W_L, H_L, false); //increase smoothness by locally averaging shrinkage
 
+    i = 0;
 #ifdef __SSE2__
-    __m128  sfv;
 
-    for (i = 0; i < W_L * H_L - 3; i += 4) {
-        sfv = LVFU(sfave[i]);
+    for (; i < W_L * H_L - 3; i += 4) {
+        const vfloat sfv = LVFU(sfave[i]);
         //use smoothed shrinkage unless local shrinkage is much less
-        _mm_storeu_ps(&WavCoeffs_L[dir][i], _mm_loadu_ps(&WavCoeffs_L[dir][i]) * (SQRV(LVFU(sfaved[i])) + SQRV(sfv)) / (LVFU(sfaved[i]) + sfv + epsv));
+        STVFU(WavCoeffs_L[dir][i], LVFU(WavCoeffs_L[dir][i]) * (SQRV(LVFU(sfaved[i])) + SQRV(sfv)) / (LVFU(sfaved[i]) + sfv + epsv));
     }
-
+#endif
     // few remaining pixels
     for (; i < W_L * H_L; ++i) {
-        float sf = sfave[i];
-
+        const float sf = sfave[i];
         //use smoothed shrinkage unless local shrinkage is much less
         WavCoeffs_L[dir][i] *= (SQR(sfaved[i]) + SQR(sf)) / (sfaved[i] + sf + eps);
     }//now luminance coefficients are denoised
-
-#else
-
-    for (int i = 0; i < W_L * H_L; ++i) {
-        float sf = sfave[i];
-
-        //use smoothed shrinkage unless local shrinkage is much less
-        WavCoeffs_L[dir][i] *= (SQR(sfaved[i]) + SQR(sf)) / (sfaved[i] + sf + eps);
-
-    }//now luminance coefficients are denoised
-
-#endif
 }
 
 
-void ImProcFunctions::ShrinkAllAB(wavelet_decomposition &WaveletCoeffs_L, wavelet_decomposition &WaveletCoeffs_ab, float **buffer, int level, int dir,
+void ImProcFunctions::ShrinkAllAB(const wavelet_decomposition &WaveletCoeffs_L, const wavelet_decomposition &WaveletCoeffs_ab, float **buffer, int level, int dir,
         float *noisevarchrom, float noisevar_ab, const bool useNoiseCCurve, bool autoch,
         bool denoiseMethodRgb, float * madL, float * madaab,  bool madCalculated)
 
@@ -2700,7 +2647,6 @@ void ImProcFunctions::ShrinkAllAB(wavelet_decomposition &WaveletCoeffs_L, wavele
 
     float * sfaveab = buffer[0] + 32;
     float * sfaveabd = buffer[1] + 64;
-    float * blurBuffer = buffer[2] + 96;
 
     int W_ab = WaveletCoeffs_ab.level_W(level);
     int H_ab = WaveletCoeffs_ab.level_H(level);
@@ -2724,12 +2670,12 @@ void ImProcFunctions::ShrinkAllAB(wavelet_decomposition &WaveletCoeffs_L, wavele
     if (noisevar_ab > 0.001f) {
         madab = useNoiseCCurve ? madab : madab * noisevar_ab;
 #ifdef __SSE2__
-        __m128 onev = _mm_set1_ps(1.f);
-        __m128 mad_abrv = _mm_set1_ps(madab);
+        vfloat onev = F2V(1.f);
+        vfloat mad_abrv = F2V(madab);
 
-        __m128 rmadLm9v = onev / _mm_set1_ps(mad_L * 9.f);
-        __m128 mad_abv ;
-        __m128 mag_Lv, mag_abv;
+        vfloat rmadLm9v = onev / F2V(mad_L * 9.f);
+        vfloat mad_abv ;
+        vfloat mag_Lv, mag_abv;
         int coeffloc_ab;
 
         for (coeffloc_ab = 0; coeffloc_ab < H_ab * W_ab - 3; coeffloc_ab += 4) {
@@ -2738,7 +2684,7 @@ void ImProcFunctions::ShrinkAllAB(wavelet_decomposition &WaveletCoeffs_L, wavele
             mag_Lv = LVFU(WavCoeffs_L[dir][coeffloc_ab]);
             mag_abv = SQRV(LVFU(WavCoeffs_ab[dir][coeffloc_ab]));
             mag_Lv = (SQRV(mag_Lv)) * rmadLm9v;
-            _mm_storeu_ps(&sfaveab[coeffloc_ab], (onev - xexpf(-(mag_abv / mad_abv) - (mag_Lv))));
+            STVFU(sfaveab[coeffloc_ab], (onev - xexpf(-(mag_abv / mad_abv) - (mag_Lv))));
         }
 
         // few remaining pixels
@@ -2761,18 +2707,18 @@ void ImProcFunctions::ShrinkAllAB(wavelet_decomposition &WaveletCoeffs_L, wavele
 
 #endif
 
-        boxblur(sfaveab, sfaveabd, blurBuffer, level + 2, level + 2, W_ab, H_ab); //increase smoothness by locally averaging shrinkage
+        boxblur(sfaveab, sfaveabd, level + 2, W_ab, H_ab, false); //increase smoothness by locally averaging shrinkage
 #ifdef __SSE2__
-        __m128 epsv = _mm_set1_ps(eps);
-        __m128 sfabv;
-        __m128 sfaveabv;
+        vfloat epsv = F2V(eps);
+        vfloat sfabv;
+        vfloat sfaveabv;
 
         for (coeffloc_ab = 0; coeffloc_ab < H_ab * W_ab - 3; coeffloc_ab += 4) {
             sfabv = LVFU(sfaveab[coeffloc_ab]);
             sfaveabv = LVFU(sfaveabd[coeffloc_ab]);
 
             //use smoothed shrinkage unless local shrinkage is much less
-            _mm_storeu_ps(&WavCoeffs_ab[dir][coeffloc_ab], LVFU(WavCoeffs_ab[dir][coeffloc_ab]) * (SQRV(sfaveabv) + SQRV(sfabv)) / (sfaveabv + sfabv + epsv));
+            STVFU(WavCoeffs_ab[dir][coeffloc_ab], LVFU(WavCoeffs_ab[dir][coeffloc_ab]) * (SQRV(sfaveabv) + SQRV(sfabv)) / (sfaveabv + sfabv + epsv));
         }
 
         // few remaining pixels
@@ -2919,8 +2865,8 @@ void ImProcFunctions::ShrinkAll_info(float ** WavCoeffs_a, float ** WavCoeffs_b,
 }
 
 
-void ImProcFunctions::WaveletDenoiseAll_info(int levwav, wavelet_decomposition &WaveletCoeffs_a,
-        wavelet_decomposition &WaveletCoeffs_b, float **noisevarlum, float **noisevarchrom, float **noisevarhue, float &chaut, int &Nb, float &redaut, float &blueaut, float &maxredaut, float &maxblueaut, float &minredaut, float &minblueaut, int schoice,
+void ImProcFunctions::WaveletDenoiseAll_info(int levwav, const wavelet_decomposition &WaveletCoeffs_a,
+        const wavelet_decomposition &WaveletCoeffs_b, float **noisevarlum, float **noisevarchrom, float **noisevarhue, float &chaut, int &Nb, float &redaut, float &blueaut, float &maxredaut, float &maxblueaut, float &minredaut, float &minblueaut, int schoice,
         float &chromina, float &sigma, float &lumema, float &sigma_L, float &redyel, float &skinc, float &nsknc, float &maxchred, float &maxchblue, float &minchred, float &minchblue, int &nb, float &chau, float &chred, float &chblue, bool denoiseMethodRgb)
 {
 
@@ -3106,7 +3052,7 @@ void ImProcFunctions::calcautodn_info(float &chaut, float &delta, int Nb, int le
 
 }
 
-void ImProcFunctions::RGB_denoise_info(Imagefloat * src, Imagefloat * provicalc, const bool isRAW, LUTf &gamcurve, float gam, float gamthresh, float gamslope, const procparams::DirPyrDenoiseParams & dnparams, const double expcomp, float &chaut, int &Nb,  float &redaut, float &blueaut, float &maxredaut, float &maxblueaut, float &minredaut, float &minblueaut, float &chromina, float &sigma, float &lumema, float &sigma_L, float &redyel, float &skinc, float &nsknc, bool multiThread)
+void ImProcFunctions::RGB_denoise_info(Imagefloat * src, Imagefloat * provicalc, const bool isRAW, const LUTf &gamcurve, float gam, float gamthresh, float gamslope, const procparams::DirPyrDenoiseParams & dnparams, const double expcomp, float &chaut, int &Nb,  float &redaut, float &blueaut, float &maxredaut, float &maxblueaut, float &minredaut, float &minblueaut, float &chromina, float &sigma, float &lumema, float &sigma_L, float &redyel, float &skinc, float &nsknc, bool multiThread)
 {
     if ((settings->leveldnautsimpl == 1 && dnparams.Cmethod == "MAN") || (settings->leveldnautsimpl == 0 && dnparams.C2method == "MANU")) {
         //nothing to do
@@ -3173,8 +3119,8 @@ void ImProcFunctions::RGB_denoise_info(Imagefloat * src, Imagefloat * provicalc,
 
     const float gain = pow(2.0f, float(expcomp));
 
-    int tilesize;
-    int overlap;
+    int tilesize = 0;
+    int overlap = 0;
 
     if (settings->leveldnti == 0) {
         tilesize = 1024;
@@ -3275,16 +3221,16 @@ void ImProcFunctions::RGB_denoise_info(Imagefloat * src, Imagefloat * provicalc,
                 for (int i = tiletop; i < tilebottom; i += 2) {
                     int i1 = i - tiletop;
 #ifdef __SSE2__
-                    __m128 aNv, bNv;
-                    __m128 c100v = _mm_set1_ps(100.f);
+                    vfloat aNv, bNv;
+                    vfloat c100v = F2V(100.f);
                     int j;
 
                     for (j = tileleft; j < tileright - 7; j += 8) {
                         int j1 = j - tileleft;
                         aNv = LVFU(acalc[i >> 1][j >> 1]);
                         bNv = LVFU(bcalc[i >> 1][j >> 1]);
-                        _mm_storeu_ps(&noisevarhue[i1 >> 1][j1 >> 1], xatan2f(bNv, aNv));
-                        _mm_storeu_ps(&noisevarchrom[i1 >> 1][j1 >> 1], vmaxf(vsqrtf(SQRV(aNv) + SQRV(bNv)),c100v));
+                        STVFU(noisevarhue[i1 >> 1][j1 >> 1], xatan2f(bNv, aNv));
+                        STVFU(noisevarchrom[i1 >> 1][j1 >> 1], vmaxf(vsqrtf(SQRV(aNv) + SQRV(bNv)),c100v));
                     }
 
                     for (; j < tileright; j += 2) {
diff --git a/rtengine/boxblur.cc b/rtengine/boxblur.cc
new file mode 100644
index 000000000..491ffae14
--- /dev/null
+++ b/rtengine/boxblur.cc
@@ -0,0 +1,420 @@
+/*
+ *  This file is part of RawTherapee.
+ *
+ *  Copyright (C) 2010 Emil Martinec <ejmartin@uchicago.edu>
+ *  Copyright (C) 2019 Ingo Weyrich <heckflosse67@gmx.de>
+ *
+ *  RawTherapee is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  RawTherapee is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with RawTherapee.  If not, see <https://www.gnu.org/licenses/>.
+*/
+
+#include <memory>
+#include <cmath>
+
+#include "boxblur.h"
+
+#include "rt_math.h"
+#include "opthelper.h"
+
+namespace rtengine
+{
+
+void boxblur(float** src, float** dst, int radius, int W, int H, bool multiThread)
+{
+    //box blur using rowbuffers and linebuffers instead of a full size buffer
+
+    if (radius == 0) {
+        if (src != dst) {
+#ifdef _OPENMP
+            #pragma omp parallel for if (multiThread)
+#endif
+
+            for (int row = 0; row < H; ++row) {
+                for (int col = 0; col < W; ++col) {
+                    dst[row][col] = src[row][col];
+                }
+            }
+        }
+        return;
+    }
+
+    constexpr int numCols = 8; // process numCols columns at once for better usage of L1 cpu cache
+#ifdef _OPENMP
+    #pragma omp parallel if (multiThread)
+#endif
+    {
+        std::unique_ptr<float[]> buffer(new float[numCols * (radius + 1)]);
+
+        //horizontal blur
+        float* const lineBuffer = buffer.get();
+#ifdef _OPENMP
+        #pragma omp for
+#endif
+        for (int row = 0; row < H; ++row) {
+            float len = radius + 1;
+            float tempval = src[row][0];
+            lineBuffer[0] = tempval;
+            for (int j = 1; j <= radius; j++) {
+                tempval += src[row][j];
+            }
+
+            tempval /= len;
+            dst[row][0] = tempval;
+
+            for (int col = 1; col <= radius; ++col) {
+                lineBuffer[col] = src[row][col];
+                tempval = (tempval * len + src[row][col + radius]) / (len + 1);
+                dst[row][col] = tempval;
+                ++len;
+            }
+            int pos = 0;
+            for (int col = radius + 1; col < W - radius; ++col) {
+                const float oldVal = lineBuffer[pos];
+                lineBuffer[pos] = src[row][col];
+                tempval = tempval + (src[row][col + radius] - oldVal) / len;
+                dst[row][col] = tempval;
+                ++pos;
+                pos = pos <= radius ? pos : 0;
+            }
+
+            for (int col = W - radius; col < W; ++col) {
+                tempval = (tempval * len - lineBuffer[pos]) / (len - 1);
+                dst[row][col] = tempval;
+                --len;
+                ++pos;
+                pos = pos <= radius ? pos : 0;
+            }
+        }
+
+        //vertical blur
+#ifdef __SSE2__
+        vfloat (* const rowBuffer)[2] = (vfloat(*)[2]) buffer.get();
+        const vfloat leninitv = F2V(radius + 1);
+        const vfloat onev = F2V(1.f);
+        vfloat tempv, temp1v, lenv, lenp1v, lenm1v, rlenv;
+
+#ifdef _OPENMP
+        #pragma omp for nowait
+#endif
+
+        for (int col = 0; col < W - 7; col += 8) {
+            lenv = leninitv;
+            tempv = LVFU(dst[0][col]);
+            temp1v = LVFU(dst[0][col + 4]);
+            rowBuffer[0][0] = tempv;
+            rowBuffer[0][1] = temp1v;
+
+            for (int i = 1; i <= radius; ++i) {
+                tempv = tempv + LVFU(dst[i][col]);
+                temp1v = temp1v + LVFU(dst[i][col + 4]);
+            }
+
+            tempv = tempv / lenv;
+            temp1v = temp1v / lenv;
+            STVFU(dst[0][col], tempv);
+            STVFU(dst[0][col + 4], temp1v);
+
+            for (int row = 1; row <= radius; ++row) {
+                rowBuffer[row][0] = LVFU(dst[row][col]);
+                rowBuffer[row][1] = LVFU(dst[row][col + 4]);
+                lenp1v = lenv + onev;
+                tempv = (tempv * lenv + LVFU(dst[row + radius][col])) / lenp1v;
+                temp1v = (temp1v * lenv + LVFU(dst[row + radius][col + 4])) / lenp1v;
+                STVFU(dst[row][col], tempv);
+                STVFU(dst[row][col + 4], temp1v);
+                lenv = lenp1v;
+            }
+
+            rlenv = onev / lenv;
+            int pos = 0;
+            for (int row = radius + 1; row < H - radius; ++row) {
+                vfloat oldVal0 = rowBuffer[pos][0];
+                vfloat oldVal1 = rowBuffer[pos][1];
+                rowBuffer[pos][0] = LVFU(dst[row][col]);
+                rowBuffer[pos][1] = LVFU(dst[row][col + 4]);
+                tempv = tempv + (LVFU(dst[row + radius][col]) - oldVal0) * rlenv ;
+                temp1v = temp1v + (LVFU(dst[row + radius][col + 4]) - oldVal1) * rlenv ;
+                STVFU(dst[row][col], tempv);
+                STVFU(dst[row][col + 4], temp1v);
+                ++pos;
+                pos = pos <= radius ? pos : 0;
+            }
+
+            for (int row = H - radius; row < H; ++row) {
+                lenm1v = lenv - onev;
+                tempv = (tempv * lenv - rowBuffer[pos][0]) / lenm1v;
+                temp1v = (temp1v * lenv - rowBuffer[pos][1]) / lenm1v;
+                STVFU(dst[row][col], tempv);
+                STVFU(dst[row][col + 4], temp1v);
+                lenv = lenm1v;
+                ++pos;
+                pos = pos <= radius ? pos : 0;
+            }
+        }
+
+#else
+        float (* const rowBuffer)[8] = (float(*)[8]) buffer.get();
+#ifdef _OPENMP
+        #pragma omp for nowait
+#endif
+
+        for (int col = 0; col < W - numCols + 1; col += 8) {
+            float len = radius + 1;
+
+            for (int k = 0; k < numCols; ++k) {
+                rowBuffer[0][k] = dst[0][col + k];
+            }
+
+            for (int i = 1; i <= radius; ++i) {
+                for (int k = 0; k < numCols; ++k) {
+                    dst[0][col + k] += dst[i][col + k];
+                }
+            }
+
+            for(int k = 0; k < numCols; ++k) {
+                dst[0][col + k] /= len;
+            }
+
+            for (int row = 1; row <= radius; ++row) {
+                for(int k = 0; k < numCols; ++k) {
+                    rowBuffer[row][k] = dst[row][col + k];
+                    dst[row][col + k] = (dst[row - 1][col + k] * len + dst[row + radius][col + k]) / (len + 1);
+                }
+
+                len ++;
+            }
+
+            int pos = 0;
+            for (int row = radius + 1; row < H - radius; ++row) {
+                for(int k = 0; k < numCols; ++k) {
+                    float oldVal = rowBuffer[pos][k];
+                    rowBuffer[pos][k] = dst[row][col + k];
+                    dst[row][col + k] = dst[row - 1][col + k] + (dst[row + radius][col + k] - oldVal) / len;
+                }
+                ++pos;
+                pos = pos <= radius ? pos : 0;
+            }
+
+            for (int row = H - radius; row < H; ++row) {
+                for(int k = 0; k < numCols; ++k) {
+                    dst[row][col + k] = (dst[row - 1][col + k] * len - rowBuffer[pos][k]) / (len - 1);
+                }
+                len --;
+                ++pos;
+                pos = pos <= radius ? pos : 0;
+            }
+        }
+
+#endif
+        //vertical blur, remaining columns
+#ifdef _OPENMP
+        #pragma omp single
+#endif
+        {
+            const int remaining = W % numCols;
+
+            if (remaining > 0) {
+                float (* const rowBuffer)[8] = (float(*)[8]) buffer.get();
+                const int col = W - remaining;
+
+                float len = radius + 1;
+                for(int k = 0; k < remaining; ++k) {
+                    rowBuffer[0][k] = dst[0][col + k];
+                }
+                for (int row = 1; row <= radius; ++row) {
+                    for(int k = 0; k < remaining; ++k) {
+                        dst[0][col + k] += dst[row][col + k];
+                    }
+                }
+                for(int k = 0; k < remaining; ++k) {
+                    dst[0][col + k] /= len;
+                }
+                for (int row = 1; row <= radius; ++row) {
+                    for(int k = 0; k < remaining; ++k) {
+                        rowBuffer[row][k] = dst[row][col + k];
+                        dst[row][col + k] = (dst[row - 1][col + k] * len + dst[row + radius][col + k]) / (len + 1);
+                    }
+                    len ++;
+                }
+                const float rlen = 1.f / len;
+                int pos = 0;
+                for (int row = radius + 1; row < H - radius; ++row) {
+                    for(int k = 0; k < remaining; ++k) {
+                        float oldVal = rowBuffer[pos][k];
+                        rowBuffer[pos][k] = dst[row][col + k];
+                        dst[row][col + k] = dst[row - 1][col + k] + (dst[row + radius][col + k] - oldVal) * rlen;
+                    }
+                    ++pos;
+                    pos = pos <= radius ? pos : 0;
+                }
+                for (int row = H - radius; row < H; ++row) {
+                    for(int k = 0; k < remaining; ++k) {
+                        dst[row][col + k] = (dst[(row - 1)][col + k] * len - rowBuffer[pos][k]) / (len - 1);
+                    }
+                    len --;
+                    ++pos;
+                    pos = pos <= radius ? pos : 0;
+                }
+            }
+        }
+    }
+}
+
+void boxabsblur(float** src, float** dst, int radius, int W, int H, bool multiThread)
+{
+    //abs box blur using rowbuffers and linebuffers instead of a full size buffer, W should be a multiple of 16
+
+    if (radius == 0) {
+        if (src != dst) {
+#ifdef _OPENMP
+            #pragma omp parallel for if (multiThread)
+#endif
+
+            for (int row = 0; row < H; ++row) {
+                for (int col = 0; col < W; ++col) {
+                    dst[row][col] = std::fabs(src[row][col]);
+                }
+            }
+        }
+        return;
+    }
+
+    constexpr int numCols = 16; // process numCols columns at once for better usage of L1 cpu cache
+#ifdef _OPENMP
+    #pragma omp parallel if (multiThread)
+#endif
+    {
+        float buffer[numCols * (radius + 1)] ALIGNED64;
+
+        //horizontal blur
+        float* const lineBuffer = buffer;
+#ifdef _OPENMP
+        #pragma omp for
+#endif
+        for (int row = 0; row < H; ++row) {
+            float len = radius + 1;
+            float tempval = std::fabs(src[row][0]);
+            lineBuffer[0] = tempval;
+            for (int j = 1; j <= radius; j++) {
+                tempval += std::fabs(src[row][j]);
+            }
+
+            tempval /= len;
+            dst[row][0] = tempval;
+
+            for (int col = 1; col <= radius; ++col) {
+                lineBuffer[col] = std::fabs(src[row][col]);
+                tempval = (tempval * len + std::fabs(src[row][col + radius])) / (len + 1);
+                dst[row][col] = tempval;
+                ++len;
+            }
+
+            const float rlen = 1.f / len;
+            int pos = 0;
+            for (int col = radius + 1; col < W - radius; ++col) {
+                const float oldVal = lineBuffer[pos];
+                lineBuffer[pos] = std::fabs(src[row][col]);
+                tempval = tempval + (std::fabs(src[row][col + radius]) - oldVal) * rlen;
+                dst[row][col] = tempval;
+                ++pos;
+                pos = pos <= radius ? pos : 0;
+            }
+
+            for (int col = W - radius; col < W; ++col) {
+                tempval = (tempval * len - lineBuffer[pos]) / (len - 1);
+                dst[row][col] = tempval;
+                --len;
+                ++pos;
+                pos = pos <= radius ? pos : 0;
+            }
+        }
+
+        //vertical blur
+        float (* const rowBuffer)[numCols] = (float(*)[numCols]) buffer;
+#ifdef _OPENMP
+        #pragma omp for
+#endif
+
+        for (int col = 0; col < W; col += numCols) {
+            float len = radius + 1;
+
+            for (int k = 0; k < numCols; ++k) {
+                rowBuffer[0][k] = dst[0][col + k];
+            }
+
+            for (int i = 1; i <= radius; ++i) {
+                for (int k = 0; k < numCols; ++k) {
+                    dst[0][col + k] += dst[i][col + k];
+                }
+            }
+
+            for(int k = 0; k < numCols; ++k) {
+                dst[0][col + k] /= len;
+            }
+
+            for (int row = 1; row <= radius; ++row) {
+                for(int k = 0; k < numCols; ++k) {
+                    rowBuffer[row][k] = dst[row][col + k];
+                    dst[row][col + k] = (dst[row - 1][col + k] * len + dst[row + radius][col + k]) / (len + 1);
+                }
+
+                ++len;
+            }
+
+            const float rlen = 1.f / len;
+            int pos = 0;
+            for (int row = radius + 1; row < H - radius; ++row) {
+                for(int k = 0; k < numCols; ++k) {
+                    float oldVal = rowBuffer[pos][k];
+                    rowBuffer[pos][k] = dst[row][col + k];
+                    dst[row][col + k] = dst[row - 1][col + k] + (dst[row + radius][col + k] - oldVal) * rlen;
+                }
+                ++pos;
+                pos = pos <= radius ? pos : 0;
+            }
+
+            for (int row = H - radius; row < H; ++row) {
+                for(int k = 0; k < numCols; ++k) {
+                    dst[row][col + k] = (dst[row - 1][col + k] * len - rowBuffer[pos][k]) / (len - 1);
+                }
+                --len;
+                ++pos;
+                pos = pos <= radius ? pos : 0;
+            }
+        }
+    }
+}
+
+void boxblur(float* src, float* dst, int radius, int W, int H, bool multiThread)
+{
+    float* srcp[H];
+    float* dstp[H];
+    for (int i = 0; i < H; ++i) {
+        srcp[i] = src + i * W;
+        dstp[i] = dst + i * W;
+    }
+    boxblur(srcp, dstp, radius, W, H, multiThread);
+}
+
+void boxabsblur(float* src, float* dst, int radius, int W, int H, bool multiThread)
+{
+    float* srcp[H];
+    float* dstp[H];
+    for (int i = 0; i < H; ++i) {
+        srcp[i] = src + i * W;
+        dstp[i] = dst + i * W;
+    }
+    boxabsblur(srcp, dstp, radius, W, H, multiThread);
+}
+
+}
diff --git a/rtengine/boxblur.h b/rtengine/boxblur.h
index 1689c5ed1..57fba9119 100644
--- a/rtengine/boxblur.h
+++ b/rtengine/boxblur.h
@@ -1,7 +1,7 @@
 /*
  *  This file is part of RawTherapee.
  *
- *  Copyright (C) 2010 Emil Martinec <ejmartin@uchicago.edu>
+ *  Copyright (C) 2019 Ingo Weyrich <heckflosse67@gmx.de>
  *
  *  RawTherapee is free software: you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License as published by
@@ -15,873 +15,15 @@
  *
  *  You should have received a copy of the GNU General Public License
  *  along with RawTherapee.  If not, see <https://www.gnu.org/licenses/>.
- */
-#ifndef _BOXBLUR_H_
-#define _BOXBLUR_H_
-
-#include <assert.h>
-#include <memory>
-#include <stdlib.h>
-#include <string.h>
-#include <math.h>
-#include "alignedbuffer.h"
-#include "rt_math.h"
-#include "opthelper.h"
+*/
+#pragma once
 
 namespace rtengine
 {
 
-// classical filtering if the support window is small:
-
-template<class T, class A> void boxblur (T** src, A** dst, int radx, int rady, int W, int H)
-{
-    //box blur image; box range = (radx,rady)
-    assert(2*radx+1 < W);
-    assert(2*rady+1 < H);
-
-    AlignedBuffer<float>* buffer = new AlignedBuffer<float> (W * H);
-    float* temp = buffer->data;
-
-    if (radx == 0) {
-#ifdef _OPENMP
-        #pragma omp parallel for
-#endif
-
-        for (int row = 0; row < H; row++)
-            for (int col = 0; col < W; col++) {
-                temp[row * W + col] = (float)src[row][col];
-            }
-    } else {
-        //horizontal blur
-#ifdef _OPENMP
-        #pragma omp parallel for
-#endif
-
-        for (int row = 0; row < H; row++) {
-            int len = radx + 1;
-            temp[row * W + 0] = (float)src[row][0] / len;
-
-            for (int j = 1; j <= radx; j++) {
-                temp[row * W + 0] += (float)src[row][j] / len;
-            }
-
-            for (int col = 1; col <= radx; col++) {
-                temp[row * W + col] = (temp[row * W + col - 1] * len + (float)src[row][col + radx]) / (len + 1);
-                len ++;
-            }
-
-            for (int col = radx + 1; col < W - radx; col++) {
-                temp[row * W + col] = temp[row * W + col - 1] + ((float)(src[row][col + radx] - src[row][col - radx - 1])) / len;
-            }
-
-            for (int col = W - radx; col < W; col++) {
-                temp[row * W + col] = (temp[row * W + col - 1] * len - src[row][col - radx - 1]) / (len - 1);
-                len --;
-            }
-        }
-    }
-
-    if (rady == 0) {
-#ifdef _OPENMP
-        #pragma omp parallel for
-#endif
-
-        for (int row = 0; row < H; row++)
-            for (int col = 0; col < W; col++) {
-                dst[row][col] = temp[row * W + col];
-            }
-    } else {
-        //vertical blur
-#ifdef _OPENMP
-        #pragma omp parallel for
-#endif
-
-        for (int col = 0; col < W; col++) {
-            int len = rady + 1;
-            dst[0][col] = temp[0 * W + col] / len;
-
-            for (int i = 1; i <= rady; i++) {
-                dst[0][col] += temp[i * W + col] / len;
-            }
-
-            for (int row = 1; row <= rady; row++) {
-                dst[row][col] = (dst[(row - 1)][col] * len + temp[(row + rady) * W + col]) / (len + 1);
-                len ++;
-            }
-
-            for (int row = rady + 1; row < H - rady; row++) {
-                dst[row][col] = dst[(row - 1)][col] + (temp[(row + rady) * W + col] - temp[(row - rady - 1) * W + col]) / len;
-            }
-
-            for (int row = H - rady; row < H; row++) {
-                dst[row][col] = (dst[(row - 1)][col] * len - temp[(row - rady - 1) * W + col]) / (len - 1);
-                len --;
-            }
-        }
-    }
-
-    delete buffer;
+void boxblur(float** src, float** dst, int radius, int W, int H, bool multiThread);
+void boxblur(float* src, float* dst, int radius, int W, int H, bool multiThread);
+void boxabsblur(float** src, float** dst, int radius, int W, int H, bool multiThread);
+void boxabsblur(float* src, float* dst, int radius, int W, int H, bool multiThread);
 
 }
-
-template<class T, class A> void boxblur (T** src, A** dst, T* buffer, int radx, int rady, int W, int H)
-{
-    //box blur image; box range = (radx,rady)
-
-    float* temp = buffer;
-
-    if (radx == 0) {
-#ifdef _OPENMP
-        #pragma omp for
-#endif
-
-        for (int row = 0; row < H; row++)
-            for (int col = 0; col < W; col++) {
-                temp[row * W + col] = (float)src[row][col];
-            }
-    } else {
-        //horizontal blur
-#ifdef _OPENMP
-        #pragma omp for
-#endif
-
-        for (int row = 0; row < H; row++) {
-            float len = radx + 1;
-            float tempval = (float)src[row][0];
-
-            for (int j = 1; j <= radx; j++) {
-                tempval += (float)src[row][j];
-            }
-
-            tempval /= len;
-            temp[row * W + 0] = tempval;
-
-            for (int col = 1; col <= radx; col++) {
-                temp[row * W + col] = tempval = (tempval * len + (float)src[row][col + radx]) / (len + 1);
-                len ++;
-            }
-
-            for (int col = radx + 1; col < W - radx; col++) {
-                temp[row * W + col] = tempval = tempval + ((float)(src[row][col + radx] - src[row][col - radx - 1])) / len;
-            }
-
-            for (int col = W - radx; col < W; col++) {
-                temp[row * W + col] = tempval = (tempval * len - src[row][col - radx - 1]) / (len - 1);
-                len --;
-            }
-        }
-    }
-
-    if (rady == 0) {
-#ifdef _OPENMP
-        #pragma omp for
-#endif
-
-        for (int row = 0; row < H; row++)
-            for (int col = 0; col < W; col++) {
-                dst[row][col] = temp[row * W + col];
-            }
-    } else {
-        const int numCols = 8; // process numCols columns at once for better usage of L1 cpu cache
-#ifdef __SSE2__
-        vfloat  leninitv = F2V( (float)(rady + 1));
-        vfloat  onev = F2V( 1.f );
-        vfloat  tempv, temp1v, lenv, lenp1v, lenm1v, rlenv;
-
-#ifdef _OPENMP
-        #pragma omp for
-#endif
-
-        for (int col = 0; col < W - 7; col += 8) {
-            lenv = leninitv;
-            tempv = LVFU(temp[0 * W + col]);
-            temp1v = LVFU(temp[0 * W + col + 4]);
-
-            for (int i = 1; i <= rady; i++) {
-                tempv = tempv + LVFU(temp[i * W + col]);
-                temp1v = temp1v + LVFU(temp[i * W + col + 4]);
-            }
-
-            tempv = tempv / lenv;
-            temp1v = temp1v / lenv;
-            STVFU(dst[0][col], tempv);
-            STVFU(dst[0][col + 4], temp1v);
-
-            for (int row = 1; row <= rady; row++) {
-                lenp1v = lenv + onev;
-                tempv = (tempv * lenv + LVFU(temp[(row + rady) * W + col])) / lenp1v;
-                temp1v = (temp1v * lenv + LVFU(temp[(row + rady) * W + col + 4])) / lenp1v;
-                STVFU(dst[row][col], tempv);
-                STVFU(dst[row][col + 4], temp1v);
-                lenv = lenp1v;
-            }
-
-            rlenv = onev / lenv;
-
-            for (int row = rady + 1; row < H - rady; row++) {
-                tempv = tempv + (LVFU(temp[(row + rady) * W + col]) - LVFU(temp[(row - rady - 1) * W + col])) * rlenv ;
-                temp1v = temp1v + (LVFU(temp[(row + rady) * W + col + 4]) - LVFU(temp[(row - rady - 1) * W + col + 4])) * rlenv ;
-                STVFU(dst[row][col], tempv);
-                STVFU(dst[row][col + 4], temp1v);
-            }
-
-            for (int row = H - rady; row < H; row++) {
-                lenm1v = lenv - onev;
-                tempv = (tempv * lenv - LVFU(temp[(row - rady - 1) * W + col])) / lenm1v;
-                temp1v = (temp1v * lenv - LVFU(temp[(row - rady - 1) * W + col + 4])) / lenm1v;
-                STVFU(dst[row][col], tempv);
-                STVFU(dst[row][col + 4], temp1v);
-                lenv = lenm1v;
-            }
-        }
-
-#else
-        //vertical blur
-#ifdef _OPENMP
-        #pragma omp for
-#endif
-
-        for (int col = 0; col < W - numCols + 1; col += 8) {
-            float len = rady + 1;
-
-            for(int k = 0; k < numCols; k++) {
-                dst[0][col + k] = temp[0 * W + col + k];
-            }
-
-            for (int i = 1; i <= rady; i++) {
-                for(int k = 0; k < numCols; k++) {
-                    dst[0][col + k] += temp[i * W + col + k];
-                }
-            }
-
-            for(int k = 0; k < numCols; k++) {
-                dst[0][col + k] /= len;
-            }
-
-            for (int row = 1; row <= rady; row++) {
-                for(int k = 0; k < numCols; k++) {
-                    dst[row][col + k] = (dst[(row - 1)][col + k] * len + temp[(row + rady) * W + col + k]) / (len + 1);
-                }
-
-                len ++;
-            }
-
-            for (int row = rady + 1; row < H - rady; row++) {
-                for(int k = 0; k < numCols; k++) {
-                    dst[row][col + k] = dst[(row - 1)][col + k] + (temp[(row + rady) * W + col + k] - temp[(row - rady - 1) * W + col + k]) / len;
-                }
-            }
-
-            for (int row = H - rady; row < H; row++) {
-                for(int k = 0; k < numCols; k++) {
-                    dst[row][col + k] = (dst[(row - 1)][col + k] * len - temp[(row - rady - 1) * W + col + k]) / (len - 1);
-                }
-
-                len --;
-            }
-        }
-
-#endif
-#ifdef _OPENMP
-        #pragma omp single
-#endif
-
-        for (int col = W - (W % numCols); col < W; col++) {
-            float len = rady + 1;
-            dst[0][col] = temp[0 * W + col] / len;
-
-            for (int i = 1; i <= rady; i++) {
-                dst[0][col] += temp[i * W + col] / len;
-            }
-
-            for (int row = 1; row <= rady; row++) {
-                dst[row][col] = (dst[(row - 1)][col] * len + temp[(row + rady) * W + col]) / (len + 1);
-                len ++;
-            }
-
-            for (int row = rady + 1; row < H - rady; row++) {
-                dst[row][col] = dst[(row - 1)][col] + (temp[(row + rady) * W + col] - temp[(row - rady - 1) * W + col]) / len;
-            }
-
-            for (int row = H - rady; row < H; row++) {
-                dst[row][col] = (dst[(row - 1)][col] * len - temp[(row - rady - 1) * W + col]) / (len - 1);
-                len --;
-            }
-        }
-    }
-
-}
-
-inline void boxblur (float** src, float** dst, int radius, int W, int H, bool multiThread)
-{
-    //box blur using rowbuffers and linebuffers instead of a full size buffer
-
-    if (radius == 0) {
-        if (src != dst) {
-#ifdef _OPENMP
-            #pragma omp parallel for if (multiThread)
-#endif
-
-            for (int row = 0; row < H; row++) {
-                for (int col = 0; col < W; col++) {
-                    dst[row][col] = src[row][col];
-                }
-            }
-        }
-        return;
-    }
-
-    constexpr int numCols = 8; // process numCols columns at once for better usage of L1 cpu cache
-#ifdef _OPENMP
-    #pragma omp parallel if (multiThread)
-#endif
-    {
-        std::unique_ptr<float[]> buffer(new float[numCols * (radius + 1)]);
-
-        //horizontal blur
-        float* const lineBuffer = buffer.get();
-#ifdef _OPENMP
-        #pragma omp for
-#endif
-        for (int row = 0; row < H; row++) {
-            float len = radius + 1;
-            float tempval = src[row][0];
-            lineBuffer[0] = tempval;
-            for (int j = 1; j <= radius; j++) {
-                tempval += src[row][j];
-            }
-
-            tempval /= len;
-            dst[row][0] = tempval;
-
-            for (int col = 1; col <= radius; col++) {
-                lineBuffer[col] = src[row][col];
-                tempval = (tempval * len + src[row][col + radius]) / (len + 1);
-                dst[row][col] = tempval;
-                ++len;
-            }
-            int pos = 0;
-            for (int col = radius + 1; col < W - radius; col++) {
-                const float oldVal = lineBuffer[pos];
-                lineBuffer[pos] = src[row][col];
-                tempval = tempval + (src[row][col + radius] - oldVal) / len;
-                dst[row][col] = tempval;
-                ++pos;
-                pos = pos <= radius ? pos : 0;
-            }
-
-            for (int col = W - radius; col < W; col++) {
-                tempval = (tempval * len - lineBuffer[pos]) / (len - 1);
-                dst[row][col] = tempval;
-                --len;
-                ++pos;
-                pos = pos <= radius ? pos : 0;
-            }
-        }
-
-        //vertical blur
-#ifdef __SSE2__
-        vfloat (* const rowBuffer)[2] = (vfloat(*)[2]) buffer.get();
-        const vfloat leninitv = F2V(radius + 1);
-        const vfloat onev = F2V(1.f);
-        vfloat tempv, temp1v, lenv, lenp1v, lenm1v, rlenv;
-
-#ifdef _OPENMP
-        #pragma omp for nowait
-#endif
-
-        for (int col = 0; col < W - 7; col += 8) {
-            lenv = leninitv;
-            tempv = LVFU(dst[0][col]);
-            temp1v = LVFU(dst[0][col + 4]);
-            rowBuffer[0][0] = tempv;
-            rowBuffer[0][1] = temp1v;
-
-            for (int i = 1; i <= radius; i++) {
-                tempv = tempv + LVFU(dst[i][col]);
-                temp1v = temp1v + LVFU(dst[i][col + 4]);
-            }
-
-            tempv = tempv / lenv;
-            temp1v = temp1v / lenv;
-            STVFU(dst[0][col], tempv);
-            STVFU(dst[0][col + 4], temp1v);
-
-            for (int row = 1; row <= radius; row++) {
-                rowBuffer[row][0] = LVFU(dst[row][col]);
-                rowBuffer[row][1] = LVFU(dst[row][col + 4]);
-                lenp1v = lenv + onev;
-                tempv = (tempv * lenv + LVFU(dst[row + radius][col])) / lenp1v;
-                temp1v = (temp1v * lenv + LVFU(dst[row + radius][col + 4])) / lenp1v;
-                STVFU(dst[row][col], tempv);
-                STVFU(dst[row][col + 4], temp1v);
-                lenv = lenp1v;
-            }
-
-            rlenv = onev / lenv;
-            int pos = 0;
-            for (int row = radius + 1; row < H - radius; row++) {
-                vfloat oldVal0 = rowBuffer[pos][0];
-                vfloat oldVal1 = rowBuffer[pos][1];
-                rowBuffer[pos][0] = LVFU(dst[row][col]);
-                rowBuffer[pos][1] = LVFU(dst[row][col + 4]);
-                tempv = tempv + (LVFU(dst[row + radius][col]) - oldVal0) * rlenv ;
-                temp1v = temp1v + (LVFU(dst[row + radius][col + 4]) - oldVal1) * rlenv ;
-                STVFU(dst[row][col], tempv);
-                STVFU(dst[row][col + 4], temp1v);
-                ++pos;
-                pos = pos <= radius ? pos : 0;
-            }
-
-            for (int row = H - radius; row < H; row++) {
-                lenm1v = lenv - onev;
-                tempv = (tempv * lenv - rowBuffer[pos][0]) / lenm1v;
-                temp1v = (temp1v * lenv - rowBuffer[pos][1]) / lenm1v;
-                STVFU(dst[row][col], tempv);
-                STVFU(dst[row][col + 4], temp1v);
-                lenv = lenm1v;
-                ++pos;
-                pos = pos <= radius ? pos : 0;
-            }
-        }
-
-#else
-        float (* const rowBuffer)[8] = (float(*)[8]) buffer.get();
-#ifdef _OPENMP
-        #pragma omp for nowait
-#endif
-
-        for (int col = 0; col < W - numCols + 1; col += 8) {
-            float len = radius + 1;
-
-            for (int k = 0; k < numCols; k++) {
-                rowBuffer[0][k] = dst[0][col + k];
-            }
-
-            for (int i = 1; i <= radius; i++) {
-                for (int k = 0; k < numCols; k++) {
-                    dst[0][col + k] += dst[i][col + k];
-                }
-            }
-
-            for(int k = 0; k < numCols; k++) {
-                dst[0][col + k] /= len;
-            }
-
-            for (int row = 1; row <= radius; row++) {
-                for(int k = 0; k < numCols; k++) {
-                    rowBuffer[row][k] = dst[row][col + k];
-                    dst[row][col + k] = (dst[row - 1][col + k] * len + dst[row + radius][col + k]) / (len + 1);
-                }
-
-                len ++;
-            }
-
-            int pos = 0;
-            for (int row = radius + 1; row < H - radius; row++) {
-                for(int k = 0; k < numCols; k++) {
-                    float oldVal = rowBuffer[pos][k];
-                    rowBuffer[pos][k] = dst[row][col + k];
-                    dst[row][col + k] = dst[row - 1][col + k] + (dst[row + radius][col + k] - oldVal) / len;
-                }
-                ++pos;
-                pos = pos <= radius ? pos : 0;
-            }
-
-            for (int row = H - radius; row < H; row++) {
-                for(int k = 0; k < numCols; k++) {
-                    dst[row][col + k] = (dst[row - 1][col + k] * len - rowBuffer[pos][k]) / (len - 1);
-                }
-                len --;
-                ++pos;
-                pos = pos <= radius ? pos : 0;
-            }
-        }
-
-#endif
-        //vertical blur, remaining columns
-#ifdef _OPENMP
-        #pragma omp single
-#endif
-        {
-            const int remaining = W % numCols;
-
-            if (remaining > 0) {
-                float (* const rowBuffer)[8] = (float(*)[8]) buffer.get();
-                const int col = W - remaining;
-
-                float len = radius + 1;
-                for(int k = 0; k < remaining; ++k) {
-                    rowBuffer[0][k] = dst[0][col + k];
-                }
-                for (int row = 1; row <= radius; ++row) {
-                    for(int k = 0; k < remaining; ++k) {
-                        dst[0][col + k] += dst[row][col + k];
-                    }
-                }
-                for(int k = 0; k < remaining; ++k) {
-                    dst[0][col + k] /= len;
-                }
-                for (int row = 1; row <= radius; ++row) {
-                    for(int k = 0; k < remaining; ++k) {
-                        rowBuffer[row][k] = dst[row][col + k];
-                        dst[row][col + k] = (dst[row - 1][col + k] * len + dst[row + radius][col + k]) / (len + 1);
-                    }
-                    len ++;
-                }
-                const float rlen = 1.f / len;
-                int pos = 0;
-                for (int row = radius + 1; row < H - radius; ++row) {
-                    for(int k = 0; k < remaining; ++k) {
-                        float oldVal = rowBuffer[pos][k];
-                        rowBuffer[pos][k] = dst[row][col + k];
-                        dst[row][col + k] = dst[row - 1][col + k] + (dst[row + radius][col + k] - oldVal) * rlen;
-                    }
-                    ++pos;
-                    pos = pos <= radius ? pos : 0;
-                }
-                for (int row = H - radius; row < H; ++row) {
-                    for(int k = 0; k < remaining; ++k) {
-                        dst[row][col + k] = (dst[(row - 1)][col + k] * len - rowBuffer[pos][k]) / (len - 1);
-                    }
-                    len --;
-                    ++pos;
-                    pos = pos <= radius ? pos : 0;
-                }
-            }
-        }
-    }
-}
-
-template<class T, class A> void boxblur (T* src, A* dst, A* buffer, int radx, int rady, int W, int H)
-{
-    //box blur image; box range = (radx,rady) i.e. box size is (2*radx+1)x(2*rady+1)
-
-    float* temp = buffer;
-
-    if (radx == 0) {
-        for (int row = 0; row < H; row++)
-            for (int col = 0; col < W; col++) {
-                temp[row * W + col] = src[row * W + col];
-            }
-    } else {
-        //horizontal blur
-        for (int row = H - 1; row >= 0; row--) {
-            int len = radx + 1;
-            float tempval = (float)src[row * W];
-
-            for (int j = 1; j <= radx; j++) {
-                tempval += (float)src[row * W + j];
-            }
-
-            tempval = tempval / len;
-            temp[row * W] = tempval;
-
-            for (int col = 1; col <= radx; col++) {
-                tempval = (tempval * len + src[row * W + col + radx]) / (len + 1);
-                temp[row * W + col] = tempval;
-                len ++;
-            }
-
-            float reclen = 1.f / len;
-
-            for (int col = radx + 1; col < W - radx; col++) {
-                tempval = tempval + ((float)(src[row * W + col + radx] - src[row * W + col - radx - 1])) * reclen;
-                temp[row * W + col] = tempval;
-            }
-
-            for (int col = W - radx; col < W; col++) {
-                tempval = (tempval * len - src[row * W + col - radx - 1]) / (len - 1);
-                temp[row * W + col] = tempval;
-                len --;
-            }
-        }
-    }
-
-    if (rady == 0) {
-        for (int row = 0; row < H; row++)
-            for (int col = 0; col < W; col++) {
-                dst[row * W + col] = temp[row * W + col];
-            }
-    } else {
-        //vertical blur
-#ifdef __SSE2__
-        vfloat  leninitv = F2V( (float)(rady + 1));
-        vfloat  onev = F2V( 1.f );
-        vfloat  tempv, temp1v, lenv, lenp1v, lenm1v, rlenv;
-        int col;
-
-        for (col = 0; col < W - 7; col += 8) {
-            lenv = leninitv;
-            tempv = LVFU(temp[0 * W + col]);
-            temp1v = LVFU(temp[0 * W + col + 4]);
-
-            for (int i = 1; i <= rady; i++) {
-                tempv = tempv + LVFU(temp[i * W + col]);
-                temp1v = temp1v + LVFU(temp[i * W + col + 4]);
-            }
-
-            tempv = tempv / lenv;
-            temp1v = temp1v / lenv;
-            STVFU(dst[0 * W + col], tempv);
-            STVFU(dst[0 * W + col + 4], temp1v);
-
-            for (int row = 1; row <= rady; row++) {
-                lenp1v = lenv + onev;
-                tempv = (tempv * lenv + LVFU(temp[(row + rady) * W + col])) / lenp1v;
-                temp1v = (temp1v * lenv + LVFU(temp[(row + rady) * W + col + 4])) / lenp1v;
-                STVFU(dst[row * W + col], tempv);
-                STVFU(dst[row * W + col + 4], temp1v);
-                lenv = lenp1v;
-            }
-
-            rlenv = onev / lenv;
-
-            for (int row = rady + 1; row < H - rady; row++) {
-                tempv = tempv + (LVFU(temp[(row + rady) * W + col]) - LVFU(temp[(row - rady - 1) * W + col])) * rlenv ;
-                temp1v = temp1v + (LVFU(temp[(row + rady) * W + col + 4]) - LVFU(temp[(row - rady - 1) * W + col + 4])) * rlenv ;
-                STVFU(dst[row * W + col], tempv);
-                STVFU(dst[row * W + col + 4], temp1v);
-            }
-
-            for (int row = H - rady; row < H; row++) {
-                lenm1v = lenv - onev;
-                tempv = (tempv * lenv - LVFU(temp[(row - rady - 1) * W + col])) / lenm1v;
-                temp1v = (temp1v * lenv - LVFU(temp[(row - rady - 1) * W + col + 4])) / lenm1v;
-                STVFU(dst[row * W + col], tempv);
-                STVFU(dst[row * W + col + 4], temp1v);
-                lenv = lenm1v;
-            }
-        }
-
-        for (; col < W - 3; col += 4) {
-            lenv = leninitv;
-            tempv = LVFU(temp[0 * W + col]);
-
-            for (int i = 1; i <= rady; i++) {
-                tempv = tempv + LVFU(temp[i * W + col]);
-            }
-
-            tempv = tempv / lenv;
-            STVFU(dst[0 * W + col], tempv);
-
-            for (int row = 1; row <= rady; row++) {
-                lenp1v = lenv + onev;
-                tempv = (tempv * lenv + LVFU(temp[(row + rady) * W + col])) / lenp1v;
-                STVFU(dst[row * W + col], tempv);
-                lenv = lenp1v;
-            }
-
-            rlenv = onev / lenv;
-
-            for (int row = rady + 1; row < H - rady; row++) {
-                tempv = tempv + (LVFU(temp[(row + rady) * W + col]) - LVFU(temp[(row - rady - 1) * W + col])) * rlenv ;
-                STVFU(dst[row * W + col], tempv);
-            }
-
-            for (int row = H - rady; row < H; row++) {
-                lenm1v = lenv - onev;
-                tempv = (tempv * lenv - LVFU(temp[(row - rady - 1) * W + col])) / lenm1v;
-                STVFU(dst[row * W + col], tempv);
-                lenv = lenm1v;
-            }
-        }
-
-        for (; col < W; col++) {
-            int len = rady + 1;
-            dst[0 * W + col] = temp[0 * W + col] / len;
-
-            for (int i = 1; i <= rady; i++) {
-                dst[0 * W + col] += temp[i * W + col] / len;
-            }
-
-            for (int row = 1; row <= rady; row++) {
-                dst[row * W + col] = (dst[(row - 1) * W + col] * len + temp[(row + rady) * W + col]) / (len + 1);
-                len ++;
-            }
-
-            for (int row = rady + 1; row < H - rady; row++) {
-                dst[row * W + col] = dst[(row - 1) * W + col] + (temp[(row + rady) * W + col] - temp[(row - rady - 1) * W + col]) / len;
-            }
-
-            for (int row = H - rady; row < H; row++) {
-                dst[row * W + col] = (dst[(row - 1) * W + col] * len - temp[(row - rady - 1) * W + col]) / (len - 1);
-                len --;
-            }
-        }
-
-#else
-
-        for (int col = 0; col < W; col++) {
-            int len = rady + 1;
-            dst[0 * W + col] = temp[0 * W + col] / len;
-
-            for (int i = 1; i <= rady; i++) {
-                dst[0 * W + col] += temp[i * W + col] / len;
-            }
-
-            for (int row = 1; row <= rady; row++) {
-                dst[row * W + col] = (dst[(row - 1) * W + col] * len + temp[(row + rady) * W + col]) / (len + 1);
-                len ++;
-            }
-
-            for (int row = rady + 1; row < H - rady; row++) {
-                dst[row * W + col] = dst[(row - 1) * W + col] + (temp[(row + rady) * W + col] - temp[(row - rady - 1) * W + col]) / len;
-            }
-
-            for (int row = H - rady; row < H; row++) {
-                dst[row * W + col] = (dst[(row - 1) * W + col] * len - temp[(row - rady - 1) * W + col]) / (len - 1);
-                len --;
-            }
-        }
-
-#endif
-    }
-
-}
-
-template<class T, class A> void boxabsblur (T* src, A* dst, int radx, int rady, int W, int H, float * temp)
-{
-
-    //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-    //box blur image; box range = (radx,rady) i.e. box size is (2*radx+1)x(2*rady+1)
-
-    if (radx == 0) {
-        for (int row = 0; row < H; row++)
-            for (int col = 0; col < W; col++) {
-                temp[row * W + col] = fabs(src[row * W + col]);
-            }
-    } else {
-        //horizontal blur
-        for (int row = 0; row < H; row++) {
-            int len = radx + 1;
-            float tempval = fabsf((float)src[row * W + 0]);
-
-            for (int j = 1; j <= radx; j++) {
-                tempval += fabsf((float)src[row * W + j]);
-            }
-
-            tempval /= len;
-            temp[row * W + 0] = tempval;
-
-            for (int col = 1; col <= radx; col++) {
-                tempval = (tempval * len + fabsf(src[row * W + col + radx])) / (len + 1);
-                temp[row * W + col] = tempval;
-                len ++;
-            }
-
-            float rlen = 1.f / (float)len;
-
-            for (int col = radx + 1; col < W - radx; col++) {
-                tempval = tempval + ((float)(fabsf(src[row * W + col + radx]) - fabsf(src[row * W + col - radx - 1]))) * rlen;
-                temp[row * W + col] = tempval;
-            }
-
-            for (int col = W - radx; col < W; col++) {
-                tempval = (tempval * len - fabsf(src[row * W + col - radx - 1])) / (len - 1);
-                temp[row * W + col] = tempval;
-                len --;
-            }
-        }
-    }
-
-    if (rady == 0) {
-        for (int row = 0; row < H; row++)
-            for (int col = 0; col < W; col++) {
-                dst[row * W + col] = temp[row * W + col];
-            }
-    } else {
-        //vertical blur
-#ifdef __SSE2__
-        vfloat  leninitv = F2V( (float)(rady + 1));
-        vfloat  onev = F2V( 1.f );
-        vfloat  tempv, lenv, lenp1v, lenm1v, rlenv;
-
-        for (int col = 0; col < W - 3; col += 4) {
-            lenv = leninitv;
-            tempv = LVF(temp[0 * W + col]);
-
-            for (int i = 1; i <= rady; i++) {
-                tempv = tempv + LVF(temp[i * W + col]);
-            }
-
-            tempv = tempv / lenv;
-            STVF(dst[0 * W + col], tempv);
-
-            for (int row = 1; row <= rady; row++) {
-                lenp1v = lenv + onev;
-                tempv = (tempv * lenv + LVF(temp[(row + rady) * W + col])) / lenp1v;
-                STVF(dst[row * W + col], tempv);
-                lenv = lenp1v;
-            }
-
-            rlenv = onev / lenv;
-
-            for (int row = rady + 1; row < H - rady; row++) {
-                tempv = tempv + (LVF(temp[(row + rady) * W + col]) - LVF(temp[(row - rady - 1) * W + col])) * rlenv;
-                STVF(dst[row * W + col], tempv);
-            }
-
-            for (int row = H - rady; row < H; row++) {
-                lenm1v = lenv - onev;
-                tempv = (tempv * lenv - LVF(temp[(row - rady - 1) * W + col])) / lenm1v;
-                STVF(dst[row * W + col], tempv);
-                lenv = lenm1v;
-            }
-        }
-
-        for (int col = W - (W % 4); col < W; col++) {
-            int len = rady + 1;
-            dst[0 * W + col] = temp[0 * W + col] / len;
-
-            for (int i = 1; i <= rady; i++) {
-                dst[0 * W + col] += temp[i * W + col] / len;
-            }
-
-            for (int row = 1; row <= rady; row++) {
-                dst[row * W + col] = (dst[(row - 1) * W + col] * len + temp[(row + rady) * W + col]) / (len + 1);
-                len ++;
-            }
-
-            for (int row = rady + 1; row < H - rady; row++) {
-                dst[row * W + col] = dst[(row - 1) * W + col] + (temp[(row + rady) * W + col] - temp[(row - rady - 1) * W + col]) / len;
-            }
-
-            for (int row = H - rady; row < H; row++) {
-                dst[row * W + col] = (dst[(row - 1) * W + col] * len - temp[(row - rady - 1) * W + col]) / (len - 1);
-                len --;
-            }
-        }
-
-#else
-
-        for (int col = 0; col < W; col++) {
-            int len = rady + 1;
-            dst[0 * W + col] = temp[0 * W + col] / len;
-
-            for (int i = 1; i <= rady; i++) {
-                dst[0 * W + col] += temp[i * W + col] / len;
-            }
-
-            for (int row = 1; row <= rady; row++) {
-                dst[row * W + col] = (dst[(row - 1) * W + col] * len + temp[(row + rady) * W + col]) / (len + 1);
-                len ++;
-            }
-
-            for (int row = rady + 1; row < H - rady; row++) {
-                dst[row * W + col] = dst[(row - 1) * W + col] + (temp[(row + rady) * W + col] - temp[(row - rady - 1) * W + col]) / len;
-            }
-
-            for (int row = H - rady; row < H; row++) {
-                dst[row * W + col] = (dst[(row - 1) * W + col] * len - temp[(row - rady - 1) * W + col]) / (len - 1);
-                len --;
-            }
-        }
-
-#endif
-    }
-
-}
-
-}
-#endif /* _BOXBLUR_H_ */
diff --git a/rtengine/gauss.cc b/rtengine/gauss.cc
index ca330f9b9..dad1c4954 100644
--- a/rtengine/gauss.cc
+++ b/rtengine/gauss.cc
@@ -1349,7 +1349,7 @@ template<class T> void gaussVerticalmult (T** src, T** dst, const int W, const i
 }
 #endif
 
-template<class T> void gaussianBlurImpl(T** src, T** dst, const int W, const int H, const double sigma, T *buffer = nullptr, eGaussType gausstype = GAUSS_STANDARD, T** buffer2 = nullptr)
+template<class T> void gaussianBlurImpl(T** src, T** dst, const int W, const int H, const double sigma, bool useBoxBlur, eGaussType gausstype = GAUSS_STANDARD, T** buffer2 = nullptr)
 {
     static constexpr auto GAUSS_SKIP = 0.25;
     static constexpr auto GAUSS_3X3_LIMIT = 0.6;
@@ -1357,7 +1357,7 @@ template<class T> void gaussianBlurImpl(T** src, T** dst, const int W, const int
     static constexpr auto GAUSS_7X7_LIMIT = 1.15;
     static constexpr auto GAUSS_DOUBLE = 25.0;
 
-    if(buffer) {
+    if (useBoxBlur) {
         // special variant for very large sigma, currently only used by retinex algorithm
         // use iterated boxblur to approximate gaussian blur
         // Compute ideal averaging filter width and number of iterations
@@ -1393,10 +1393,10 @@ template<class T> void gaussianBlurImpl(T** src, T** dst, const int W, const int
             sizes[i] = ((i < m ? wl : wu) - 1) / 2;
         }
 
-        rtengine::boxblur(src, dst, buffer, sizes[0], sizes[0], W, H);
+        rtengine::boxblur(src, dst, sizes[0], W, H, true);
 
         for(int i = 1; i < n; i++) {
-            rtengine::boxblur(dst, dst, buffer, sizes[i], sizes[i], W, H);
+            rtengine::boxblur(dst, dst, sizes[i], W, H, true);
         }
     } else {
         if (sigma < GAUSS_SKIP) {
@@ -1532,8 +1532,8 @@ template<class T> void gaussianBlurImpl(T** src, T** dst, const int W, const int
 }
 }
 
-void gaussianBlur(float** src, float** dst, const int W, const int H, const double sigma, float *buffer, eGaussType gausstype, float** buffer2)
+void gaussianBlur(float** src, float** dst, const int W, const int H, const double sigma, bool useBoxBlur, eGaussType gausstype, float** buffer2)
 {
-    gaussianBlurImpl<float>(src, dst, W, H, sigma, buffer, gausstype, buffer2);
+    gaussianBlurImpl<float>(src, dst, W, H, sigma, useBoxBlur, gausstype, buffer2);
 }
 
diff --git a/rtengine/gauss.h b/rtengine/gauss.h
index b63301d2b..f78762df3 100644
--- a/rtengine/gauss.h
+++ b/rtengine/gauss.h
@@ -21,6 +21,6 @@
 
 enum eGaussType {GAUSS_STANDARD, GAUSS_MULT, GAUSS_DIV};
 
-void gaussianBlur(float** src, float** dst, const int W, const int H, const double sigma, float *buffer = nullptr, eGaussType gausstype = GAUSS_STANDARD, float** buffer2 = nullptr);
+void gaussianBlur(float** src, float** dst, const int W, const int H, const double sigma, bool useBoxBlur = false, eGaussType gausstype = GAUSS_STANDARD, float** buffer2 = nullptr);
 
 #endif
\ No newline at end of file
diff --git a/rtengine/guidedfilter.cc b/rtengine/guidedfilter.cc
index 159e89504..feb108198 100644
--- a/rtengine/guidedfilter.cc
+++ b/rtengine/guidedfilter.cc
@@ -105,7 +105,7 @@ void guidedFilter(const array2D<float> &guide, const array2D<float> &src, array2
         [multithread](array2D<float> &d, array2D<float> &s, int rad) -> void
         {
             rad = LIM(rad, 0, (min(s.width(), s.height()) - 1) / 2 - 1);
-            boxblur(s, d, rad, s.width(), s.height(), multithread);
+            boxblur(static_cast<float**>(s), static_cast<float**>(d), rad, s.width(), s.height(), multithread);
         };
 
     const int W = src.width();
diff --git a/rtengine/improcfun.h b/rtengine/improcfun.h
index 84da1cacc..f88108422 100644
--- a/rtengine/improcfun.h
+++ b/rtengine/improcfun.h
@@ -198,20 +198,20 @@ public:
     void Median_Denoise(float **src, float **dst, float upperBound, int width, int height, Median medianType, int iterations, int numThreads, float **buffer = nullptr);
     void RGB_denoise(int kall, Imagefloat * src, Imagefloat * dst, Imagefloat * calclum, float * ch_M, float *max_r, float *max_b, bool isRAW, const procparams::DirPyrDenoiseParams & dnparams, const double expcomp, const NoiseCurve & noiseLCurve, const NoiseCurve & noiseCCurve, float &nresi, float &highresi);
     void RGB_denoise_infoGamCurve(const procparams::DirPyrDenoiseParams & dnparams, const bool isRAW, LUTf &gamcurve, float &gam, float &gamthresh, float &gamslope);
-    void RGB_denoise_info(Imagefloat * src, Imagefloat * provicalc, bool isRAW, LUTf &gamcurve, float gam, float gamthresh, float gamslope, const procparams::DirPyrDenoiseParams & dnparams, const double expcomp, float &chaut, int &Nb, float &redaut, float &blueaut, float &maxredaut, float & maxblueaut, float &minredaut, float & minblueaut, float &chromina, float &sigma, float &lumema, float &sigma_L, float &redyel, float &skinc, float &nsknc, bool multiThread = false);
-    void RGBtile_denoise(float * fLblox, int hblproc, float noisevar_Ldetail, float * nbrwt, float * blurbuffer);     //for DCT
+    void RGB_denoise_info(Imagefloat * src, Imagefloat * provicalc, bool isRAW, const LUTf &gamcurve, float gam, float gamthresh, float gamslope, const procparams::DirPyrDenoiseParams & dnparams, const double expcomp, float &chaut, int &Nb, float &redaut, float &blueaut, float &maxredaut, float & maxblueaut, float &minredaut, float & minblueaut, float &chromina, float &sigma, float &lumema, float &sigma_L, float &redyel, float &skinc, float &nsknc, bool multiThread = false);
+    void RGBtile_denoise(float * fLblox, int hblproc, float noisevar_Ldetail);     //for DCT
     void RGBoutput_tile_row(float *bloxrow_L, float ** Ldetail, float ** tilemask_out, int height, int width, int top);
-    bool WaveletDenoiseAllL(wavelet_decomposition &WaveletCoeffs_L, float *noisevarlum, float madL[8][3], float * vari, int edge);
-    bool WaveletDenoiseAllAB(wavelet_decomposition &WaveletCoeffs_L, wavelet_decomposition &WaveletCoeffs_ab, float *noisevarchrom, float madL[8][3], float noisevar_ab, const bool useNoiseCCurve, bool autoch, bool denoiseMethodRgb);
-    void WaveletDenoiseAll_info(int levwav, wavelet_decomposition &WaveletCoeffs_a,
-                                wavelet_decomposition &WaveletCoeffs_b, float **noisevarlum, float **noisevarchrom, float **noisevarhue, float &chaut, int &Nb, float &redaut, float &blueaut, float &maxredaut, float &maxblueaut, float &minredaut, float & minblueaut, int schoice, float &chromina, float &sigma, float &lumema, float &sigma_L, float &redyel, float &skinc, float &nsknc,
+    bool WaveletDenoiseAllL(const wavelet_decomposition &WaveletCoeffs_L, float *noisevarlum, float madL[8][3], float * vari, int edge);
+    bool WaveletDenoiseAllAB(const wavelet_decomposition &WaveletCoeffs_L, const wavelet_decomposition &WaveletCoeffs_ab, float *noisevarchrom, float madL[8][3], float noisevar_ab, const bool useNoiseCCurve, bool autoch, bool denoiseMethodRgb);
+    void WaveletDenoiseAll_info(int levwav, const wavelet_decomposition &WaveletCoeffs_a,
+                                const wavelet_decomposition &WaveletCoeffs_b, float **noisevarlum, float **noisevarchrom, float **noisevarhue, float &chaut, int &Nb, float &redaut, float &blueaut, float &maxredaut, float &maxblueaut, float &minredaut, float & minblueaut, int schoice, float &chromina, float &sigma, float &lumema, float &sigma_L, float &redyel, float &skinc, float &nsknc,
                                 float &maxchred, float &maxchblue, float &minchred, float &minchblue, int &nb, float &chau, float &chred, float &chblue, bool denoiseMethodRgb);
 
-    bool WaveletDenoiseAll_BiShrinkL(wavelet_decomposition &WaveletCoeffs_L, float *noisevarlum, float madL[8][3]);
-    bool WaveletDenoiseAll_BiShrinkAB(wavelet_decomposition &WaveletCoeffs_L, wavelet_decomposition &WaveletCoeffs_ab, float *noisevarchrom, float madL[8][3], float noisevar_ab,
+    bool WaveletDenoiseAll_BiShrinkL(const wavelet_decomposition &WaveletCoeffs_L, float *noisevarlum, float madL[8][3]);
+    bool WaveletDenoiseAll_BiShrinkAB(const wavelet_decomposition &WaveletCoeffs_L, const wavelet_decomposition &WaveletCoeffs_ab, float *noisevarchrom, float madL[8][3], float noisevar_ab,
                                       const bool useNoiseCCurve,  bool autoch, bool denoiseMethodRgb);
-    void ShrinkAllL(wavelet_decomposition &WaveletCoeffs_L, float **buffer, int level, int dir, float *noisevarlum, float * madL, float * vari, int edge);
-    void ShrinkAllAB(wavelet_decomposition &WaveletCoeffs_L, wavelet_decomposition &WaveletCoeffs_ab, float **buffer, int level, int dir,
+    void ShrinkAllL(const wavelet_decomposition &WaveletCoeffs_L, float **buffer, int level, int dir, float *noisevarlum, float * madL, float * vari, int edge);
+    void ShrinkAllAB(const wavelet_decomposition &WaveletCoeffs_L, const wavelet_decomposition &WaveletCoeffs_ab, float **buffer, int level, int dir,
                      float *noisevarchrom, float noisevar_ab, const bool useNoiseCCurve, bool autoch, bool denoiseMethodRgb, float * madL, float * madaab = nullptr, bool madCalculated = false);
     void ShrinkAll_info(float ** WavCoeffs_a, float ** WavCoeffs_b,
                         int W_ab, int H_ab, float **noisevarlum, float **noisevarchrom, float **noisevarhue, float &chaut, int &Nb, float &redaut, float &blueaut, float &maxredaut, float &maxblueaut, float &minredaut, float &minblueaut, int schoice, int lvl, float &chromina, float &sigma, float &lumema, float &sigma_L, float &redyel, float &skinc, float &nsknc,
diff --git a/rtengine/ipretinex.cc b/rtengine/ipretinex.cc
index 7117c9f2a..de6f4960a 100644
--- a/rtengine/ipretinex.cc
+++ b/rtengine/ipretinex.cc
@@ -43,6 +43,7 @@
 
 #include "gauss.h"
 #include "improcfun.h"
+#include "jaggedarray.h"
 #include "median.h"
 #include "opthelper.h"
 #include "procparams.h"
@@ -50,8 +51,6 @@
 #include "rtengine.h"
 #include "StopWatch.h"
 
-#define clipretinex( val, minv, maxv )    (( val = (val < minv ? minv : val ) ) > maxv ? maxv : val )
-
 namespace
 {
 void retinex_scales( float* scales, int nscales, int mode, int s, float high)
@@ -138,360 +137,422 @@ namespace rtengine
 
 extern const Settings* settings;
 
-void RawImageSource::MSR(float** luminance, float** originalLuminance, float **exLuminance,  LUTf & mapcurve, bool &mapcontlutili, int width, int height, const RetinexParams &deh, const RetinextransmissionCurve & dehatransmissionCurve, const RetinexgaintransmissionCurve & dehagaintransmissionCurve, float &minCD, float &maxCD, float &mini, float &maxi, float &Tmean, float &Tsigma, float &Tmin, float &Tmax)
+void RawImageSource::MSR(float** luminance, float** originalLuminance, float **exLuminance, const LUTf& mapcurve, bool mapcontlutili, int width, int height, const RetinexParams &deh, const RetinextransmissionCurve & dehatransmissionCurve, const RetinexgaintransmissionCurve & dehagaintransmissionCurve, float &minCD, float &maxCD, float &mini, float &maxi, float &Tmean, float &Tsigma, float &Tmin, float &Tmax)
 {
+BENCHFUN
+    if (!deh.enabled) {
+        return;
+    }
 
-    if (deh.enabled) {//enabled
-        float maxtr, mintr;
-        constexpr float eps = 2.f;
-        bool useHsl = deh.retinexcolorspace == "HSLLOG";
-        bool useHslLin = deh.retinexcolorspace == "HSLLIN";
-        float offse = (float) deh.offs; //def = 0  not use
-        int iter = deh.iter;
-        int gradient = deh.scal;
-        int scal = 3;//disabled scal
-        int nei = (int) (2.8f * deh.neigh); //def = 220
-        float vart = (float)deh.vart / 100.f;//variance
-        float gradvart = (float)deh.grad;
-        float gradstr = (float)deh.grads;
-        float strength = (float) deh.str / 100.f; // Blend with original L channel data
-        float limD = (float) deh.limd;
-        limD = pow(limD, 1.7f);//about 2500 enough
-        limD *= useHslLin ? 10.f : 1.f;
-        float ilimD = 1.f / limD;
-        float hig = ((float) deh.highl) / 100.f;
-        scal = deh.skal;
+    constexpr float eps = 2.f;
+    const bool useHsl = deh.retinexcolorspace == "HSLLOG";
+    const bool useHslLin = deh.retinexcolorspace == "HSLLIN";
+    const float offse = deh.offs; //def = 0  not use
+    const int iter = deh.iter;
+    const int gradient = deh.scal;
+    int scal = deh.skal;
+    const int nei = 2.8f * deh.neigh; //def = 220
+    const float vart = deh.vart / 100.f;//variance
+    const float gradvart = deh.grad;
+    const float gradstr = deh.grads;
+    const float strength = deh.str / 100.f; // Blend with original L channel data
+    float limD = deh.limd;
+    limD = pow(limD, 1.7f);//about 2500 enough
+    limD *= useHslLin ? 10.f : 1.f;
+    const float ilimD = 1.f / limD;
+    const float hig = deh.highl / 100.f;
 
-        int H_L = height;
-        int W_L = width;
+    const int H_L = height;
+    const int W_L = width;
 
-        float *tran[H_L] ALIGNED16;
-        float *tranBuffer = nullptr;
+    constexpr float elogt = 2.71828f;
+    bool lhutili = false;
 
-        constexpr float elogt = 2.71828f;
-        bool lhutili = false;
+    FlatCurve* shcurve = new FlatCurve(deh.lhcurve); //curve L=f(H)
 
-        FlatCurve* shcurve = new FlatCurve(deh.lhcurve); //curve L=f(H)
-
-        if (!shcurve || shcurve->isIdentity()) {
-            if (shcurve) {
-                delete shcurve;
-                shcurve = nullptr;
-            }
-        } else {
-            lhutili = true;
+    if (!shcurve || shcurve->isIdentity()) {
+        if (shcurve) {
+            delete shcurve;
+            shcurve = nullptr;
         }
+    } else {
+        lhutili = true;
+    }
 
 
-        bool higplus = false ;
-        int moderetinex = 2; // default to 2 ( deh.retinexMethod == "high" )
+    bool higplus = false ;
+    int moderetinex = 2; // default to 2 ( deh.retinexMethod == "high" )
 
-        if(deh.retinexMethod == "highliplus") {
-            higplus = true;
-            moderetinex = 3;
-        } else if (deh.retinexMethod == "uni") {
-            moderetinex = 0;
-        } else if (deh.retinexMethod == "low") {
-            moderetinex = 1;
-        } else { /*if (deh.retinexMethod == "highli") */
-            moderetinex = 3;
-        }
+    if(deh.retinexMethod == "highliplus") {
+        higplus = true;
+        moderetinex = 3;
+    } else if (deh.retinexMethod == "uni") {
+        moderetinex = 0;
+    } else if (deh.retinexMethod == "low") {
+        moderetinex = 1;
+    } else { /*if (deh.retinexMethod == "highli") */
+        moderetinex = 3;
+    }
 
-        constexpr float aahi = 49.f / 99.f; ////reduce sensibility 50%
-        constexpr float bbhi = 1.f - aahi;
+    constexpr float aahi = 49.f / 99.f; ////reduce sensibility 50%
+    constexpr float bbhi = 1.f - aahi;
+    float high = bbhi + aahi * (float) deh.highl;
 
-        for(int it = 1; it < iter + 1; it++) { //iter nb max of iterations
-            float high = bbhi + aahi * (float) deh.highl;
+    for (int it = 1; it < iter + 1; it++) { //iter nb max of iterations
+        float grad = 1.f;
+        float sc = scal;
 
-            float grad = 1.f;
-            float sc = scal;
-
-            if(gradient == 0) {
-                grad = 1.f;
-                sc = 3.f;
-            } else if(gradient == 1) {
-                grad = 0.25f * it + 0.75f;
-                sc = -0.5f * it + 4.5f;
-            } else if(gradient == 2) {
-                grad = 0.5f * it + 0.5f;
-                sc = -0.75f * it + 5.75f;
-            } else if(gradient == 3) {
-                grad = 0.666f * it + 0.333f;
-                sc = -0.75f * it + 5.75f;
-            } else if(gradient == 4) {
-                grad = 0.8f * it + 0.2f;
-                sc = -0.75f * it + 5.75f;
-            } else if(gradient == 5) {
-                if(moderetinex != 3) {
-                    grad = 2.5f * it - 1.5f;
-                } else {
-                    float aa = (11.f * high - 1.f) / 4.f;
-                    float bb = 1.f - aa;
-                    grad = aa * it + bb;
-                }
-
-                sc = -0.75f * it + 5.75f;
-            } else if(gradient == 6) {
-                if(moderetinex != 3) {
-                    grad = 5.f * it - 4.f;
-                } else {
-                    float aa = (21.f * high - 1.f) / 4.f;
-                    float bb = 1.f - aa;
-                    grad = aa * it + bb;
-                }
-
-                sc = -0.75f * it + 5.75f;
-            }
-
-            else if(gradient == -1) {
-                grad = -0.125f * it + 1.125f;
-                sc = 3.f;
-            }
-
-            if(iter == 1) {
-                sc = scal;
+        if (gradient == 0) {
+            grad = 1.f;
+            sc = 3.f;
+        } else if (gradient == 1) {
+            grad = 0.25f * it + 0.75f;
+            sc = -0.5f * it + 4.5f;
+        } else if (gradient == 2) {
+            grad = 0.5f * it + 0.5f;
+            sc = -0.75f * it + 5.75f;
+        } else if (gradient == 3) {
+            grad = 0.666f * it + 0.333f;
+            sc = -0.75f * it + 5.75f;
+        } else if (gradient == 4) {
+            grad = 0.8f * it + 0.2f;
+            sc = -0.75f * it + 5.75f;
+        } else if (gradient == 5) {
+            if (moderetinex != 3) {
+                grad = 2.5f * it - 1.5f;
             } else {
-                //adjust sc in function of choice of scale by user if iterations
-                if(scal < 3) {
-                    sc -= 1;
+                float aa = (11.f * high - 1.f) / 4.f;
+                float bb = 1.f - aa;
+                grad = aa * it + bb;
+            }
 
-                    if(sc < 1.f) {//avoid 0
-                        sc = 1.f;
-                    }
+            sc = -0.75f * it + 5.75f;
+        } else if (gradient == 6) {
+            if (moderetinex != 3) {
+                grad = 5.f * it - 4.f;
+            } else {
+                float aa = (21.f * high - 1.f) / 4.f;
+                float bb = 1.f - aa;
+                grad = aa * it + bb;
+            }
+
+            sc = -0.75f * it + 5.75f;
+        } else if (gradient == -1) {
+            grad = -0.125f * it + 1.125f;
+            sc = 3.f;
+        }
+
+        if (iter == 1) {
+            sc = scal;
+        } else {
+            //adjust sc in function of choice of scale by user if iterations
+            if (scal < 3) {
+                sc -= 1;
+                if (sc < 1.f) {//avoid 0
+                    sc = 1.f;
                 }
+            } else if (scal > 4) {
+                sc += 1;
+            }
+        }
 
-                if(scal > 4) {
-                    sc += 1;
+        float varx = vart;
+        float limdx = limD;
+        float ilimdx = ilimD;
+
+        if (gradvart != 0) {
+            if (gradvart == 1) {
+                varx = vart * (-0.125f * it + 1.125f);
+                limdx = limD * (-0.125f * it + 1.125f);
+                ilimdx = 1.f / limdx;
+            } else if (gradvart == 2) {
+                varx = vart * (-0.2f * it + 1.2f);
+                limdx = limD * (-0.2f * it + 1.2f);
+                ilimdx = 1.f / limdx;
+            } else if (gradvart == -1) {
+                varx = vart * (0.125f * it + 0.875f);
+                limdx = limD * (0.125f * it + 0.875f);
+                ilimdx = 1.f / limdx;
+            } else if (gradvart == -2) {
+                varx = vart * (0.4f * it + 0.6f);
+                limdx = limD * (0.4f * it + 0.6f);
+                ilimdx = 1.f / limdx;
+            }
+        }
+
+        scal = round(sc);
+        float ks = 1.f;
+
+        if (gradstr != 0) {
+            if (gradstr == 1) {
+                if (it <= 3) {
+                    ks = -0.3f * it + 1.6f;
+                } else {
+                    ks = 0.5f;
+                }
+            } else if (gradstr == 2) {
+                if (it <= 3) {
+                    ks = -0.6f * it + 2.2f;
+                } else {
+                    ks = 0.3f;
+                }
+            } else if (gradstr == -1) {
+                if (it <= 3) {
+                    ks = 0.2f * it + 0.6f;
+                } else {
+                    ks = 1.2f;
+                }
+            } else if (gradstr == -2) {
+                if (it <= 3) {
+                    ks = 0.4f * it + 0.2f;
+                } else {
+                    ks = 1.5f;
                 }
             }
+        }
 
-            float varx = vart;
-            float limdx = limD;
-            float ilimdx = ilimD;
+        const float strengthx = ks * strength;
 
-            if(gradvart != 0) {
-                if(gradvart == 1) {
-                    varx = vart * (-0.125f * it + 1.125f);
-                    limdx = limD * (-0.125f * it + 1.125f);
-                    ilimdx = 1.f / limdx;
-                } else if(gradvart == 2) {
-                    varx = vart * (-0.2f * it + 1.2f);
-                    limdx = limD * (-0.2f * it + 1.2f);
-                    ilimdx = 1.f / limdx;
-                } else if(gradvart == -1) {
-                    varx = vart * (0.125f * it + 0.875f);
-                    limdx = limD * (0.125f * it + 0.875f);
-                    ilimdx = 1.f / limdx;
-                } else if(gradvart == -2) {
-                    varx = vart * (0.4f * it + 0.6f);
-                    limdx = limD * (0.4f * it + 0.6f);
-                    ilimdx = 1.f / limdx;
-                }
-            }
+        constexpr auto maxRetinexScales = 8;
+        float RetinexScales[maxRetinexScales];
 
-            scal = round(sc);
-            float ks = 1.f;
+        retinex_scales(RetinexScales, scal, moderetinex, nei / grad, high);
 
-            if(gradstr != 0) {
-                if(gradstr == 1) {
-                    if(it <= 3) {
-                        ks = -0.3f * it + 1.6f;
-                    } else {
-                        ks = 0.5f;
-                    }
-                } else if(gradstr == 2) {
-                    if(it <= 3) {
-                        ks = -0.6f * it + 2.2f;
-                    } else {
-                        ks = 0.3f;
-                    }
-                } else if(gradstr == -1) {
-                    if(it <= 3) {
-                        ks = 0.2f * it + 0.6f;
-                    } else {
-                        ks = 1.2f;
-                    }
-                } else if(gradstr == -2) {
-                    if(it <= 3) {
-                        ks = 0.4f * it + 0.2f;
-                    } else {
-                        ks = 1.5f;
-                    }
-                }
-            }
+        const int shHighlights = deh.highlights;
+        const int shShadows = deh.shadows;
 
-            float strengthx = ks * strength;
+        int mapmet = 0;
 
-            constexpr auto maxRetinexScales = 8;
-            float RetinexScales[maxRetinexScales];
+        if(deh.mapMethod == "map") {
+            mapmet = 2;
+        } else if(deh.mapMethod == "mapT") {
+            mapmet = 3;
+        } else if(deh.mapMethod == "gaus") {
+            mapmet = 4;
+        }
 
-            retinex_scales( RetinexScales, scal, moderetinex, nei / grad, high );
+        const double shradius = mapmet == 4 ? (double) deh.radius : 40.;
 
-            float *src[H_L] ALIGNED16;
-            float *srcBuffer = new float[H_L * W_L];
+        int viewmet = 0;
 
-            for (int i = 0; i < H_L; i++) {
-                src[i] = &srcBuffer[i * W_L];
-            }
+        if(deh.viewMethod == "mask") {
+            viewmet = 1;
+        } else if(deh.viewMethod == "tran") {
+            viewmet = 2;
+        } else if(deh.viewMethod == "tran2") {
+            viewmet = 3;
+        } else if(deh.viewMethod == "unsharp") {
+            viewmet = 4;
+        }
 
-            int h_th = 0, s_th = 0;
-
-            int shHighlights = deh.highlights;
-            int shShadows = deh.shadows;
-
-            int mapmet = 0;
-
-            if(deh.mapMethod == "map") {
-                mapmet = 2;
-            } else if(deh.mapMethod == "mapT") {
-                mapmet = 3;
-            } else if(deh.mapMethod == "gaus") {
-                mapmet = 4;
-            }
-
-            const double shradius = mapmet == 4 ? (double) deh.radius : 40.;
-
-            int viewmet = 0;
-
-            if(deh.viewMethod == "mask") {
-                viewmet = 1;
-            } else if(deh.viewMethod == "tran") {
-                viewmet = 2;
-            } else if(deh.viewMethod == "tran2") {
-                viewmet = 3;
-            } else if(deh.viewMethod == "unsharp") {
-                viewmet = 4;
-            }
+        std::unique_ptr<JaggedArray<float>> srcBuffer(new JaggedArray<float>(W_L, H_L));
+        float** src = *(srcBuffer.get());
 
 #ifdef _OPENMP
-            #pragma omp parallel for
+        #pragma omp parallel for
 #endif
 
-            for (int i = 0; i < H_L; i++)
-                for (int j = 0; j < W_L; j++) {
-                    src[i][j] = luminance[i][j] + eps;
-                    luminance[i][j] = 0.f;
-                }
-
-            float *out[H_L] ALIGNED16;
-            float *outBuffer = new float[H_L * W_L];
-
-            for (int i = 0; i < H_L; i++) {
-                out[i] = &outBuffer[i * W_L];
+        for (int i = 0; i < H_L; i++)
+            for (int j = 0; j < W_L; j++) {
+                src[i][j] = luminance[i][j] + eps;
+                luminance[i][j] = 0.f;
             }
 
-            if(viewmet == 3  || viewmet == 2) {
-                tranBuffer = new float[H_L * W_L];
+        JaggedArray<float> out(W_L, H_L);
+        JaggedArray<float>& tran = out; // tran and out can safely use the same buffer
 
-                for (int i = 0; i < H_L; i++) {
-                    tran[i] = &tranBuffer[i * W_L];
-                }
-            }
+        const float logBetaGain = xlogf(16384.f);
+        float pond = logBetaGain / (float) scal;
 
-            const float logBetaGain = xlogf(16384.f);
-            float pond = logBetaGain / (float) scal;
+        if(!useHslLin) {
+            pond /= log(elogt);
+        }
 
-            if(!useHslLin) {
-                pond /= log(elogt);
-            }
+        std::unique_ptr<SHMap> shmap;
+        if (((mapmet == 2 || mapmet == 3 || mapmet == 4) && it == 1)) {
+            shmap.reset(new SHMap(W_L, H_L));
+        }
 
-            auto shmap = ((mapmet == 2 || mapmet == 3 || mapmet == 4) && it == 1) ? new SHMap (W_L, H_L) : nullptr;
+        std::unique_ptr<float[]> buffer;
+        if (mapmet > 0) {
+            buffer.reset(new float[W_L * H_L]);
+        }
 
-            float *buffer = new float[W_L * H_L];;
-
-            for ( int scale = scal - 1; scale >= 0; scale-- ) {
-#ifdef _OPENMP
-                #pragma omp parallel
-#endif
-                {
-                    if(scale == scal - 1)
-                    {
-                        gaussianBlur (src, out, W_L, H_L, RetinexScales[scale], buffer);
-                    } else { // reuse result of last iteration
-                        // out was modified in last iteration => restore it
-                        if((((mapmet == 2 && scale > 1) || mapmet == 3 || mapmet == 4) || (mapmet > 0 && mapcontlutili)) && it == 1)
-                        {
-#ifdef _OPENMP
-                            #pragma omp for
-#endif
-
-                            for (int i = 0; i < H_L; i++) {
-                                for (int j = 0; j < W_L; j++) {
-                                    out[i][j] = buffer[i * W_L + j];
-                                }
-                            }
-                        }
-
-                        gaussianBlur (out, out, W_L, H_L, sqrtf(SQR(RetinexScales[scale]) - SQR(RetinexScales[scale + 1])), buffer);
-                    }
-                    if((((mapmet == 2 && scale > 2) || mapmet == 3 || mapmet == 4) || (mapmet > 0 && mapcontlutili)) && it == 1 && scale > 0)
-                    {
-                        // out will be modified => store it for use in next iteration. We even don't need a new buffer because 'buffer' is free after gaussianBlur :)
-#ifdef _OPENMP
-                        #pragma omp for
-#endif
-
-                        for (int i = 0; i < H_L; i++) {
-                            for (int j = 0; j < W_L; j++) {
-                                buffer[i * W_L + j] = out[i][j];
-                            }
-                        }
-                    }
-                }
-
-                if(((mapmet == 2 && scale > 2) || mapmet == 3 || mapmet == 4) && it == 1) {
-                    shmap->updateL (out, shradius, true, 1);
-                    h_th = shmap->max_f - deh.htonalwidth * (shmap->max_f - shmap->avg) / 100;
-                    s_th = deh.stonalwidth * (shmap->avg - shmap->min_f) / 100;
-                }
-
-#ifdef __SSE2__
-                vfloat pondv = F2V(pond);
-                vfloat limMinv = F2V(ilimdx);
-                vfloat limMaxv = F2V(limdx);
-
-#endif
-
-                if(mapmet > 0 && mapcontlutili && it == 1) {
-                    // TODO: When rgbcurvespeedup branch is merged into master we can simplify the code by
-                    // 1) in rawimagesource.retinexPrepareCurves() insert
-                    //    mapcurve *= 0.5f;
-                    //    after
-                    //    CurveFactory::mapcurve (mapcontlutili, retinexParams.mapcurve, mapcurve, 1, lhist16RETI, histLRETI);
-                    // 2) remove the division by 2.f from the code 7 lines below this line
+        for (int scale = scal - 1; scale >= 0; --scale) {
+            if (scale == scal - 1) {
+                gaussianBlur(src, out, W_L, H_L, RetinexScales[scale], true);
+            } else { // reuse result of last iteration
+                // out was modified in last iteration => restore it
+                if((((mapmet == 2 && scale > 1) || mapmet == 3 || mapmet == 4) || (mapmet > 0 && mapcontlutili)) && it == 1) {
 #ifdef _OPENMP
                     #pragma omp parallel for
 #endif
 
                     for (int i = 0; i < H_L; i++) {
                         for (int j = 0; j < W_L; j++) {
-                            out[i][j] = mapcurve[2.f * out[i][j]] / 2.f;
+                            out[i][j] = buffer[i * W_L + j];
                         }
                     }
-
                 }
 
-                if(((mapmet == 2 && scale > 2) || mapmet == 3 || mapmet == 4) && it == 1) {
-                    float hWeight = (100.f - shHighlights) / 100.f;
-                    float sWeight = (100.f - shShadows) / 100.f;
+                gaussianBlur(out, out, W_L, H_L, sqrtf(SQR(RetinexScales[scale]) - SQR(RetinexScales[scale + 1])), true);
+            }
+
+            if ((((mapmet == 2 && scale > 2) || mapmet == 3 || mapmet == 4) || (mapmet > 0 && mapcontlutili)) && it == 1 && scale > 0) {
+                // out will be modified => store it for use in next iteration.
 #ifdef _OPENMP
-                    #pragma omp parallel for schedule(dynamic,16)
+                #pragma omp parallel for
 #endif
 
-                    for (int i = 0; i < H_L; i++) {
-                        for (int j = 0; j < W_L; j++) {
-                            float mapval = 1.f + shmap->map[i][j];
-                            float factor = 1.f;
+                for (int i = 0; i < H_L; i++) {
+                    for (int j = 0; j < W_L; j++) {
+                        buffer[i * W_L + j] = out[i][j];
+                    }
+                }
+            }
+            int h_th = 0;
+            int s_th = 0;
+            if (((mapmet == 2 && scale > 2) || mapmet == 3 || mapmet == 4) && it == 1) {
+                shmap->updateL(out, shradius, true, 1);
+                h_th = shmap->max_f - deh.htonalwidth * (shmap->max_f - shmap->avg) / 100;
+                s_th = deh.stonalwidth * (shmap->avg - shmap->min_f) / 100;
+            }
 
-                            if (mapval > h_th) {
-                                factor = (h_th + hWeight * (mapval - h_th)) / mapval;
-                            } else if (mapval < s_th) {
-                                factor = (s_th - sWeight * (s_th - mapval)) / mapval;
-                            }
+            if (mapmet > 0 && mapcontlutili && it == 1) {
+#ifdef _OPENMP
+                #pragma omp parallel for
+#endif
 
-                            out[i][j] *= factor;
+                for (int i = 0; i < H_L; i++) {
+                    for (int j = 0; j < W_L; j++) {
+                        out[i][j] = mapcurve[2.f * out[i][j]];
+                    }
+                }
 
+            }
+
+            if (((mapmet == 2 && scale > 2) || mapmet == 3 || mapmet == 4) && it == 1) {
+                const float hWeight = (100.f - shHighlights) / 100.f;
+                const float sWeight = (100.f - shShadows) / 100.f;
+#ifdef _OPENMP
+                #pragma omp parallel for schedule(dynamic,16)
+#endif
+
+                for (int i = 0; i < H_L; i++) {
+                    for (int j = 0; j < W_L; j++) {
+                        const float mapval = 1.f + shmap->map[i][j];
+                        float factor;
+
+                        if (mapval > h_th) {
+                            factor = (h_th + hWeight * (mapval - h_th)) / mapval;
+                        } else if (mapval < s_th) {
+                            factor = (s_th - sWeight * (s_th - mapval)) / mapval;
+                        } else {
+                            factor = 1.f;
                         }
+
+                        out[i][j] *= factor;
+                    }
+                }
+            }
+
+#ifdef _OPENMP
+            #pragma omp parallel for
+#endif
+
+            for (int i = 0; i < H_L; i++) {
+                int j = 0;
+
+#ifdef __SSE2__
+                const vfloat pondv = F2V(pond);
+                const vfloat limMinv = F2V(ilimdx);
+                const vfloat limMaxv = F2V(limdx);
+                if( useHslLin) {
+                    for (; j < W_L - 3; j += 4) {
+                        STVFU(luminance[i][j], LVFU(luminance[i][j]) + pondv * vclampf(LVFU(src[i][j]) / LVFU(out[i][j]), limMinv, limMaxv));
+                    }
+                } else {
+                    for (; j < W_L - 3; j += 4) {
+                        STVFU(luminance[i][j], LVFU(luminance[i][j]) + pondv * xlogf(vclampf(LVFU(src[i][j]) / LVFU(out[i][j]), limMinv, limMaxv)));
+                    }
+                }
+
+#endif
+
+                if(useHslLin) {
+                    for (; j < W_L; j++) {
+                        luminance[i][j] +=  pond * LIM(src[i][j] / out[i][j], ilimdx, limdx);
+                    }
+                } else {
+                    for (; j < W_L; j++) {
+                        luminance[i][j] +=  pond * xlogf(LIM(src[i][j] / out[i][j], ilimdx, limdx)); //  /logt ?
+                    }
+                }
+            }
+        }
+
+        srcBuffer.reset();
+
+        float mean = 0.f;
+        float stddv = 0.f;
+        // I call mean_stddv2 instead of mean_stddv ==> logBetaGain
+
+        float maxtr, mintr;
+        mean_stddv2(luminance, mean, stddv, W_L, H_L, maxtr, mintr);
+        //printf("mean=%f std=%f delta=%f maxtr=%f mintr=%f\n", mean, stddv, delta, maxtr, mintr);
+
+        //mean_stddv( luminance, mean, stddv, W_L, H_L, logBetaGain, maxtr, mintr);
+        if (dehatransmissionCurve && mean != 0.f && stddv != 0.f) { //if curve
+            float asig = 0.166666f / stddv;
+            float bsig = 0.5f - asig * mean;
+            float amax = 0.333333f / (maxtr - mean - stddv);
+            float bmax = 1.f - amax * maxtr;
+            float amin = 0.333333f / (mean - stddv - mintr);
+            float bmin = -amin * mintr;
+
+            asig *= 500.f;
+            bsig *= 500.f;
+            amax *= 500.f;
+            bmax *= 500.f;
+            amin *= 500.f;
+            bmin *= 500.f;
+
+#ifdef _OPENMP
+            #pragma omp parallel for schedule(dynamic,16)
+#endif
+
+            for (int i = 0; i < H_L; i++ ) {
+                for (int j = 0; j < W_L; j++) { //for mintr to maxtr evalate absciss in function of original transmission
+                    float absciss;
+                    if (LIKELY(fabsf(luminance[i][j] - mean) < stddv)) {
+                        absciss = asig * luminance[i][j] + bsig;
+                    } else if (luminance[i][j] >= mean) {
+                        absciss = amax * luminance[i][j] + bmax;
+                    } else { /*if(luminance[i][j] <= mean - stddv)*/
+                        absciss = amin * luminance[i][j] + bmin;
+                    }
+
+                    //TODO : move multiplication by 4.f and subtraction of 1.f inside the curve
+                    luminance[i][j] *= (-1.f + 4.f * dehatransmissionCurve[absciss]); //new transmission
+
+                    if(viewmet == 3 || viewmet == 2) {
+                        tran[i][j] = luminance[i][j];
+                    }
+                }
+            }
+
+            // median filter on transmission  ==> reduce artifacts
+            if (deh.medianmap && it == 1) { //only one time
+                JaggedArray<float> tmL(W_L, H_L);
+                constexpr int borderL = 1;
+
+#ifdef _OPENMP
+                #pragma omp parallel for
+#endif
+
+                for (int i = borderL; i < H_L - borderL; i++) {
+                    for (int j = borderL; j < W_L - borderL; j++) {
+                        tmL[i][j] = median(luminance[i][j], luminance[i - 1][j], luminance[i + 1][j], luminance[i][j + 1], luminance[i][j - 1], luminance[i - 1][j - 1], luminance[i - 1][j + 1], luminance[i + 1][j - 1], luminance[i + 1][j + 1]); //3x3
                     }
                 }
 
@@ -499,307 +560,164 @@ void RawImageSource::MSR(float** luminance, float** originalLuminance, float **e
                 #pragma omp parallel for
 #endif
 
-                for (int i = 0; i < H_L; i++) {
-                    int j = 0;
-
-#ifdef __SSE2__
-
-                    if(useHslLin) {
-                        for (; j < W_L - 3; j += 4) {
-                            _mm_storeu_ps(&luminance[i][j], LVFU(luminance[i][j]) + pondv *  (vclampf(LVFU(src[i][j]) / LVFU(out[i][j]), limMinv, limMaxv) ));
-                        }
-                    } else {
-                        for (; j < W_L - 3; j += 4) {
-                            _mm_storeu_ps(&luminance[i][j], LVFU(luminance[i][j]) + pondv *  xlogf(vclampf(LVFU(src[i][j]) / LVFU(out[i][j]), limMinv, limMaxv) ));
-                        }
-                    }
-
-#endif
-
-                    if(useHslLin) {
-                        for (; j < W_L; j++) {
-                            luminance[i][j] +=  pond * (LIM(src[i][j] / out[i][j], ilimdx, limdx));
-                        }
-                    } else {
-                        for (; j < W_L; j++) {
-                            luminance[i][j] +=  pond * xlogf(LIM(src[i][j] / out[i][j], ilimdx, limdx)); //  /logt ?
-                        }
+                for (int i = borderL; i < H_L - borderL; i++ ) {
+                    for (int j = borderL; j < W_L - borderL; j++) {
+                        luminance[i][j] = tmL[i][j];
                     }
                 }
             }
 
-            if(mapmet > 1) {
-                if(shmap) {
-                    delete shmap;
-                }
-            }
-
-            shmap = nullptr;
-
-            delete [] buffer;
-            delete [] srcBuffer;
-
-            float mean = 0.f;
-            float stddv = 0.f;
             // I call mean_stddv2 instead of mean_stddv ==> logBetaGain
+            //mean_stddv( luminance, mean, stddv, W_L, H_L, 1.f, maxtr, mintr);
+            mean_stddv2(luminance, mean, stddv, W_L, H_L, maxtr, mintr);
+        }
 
-            mean_stddv2( luminance, mean, stddv, W_L, H_L, maxtr, mintr);
-            //printf("mean=%f std=%f delta=%f maxtr=%f mintr=%f\n", mean, stddv, delta, maxtr, mintr);
+        constexpr float epsil = 0.1f;
 
-            //mean_stddv( luminance, mean, stddv, W_L, H_L, logBetaGain, maxtr, mintr);
-            if (dehatransmissionCurve && mean != 0.f && stddv != 0.f) { //if curve
-                float asig = 0.166666f / stddv;
-                float bsig = 0.5f - asig * mean;
-                float amax = 0.333333f / (maxtr - mean - stddv);
-                float bmax = 1.f - amax * maxtr;
-                float amin = 0.333333f / (mean - stddv - mintr);
-                float bmin = -amin * mintr;
+        mini = mean - varx * stddv;
 
-                asig *= 500.f;
-                bsig *= 500.f;
-                amax *= 500.f;
-                bmax *= 500.f;
-                amin *= 500.f;
-                bmin *= 500.f;
+        if (mini < mintr) {
+            mini = mintr + epsil;
+        }
 
-#ifdef _OPENMP
-                #pragma omp parallel
-#endif
-                {
-                    float absciss;
-#ifdef _OPENMP
-                    #pragma omp for schedule(dynamic,16)
-#endif
+        maxi = mean + varx * stddv;
 
-                    for (int i = 0; i < H_L; i++ )
-                        for (int j = 0; j < W_L; j++) { //for mintr to maxtr evalate absciss in function of original transmission
-                            if (LIKELY(fabsf(luminance[i][j] - mean) < stddv)) {
-                                absciss = asig * luminance[i][j] + bsig;
-                            } else if (luminance[i][j] >= mean) {
-                                absciss = amax * luminance[i][j] + bmax;
-                            } else { /*if(luminance[i][j] <= mean - stddv)*/
-                                absciss = amin * luminance[i][j] + bmin;
-                            }
+        if (maxi > maxtr) {
+            maxi = maxtr - epsil;
+        }
 
-                            //TODO : move multiplication by 4.f and subtraction of 1.f inside the curve
-                            luminance[i][j] *= (-1.f + 4.f * dehatransmissionCurve[absciss]); //new transmission
+        float delta = maxi - mini;
+        //printf("maxi=%f mini=%f mean=%f std=%f delta=%f maxtr=%f mintr=%f\n", maxi, mini, mean, stddv, delta, maxtr, mintr);
 
-                            if(viewmet == 3 || viewmet == 2) {
-                                tran[i][j] = luminance[i][j];
-                            }
-                        }
-                }
+        if ( !delta ) {
+            delta = 1.0f;
+        }
 
-                // median filter on transmission  ==> reduce artifacts
-                if (deh.medianmap && it == 1) { //only one time
-                    int wid = W_L;
-                    int hei = H_L;
-                    float *tmL[hei] ALIGNED16;
-                    float *tmLBuffer = new float[wid * hei];
-                    int borderL = 1;
-
-                    for (int i = 0; i < hei; i++) {
-                        tmL[i] = &tmLBuffer[i * wid];
-                    }
-
-#ifdef _OPENMP
-                    #pragma omp parallel for
-#endif
-
-                    for (int i = borderL; i < hei - borderL; i++) {
-                        for (int j = borderL; j < wid - borderL; j++) {
-                            tmL[i][j] = median(luminance[i][j], luminance[i - 1][j], luminance[i + 1][j], luminance[i][j + 1], luminance[i][j - 1], luminance[i - 1][j - 1], luminance[i - 1][j + 1], luminance[i + 1][j - 1], luminance[i + 1][j + 1]); //3x3
-                        }
-                    }
-
-#ifdef _OPENMP
-                    #pragma omp parallel for
-#endif
-
-                    for (int i = borderL; i < hei - borderL; i++ ) {
-                        for (int j = borderL; j < wid - borderL; j++) {
-                            luminance[i][j] = tmL[i][j];
-                        }
-                    }
-
-                    delete [] tmLBuffer;
-
-                }
-
-                // I call mean_stddv2 instead of mean_stddv ==> logBetaGain
-                //mean_stddv( luminance, mean, stddv, W_L, H_L, 1.f, maxtr, mintr);
-                mean_stddv2( luminance, mean, stddv, W_L, H_L, maxtr, mintr);
-
-            }
-
-            float epsil = 0.1f;
-
-            mini = mean - varx * stddv;
-
-            if (mini < mintr) {
-                mini = mintr + epsil;
-            }
-
-            maxi = mean + varx * stddv;
-
-            if (maxi > maxtr) {
-                maxi = maxtr - epsil;
-            }
-
-            float delta = maxi - mini;
-            //printf("maxi=%f mini=%f mean=%f std=%f delta=%f maxtr=%f mintr=%f\n", maxi, mini, mean, stddv, delta, maxtr, mintr);
-
-            if ( !delta ) {
-                delta = 1.0f;
-            }
-
-            float cdfactor = 32768.f / delta;
-            maxCD = -9999999.f;
-            minCD = 9999999.f;
-            // coeff for auto    "transmission" with 2 sigma #95% data
-            float aza = 16300.f / (2.f * stddv);
-            float azb = -aza * (mean - 2.f * stddv);
-            float bza = 16300.f / (2.f * stddv);
-            float bzb = 16300.f - bza * (mean);
+        // coeff for auto    "transmission" with 2 sigma #95% data
+        const float aza = 16300.f / (2.f * stddv);
+        const float azb = -aza * (mean - 2.f * stddv);
+        const float bza = 16300.f / (2.f * stddv);
+        const float bzb = 16300.f - bza * (mean);
 
 //prepare work for curve gain
 #ifdef _OPENMP
-            #pragma omp parallel for
+        #pragma omp parallel for
 #endif
 
-            for (int i = 0; i < H_L; i++) {
-                for (int j = 0; j < W_L; j++) {
-                    luminance[i][j] = luminance[i][j] - mini;
-                }
+        for (int i = 0; i < H_L; i++) {
+            for (int j = 0; j < W_L; j++) {
+                luminance[i][j] = luminance[i][j] - mini;
             }
+        }
 
-            mean = 0.f;
-            stddv = 0.f;
-            // I call mean_stddv2 instead of mean_stddv ==> logBetaGain
+        mean = 0.f;
+        stddv = 0.f;
+        // I call mean_stddv2 instead of mean_stddv ==> logBetaGain
 
-            mean_stddv2( luminance, mean, stddv, W_L, H_L, maxtr, mintr);
-            float asig = 0.f, bsig = 0.f, amax = 0.f, bmax = 0.f, amin = 0.f, bmin = 0.f;
+        mean_stddv2(luminance, mean, stddv, W_L, H_L, maxtr, mintr);
+        float asig = 0.f, bsig = 0.f, amax = 0.f, bmax = 0.f, amin = 0.f, bmin = 0.f;
 
-            if (dehagaintransmissionCurve && mean != 0.f && stddv != 0.f) { //if curve
-                asig = 0.166666f / stddv;
-                bsig = 0.5f - asig * mean;
-                amax = 0.333333f / (maxtr - mean - stddv);
-                bmax = 1.f - amax * maxtr;
-                amin = 0.333333f / (mean - stddv - mintr);
-                bmin = -amin * mintr;
+        if (dehagaintransmissionCurve && mean != 0.f && stddv != 0.f) { //if curve
+            asig = 0.166666f / stddv;
+            bsig = 0.5f - asig * mean;
+            amax = 0.333333f / (maxtr - mean - stddv);
+            bmax = 1.f - amax * maxtr;
+            amin = 0.333333f / (mean - stddv - mintr);
+            bmin = -amin * mintr;
 
-                asig *= 500.f;
-                bsig *= 500.f;
-                amax *= 500.f;
-                bmax *= 500.f;
-                amin *= 500.f;
-                bmin *= 500.f;
-            }
+            asig *= 500.f;
+            bsig *= 500.f;
+            amax *= 500.f;
+            bmax *= 500.f;
+            amin *= 500.f;
+            bmin *= 500.f;
+        }
+
+        const float cdfactor = 32768.f / delta;
+        maxCD = -9999999.f;
+        minCD = 9999999.f;
 
 #ifdef _OPENMP
-            #pragma omp parallel
-#endif
-            {
-                float cdmax = -999999.f, cdmin = 999999.f;
-#ifdef _OPENMP
-                #pragma omp for schedule(dynamic,16) nowait
+        #pragma omp parallel for reduction(max:maxCD) reduction(min:minCD) schedule(dynamic, 16)
 #endif
 
-                for ( int i = 0; i < H_L; i ++ )
-                    for (int j = 0; j < W_L; j++) {
-                        float gan;
+        for ( int i = 0; i < H_L; i ++ ) {
+            for (int j = 0; j < W_L; j++) {
+                float gan;
 
-                        if (dehagaintransmissionCurve && mean != 0.f && stddv != 0.f) {
-                            float absciss;
-
-                            if (LIKELY(fabsf(luminance[i][j] - mean) < stddv)) {
-                                absciss = asig * luminance[i][j] + bsig;
-                            } else if (luminance[i][j] >= mean) {
-                                absciss = amax * luminance[i][j] + bmax;
-                            } else { /*if(luminance[i][j] <= mean - stddv)*/
-                                absciss = amin * luminance[i][j] + bmin;
-                            }
-
-
-                            //    float cd = cdfactor * ( luminance[i][j]  - mini ) + offse;
-                            // TODO : move multiplication by 2.f inside the curve
-                            gan = 2.f * (dehagaintransmissionCurve[absciss]); //new gain function transmission
-                        } else {
-                            gan = 0.5f;
-                        }
-
-                        float cd = gan * cdfactor * ( luminance[i][j] ) + offse;
-
-                        cdmax = cd > cdmax ? cd : cdmax;
-                        cdmin = cd < cdmin ? cd : cdmin;
-
-                        float str = strengthx;
-
-                        if(lhutili && it == 1) { // S=f(H)
-                            {
-                                float HH = exLuminance[i][j];
-                                float valparam;
-
-                                if(useHsl || useHslLin) {
-                                    valparam = float((shcurve->getVal(HH) - 0.5f));
-                                } else {
-                                    valparam = float((shcurve->getVal(Color::huelab_to_huehsv2(HH)) - 0.5f));
-                                }
-
-                                str *= (1.f + 2.f * valparam);
-                            }
-                        }
-
-                        if(higplus && exLuminance[i][j] > 65535.f * hig) {
-                            str *= hig;
-                        }
-
-                        if(viewmet == 0) {
-                            luminance[i][j] = intp(str, clipretinex( cd, 0.f, 32768.f ), originalLuminance[i][j]);
-                        } else if(viewmet == 1) {
-                            luminance[i][j] = out[i][j];
-                        } else if(viewmet == 4) {
-                            luminance[i][j] = originalLuminance[i][j] + str * (originalLuminance[i][j] - out[i][j]);//unsharp
-                        } else if(viewmet == 2) {
-                            if(tran[i][j] <= mean) {
-                                luminance[i][j] = azb + aza * tran[i][j];    //auto values
-                            } else {
-                                luminance[i][j] = bzb + bza * tran[i][j];
-                            }
-                        } else { /*if(viewmet == 3) */
-                            luminance[i][j] = 1000.f + tran[i][j] * 700.f;    //arbitrary values to help display log values which are between -20 to + 30 - usage values -4 + 5
-                        }
+                if (dehagaintransmissionCurve && mean != 0.f && stddv != 0.f) {
+                    float absciss;
 
+                    if (LIKELY(fabsf(luminance[i][j] - mean) < stddv)) {
+                        absciss = asig * luminance[i][j] + bsig;
+                    } else if (luminance[i][j] >= mean) {
+                        absciss = amax * luminance[i][j] + bmax;
+                    } else { /*if(luminance[i][j] <= mean - stddv)*/
+                        absciss = amin * luminance[i][j] + bmin;
                     }
 
-#ifdef _OPENMP
-                #pragma omp critical
-#endif
-                {
-                    maxCD = maxCD > cdmax ? maxCD : cdmax;
-                    minCD = minCD < cdmin ? minCD : cdmin;
+
+                    //    float cd = cdfactor * ( luminance[i][j]  - mini ) + offse;
+                    // TODO : move multiplication by 2.f inside the curve
+                    gan = 2.f * dehagaintransmissionCurve[absciss]; //new gain function transmission
+                } else {
+                    gan = 0.5f;
+                }
+
+                const float cd = gan * cdfactor * luminance[i][j] + offse;
+
+                maxCD = cd > maxCD ? cd : maxCD;
+                minCD = cd < minCD ? cd : minCD;
+
+                float str = strengthx;
+
+                if (lhutili && it == 1) { // S=f(H)
+                    {
+                        const float HH = exLuminance[i][j];
+                        float valparam;
+
+                        if(useHsl || useHslLin) {
+                            valparam = shcurve->getVal(HH) - 0.5f;
+                        } else {
+                            valparam = shcurve->getVal(Color::huelab_to_huehsv2(HH)) - 0.5f;
+                        }
+
+                        str *= (1.f + 2.f * valparam);
+                    }
+                }
+
+                if (higplus && exLuminance[i][j] > 65535.f * hig) {
+                    str *= hig;
+                }
+
+                if (viewmet == 0) {
+                    luminance[i][j] = intp(str, LIM(cd, 0.f, 32768.f), originalLuminance[i][j]);
+                } else if (viewmet == 1) {
+                    luminance[i][j] = out[i][j];
+                } else if (viewmet == 4) {
+                    luminance[i][j] = originalLuminance[i][j] + str * (originalLuminance[i][j] - out[i][j]);//unsharp
+                } else if (viewmet == 2) {
+                    if(tran[i][j] <= mean) {
+                        luminance[i][j] = azb + aza * tran[i][j];    //auto values
+                    } else {
+                        luminance[i][j] = bzb + bza * tran[i][j];
+                    }
+                } else { /*if (viewmet == 3) */
+                    luminance[i][j] = 1000.f + tran[i][j] * 700.f;    //arbitrary values to help display log values which are between -20 to + 30 - usage values -4 + 5
                 }
 
             }
-
-            delete [] outBuffer;
-            outBuffer = nullptr;
-            //printf("cdmin=%f cdmax=%f\n",minCD, maxCD);
-            Tmean = mean;
-            Tsigma = stddv;
-            Tmin = mintr;
-            Tmax = maxtr;
-
-            if (shcurve) {
-                delete shcurve;
-                shcurve = nullptr;
-            }
         }
 
-        if(tranBuffer) {
-            delete [] tranBuffer;
-        }
+        Tmean = mean;
+        Tsigma = stddv;
+        Tmin = mintr;
+        Tmax = maxtr;
 
+        if (shcurve) {
+            delete shcurve;
+            shcurve = nullptr;
+        }
     }
 }
 
diff --git a/rtengine/ipsharpen.cc b/rtengine/ipsharpen.cc
index bbd1de155..0b1332ec9 100644
--- a/rtengine/ipsharpen.cc
+++ b/rtengine/ipsharpen.cc
@@ -207,13 +207,13 @@ BENCHFUN
         for (int k = 0; k < sharpenParam.deconviter; k++) {
             if (!needdamp) {
                 // apply gaussian blur and divide luminance by result of gaussian blur
-                gaussianBlur(tmpI, tmp, W, H, sigma, nullptr, GAUSS_DIV, luminance);
+                gaussianBlur(tmpI, tmp, W, H, sigma, false, GAUSS_DIV, luminance);
             } else {
                 // apply gaussian blur + damping
                 gaussianBlur(tmpI, tmp, W, H, sigma);
                 dcdamping(tmp, luminance, damping, W, H);
             }
-            gaussianBlur(tmp, tmpI, W, H, sigma, nullptr, GAUSS_MULT);
+            gaussianBlur(tmp, tmpI, W, H, sigma, false, GAUSS_MULT);
         } // end for
 
 #ifdef _OPENMP
diff --git a/rtengine/ipwavelet.cc b/rtengine/ipwavelet.cc
index 07e9da85b..6631aae32 100644
--- a/rtengine/ipwavelet.cc
+++ b/rtengine/ipwavelet.cc
@@ -32,7 +32,6 @@
 #include "improcfun.h"
 #include "LUT.h"
 #include "array2D.h"
-#include "boxblur.h"
 #include "rt_math.h"
 #include "mytime.h"
 #include "sleef.c"
diff --git a/rtengine/rawimagesource.cc b/rtengine/rawimagesource.cc
index 5f1d13c79..ed1e45394 100644
--- a/rtengine/rawimagesource.cc
+++ b/rtengine/rawimagesource.cc
@@ -1950,8 +1950,8 @@ void RawImageSource::retinexPrepareCurves(const RetinexParams &retinexParams, LU
         CurveFactory::curveDehaContL (retinexcontlutili, retinexParams.cdcurve, cdcurve, 1, lhist16RETI, histLRETI);
     }
 
-    CurveFactory::mapcurve (mapcontlutili, retinexParams.mapcurve, mapcurve, 1, lhist16RETI, histLRETI);
-
+    CurveFactory::mapcurve(mapcontlutili, retinexParams.mapcurve, mapcurve, 1, lhist16RETI, histLRETI);
+    mapcurve *= 0.5f;
     retinexParams.getCurves(retinextransmissionCurve, retinexgaintransmissionCurve);
 }
 
diff --git a/rtengine/rawimagesource.h b/rtengine/rawimagesource.h
index e52adea18..37d927445 100644
--- a/rtengine/rawimagesource.h
+++ b/rtengine/rawimagesource.h
@@ -201,7 +201,7 @@ public:
     }
     static void inverse33 (const double (*coeff)[3], double (*icoeff)[3]);
 
-    void MSR(float** luminance, float **originalLuminance, float **exLuminance,  LUTf & mapcurve, bool &mapcontlutili, int width, int height, const RetinexParams &deh, const RetinextransmissionCurve & dehatransmissionCurve, const RetinexgaintransmissionCurve & dehagaintransmissionCurve, float &minCD, float &maxCD, float &mini, float &maxi, float &Tmean, float &Tsigma, float &Tmin, float &Tmax);
+    void MSR(float** luminance, float **originalLuminance, float **exLuminance, const LUTf& mapcurve, bool mapcontlutili, int width, int height, const RetinexParams &deh, const RetinextransmissionCurve & dehatransmissionCurve, const RetinexgaintransmissionCurve & dehagaintransmissionCurve, float &minCD, float &maxCD, float &mini, float &maxi, float &Tmean, float &Tsigma, float &Tmin, float &Tmax);
     void HLRecovery_inpaint (float** red, float** green, float** blue) override;
     static void HLRecovery_Luminance (float* rin, float* gin, float* bin, float* rout, float* gout, float* bout, int width, float maxval);
     static void HLRecovery_CIELab (float* rin, float* gin, float* bin, float* rout, float* gout, float* bout, int width, float maxval, double cam[3][3], double icam[3][3]);
diff --git a/rtengine/rescale.h b/rtengine/rescale.h
index e9d476fea..8e1f99271 100644
--- a/rtengine/rescale.h
+++ b/rtengine/rescale.h
@@ -21,6 +21,7 @@
 #pragma once
 
 #include "array2D.h"
+#include "rt_math.h"
 
 namespace rtengine {
 
diff --git a/rtengine/shmap.cc b/rtengine/shmap.cc
index 6b59c4b40..368442f2b 100644
--- a/rtengine/shmap.cc
+++ b/rtengine/shmap.cc
@@ -85,21 +85,14 @@ void SHMap::update (Imagefloat* img, double radius, double lumi[3], bool hq, int
     if (!hq) {
         fillLuminance( img, map, lumi);
 
-        float *buffer = nullptr;
-
-        if(radius > 40.) {
-            // When we pass another buffer to gaussianBlur, it will use iterated boxblur which is less prone to artifacts
-            buffer = new float[W * H];
-        }
+        const bool useBoxBlur = radius > 40.0; // boxblur is less prone to artifacts for large radi
 
 #ifdef _OPENMP
-        #pragma omp parallel
+        #pragma omp parallel if (!useBoxBlur)
 #endif
         {
-            gaussianBlur (map, map, W, H, radius, buffer);
+            gaussianBlur (map, map, W, H, radius, useBoxBlur);
         }
-
-        delete [] buffer;
     }
 
     else {