From 0f21a0de6c1339fa7e8364d736af11dc1838b6f5 Mon Sep 17 00:00:00 2001
From: heckflosse <heckflosse67@gmx.de>
Date: Wed, 7 Sep 2016 16:44:16 +0200
Subject: [PATCH 1/5] Speedups for denoise, fixes #3418

---
 rtengine/FTblockDN.cc         | 97 ++++++++++++++++++-----------------
 rtengine/color.cc             | 61 ++++++++++++++++++++++
 rtengine/color.h              | 18 +++++--
 rtengine/dcrop.cc             |  4 +-
 rtengine/improccoordinator.cc |  1 +
 rtengine/improccoordinator.h  |  8 +++
 rtengine/sleefsseavx.c        | 37 +++++++++++++
 7 files changed, 174 insertions(+), 52 deletions(-)
diff --git a/rtengine/FTblockDN.cc b/rtengine/FTblockDN.cc
index 69e846f3c..4ccc63c3a 100644
--- a/rtengine/FTblockDN.cc
+++ b/rtengine/FTblockDN.cc
@@ -37,7 +37,8 @@
 #include "opthelper.h"
 #include "cplx_wavelet_dec.h"
 #include "median.h"
-
+#define BENCHMARK
+#include "StopWatch.h"
 #ifdef _OPENMP
 #include <omp.h>
 #endif
@@ -426,6 +427,7 @@ enum nrquality {QUALITY_STANDARD, QUALITY_HIGH};
 
 SSEFUNCTION void ImProcFunctions::RGB_denoise(int kall, Imagefloat * src, Imagefloat * dst, Imagefloat * calclum, float * ch_M, float *max_r, float *max_b, bool isRAW, const procparams::DirPyrDenoiseParams & dnparams, const double expcomp, const NoiseCurve & noiseLCurve, const NoiseCurve & noiseCCurve, float &chaut, float &redaut, float &blueaut, float &maxredaut, float &maxblueaut, float &nresi, float &highresi)
 {
+BENCHFUN
 //#ifdef _DEBUG
     MyTime t1e, t2e;
     t1e.set();
@@ -588,19 +590,19 @@ SSEFUNCTION void ImProcFunctions::RGB_denoise(int kall, Imagefloat * src, Imagef
             }
         }
 
-        float gamslope = exp(log(static_cast<double>(gamthresh)) / gam) / gamthresh;
 
         LUTf gamcurve(65536, LUT_CLIP_BELOW);
+        float gamslope = exp(log(static_cast<double>(gamthresh)) / gam) / gamthresh;
 
+        MyTime t1e, t2e;
+        t1e.set();
         if (denoiseMethodRgb) {
-            for (int i = 0; i < 65536; ++i) {
-                gamcurve[i] = (Color::gamma(static_cast<double>(i) / 65535.0, gam, gamthresh, gamslope, 1.0, 0.0)) * 32768.0f;
-            }
+            Color::gammaf2lut(gamcurve, gam, gamthresh, gamslope, 65535.f, 32768.f);
         } else {
-            for (int i = 0; i < 65536; ++i) {
-                gamcurve[i] = (Color::gamman(static_cast<double>(i) / 65535.0, gam)) * 32768.0f;
-            }
+            Color::gammanf2lut(gamcurve, gam, 65535.f, 32768.f);
         }
+        t2e.set();
+        printf("gamcurve performed in %d usec:\n", t2e.etime(t1e));
 
         // inverse gamma transform for output data
         float igam = 1.f / gam;
@@ -609,15 +611,16 @@ SSEFUNCTION void ImProcFunctions::RGB_denoise(int kall, Imagefloat * src, Imagef
 
         LUTf igamcurve(65536, LUT_CLIP_BELOW);
 
+        MyTime t11e, t21e;
+        t11e.set();
+
         if (denoiseMethodRgb) {
-            for (int i = 0; i < 65536; ++i) {
-                igamcurve[i] = (Color::gamma(static_cast<float>(i) / 32768.0f, igam, igamthresh, igamslope, 1.0, 0.0) * 65535.0f);
-            }
+            Color::gammaf2lut(igamcurve, igam, igamthresh, igamslope, 32768.f, 65535.f);
         } else {
-            for (int i = 0; i < 65536; ++i) {
-                igamcurve[i] = (Color::gamman(static_cast<float>(i) / 32768.0f, igam) * 65535.0f);
-            }
+            Color::gammanf2lut(igamcurve, igam, 32768.f, 65535.f);
         }
+        t21e.set();
+        printf("igamcurve performed in %d usec:\n", t21e.etime(t11e));
 
         const float gain = pow (2.0f, float(expcomp));
         float noisevar_Ldetail = SQR(static_cast<float>(SQR(100. - dnparams.Ldetail) + 50.*(100. - dnparams.Ldetail)) * TS * 0.5f);
@@ -922,9 +925,9 @@ SSEFUNCTION void ImProcFunctions::RGB_denoise(int kall, Imagefloat * src, Imagef
                                         B_ = (*denoiseigamtab)[B_];
 
                                         //apply gamma noise standard (slider)
-                                        R_ = R_ < 65535.0f ? gamcurve[R_] : (Color::gammanf(R_ / 65535.f, gam) * 32768.0f);
-                                        G_ = G_ < 65535.0f ? gamcurve[G_] : (Color::gammanf(G_ / 65535.f, gam) * 32768.0f);
-                                        B_ = B_ < 65535.0f ? gamcurve[B_] : (Color::gammanf(B_ / 65535.f, gam) * 32768.0f);
+                                        R_ = R_ < 65535.f ? gamcurve[R_] : (Color::gammanf(R_ / 65535.f, gam) * 32768.f);
+                                        G_ = G_ < 65535.f ? gamcurve[G_] : (Color::gammanf(G_ / 65535.f, gam) * 32768.f);
+                                        B_ = B_ < 65535.f ? gamcurve[B_] : (Color::gammanf(B_ / 65535.f, gam) * 32768.f);
 
                                         //true conversion xyz=>Lab
                                         float X, Y, Z;
@@ -966,9 +969,9 @@ SSEFUNCTION void ImProcFunctions::RGB_denoise(int kall, Imagefloat * src, Imagef
                                         float Y = gain * src->g(i, j);
                                         float Z = gain * src->b(i, j);
                                         //conversion colorspace to determine luminance with no gamma
-                                        X = X < 65535.0f ? gamcurve[X] : (Color::gamma(static_cast<double>(X) / 65535.0, gam, gamthresh, gamslope, 1.0, 0.0) * 32768.0f);
-                                        Y = Y < 65535.0f ? gamcurve[Y] : (Color::gamma(static_cast<double>(Y) / 65535.0, gam, gamthresh, gamslope, 1.0, 0.0) * 32768.0f);
-                                        Z = Z < 65535.0f ? gamcurve[Z] : (Color::gamma(static_cast<double>(Z) / 65535.0, gam, gamthresh, gamslope, 1.0, 0.0) * 32768.0f);
+                                        X = X < 65535.f ? gamcurve[X] : (Color::gammaf(X / 65535.f, gam, gamthresh, gamslope) * 32768.f);
+                                        Y = Y < 65535.f ? gamcurve[Y] : (Color::gammaf(Y / 65535.f, gam, gamthresh, gamslope) * 32768.f);
+                                        Z = Z < 65535.f ? gamcurve[Z] : (Color::gammaf(Z / 65535.f, gam, gamthresh, gamslope) * 32768.f);
                                         //end chroma
                                         labdn->L[i1][j1] = Y;
                                         labdn->a[i1][j1] = (X - Y);
@@ -1009,9 +1012,9 @@ SSEFUNCTION void ImProcFunctions::RGB_denoise(int kall, Imagefloat * src, Imagef
                                     float btmp = Color::igammatab_srgb[ src->b(i, j) ];
                                     //modification Jacques feb 2013
                                     // gamma slider different from raw
-                                    rtmp = rtmp < 65535.0f ? gamcurve[rtmp] : (Color::gamman(static_cast<double>(rtmp) / 65535.0, gam) * 32768.0f);
-                                    gtmp = gtmp < 65535.0f ? gamcurve[gtmp] : (Color::gamman(static_cast<double>(gtmp) / 65535.0, gam) * 32768.0f);
-                                    btmp = btmp < 65535.0f ? gamcurve[btmp] : (Color::gamman(static_cast<double>(btmp) / 65535.0, gam) * 32768.0f);
+                                    rtmp = rtmp < 65535.f ? gamcurve[rtmp] : (Color::gammanf(rtmp / 65535.f, gam) * 32768.f);
+                                    gtmp = gtmp < 65535.f ? gamcurve[gtmp] : (Color::gammanf(gtmp / 65535.f, gam) * 32768.f);
+                                    btmp = btmp < 65535.f ? gamcurve[btmp] : (Color::gammanf(btmp / 65535.f, gam) * 32768.f);
 
                                     float X, Y, Z;
                                     Color::rgbxyz(rtmp, gtmp, btmp, X, Y, Z, wp);
@@ -1650,9 +1653,9 @@ SSEFUNCTION void ImProcFunctions::RGB_denoise(int kall, Imagefloat * src, Imagef
                                             float Z = Y - (labdn->b[i1][j1]);
 
 
-                                            X = X < 32768.0f ? igamcurve[X] : (Color::gamma(X / 32768.0f, igam, igamthresh, igamslope, 1.0, 0.0) * 65535.0f);
-                                            Y = Y < 32768.0f ? igamcurve[Y] : (Color::gamma(Y / 32768.0f, igam, igamthresh, igamslope, 1.0, 0.0) * 65535.0f);
-                                            Z = Z < 32768.0f ? igamcurve[Z] : (Color::gamma(Z / 32768.0f, igam, igamthresh, igamslope, 1.0, 0.0) * 65535.0f);
+                                            X = X < 32768.f ? igamcurve[X] : (Color::gammaf(X / 32768.f, igam, igamthresh, igamslope) * 65535.f);
+                                            Y = Y < 32768.f ? igamcurve[Y] : (Color::gammaf(Y / 32768.f, igam, igamthresh, igamslope) * 65535.f);
+                                            Z = Z < 32768.f ? igamcurve[Z] : (Color::gammaf(Z / 32768.f, igam, igamthresh, igamslope) * 65535.f);
 
                                             if (numtiles == 1) {
                                                 dsttmp->r(i, j) = newGain * X;
@@ -1695,9 +1698,9 @@ SSEFUNCTION void ImProcFunctions::RGB_denoise(int kall, Imagefloat * src, Imagef
                                         float r_, g_, b_;
                                         Color::xyz2rgb(X, Y, Z, r_, g_, b_, wip);
                                         //gamma slider is different from Raw
-                                        r_ = r_ < 32768.0f ? igamcurve[r_] : (Color::gamman(r_ / 32768.0f, igam) * 65535.0f);
-                                        g_ = g_ < 32768.0f ? igamcurve[g_] : (Color::gamman(g_ / 32768.0f, igam) * 65535.0f);
-                                        b_ = b_ < 32768.0f ? igamcurve[b_] : (Color::gamman(b_ / 32768.0f, igam) * 65535.0f);
+                                        r_ = r_ < 32768.f ? igamcurve[r_] : (Color::gammanf(r_ / 32768.f, igam) * 65535.f);
+                                        g_ = g_ < 32768.f ? igamcurve[g_] : (Color::gammanf(g_ / 32768.f, igam) * 65535.f);
+                                        b_ = b_ < 32768.f ? igamcurve[b_] : (Color::gammanf(b_ / 32768.f, igam) * 65535.f);
 
                                         if (numtiles == 1) {
                                             dsttmp->r(i, j) = newGain * r_;
@@ -2982,7 +2985,7 @@ void ImProcFunctions::WaveletDenoiseAll_info(int levwav, wavelet_decomposition &
     }
 }
 
-void ImProcFunctions::RGB_denoise_infoGamCurve(const procparams::DirPyrDenoiseParams & dnparams, bool isRAW, LUTf &gamcurve, float &gam, float &gamthresh, float &gamslope)
+SSEFUNCTION void ImProcFunctions::RGB_denoise_infoGamCurve(const procparams::DirPyrDenoiseParams & dnparams, bool isRAW, LUTf &gamcurve, float &gam, float &gamthresh, float &gamslope)
 {
     gam = dnparams.gamma;
     gamthresh = 0.001f;
@@ -2995,18 +2998,20 @@ void ImProcFunctions::RGB_denoise_infoGamCurve(const procparams::DirPyrDenoisePa
         }
     }
 
-    gamslope = exp(log(static_cast<double>(gamthresh)) / gam) / gamthresh;
     bool denoiseMethodRgb = (dnparams.dmethod == "RGB");
 
+    MyTime t1e, t2e;
+    t1e.set();
+
     if (denoiseMethodRgb) {
-        for (int i = 0; i < 65536; ++i) {
-            gamcurve[i] = (Color::gamma(static_cast<double>(i) / 65535.0, gam, gamthresh, gamslope, 1.0, 0.0)) * 32768.0f;
-        }
+        gamslope = exp(log(static_cast<double>(gamthresh)) / gam) / gamthresh;
+        Color::gammaf2lut(gamcurve, gam, gamthresh, gamslope, 65535.f, 32768.f);
     } else {
-        for (int i = 0; i < 65536; ++i) {
-            gamcurve[i] = (Color::gamman(static_cast<double>(i) / 65535.0, gam)) * 32768.0f;
-        }
+        Color::gammanf2lut(gamcurve, gam, 65535.f, 32768.f);
     }
+    t2e.set();
+    printf("gamcurve in RGB_denoise_infoGamCurve performed in %d usec:\n", t2e.etime(t1e));
+
 }
 
 void ImProcFunctions::calcautodn_info (float &chaut, float &delta, int Nb, int levaut, float maxmax, float lumema, float chromina, int mode, int lissage, float redyel, float skinc, float nsknc)
@@ -3424,9 +3429,9 @@ SSEFUNCTION void ImProcFunctions::RGB_denoise_info(Imagefloat * src, Imagefloat
                             B_ = (*denoiseigamtab)[B_];
 
                             //apply gamma noise standard (slider)
-                            R_ = R_ < 65535.0f ? gamcurve[R_] : (Color::gamman(static_cast<double>(R_) / 65535.0, gam) * 32768.0f);
-                            G_ = G_ < 65535.0f ? gamcurve[G_] : (Color::gamman(static_cast<double>(G_) / 65535.0, gam) * 32768.0f);
-                            B_ = B_ < 65535.0f ? gamcurve[B_] : (Color::gamman(static_cast<double>(B_) / 65535.0, gam) * 32768.0f);
+                            R_ = R_ < 65535.f ? gamcurve[R_] : (Color::gammanf(R_ / 65535.f, gam) * 32768.f);
+                            G_ = G_ < 65535.f ? gamcurve[G_] : (Color::gammanf(G_ / 65535.f, gam) * 32768.f);
+                            B_ = B_ < 65535.f ? gamcurve[B_] : (Color::gammanf(B_ / 65535.f, gam) * 32768.f);
                             //true conversion xyz=>Lab
                             float X, Y, Z;
                             Color::rgbxyz(R_, G_, B_, X, Y, Z, wp);
@@ -3451,9 +3456,9 @@ SSEFUNCTION void ImProcFunctions::RGB_denoise_info(Imagefloat * src, Imagefloat
                             float Y = gain * src->g(i, j);
                             float Z = gain * src->b(i, j);
 
-                            X = X < 65535.0f ? gamcurve[X] : (Color::gamma(static_cast<double>(X) / 65535.0, gam, gamthresh, gamslope, 1.0, 0.0) * 32768.0f);
-                            Y = Y < 65535.0f ? gamcurve[Y] : (Color::gamma(static_cast<double>(Y) / 65535.0, gam, gamthresh, gamslope, 1.0, 0.0) * 32768.0f);
-                            Z = Z < 65535.0f ? gamcurve[Z] : (Color::gamma(static_cast<double>(Z) / 65535.0, gam, gamthresh, gamslope, 1.0, 0.0) * 32768.0f);
+                            X = X < 65535.f ? gamcurve[X] : (Color::gammaf(X / 65535.f, gam, gamthresh, gamslope) * 32768.f);
+                            Y = Y < 65535.f ? gamcurve[Y] : (Color::gammaf(Y / 65535.f, gam, gamthresh, gamslope) * 32768.f);
+                            Z = Z < 65535.f ? gamcurve[Z] : (Color::gammaf(Z / 65535.f, gam, gamthresh, gamslope) * 32768.f);
 
                             labdn->a[i1][j1] = (X - Y);
                             labdn->b[i1][j1] = (Y - Z);
@@ -3480,9 +3485,9 @@ SSEFUNCTION void ImProcFunctions::RGB_denoise_info(Imagefloat * src, Imagefloat
                         float btmp = Color::igammatab_srgb[ src->b(i, j) ];
                         //modification Jacques feb 2013
                         // gamma slider different from raw
-                        rtmp = rtmp < 65535.0f ? gamcurve[rtmp] : (Color::gamman(static_cast<double>(rtmp) / 65535.0, gam) * 32768.0f);
-                        gtmp = gtmp < 65535.0f ? gamcurve[gtmp] : (Color::gamman(static_cast<double>(gtmp) / 65535.0, gam) * 32768.0f);
-                        btmp = btmp < 65535.0f ? gamcurve[btmp] : (Color::gamman(static_cast<double>(btmp) / 65535.0, gam) * 32768.0f);
+                        rtmp = rtmp < 65535.f ? gamcurve[rtmp] : (Color::gammanf(rtmp / 65535.f, gam) * 32768.f);
+                        gtmp = gtmp < 65535.f ? gamcurve[gtmp] : (Color::gammanf(gtmp / 65535.f, gam) * 32768.f);
+                        btmp = btmp < 65535.f ? gamcurve[btmp] : (Color::gammanf(btmp / 65535.f, gam) * 32768.f);
 
                         float X, Y, Z;
                         Color::rgbxyz(rtmp, gtmp, btmp, X, Y, Z, wp);
diff --git a/rtengine/color.cc b/rtengine/color.cc
index 872f47522..fa4a79840 100644
--- a/rtengine/color.cc
+++ b/rtengine/color.cc
@@ -1478,6 +1478,67 @@ void Color::calcGamma (double pwr, double ts, int mode, int imax, double &gamma0
         return;
     }
 }
+void Color::gammaf2lut (LUTf &gammacurve, float gamma, float start, float slope, float divisor, float factor)
+{
+#ifdef __SSE2__
+    // SSE2 version is more than 6 times faster than scalar version
+    vfloat iv = _mm_set_ps(3.f,2.f,1.f,0.f);
+    vfloat fourv = F2V(4.f);
+    vfloat gammav = F2V(1.f / gamma);
+    vfloat slopev = F2V((slope / divisor) * factor);
+    vfloat divisorv = F2V(xlogf(divisor));
+    vfloat factorv = F2V(factor);
+    vfloat comparev = F2V(start * divisor);
+    int border = start * divisor;
+    int border1 = border - (border & 3);
+    int border2 = border1 + 4;
+    int i = 0;
+    for(; i < border1; i += 4) {
+        vfloat resultv = iv * slopev;
+        _mm_storeu_ps(&gammacurve[i], resultv);
+        iv += fourv;
+    }
+    for(; i < border2; i += 4) {
+        vfloat result0v = iv * slopev;
+        vfloat result1v = xexpf((xlogf(iv) - divisorv) * gammav) * factorv;
+        _mm_storeu_ps(&gammacurve[i], vself(vmaskf_le(iv, comparev), result0v, result1v));
+        iv += fourv;
+    }
+    for(; i < 65536; i += 4) {
+        vfloat resultv = xexpfNoCheck((xlogfNoCheck(iv) - divisorv) * gammav) * factorv;
+        _mm_storeu_ps(&gammacurve[i], resultv);
+        iv += fourv;
+    }
+#else
+    for (int i = 0; i < 65536; ++i) {
+        gammacurve[i] = gammaf(static_cast<float>(i) / divisor, gamma, start, slope) * factor;
+    }
+#endif
+}
+
+void Color::gammanf2lut (LUTf &gammacurve, float gamma, float divisor, float factor)           //standard gamma without slope...
+{
+#ifdef __SSE2__
+    // SSE2 version is more than 6 times faster than scalar version
+    vfloat iv = _mm_set_ps(3.f,2.f,1.f,0.f);
+    vfloat fourv = F2V(4.f);
+    vfloat gammav = F2V(1.f / gamma);
+    vfloat divisorv = F2V(xlogf(divisor));
+    vfloat factorv = F2V(factor);
+    vfloat resultv = xexpf((xlogf(iv) - divisorv) * gammav) * factorv;
+    _mm_storeu_ps(&gammacurve[0], resultv);
+    iv += fourv;
+    for(int i=4; i < 65536; i += 4) {
+        resultv = xexpfNoCheck((xlogfNoCheck(iv) - divisorv) * gammav) * factorv;
+        _mm_storeu_ps(&gammacurve[i], resultv);
+        iv += fourv;
+    }
+#else
+    for (int i = 0; i < 65536; ++i) {
+        gammacurve[i] = Color::gammanf(static_cast<float>(i) / divisor, gamma) * factor;
+    }
+#endif
+}
 
 void Color::Lab2XYZ(float L, float a, float b, float &x, float &y, float &z)
 {
diff --git a/rtengine/color.h b/rtengine/color.h
index 1ae721e5a..c7699e9c5 100644
--- a/rtengine/color.h
+++ b/rtengine/color.h
@@ -1104,6 +1104,15 @@ public:
     {
         return (x <= start ? x*slope : exp(log(x) / gamma) * mul - add);
     }
+
+    static inline float gammaf      (float x, float gamma, float start, float slope)
+    {
+        return x <= start ? x*slope : xexpf(xlogf(x) / gamma);
+    }
+
+    //fills a LUT of size 65536 using gamma with slope...
+    static void gammaf2lut (LUTf &gammacurve, float gamma, float start, float slope, float divisor, float factor);
+
     static inline double igamma     (double x, double gamma, double start, double slope, double mul, double add)
     {
         return (x <= start * slope ? x / slope : exp(log((x + add) / mul) * gamma) );
@@ -1118,7 +1127,7 @@ public:
     */
     static inline double gamman      (double x, double gamma)           //standard gamma without slope...
     {
-        return (x = exp(log(x) / gamma));
+        return exp(log(x) / gamma);
     }
 
     /**
@@ -1129,9 +1138,10 @@ public:
     */
     static inline float gammanf      (float x, float gamma)           //standard gamma without slope...
     {
-        return (x = xexpf(xlogf(x) / gamma));
+        return xexpf(xlogf(x) / gamma);
     }
-
+    //fills a LUT of size 65536 using gamma without slope...
+    static void gammanf2lut (LUTf &gammacurve, float gamma, float divisor, float factor);
 
     /**
     * @brief Very simply inverse gamma
@@ -1141,7 +1151,7 @@ public:
     */
     static inline double igamman     (double x, double gamma)           //standard inverse gamma without slope...
     {
-        return (x = exp(log(x) * gamma) );
+        return exp(log(x) * gamma);
     }
 
 
diff --git a/rtengine/dcrop.cc b/rtengine/dcrop.cc
index 93aca0f6d..6bf3f2273 100644
--- a/rtengine/dcrop.cc
+++ b/rtengine/dcrop.cc
@@ -380,7 +380,7 @@ void Crop::update (int todo)
             }
         }
 
-        if(skip == 1 && params.dirpyrDenoise.enabled && ((settings->leveldnautsimpl == 1 && params.dirpyrDenoise.Cmethod == "AUT")  || (settings->leveldnautsimpl == 0 && params.dirpyrDenoise.C2method == "AUTO"))) {
+        if(skip == 1 && params.dirpyrDenoise.enabled && !parent->denoiseInfoStore.valid && ((settings->leveldnautsimpl == 1 && params.dirpyrDenoise.Cmethod == "AUT")  || (settings->leveldnautsimpl == 0 && params.dirpyrDenoise.C2method == "AUTO"))) {
             MyTime t1aue, t2aue;
             t1aue.set();
 
@@ -587,7 +587,7 @@ void Crop::update (int todo)
             params.dirpyrDenoise.chroma = chM / (autoNR * multip * adjustr);
             params.dirpyrDenoise.redchro = maxr;
             params.dirpyrDenoise.bluechro = maxb;
-
+            parent->denoiseInfoStore.valid = true;
             if(parent->adnListener) {
                 parent->adnListener->chromaChanged(params.dirpyrDenoise.chroma, params.dirpyrDenoise.redchro, params.dirpyrDenoise.bluechro);
             }
diff --git a/rtengine/improccoordinator.cc b/rtengine/improccoordinator.cc
index a6354bd56..8925b29e0 100644
--- a/rtengine/improccoordinator.cc
+++ b/rtengine/improccoordinator.cc
@@ -320,6 +320,7 @@ void ImProcCoordinator::updatePreviewImage (int todo, Crop* cropCall)
         ipf.setScale (scale);
 
         imgsrc->getImage (currWB, tr, orig_prev, pp, params.toneCurve, params.icm, params.raw);
+        denoiseInfoStore.valid = false;
         //ColorTemp::CAT02 (orig_prev, &params) ;
         //   printf("orig_prevW=%d\n  scale=%d",orig_prev->width, scale);
         /* Issue 2785, disabled some 1:1 tools
diff --git a/rtengine/improccoordinator.h b/rtengine/improccoordinator.h
index 5cc03cb72..a1124ff01 100644
--- a/rtengine/improccoordinator.h
+++ b/rtengine/improccoordinator.h
@@ -325,6 +325,14 @@ public:
     {
         return imgsrc;
     }
+
+    class denoiseinfostore {
+        public:
+            bool valid;
+
+        denoiseinfostore() : valid(false) {};
+    } denoiseInfoStore;
+
 };
 }
 #endif
diff --git a/rtengine/sleefsseavx.c b/rtengine/sleefsseavx.c
index a9f49f143..1b0c9a0ae 100644
--- a/rtengine/sleefsseavx.c
+++ b/rtengine/sleefsseavx.c
@@ -1275,6 +1275,25 @@ static INLINE vfloat xlogf0(vfloat d) {
   return x;
 }
 
+static INLINE vfloat xlogfNoCheck(vfloat d) { // this version does not check input values. Use it only when you know the input values are > 0 e.g. when filling a lookup table
+  vfloat x, x2, t, m;
+  vint2 e;
+
+  e = vilogbp1f(vmulf(d, vcast_vf_f(0.7071f)));
+  m = vldexpf(d, vsubi2(vcast_vi2_i(0), e));
+
+  x = vdivf(vaddf(vcast_vf_f(-1.0f), m), vaddf(vcast_vf_f(1.0f), m));
+  x2 = vmulf(x, x);
+
+  t = vcast_vf_f(0.2371599674224853515625f);
+  t = vmlaf(t, x2, vcast_vf_f(0.285279005765914916992188f));
+  t = vmlaf(t, x2, vcast_vf_f(0.400005519390106201171875f));
+  t = vmlaf(t, x2, vcast_vf_f(0.666666567325592041015625f));
+  t = vmlaf(t, x2, vcast_vf_f(2.0f));
+
+  return vaddf(vmulf(x, t), vmulf(vcast_vf_f(0.693147180559945286226764f), vcast_vf_vi2(e)));
+
+}
 
 static INLINE vfloat xexpf(vfloat d) {
   vint2 q = vrint_vi2_vf(vmulf(d, vcast_vf_f(R_LN2f)));
@@ -1299,6 +1318,24 @@ static INLINE vfloat xexpf(vfloat d) {
   return u;
 }
 
+static INLINE vfloat xexpfNoCheck(vfloat d) { // this version does not check input values. Use it only when you know the input values are > -104.f e.g. when filling a lookup table
+  vint2 q = vrint_vi2_vf(vmulf(d, vcast_vf_f(R_LN2f)));
+  vfloat s, u;
+
+  s = vmlaf(vcast_vf_vi2(q), vcast_vf_f(-L2Uf),d);
+  s = vmlaf(vcast_vf_vi2(q), vcast_vf_f(-L2Lf),s);
+
+  u = vcast_vf_f(0.00136324646882712841033936f);
+  u = vmlaf(u, s, vcast_vf_f(0.00836596917361021041870117f));
+  u = vmlaf(u, s, vcast_vf_f(0.0416710823774337768554688f));
+  u = vmlaf(u, s, vcast_vf_f(0.166665524244308471679688f));
+  u = vmlaf(u, s, vcast_vf_f(0.499999850988388061523438f));
+
+  u = vaddf(vcast_vf_f(1.0f), vmlaf(vmulf(s, s), u, s));
+
+  return vldexpf(u, q);
+}
+
 static INLINE vfloat xcbrtf(vfloat d) {
   vfloat x, y, q = vcast_vf_f(1.0), t;
   vint2 e, qu, re;

From ef83e6b591a211afe8d4471c6933b3953ae2fd4e Mon Sep 17 00:00:00 2001
From: heckflosse <heckflosse67@gmx.de>
Date: Wed, 7 Sep 2016 23:34:06 +0200
Subject: [PATCH 2/5] Removed StopWatches and corrected some bugs from latest
 commit

---
 rtengine/FTblockDN.cc        | 19 -------------------
 rtengine/dcrop.cc            | 26 ++++++++++----------------
 rtengine/improccoordinator.h | 10 ++++++----
 3 files changed, 16 insertions(+), 39 deletions(-)

diff --git a/rtengine/FTblockDN.cc b/rtengine/FTblockDN.cc
index 4ccc63c3a..8229417e4 100644
--- a/rtengine/FTblockDN.cc
+++ b/rtengine/FTblockDN.cc
@@ -37,8 +37,6 @@
 #include "opthelper.h"
 #include "cplx_wavelet_dec.h"
 #include "median.h"
-#define BENCHMARK
-#include "StopWatch.h"
 #ifdef _OPENMP
 #include <omp.h>
 #endif
@@ -427,7 +425,6 @@ enum nrquality {QUALITY_STANDARD, QUALITY_HIGH};
 
 SSEFUNCTION void ImProcFunctions::RGB_denoise(int kall, Imagefloat * src, Imagefloat * dst, Imagefloat * calclum, float * ch_M, float *max_r, float *max_b, bool isRAW, const procparams::DirPyrDenoiseParams & dnparams, const double expcomp, const NoiseCurve & noiseLCurve, const NoiseCurve & noiseCCurve, float &chaut, float &redaut, float &blueaut, float &maxredaut, float &maxblueaut, float &nresi, float &highresi)
 {
-BENCHFUN
 //#ifdef _DEBUG
     MyTime t1e, t2e;
     t1e.set();
@@ -594,15 +591,11 @@ BENCHFUN
         LUTf gamcurve(65536, LUT_CLIP_BELOW);
         float gamslope = exp(log(static_cast<double>(gamthresh)) / gam) / gamthresh;
 
-        MyTime t1e, t2e;
-        t1e.set();
         if (denoiseMethodRgb) {
             Color::gammaf2lut(gamcurve, gam, gamthresh, gamslope, 65535.f, 32768.f);
         } else {
             Color::gammanf2lut(gamcurve, gam, 65535.f, 32768.f);
         }
-        t2e.set();
-        printf("gamcurve performed in %d usec:\n", t2e.etime(t1e));
 
         // inverse gamma transform for output data
         float igam = 1.f / gam;
@@ -611,16 +604,11 @@ BENCHFUN
 
         LUTf igamcurve(65536, LUT_CLIP_BELOW);
 
-        MyTime t11e, t21e;
-        t11e.set();
-
         if (denoiseMethodRgb) {
             Color::gammaf2lut(igamcurve, igam, igamthresh, igamslope, 32768.f, 65535.f);
         } else {
             Color::gammanf2lut(igamcurve, igam, 32768.f, 65535.f);
         }
-        t21e.set();
-        printf("igamcurve performed in %d usec:\n", t21e.etime(t11e));
 
         const float gain = pow (2.0f, float(expcomp));
         float noisevar_Ldetail = SQR(static_cast<float>(SQR(100. - dnparams.Ldetail) + 50.*(100. - dnparams.Ldetail)) * TS * 0.5f);
@@ -821,7 +809,6 @@ BENCHFUN
                 {static_cast<float>(wprof[2][0]), static_cast<float>(wprof[2][1]), static_cast<float>(wprof[2][2])}
             };
 
-
             // begin tile processing of image
 #ifdef _OPENMP
             #pragma omp parallel num_threads(numthreads) if (numthreads>1)
@@ -3000,18 +2987,12 @@ SSEFUNCTION void ImProcFunctions::RGB_denoise_infoGamCurve(const procparams::Dir
 
     bool denoiseMethodRgb = (dnparams.dmethod == "RGB");
 
-    MyTime t1e, t2e;
-    t1e.set();
-
     if (denoiseMethodRgb) {
         gamslope = exp(log(static_cast<double>(gamthresh)) / gam) / gamthresh;
         Color::gammaf2lut(gamcurve, gam, gamthresh, gamslope, 65535.f, 32768.f);
     } else {
         Color::gammanf2lut(gamcurve, gam, 65535.f, 32768.f);
     }
-    t2e.set();
-    printf("gamcurve in RGB_denoise_infoGamCurve performed in %d usec:\n", t2e.etime(t1e));
-
 }
 
 void ImProcFunctions::calcautodn_info (float &chaut, float &delta, int Nb, int levaut, float maxmax, float lumema, float chromina, int mode, int lissage, float redyel, float skinc, float nsknc)
diff --git a/rtengine/dcrop.cc b/rtengine/dcrop.cc
index 6bf3f2273..1e5aba77c 100644
--- a/rtengine/dcrop.cc
+++ b/rtengine/dcrop.cc
@@ -191,9 +191,6 @@ void Crop::update (int todo)
         parent->ipf.Tile_calc (tilesize, overlap, kall, widIm, heiIm, numtiles_W, numtiles_H, tilewidth, tileheight, tileWskip, tileHskip);
         kall = 0;
 
-        float *ch_M = new float [9];//allocate memory
-        float *max_r = new float [9];
-        float *max_b = new float [9];
         float *min_b = new float [9];
         float *min_r = new float [9];
         float *lumL = new float [9];
@@ -462,9 +459,9 @@ void Crop::update (int todo)
 
                         //printf("DCROP skip=%d cha=%f red=%f bl=%f redM=%f bluM=%f chrom=%f sigm=%f lum=%f\n",skip, chaut,redaut,blueaut, maxredaut, maxblueaut, chromina, sigma, lumema);
                         Nb[hcr * 3 + wcr] = nb;
-                        ch_M[hcr * 3 + wcr] = pondcorrec * chaut;
-                        max_r[hcr * 3 + wcr] = pondcorrec * maxredaut;
-                        max_b[hcr * 3 + wcr] = pondcorrec * maxblueaut;
+                        parent->denoiseInfoStore.ch_M[hcr * 3 + wcr] = pondcorrec * chaut;
+                        parent->denoiseInfoStore.max_r[hcr * 3 + wcr] = pondcorrec * maxredaut;
+                        parent->denoiseInfoStore.max_b[hcr * 3 + wcr] = pondcorrec * maxblueaut;
                         min_r[hcr * 3 + wcr] = pondcorrec * minredaut;
                         min_b[hcr * 3 + wcr] = pondcorrec * minblueaut;
                         lumL[hcr * 3 + wcr] = lumema;
@@ -524,20 +521,20 @@ void Crop::update (int todo)
             int lissage = settings->leveldnliss;
 
             for (int k = 0; k < 9; k++) {
-                float maxmax = max(max_r[k], max_b[k]);
-                parent->ipf.calcautodn_info (ch_M[k], delta[k], Nb[k], levaut, maxmax, lumL[k], chromC[k], mode, lissage, ry[k], sk[k], pcsk[k]);
+                float maxmax = max(parent->denoiseInfoStore.max_r[k], parent->denoiseInfoStore.max_b[k]);
+                parent->ipf.calcautodn_info (parent->denoiseInfoStore.ch_M[k], delta[k], Nb[k], levaut, maxmax, lumL[k], chromC[k], mode, lissage, ry[k], sk[k], pcsk[k]);
                 //  printf("ch_M=%f delta=%f\n",ch_M[k], delta[k]);
             }
 
             for (int k = 0; k < 9; k++) {
-                if(max_r[k] > max_b[k]) {
+                if(parent->denoiseInfoStore.max_r[k] > parent->denoiseInfoStore.max_b[k]) {
                     Max_R[k] = (delta[k]) / ((autoNRmax * multip * adjustr * lowdenoise) / 2.f);
-                    Min_B[k] = -(ch_M[k] - min_b[k]) / (autoNRmax * multip * adjustr * lowdenoise);
+                    Min_B[k] = -(parent->denoiseInfoStore.ch_M[k] - min_b[k]) / (autoNRmax * multip * adjustr * lowdenoise);
                     Max_B[k] = 0.f;
                     Min_R[k] = 0.f;
                 } else {
                     Max_B[k] = (delta[k]) / ((autoNRmax * multip * adjustr * lowdenoise) / 2.f);
-                    Min_R[k] = - (ch_M[k] - min_r[k])   / (autoNRmax * multip * adjustr * lowdenoise);
+                    Min_R[k] = - (parent->denoiseInfoStore.ch_M[k] - min_r[k])   / (autoNRmax * multip * adjustr * lowdenoise);
                     Min_B[k] = 0.f;
                     Max_R[k] = 0.f;
                 }
@@ -545,7 +542,7 @@ void Crop::update (int todo)
 
             for (int k = 0; k < 9; k++) {
                 //  printf("ch_M= %f Max_R=%f Max_B=%f min_r=%f min_b=%f\n",ch_M[k],Max_R[k], Max_B[k],Min_R[k], Min_B[k]);
-                chM += ch_M[k];
+                chM += parent->denoiseInfoStore.ch_M[k];
                 MaxBMoy += Max_B[k];
                 MaxRMoy += Max_R[k];
                 MinRMoy += Min_R[k];
@@ -644,7 +641,7 @@ void Crop::update (int todo)
                 int kall = 0;
 
                 float chaut, redaut, blueaut, maxredaut, maxblueaut, nresi, highresi;
-                parent->ipf.RGB_denoise(kall, origCrop, origCrop, calclum, ch_M, max_r, max_b, parent->imgsrc->isRAW(), /*Roffset,*/ denoiseParams, parent->imgsrc->getDirPyrDenoiseExpComp(), noiseLCurve, noiseCCurve, chaut, redaut, blueaut, maxredaut, maxblueaut, nresi, highresi);
+                parent->ipf.RGB_denoise(kall, origCrop, origCrop, calclum, parent->denoiseInfoStore.ch_M, parent->denoiseInfoStore.max_r, parent->denoiseInfoStore.max_b, parent->imgsrc->isRAW(), /*Roffset,*/ denoiseParams, parent->imgsrc->getDirPyrDenoiseExpComp(), noiseLCurve, noiseCCurve, chaut, redaut, blueaut, maxredaut, maxblueaut, nresi, highresi);
 
                 if (parent->adnListener) {
                     parent->adnListener->noiseChanged(nresi, highresi);
@@ -665,9 +662,6 @@ void Crop::update (int todo)
 
         parent->imgsrc->convertColorSpace(origCrop, params.icm, parent->currWB);
 
-        delete [] ch_M;
-        delete [] max_r;
-        delete [] max_b;
         delete [] min_r;
         delete [] min_b;
         delete [] lumL;
diff --git a/rtengine/improccoordinator.h b/rtengine/improccoordinator.h
index a1124ff01..776484791 100644
--- a/rtengine/improccoordinator.h
+++ b/rtengine/improccoordinator.h
@@ -326,11 +326,13 @@ public:
         return imgsrc;
     }
 
-    class denoiseinfostore {
-        public:
-            bool valid;
+    struct {
+        float chM;
+        float max_r[9];
+        float max_b[9];
+        float ch_M[9];
+        bool valid = false;
 
-        denoiseinfostore() : valid(false) {};
     } denoiseInfoStore;
 
 };

From 5bb20c413e889e397d005357c45003822989f606 Mon Sep 17 00:00:00 2001
From: heckflosse <heckflosse67@gmx.de>
Date: Fri, 9 Sep 2016 16:04:43 +0200
Subject: [PATCH 3/5] Cleaned code, also reduced base memory usage of RT by 1
 MB

---
 rtengine/FTblockDN.cc |  66 +++-----------------
 rtengine/color.cc     | 138 +++++++++++++++++++++++++-----------------
 rtengine/color.h      |  11 ++--
 3 files changed, 95 insertions(+), 120 deletions(-)

diff --git a/rtengine/FTblockDN.cc b/rtengine/FTblockDN.cc
index 8229417e4..8ff22864c 100644
--- a/rtengine/FTblockDN.cc
+++ b/rtengine/FTblockDN.cc
@@ -613,36 +613,6 @@ SSEFUNCTION void ImProcFunctions::RGB_denoise(int kall, Imagefloat * src, Imagef
         const float gain = pow (2.0f, float(expcomp));
         float noisevar_Ldetail = SQR(static_cast<float>(SQR(100. - dnparams.Ldetail) + 50.*(100. - dnparams.Ldetail)) * TS * 0.5f);
 
-        if (settings->verbose) {
-            printf("Denoise Lab=%i\n", settings->denoiselabgamma);
-        }
-
-        // To avoid branches in loops we access the gammatabs by pointers
-        // modify arbitrary data for Lab..I have test : nothing, gamma 2.6 11 - gamma 4 5 - gamma 5.5 10
-        // we can put other as gamma g=2.6 slope=11, etc.
-        // but noting to do with real gamma !!!: it's only for data Lab # data RGB
-        // finally I opted fot gamma55 and with options we can change
-
-        LUTf *denoisegamtab;
-        LUTf *denoiseigamtab;
-
-        switch(settings->denoiselabgamma) {
-            case 0:
-                denoisegamtab = &(Color::gammatab_26_11);
-                denoiseigamtab = &(Color::igammatab_26_11);
-                break;
-
-            case 1:
-                denoisegamtab = &(Color::gammatab_4);
-                denoiseigamtab = &(Color::igammatab_4);
-                break;
-
-            default:
-                denoisegamtab = &(Color::gammatab_55);
-                denoiseigamtab = &(Color::igammatab_55);
-                break;
-        }
-
         array2D<float> tilemask_in(TS, TS);
         array2D<float> tilemask_out(TS, TS);
 
@@ -907,9 +877,9 @@ SSEFUNCTION void ImProcFunctions::RGB_denoise(int kall, Imagefloat * src, Imagef
                                         float G_ = gain * src->g(i, j);
                                         float B_ = gain * src->b(i, j);
 
-                                        R_ = (*denoiseigamtab)[R_];
-                                        G_ = (*denoiseigamtab)[G_];
-                                        B_ = (*denoiseigamtab)[B_];
+                                        R_ = Color::denoiseIGammaTab[R_];
+                                        G_ = Color::denoiseIGammaTab[G_];
+                                        B_ = Color::denoiseIGammaTab[B_];
 
                                         //apply gamma noise standard (slider)
                                         R_ = R_ < 65535.f ? gamcurve[R_] : (Color::gammanf(R_ / 65535.f, gam) * 32768.f);
@@ -1602,9 +1572,9 @@ SSEFUNCTION void ImProcFunctions::RGB_denoise(int kall, Imagefloat * src, Imagef
                                             b_ = b_ < 32768.f ? igamcurve[b_] : (Color::gammanf(b_ / 32768.f, igam) * 65535.f);
 
                                             //readapt arbitrary gamma (inverse from beginning)
-                                            r_ = (*denoisegamtab)[r_];
-                                            g_ = (*denoisegamtab)[g_];
-                                            b_ = (*denoisegamtab)[b_];
+                                            r_ = Color::denoiseGammaTab[r_];
+                                            g_ = Color::denoiseGammaTab[g_];
+                                            b_ = Color::denoiseGammaTab[b_];
 
                                             if (numtiles == 1) {
                                                 dsttmp->r(i, j) = newGain * r_;
@@ -3242,24 +3212,6 @@ SSEFUNCTION void ImProcFunctions::RGB_denoise_info(Imagefloat * src, Imagefloat
     int nb = 0;
     int comptlevel = 0;
 
-    // To avoid branches in loops we access the gammatabs by pointers
-    LUTf *denoiseigamtab;
-
-    switch(settings->denoiselabgamma) {
-        case 0:
-            denoiseigamtab = &(Color::igammatab_26_11);
-            break;
-
-        case 1:
-            denoiseigamtab = &(Color::igammatab_4);
-            break;
-
-        default:
-            denoiseigamtab = &(Color::igammatab_55);
-            break;
-    }
-
-
     for (int tiletop = 0; tiletop < imheight; tiletop += tileHskip) {
         for (int tileleft = 0; tileleft < imwidth; tileleft += tileWskip) {
 
@@ -3405,9 +3357,9 @@ SSEFUNCTION void ImProcFunctions::RGB_denoise_info(Imagefloat * src, Imagefloat
                             float G_ = gain * src->g(i, j);
                             float B_ = gain * src->b(i, j);
 
-                            R_ = (*denoiseigamtab)[R_];
-                            G_ = (*denoiseigamtab)[G_];
-                            B_ = (*denoiseigamtab)[B_];
+                            R_ = Color::denoiseIGammaTab[R_];
+                            G_ = Color::denoiseIGammaTab[G_];
+                            B_ = Color::denoiseIGammaTab[B_];
 
                             //apply gamma noise standard (slider)
                             R_ = R_ < 65535.f ? gamcurve[R_] : (Color::gammanf(R_ / 65535.f, gam) * 32768.f);
diff --git a/rtengine/color.cc b/rtengine/color.cc
index fa4a79840..e19cfb616 100644
--- a/rtengine/color.cc
+++ b/rtengine/color.cc
@@ -41,15 +41,10 @@ LUTf Color::igammatab_srgb;
 LUTf Color::igammatab_srgb1;
 LUTf Color::gammatab_srgb;
 LUTf Color::gammatab_srgb1;
-//  LUTf Color::igammatab_709;
-//  LUTf Color::gammatab_709;
-LUTf Color::igammatab_55;
-LUTf Color::gammatab_55;
-LUTf Color::igammatab_4;
-LUTf Color::gammatab_4;
 
-LUTf Color::igammatab_26_11;
-LUTf Color::gammatab_26_11;
+LUTf Color::denoiseGammaTab;
+LUTf Color::denoiseIGammaTab;
+
 LUTf Color::igammatab_24_17;
 LUTf Color::gammatab_24_17a;
 LUTf Color::gammatab_13_2;
@@ -148,13 +143,10 @@ void Color::init ()
     igammatab_srgb1(maxindex, 0);
     gammatab_srgb(maxindex, 0);
     gammatab_srgb1(maxindex, 0);
-    igammatab_55(maxindex, 0);
-    gammatab_55(maxindex, 0);
-    igammatab_4(maxindex, 0);
-    gammatab_4(maxindex, 0);
 
-    igammatab_26_11(maxindex, 0);
-    gammatab_26_11(maxindex, 0);
+    denoiseGammaTab(maxindex, 0);
+    denoiseIGammaTab(maxindex, 0);
+
     igammatab_24_17(maxindex, 0);
     gammatab_24_17a(maxindex, LUT_CLIP_ABOVE | LUT_CLIP_BELOW);
     gammatab_13_2(maxindex, 0);
@@ -193,6 +185,7 @@ void Color::init ()
             {
                 gammatab_srgb[i] = gammatab_srgb1[i] = gamma2(i / 65535.0);
             }
+
             gammatab_srgb *= 65535.f;
             gamma2curve.share(gammatab_srgb, LUT_CLIP_BELOW | LUT_CLIP_ABOVE); // shares the buffer with gammatab_srgb but has different clip flags
         }
@@ -200,9 +193,11 @@ void Color::init ()
         #pragma omp section
 #endif
         {
-            for (int i = 0; i < maxindex; i++) {
+            for (int i = 0; i < maxindex; i++)
+            {
                 igammatab_srgb[i] = igammatab_srgb1[i] = igamma2 (i / 65535.0);
             }
+
             igammatab_srgb *= 65535.f;
         }
 #ifdef _OPENMP
@@ -211,42 +206,74 @@ void Color::init ()
         {
             double rsRGBGamma = 1.0 / sRGBGamma;
 
-            for (int i = 0; i < maxindex; i++) {
+            for (int i = 0; i < maxindex; i++)
+            {
                 double val = pow (i / 65535.0, rsRGBGamma);
                 gammatab[i] = 65535.0 * val;
                 gammatabThumb[i] = (unsigned char)(255.0 * val);
             }
         }
+
 #ifdef _OPENMP
         #pragma omp section
 #endif
+        // modify arbitrary data for Lab..I have test : nothing, gamma 2.6 11 - gamma 4 5 - gamma 5.5 10
+        // we can put other as gamma g=2.6 slope=11, etc.
+        // but noting to do with real gamma !!!: it's only for data Lab # data RGB
+        // finally I opted for gamma55 and with options we can change
 
-        for (int i = 0; i < maxindex; i++) {
-            gammatab_55[i] = 65535.0 * gamma55 (i / 65535.0);
+        switch(settings->denoiselabgamma) {
+            case 0:
+                for (int i = 0; i < maxindex; i++) {
+                    denoiseGammaTab[i] = 65535.0 * gamma26_11 (i / 65535.0);
+                }
+
+                break;
+
+            case 1:
+                for (int i = 0; i < maxindex; i++) {
+                    denoiseGammaTab[i] = 65535.0 * gamma4 (i / 65535.0);
+                }
+
+                break;
+
+            default:
+                for (int i = 0; i < maxindex; i++) {
+                    denoiseGammaTab[i] = 65535.0 * gamma55 (i / 65535.0);
+                }
+
+                break;
         }
 
 #ifdef _OPENMP
         #pragma omp section
 #endif
+        // modify arbitrary data for Lab..I have test : nothing, gamma 2.6 11 - gamma 4 5 - gamma 5.5 10
+        // we can put other as gamma g=2.6 slope=11, etc.
+        // but noting to do with real gamma !!!: it's only for data Lab # data RGB
+        // finally I opted for gamma55 and with options we can change
 
-        for (int i = 0; i < maxindex; i++) {
-            igammatab_55[i] = 65535.0 * igamma55 (i / 65535.0);
-        }
+        switch(settings->denoiselabgamma) {
+            case 0:
+                for (int i = 0; i < maxindex; i++) {
+                    denoiseIGammaTab[i] = 65535.0 * igamma26_11 (i / 65535.0);
+                }
 
-#ifdef _OPENMP
-        #pragma omp section
-#endif
+                break;
 
-        for (int i = 0; i < maxindex; i++) {
-            gammatab_4[i] = 65535.0 * gamma4 (i / 65535.0);
-        }
+            case 1:
+                for (int i = 0; i < maxindex; i++) {
+                    denoiseIGammaTab[i] = 65535.0 * igamma4 (i / 65535.0);
+                }
 
-#ifdef _OPENMP
-        #pragma omp section
-#endif
+                break;
 
-        for (int i = 0; i < maxindex; i++) {
-            igammatab_4[i] = 65535.0 * igamma4 (i / 65535.0);
+            default:
+                for (int i = 0; i < maxindex; i++) {
+                    denoiseIGammaTab[i] = 65535.0 * igamma55 (i / 65535.0);
+                }
+
+                break;
         }
 
 #ifdef _OPENMP
@@ -297,22 +324,6 @@ void Color::init ()
             igammatab_145_3[i] = 65535.0 * igamma145_3 (i / 65535.0);
         }
 
-#ifdef _OPENMP
-        #pragma omp section
-#endif
-
-        for (int i = 0; i < maxindex; i++) {
-            gammatab_26_11[i] = 65535.0 * gamma26_11 (i / 65535.0);
-        }
-
-#ifdef _OPENMP
-        #pragma omp section
-#endif
-
-        for (int i = 0; i < maxindex; i++) {
-            igammatab_26_11[i] = 65535.0 * igamma26_11 (i / 65535.0);
-        }
-
 #ifdef _OPENMP
         #pragma omp section
 #endif
@@ -1482,7 +1493,7 @@ void Color::gammaf2lut (LUTf &gammacurve, float gamma, float start, float slope,
 {
 #ifdef __SSE2__
     // SSE2 version is more than 6 times faster than scalar version
-    vfloat iv = _mm_set_ps(3.f,2.f,1.f,0.f);
+    vfloat iv = _mm_set_ps(3.f, 2.f, 1.f, 0.f);
     vfloat fourv = F2V(4.f);
     vfloat gammav = F2V(1.f / gamma);
     vfloat slopev = F2V((slope / divisor) * factor);
@@ -1493,26 +1504,32 @@ void Color::gammaf2lut (LUTf &gammacurve, float gamma, float start, float slope,
     int border1 = border - (border & 3);
     int border2 = border1 + 4;
     int i = 0;
+
     for(; i < border1; i += 4) {
         vfloat resultv = iv * slopev;
-        _mm_storeu_ps(&gammacurve[i], resultv);
+        STVFU(gammacurve[i], resultv);
         iv += fourv;
     }
+
     for(; i < border2; i += 4) {
         vfloat result0v = iv * slopev;
         vfloat result1v = xexpf((xlogf(iv) - divisorv) * gammav) * factorv;
-        _mm_storeu_ps(&gammacurve[i], vself(vmaskf_le(iv, comparev), result0v, result1v));
+        STVFU(gammacurve[i], vself(vmaskf_le(iv, comparev), result0v, result1v));
         iv += fourv;
     }
+
     for(; i < 65536; i += 4) {
         vfloat resultv = xexpfNoCheck((xlogfNoCheck(iv) - divisorv) * gammav) * factorv;
-        _mm_storeu_ps(&gammacurve[i], resultv);
+        STVFU(gammacurve[i], resultv);
         iv += fourv;
     }
+
 #else
+
     for (int i = 0; i < 65536; ++i) {
         gammacurve[i] = gammaf(static_cast<float>(i) / divisor, gamma, start, slope) * factor;
     }
+
 #endif
 }
 
@@ -1520,23 +1537,30 @@ void Color::gammanf2lut (LUTf &gammacurve, float gamma, float divisor, float fac
 {
 #ifdef __SSE2__
     // SSE2 version is more than 6 times faster than scalar version
-    vfloat iv = _mm_set_ps(3.f,2.f,1.f,0.f);
+    vfloat iv = _mm_set_ps(3.f, 2.f, 1.f, 0.f);
     vfloat fourv = F2V(4.f);
     vfloat gammav = F2V(1.f / gamma);
     vfloat divisorv = F2V(xlogf(divisor));
     vfloat factorv = F2V(factor);
+
+    // first input value is zero => we have to use the xlogf function which checks this
     vfloat resultv = xexpf((xlogf(iv) - divisorv) * gammav) * factorv;
-    _mm_storeu_ps(&gammacurve[0], resultv);
+    STVFU(gammacurve[0], resultv);
     iv += fourv;
-    for(int i=4; i < 65536; i += 4) {
+
+    // inside the loop we can use xlogfNoCheck and xexpfNoCheck because we know about the input values
+    for(int i = 4; i < 65536; i += 4) {
         resultv = xexpfNoCheck((xlogfNoCheck(iv) - divisorv) * gammav) * factorv;
-        _mm_storeu_ps(&gammacurve[i], resultv);
+        STVFU(gammacurve[i], resultv);
         iv += fourv;
     }
+
 #else
+
     for (int i = 0; i < 65536; ++i) {
         gammacurve[i] = Color::gammanf(static_cast<float>(i) / divisor, gamma) * factor;
     }
+
 #endif
 }
 
@@ -2204,6 +2228,7 @@ void Color::gamutLchonly (float HH, float2 sincosval, float &Lprov1, float &Chpr
     neg = false, more_rgb = false;
 #endif
     float ChprovSave = Chprov1;
+
     do {
         inGamut = true;
 
@@ -2226,6 +2251,7 @@ void Color::gamutLchonly (float HH, float2 sincosval, float &Lprov1, float &Chpr
 #ifdef _DEBUG
             neg = true;
 #endif
+
             if (isnan(HH)) {
                 float atemp = ChprovSave * sincosval.y * 327.68;
                 float btemp = ChprovSave * sincosval.x * 327.68;
diff --git a/rtengine/color.h b/rtengine/color.h
index c7699e9c5..0f6b24797 100644
--- a/rtengine/color.h
+++ b/rtengine/color.h
@@ -132,13 +132,10 @@ public:
     static LUTf igammatab_srgb1;
     static LUTf gammatab_srgb;
     static LUTf gammatab_srgb1;
-    static LUTf igammatab_55;
-    static LUTf gammatab_55;
-    static LUTf igammatab_4;
-    static LUTf gammatab_4;
 
-    static LUTf igammatab_26_11;
-    static LUTf gammatab_26_11;
+    static LUTf denoiseGammaTab;
+    static LUTf denoiseIGammaTab;
+
     static LUTf igammatab_24_17;
     static LUTf gammatab_24_17a;
     static LUTf gammatab_13_2;
@@ -1107,7 +1104,7 @@ public:
 
     static inline float gammaf      (float x, float gamma, float start, float slope)
     {
-        return x <= start ? x*slope : xexpf(xlogf(x) / gamma);
+        return x <= start ? x * slope : xexpf(xlogf(x) / gamma);
     }
 
     //fills a LUT of size 65536 using gamma with slope...

From d564d17425e2677bf1f24b77088cb7e61957fc09 Mon Sep 17 00:00:00 2001
From: heckflosse <heckflosse67@gmx.de>
Date: Fri, 9 Sep 2016 16:36:18 +0200
Subject: [PATCH 4/5] mentioned the author of the sleef library in the header
 of the sleef source files, removed unused sleef.h

---
 rtengine/helperavx.h   |  6 +++++
 rtengine/helpersse2.h  |  9 ++++++++
 rtengine/sleef.c       |  9 ++++++++
 rtengine/sleef.h       | 51 ------------------------------------------
 rtengine/sleefsseavx.c | 10 +++++++++
 5 files changed, 34 insertions(+), 51 deletions(-)
 delete mode 100644 rtengine/sleef.h

diff --git a/rtengine/helperavx.h b/rtengine/helperavx.h
index eb32277c3..528760a92 100644
--- a/rtengine/helperavx.h
+++ b/rtengine/helperavx.h
@@ -1,3 +1,9 @@
+////////////////////////////////////////////////////////////////
+//
+//  this code was taken from http://shibatch.sourceforge.net/
+//  Many thanks to the author: Naoki Shibata
+//
+////////////////////////////////////////////////////////////////
 #ifndef __AVX__
 #error Please specify -mavx.
 #endif
diff --git a/rtengine/helpersse2.h b/rtengine/helpersse2.h
index 0f1fc5759..23dd016fa 100644
--- a/rtengine/helpersse2.h
+++ b/rtengine/helpersse2.h
@@ -1,3 +1,12 @@
+////////////////////////////////////////////////////////////////
+//
+//  this code was taken from http://shibatch.sourceforge.net/
+//  Many thanks to the author of original version: Naoki Shibata
+//
+//  This version contains modifications made by Ingo Weyrich
+//
+////////////////////////////////////////////////////////////////
+
 #ifndef __SSE2__
 #error Please specify -msse2.
 #endif
diff --git a/rtengine/sleef.c b/rtengine/sleef.c
index 2377aea79..bc38a3cfb 100644
--- a/rtengine/sleef.c
+++ b/rtengine/sleef.c
@@ -1,3 +1,12 @@
+////////////////////////////////////////////////////////////////
+//
+//  this code was taken from http://shibatch.sourceforge.net/
+//  Many thanks to the author of original version: Naoki Shibata
+//
+//  This version contains modifications made by Ingo Weyrich
+//
+////////////////////////////////////////////////////////////////
+
 #ifndef _SLEEFC_
 #define _SLEEFC_
 
diff --git a/rtengine/sleef.h b/rtengine/sleef.h
deleted file mode 100644
index 101a4faff..000000000
--- a/rtengine/sleef.h
+++ /dev/null
@@ -1,51 +0,0 @@
-typedef struct {
-    double x, y;
-} double2;
-
-typedef struct {
-    float x, y;
-} float2;
-
-double xsin(double d);
-double xcos(double d);
-double2 xsincos(double d);
-double xtan(double d);
-double xasin(double s);
-double xacos(double s);
-double xatan(double s);
-double xatan2(double y, double x);
-double xlog(double d);
-double xexp(double d);
-double xpow(double x, double y);
-
-double xsinh(double x);
-double xcosh(double x);
-double xtanh(double x);
-double xasinh(double x);
-double xacosh(double x);
-double xatanh(double x);
-double xldexp(double x, int q);
-int xilogb(double d);
-
-double xfma(double x, double y, double z);
-double xsqrt(double d);
-double xcbrt(double d);
-
-double xexp2(double a);
-double xexp10(double a);
-double xexpm1(double a);
-double xlog10(double a);
-double xlog1p(double a);
-
-float xsinf(float d);
-float xcosf(float d);
-float2 xsincosf(float d);
-float xtanf(float d);
-float xasinf(float s);
-float xacosf(float s);
-float xatanf(float s);
-float xatan2f(float y, float x);
-float xlogf(float d);
-float xexpf(float d);
-float xpowf(float x, float y);
-float xcbrtf(float d);
diff --git a/rtengine/sleefsseavx.c b/rtengine/sleefsseavx.c
index 1b0c9a0ae..a55fcf897 100644
--- a/rtengine/sleefsseavx.c
+++ b/rtengine/sleefsseavx.c
@@ -1,3 +1,13 @@
+////////////////////////////////////////////////////////////////
+//
+//  this code was taken from http://shibatch.sourceforge.net/
+//  Many thanks to the author of original version: Naoki Shibata
+//
+//  This version contains modifications made by Ingo Weyrich
+//
+////////////////////////////////////////////////////////////////
+
+
 #ifndef SLEEFSSEAVX
 #define SLEEFSSEAVX
 

From 3ffe983140ecb59966a3b929f8fbb196d452c2a1 Mon Sep 17 00:00:00 2001
From: heckflosse <heckflosse67@gmx.de>
Date: Fri, 9 Sep 2016 21:54:51 +0200
Subject: [PATCH 5/5] Small code cleanups

---
 rtengine/FTblockDN.cc        | 4 ++--
 rtengine/improccoordinator.h | 5 +++--
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/rtengine/FTblockDN.cc b/rtengine/FTblockDN.cc
index 8ff22864c..6c608cfc9 100644
--- a/rtengine/FTblockDN.cc
+++ b/rtengine/FTblockDN.cc
@@ -2106,7 +2106,7 @@ float ImProcFunctions::MadMax(float * DataList, int & max, int datalen)
 
 float ImProcFunctions::Mad(float * DataList, const int datalen)
 {
-    if(datalen <= 0) { // Avoid possible buffer underrun
+    if(datalen <= 1) { // Avoid possible buffer underrun
         return 0;
     }
 
@@ -2135,7 +2135,7 @@ float ImProcFunctions::Mad(float * DataList, const int datalen)
 
 float ImProcFunctions::MadRgb(float * DataList, const int datalen)
 {
-    if(datalen <= 0) { // Avoid possible buffer underrun
+    if(datalen <= 1) { // Avoid possible buffer underrun
         return 0;
     }
 
diff --git a/rtengine/improccoordinator.h b/rtengine/improccoordinator.h
index 776484791..0fb0041f3 100644
--- a/rtengine/improccoordinator.h
+++ b/rtengine/improccoordinator.h
@@ -326,12 +326,13 @@ public:
         return imgsrc;
     }
 
-    struct {
+    struct DenoiseInfoStore {
+        DenoiseInfoStore () : valid(false) {}
         float chM;
         float max_r[9];
         float max_b[9];
         float ch_M[9];
-        bool valid = false;
+        bool valid;
 
     } denoiseInfoStore;