Merge branch 'master' into dehaze

2015-09-08 09:40:23 +02:00
parent 60b3341f3a 18fb107f82
commit 5d6f7080b7
76 changed files with 2999 additions and 2247 deletions
--- a/rtengine/CMakeLists.txt
+++ b/rtengine/CMakeLists.txt
@@ -11,7 +11,7 @@ set (RTENGINESOURCEFILES safegtk.cc colortemp.cc curves.cc flatcurves.cc diagona
    dfmanager.cc ffmanager.cc rawimage.cc image8.cc image16.cc imagefloat.cc imagedata.cc imageio.cc improcfun.cc init.cc dcrop.cc
    loadinitial.cc procparams.cc rawimagesource.cc demosaic_algos.cc shmap.cc simpleprocess.cc refreshmap.cc
    fast_demo.cc amaze_demosaic_RT.cc CA_correct_RT.cc cfa_linedn_RT.cc green_equil_RT.cc hilite_recon.cc expo_before_b.cc
-    stdimagesource.cc myfile.cc iccjpeg.cc hlmultipliers.cc improccoordinator.cc editbuffer.cc
+    stdimagesource.cc myfile.cc iccjpeg.cc hlmultipliers.cc improccoordinator.cc editbuffer.cc coord.cc
    processingjob.cc rtthumbnail.cc utils.cc labimage.cc slicer.cc cieimage.cc
    iplab2rgb.cc ipsharpen.cc iptransform.cc ipresize.cc ipvibrance.cc
    imagedimensions.cc jpeg_memsrc.cc jdatasrc.cc iimage.cc
--- a/rtengine/ciecam02.cc
+++ b/rtengine/ciecam02.cc
@@ -462,7 +462,6 @@ void Ciecam02::cat02_to_xyz( double &x, double &y, double &z, double r, double g
    }
 }

-#ifndef __SSE2__
 void Ciecam02::cat02_to_xyzfloat( float &x, float &y, float &z, float r, float g, float b, int gamu )
 {
    gamu = 1;
@@ -480,7 +479,7 @@ void Ciecam02::cat02_to_xyzfloat( float &x, float &y, float &z, float r, float g
        z = ( 0.000000f * r) - (0.000000f * g) + (1.000000f * b);
    }
 }
-#else
+#ifdef __SSE2__
 void Ciecam02::cat02_to_xyzfloat( vfloat &x, vfloat &y, vfloat &z, vfloat r, vfloat g, vfloat b )
 {
    //gamut correction M.H.Brill S.Susstrunk
@@ -497,14 +496,14 @@ void Ciecam02::hpe_to_xyz( double &x, double &y, double &z, double r, double g,
    z = b;
 }

-#ifndef __SSE2__
+
 void Ciecam02::hpe_to_xyzfloat( float &x, float &y, float &z, float r, float g, float b )
 {
    x = (1.910197f * r) - (1.112124f * g) + (0.201908f * b);
    y = (0.370950f * r) + (0.629054f * g) - (0.000008f * b);
    z = b;
 }
-#else
+#ifdef __SSE2__
 void Ciecam02::hpe_to_xyzfloat( vfloat &x, vfloat &y, vfloat &z, vfloat r, vfloat g, vfloat b )
 {
    x = (F2V(1.910197f) * r) - (F2V(1.112124f) * g) + (F2V(0.201908f) * b);
@@ -565,7 +564,6 @@ void Ciecam02::Aab_to_rgb( double &r, double &g, double &b, double A, double aa,
    b = (0.32787 * x) - (0.15681 * aa) - (4.49038 * bb);
 }

-#ifndef __SSE2__
 void Ciecam02::Aab_to_rgbfloat( float &r, float &g, float &b, float A, float aa, float bb, float nbb )
 {
    float x = (A / nbb) + 0.305f;
@@ -577,7 +575,7 @@ void Ciecam02::Aab_to_rgbfloat( float &r, float &g, float &b, float A, float aa,
    /*       c1              c6               c7       */
    b = (0.32787f * x) - (0.15681f * aa) - (4.49038f * bb);
 }
-#else
+#ifdef __SSE2__
 void Ciecam02::Aab_to_rgbfloat( vfloat &r, vfloat &g, vfloat &b, vfloat A, vfloat aa, vfloat bb, vfloat nbb )
 {
    vfloat c1 = F2V(0.32787f) * ((A / nbb) + F2V(0.305f));
@@ -619,7 +617,6 @@ void Ciecam02::calculate_ab( double &aa, double &bb, double h, double e, double
        bb = (aa * sinh) / cosh;
    }
 }
-#ifndef __SSE2__
 void Ciecam02::calculate_abfloat( float &aa, float &bb, float h, float e, float t, float nbb, float a )
 {
    float2 sincosval = xsincosf((h * M_PI) / 180.0f);
@@ -657,7 +654,7 @@ void Ciecam02::calculate_abfloat( float &aa, float &bb, float h, float e, float
        std::swap(aa, bb);
    }
 }
-#else
+#ifdef __SSE2__
 void Ciecam02::calculate_abfloat( vfloat &aa, vfloat &bb, vfloat h, vfloat e, vfloat t, vfloat nbb, vfloat a )
 {
    vfloat2 sincosval = xsincosf((h * F2V(M_PI)) / F2V(180.0f));
@@ -862,7 +859,7 @@ void Ciecam02::xyz2jchqms_ciecam02( double &J, double &C, double &h, double &Q,

 void Ciecam02::xyz2jchqms_ciecam02float( float &J, float &C, float &h, float &Q, float &M, float &s, float &aw, float &fl, float &wh,
        float x, float y, float z, float xw, float yw, float zw,
-        float yb, float la, float f, float c, float nc, float pilotd, int gamu, float pow1, float nbb, float ncb, float pfl, float cz, float d)
+        float c, float nc, int gamu, float pow1, float nbb, float ncb, float pfl, float cz, float d)

 {
    float r, g, b;
@@ -876,9 +873,9 @@ void Ciecam02::xyz2jchqms_ciecam02float( float &J, float &C, float &h, float &Q,
    gamu = 1;
    xyz_to_cat02float( r, g, b, x, y, z, gamu );
    xyz_to_cat02float( rw, gw, bw, xw, yw, zw, gamu );
-    rc = r * (((yw * d) / rw) + (1.0 - d));
-    gc = g * (((yw * d) / gw) + (1.0 - d));
-    bc = b * (((yw * d) / bw) + (1.0 - d));
+    rc = r * (((yw * d) / rw) + (1.f - d));
+    gc = g * (((yw * d) / gw) + (1.f - d));
+    bc = b * (((yw * d) / bw) + (1.f - d));

    cat02_to_hpefloat( rp, gp, bp, rc, gc, bc, gamu );

@@ -924,7 +921,7 @@ void Ciecam02::xyz2jchqms_ciecam02float( float &J, float &C, float &h, float &Q,
 #ifdef __SSE2__
 void Ciecam02::xyz2jchqms_ciecam02float( vfloat &J, vfloat &C, vfloat &h, vfloat &Q, vfloat &M, vfloat &s, vfloat aw, vfloat fl, vfloat wh,
        vfloat x, vfloat y, vfloat z, vfloat xw, vfloat yw, vfloat zw,
-        vfloat yb, vfloat la, vfloat f, vfloat c, vfloat nc, vfloat pow1, vfloat nbb, vfloat ncb, vfloat pfl, vfloat cz, vfloat d)
+        vfloat c, vfloat nc, vfloat pow1, vfloat nbb, vfloat ncb, vfloat pfl, vfloat cz, vfloat d)

 {
    vfloat r, g, b;
@@ -979,6 +976,65 @@ void Ciecam02::xyz2jchqms_ciecam02float( vfloat &J, vfloat &C, vfloat &h, vfloat
 }
 #endif

+void Ciecam02::xyz2jch_ciecam02float( float &J, float &C, float &h, float aw, float fl,
+                                      float x, float y, float z, float xw, float yw, float zw,
+                                      float c, float nc, float pow1, float nbb, float ncb, float cz, float d)
+
+{
+    float r, g, b;
+    float rw, gw, bw;
+    float rc, gc, bc;
+    float rp, gp, bp;
+    float rpa, gpa, bpa;
+    float a, ca, cb;
+    float e, t;
+    float myh;
+    int gamu = 1;
+    xyz_to_cat02float( r, g, b, x, y, z, gamu );
+    xyz_to_cat02float( rw, gw, bw, xw, yw, zw, gamu );
+    rc = r * (((yw * d) / rw) + (1.f - d));
+    gc = g * (((yw * d) / gw) + (1.f - d));
+    bc = b * (((yw * d) / bw) + (1.f - d));
+
+    cat02_to_hpefloat( rp, gp, bp, rc, gc, bc, gamu );
+
+    if (gamu == 1) { //gamut correction M.H.Brill S.Susstrunk
+        rp = MAXR(rp, 0.0f);
+        gp = MAXR(gp, 0.0f);
+        bp = MAXR(bp, 0.0f);
+    }
+
+    rpa = nonlinear_adaptationfloat( rp, fl );
+    gpa = nonlinear_adaptationfloat( gp, fl );
+    bpa = nonlinear_adaptationfloat( bp, fl );
+
+    ca = rpa - ((12.0f * gpa) - bpa) / 11.0f;
+    cb = (0.11111111f) * (rpa + gpa - (2.0f * bpa));
+
+    myh = xatan2f( cb, ca );
+
+    if ( myh < 0.0f ) {
+        myh += (2.f * M_PI);
+    }
+
+    a = ((2.0f * rpa) + gpa + (0.05f * bpa) - 0.305f) * nbb;
+
+    if (gamu == 1) {
+        a = MAXR(a, 0.0f); //gamut correction M.H.Brill S.Susstrunk
+    }
+
+    J = pow_F( a / aw, c * cz * 0.5f);
+
+    e = ((961.53846f) * nc * ncb) * (xcosf( myh + 2.0f ) + 3.8f);
+    t = (e * sqrtf( (ca * ca) + (cb * cb) )) / (rpa + gpa + (1.05f * bpa));
+
+    C = pow_F( t, 0.9f ) * J * pow1;
+
+    J *= J * 100.0f;
+    h = (myh * 180.f) / (float)M_PI;
+}
+
+
 void Ciecam02::jch2xyz_ciecam02( double &x, double &y, double &z, double J, double C, double h,
                                 double xw, double yw, double zw, double yb, double la,
                                 double f, double c, double nc , int gamu, double n, double nbb, double ncb, double fl, double cz, double d, double aw )
@@ -1012,9 +1068,9 @@ void Ciecam02::jch2xyz_ciecam02( double &x, double &y, double &z, double J, doub

    cat02_to_xyz( x, y, z, r, g, b, gamu );
 }
-#ifndef __SSE2__
+
 void Ciecam02::jch2xyz_ciecam02float( float &x, float &y, float &z, float J, float C, float h,
-                                      float xw, float yw, float zw, float yb, float la,
+                                      float xw, float yw, float zw,
                                      float f, float c, float nc , int gamu, float pow1, float nbb, float ncb, float fl, float cz, float d, float aw)
 {
    float r, g, b;
@@ -1047,9 +1103,9 @@ void Ciecam02::jch2xyz_ciecam02float( float &x, float &y, float &z, float J, flo
    cat02_to_xyzfloat( x, y, z, r, g, b, gamu );
 }

-#else
+#ifdef __SSE2__
 void Ciecam02::jch2xyz_ciecam02float( vfloat &x, vfloat &y, vfloat &z, vfloat J, vfloat C, vfloat h,
-                                      vfloat xw, vfloat yw, vfloat zw, vfloat yb, vfloat la,
+                                      vfloat xw, vfloat yw, vfloat zw,
                                      vfloat f, vfloat nc, vfloat pow1, vfloat nbb, vfloat ncb, vfloat fl, vfloat d, vfloat aw, vfloat reccmcz)
 {
    vfloat r, g, b;
@@ -1135,7 +1191,6 @@ double Ciecam02::inverse_nonlinear_adaptation( double c, double fl )
    return c1 * (100.0 / fl) * pow( (27.13 * fabs( c - 0.1 )) / (400.0 - fabs( c - 0.1 )), 1.0 / 0.42 );
 }

-#ifndef __SSE2__
 float Ciecam02::inverse_nonlinear_adaptationfloat( float c, float fl )
 {
    c -= 0.1f;
@@ -1153,7 +1208,7 @@ float Ciecam02::inverse_nonlinear_adaptationfloat( float c, float fl )
    return (100.0f / fl) * pow_F( (27.13f * fabsf( c )) / (400.0f - fabsf( c )), 2.38095238f );
 }

-#else
+#ifdef __SSE2__
 vfloat Ciecam02::inverse_nonlinear_adaptationfloat( vfloat c, vfloat fl )
 {
    c -= F2V(0.1f);
--- a/rtengine/ciecam02.h
+++ b/rtengine/ciecam02.h
@@ -55,13 +55,13 @@ private:
    static float nonlinear_adaptationfloat( float c, float fl );
    static double inverse_nonlinear_adaptation( double c, double fl );

-#ifndef __SSE2__
+
    static float inverse_nonlinear_adaptationfloat( float c, float fl );
    static void calculate_abfloat( float &aa, float &bb, float h, float e, float t, float nbb, float a );
    static void Aab_to_rgbfloat( float &r, float &g, float &b, float A, float aa, float bb, float nbb );
    static void hpe_to_xyzfloat   ( float &x,  float &y,  float &z,  float r, float g, float b );
    static void cat02_to_xyzfloat ( float &x,  float &y,  float &z,  float r, float g, float b, int gamu );
-#else
+#ifdef __SSE2__
    static vfloat inverse_nonlinear_adaptationfloat( vfloat c, vfloat fl );
    static void calculate_abfloat( vfloat &aa, vfloat &bb, vfloat h, vfloat e, vfloat t, vfloat nbb, vfloat a );
    static void Aab_to_rgbfloat( vfloat &r, vfloat &g, vfloat &b, vfloat A, vfloat aa, vfloat bb, vfloat nbb );
@@ -85,17 +85,15 @@ public:
                                  double yb, double la,
                                  double f, double c, double nc, int gamu, double n, double nbb, double ncb, double fl, double cz, double d, double aw);

-#ifndef __SSE2__
+
    static void jch2xyz_ciecam02float( float &x, float &y, float &z,
                                       float J, float C, float h,
                                       float xw, float yw, float zw,
-                                       float yb, float la,
                                       float f, float c, float nc, int gamu, float n, float nbb, float ncb, float fl, float cz, float d, float aw );
-#else
+#ifdef __SSE2__
    static void jch2xyz_ciecam02float( vfloat &x, vfloat &y, vfloat &z,
                                       vfloat J, vfloat C, vfloat h,
                                       vfloat xw, vfloat yw, vfloat zw,
-                                       vfloat yb, vfloat la,
                                       vfloat f, vfloat nc, vfloat n, vfloat nbb, vfloat ncb, vfloat fl, vfloat d, vfloat aw, vfloat reccmcz );
 #endif
    /**
@@ -120,20 +118,24 @@ public:
                                     double yb, double la,
                                     double f, double c, double nc,  double pilotd, int gamu , double n, double nbb, double ncb, double pfl, double cz, double d );

+    static void xyz2jch_ciecam02float( float &J, float &C, float &h,
+                                       float aw, float fl,
+                                       float x, float y, float z,
+                                       float xw, float yw, float zw,
+                                       float c, float nc, float n, float nbb, float ncb, float cz, float d  );
+
    static void xyz2jchqms_ciecam02float( float &J, float &C, float &h,
                                          float &Q, float &M, float &s, float &aw, float &fl, float &wh,
                                          float x, float y, float z,
                                          float xw, float yw, float zw,
-                                          float yb, float la,
-                                          float f, float c, float nc,  float pilotd, int gamu, float n, float nbb, float ncb, float pfl, float cz, float d  );
+                                          float c, float nc, int gamu, float n, float nbb, float ncb, float pfl, float cz, float d  );

 #ifdef __SSE2__
    static void xyz2jchqms_ciecam02float( vfloat &J, vfloat &C, vfloat &h,
                                          vfloat &Q, vfloat &M, vfloat &s, vfloat aw, vfloat fl, vfloat wh,
                                          vfloat x, vfloat y, vfloat z,
                                          vfloat xw, vfloat yw, vfloat zw,
-                                          vfloat yb, vfloat la,
-                                          vfloat f, vfloat c, vfloat nc, vfloat n, vfloat nbb, vfloat ncb, vfloat pfl, vfloat cz, vfloat d  );
+                                          vfloat c, vfloat nc, vfloat n, vfloat nbb, vfloat ncb, vfloat pfl, vfloat cz, vfloat d  );


 #endif
--- a/rtengine/coord.cc
+++ b/rtengine/coord.cc
@@ -0,0 +1,39 @@
+/*
+ *  This file is part of RawTherapee.
+ *
+ *  Copyright (c) 2004-2010 Gabor Horvath <hgabor@rawtherapee.com>
+ *
+ *  RawTherapee is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  RawTherapee is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with RawTherapee.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "coord.h"
+
+namespace rtengine
+{
+
+void Coord::setFromPolar(PolarCoord polar)
+{
+    while (polar.angle <   0.f) {
+        polar.angle += 360.f;
+    }
+
+    while (polar.angle > 360.f) {
+        polar.angle -= 360.f;
+    }
+
+    x = polar.radius * cos(polar.angle / 180.f * M_PI);
+    y = polar.radius * sin(polar.angle / 180.f * M_PI);
+}
+
+}
--- a/rtengine/coord.h
+++ b/rtengine/coord.h
@@ -0,0 +1,221 @@
+/*
+ *  This file is part of RawTherapee.
+ *
+ *  Copyright (c) 2004-2010 Gabor Horvath <hgabor@rawtherapee.com>
+ *
+ *  RawTherapee is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  RawTherapee is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with RawTherapee.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#ifndef __COORD__
+#define __COORD__
+
+#include "rt_math.h"
+
+namespace rtengine
+{
+
+class PolarCoord;
+
+// Do not confuse with rtengine::Coord2D, this one is for the GUI
+class Coord
+{
+public:
+    int x;
+    int y;
+
+    Coord() : x(-1), y(-1) {}
+    Coord(int x, int y) : x(x), y(y) {}
+
+    void set (int x, int y)
+    {
+        this->x = x;
+        this->y = y;
+    }
+
+    void setFromPolar(PolarCoord polar);
+
+    /// @brief Clip the coord to stay in the width x height bounds
+    /// @return true if the x or y coordinate has changed
+    bool clip(int width, int height)
+    {
+        int trimmedX = rtengine::LIM<int>(x, 0, width);
+        int trimmedY = rtengine::LIM<int>(y, 0, height);
+        bool retval = trimmedX != x || trimmedY != y;
+        x = trimmedX;
+        y = trimmedY;
+        return retval;
+    }
+
+    bool operator== (const Coord& other) const
+    {
+        return other.x == x && other.y == y;
+    }
+
+    bool operator!= (const Coord& other) const
+    {
+        return other.x != x || other.y != y;
+    }
+
+    void operator+=(const Coord & rhs)
+    {
+        x += rhs.x;
+        y += rhs.y;
+    }
+    void operator-=(const Coord & rhs)
+    {
+        x -= rhs.x;
+        y -= rhs.y;
+    }
+    void operator*=(double scale)
+    {
+        x *= scale;
+        y *= scale;
+    }
+    Coord operator+(Coord & rhs)
+    {
+        Coord result(x + rhs.x, y + rhs.y);
+        return result;
+    }
+    Coord operator-(Coord & rhs)
+    {
+        Coord result(x - rhs.x, y - rhs.y);
+        return result;
+    }
+    Coord operator*(double scale)
+    {
+        Coord result(x * scale, y * scale);
+        return result;
+    }
+};
+
+class PolarCoord
+{
+public:
+    double radius;
+    double angle; // degree
+
+    PolarCoord() : radius(1.), angle(0.) {}
+    PolarCoord(double radius, double angle) : radius(radius), angle(angle) {}
+    PolarCoord(Coord start, Coord end) : radius(1.), angle(0.)
+    {
+        setFromCartesian(start, end);
+    }
+    PolarCoord(Coord delta) : radius(1.), angle(0.)
+    {
+        setFromCartesian(delta);
+    }
+
+    void set (double radius, double angle)
+    {
+        this->radius = radius;
+        this->angle = angle;
+    }
+
+    void setFromCartesian(Coord start, Coord end)
+    {
+        Coord delta(end.x - start.x, end.y - start.y);
+        setFromCartesian(delta);
+    }
+
+    void setFromCartesian(Coord delta)
+    {
+        if (!delta.x && !delta.y) {
+            // null vector, we set to a default value
+            radius = 1.;
+            angle = 0.;
+            return;
+        }
+
+        double x_ = double(delta.x);
+        double y_ = double(delta.y);
+        radius = sqrt(x_ * x_ + y_ * y_);
+
+        if (delta.x > 0.) {
+            if (delta.y >= 0.) {
+                angle = atan(y_ / x_) / (2 * M_PI) * 360.;
+            } else if (delta.y < 0.) {
+                angle = (atan(y_ / x_) + 2 * M_PI) / (2 * M_PI) * 360.;
+            }
+        } else if (delta.x < 0.) {
+            angle = (atan(y_ / x_) + M_PI) / (2 * M_PI) * 360.;
+        } else if (delta.x == 0.) {
+            if (delta.y > 0.) {
+                angle = 90.;
+            } else {
+                angle = 270.;
+            }
+        }
+    }
+
+    bool operator== (const PolarCoord& other) const
+    {
+        return other.radius == radius && other.angle == angle;
+    }
+
+    bool operator!= (const PolarCoord& other) const
+    {
+        return other.radius != radius || other.angle != angle;
+    }
+
+    void operator+=(const PolarCoord & rhs)
+    {
+        Coord thisCoord, rhsCoord;
+        thisCoord.setFromPolar(*this);
+        rhsCoord.setFromPolar(rhs);
+        thisCoord += rhsCoord;
+        setFromCartesian(thisCoord);
+    }
+    void operator-=(const PolarCoord & rhs)
+    {
+        Coord thisCoord, rhsCoord;
+        thisCoord.setFromPolar(*this);
+        rhsCoord.setFromPolar(rhs);
+        thisCoord -= rhsCoord;
+        setFromCartesian(thisCoord);
+    }
+    void operator*=(double scale)
+    {
+        radius *= scale;
+    }
+    PolarCoord operator+(PolarCoord & rhs)
+    {
+        Coord thisCoord, rhsCoord;
+        thisCoord.setFromPolar(*this);
+        rhsCoord.setFromPolar(rhs);
+        thisCoord += rhsCoord;
+        PolarCoord result;
+        result.setFromCartesian(thisCoord);
+        return result;
+    }
+    PolarCoord operator-(PolarCoord & rhs)
+    {
+        Coord thisCoord, rhsCoord;
+        thisCoord.setFromPolar(*this);
+        rhsCoord.setFromPolar(rhs);
+        thisCoord -= rhsCoord;
+        PolarCoord result;
+        result.setFromCartesian(thisCoord);
+        return result;
+    }
+    Coord operator*(double scale)
+    {
+        Coord result(radius * scale, angle);
+        return result;
+    }
+
+};
+
+
+}
+
+#endif
--- a/rtengine/curves.cc
+++ b/rtengine/curves.cc
@@ -33,6 +33,7 @@
 #include "LUT.h"
 #include "curves.h"
 #include "opthelper.h"
+#include "ciecam02.h"
 #undef CLIPD
 #define CLIPD(a) ((a)>0.0f?((a)<1.0f?(a):1.0f):0.0f)

@@ -2139,13 +2140,6 @@ float PerceptualToneCurve::calculateToneCurveContrastValue(void) const
 void PerceptualToneCurve::Apply(float &r, float &g, float &b, PerceptualToneCurveState & state) const
 {
    float x, y, z;
-    cmsCIEXYZ XYZ;
-    cmsJCh JCh;
-
-    int thread_idx = 0;
-#ifdef _OPENMP
-    thread_idx = omp_get_thread_num();
-#endif

    if (!state.isProphoto) {
        // convert to prophoto space to make sure the same result is had regardless of working color space
@@ -2190,12 +2184,16 @@ void PerceptualToneCurve::Apply(float &r, float &g, float &b, PerceptualToneCurv

    // move to JCh so we can modulate chroma based on the global contrast-related chroma scaling factor
    Color::Prophotoxyz(r, g, b, x, y, z);
-    XYZ.X = x * 100.0f / 65535;
-    XYZ.Y = y * 100.0f / 65535;
-    XYZ.Z = z * 100.0f / 65535;
-    cmsCIECAM02Forward(h02[thread_idx], &XYZ, &JCh);

-    if (!isfinite(JCh.J) || !isfinite(JCh.C) || !isfinite(JCh.h)) {
+    float J, C, h;
+    Ciecam02::xyz2jch_ciecam02float( J, C, h,
+                                     aw, fl,
+                                     x * 0.0015259022f,  y * 0.0015259022f,  z * 0.0015259022f,
+                                     xw, yw,  zw,
+                                     c,  nc, pow1, nbb, ncb, cz, d);
+
+
+    if (!isfinite(J) || !isfinite(C) || !isfinite(h)) {
        // this can happen for dark noise colors or colors outside human gamut. Then we just return the curve's result.
        if (!state.isProphoto) {
            float newr = state.Prophoto2Working[0][0] * r + state.Prophoto2Working[0][1] * g + state.Prophoto2Working[0][2] * b;
@@ -2215,24 +2213,24 @@ void PerceptualToneCurve::Apply(float &r, float &g, float &b, PerceptualToneCurv

    {
        // decrease chroma scaling sligthly of extremely saturated colors
-        float saturated_scale_factor = 0.95;
-        const float lolim = 35; // lower limit, below this chroma all colors will keep original chroma scaling factor
-        const float hilim = 60; // high limit, above this chroma the chroma scaling factor is multiplied with the saturated scale factor value above
+        float saturated_scale_factor = 0.95f;
+        const float lolim = 35.f; // lower limit, below this chroma all colors will keep original chroma scaling factor
+        const float hilim = 60.f; // high limit, above this chroma the chroma scaling factor is multiplied with the saturated scale factor value above

-        if (JCh.C < lolim) {
+        if (C < lolim) {
            // chroma is low enough, don't scale
-            saturated_scale_factor = 1.0;
-        } else if (JCh.C < hilim) {
+            saturated_scale_factor = 1.f;
+        } else if (C < hilim) {
            // S-curve transition between low and high limit
-            float x = (JCh.C - lolim) / (hilim - lolim); // x = [0..1], 0 at lolim, 1 at hilim
+            float x = (C - lolim) / (hilim - lolim); // x = [0..1], 0 at lolim, 1 at hilim

-            if (x < 0.5) {
-                x = 0.5 * powf(2 * x, 2);
+            if (x < 0.5f) {
+                x = 2.f * SQR(x);
            } else {
-                x = 0.5 + 0.5 * (1 - powf(1 - 2 * (x - 0.5), 2));
+                x = 1.f - 2.f * SQR(1 - x);
            }

-            saturated_scale_factor = 1.0 * (1.0 - x) + saturated_scale_factor * x;
+            saturated_scale_factor = (1.f - x) + saturated_scale_factor * x;
        } else {
            // do nothing, high saturation color, keep scale factor
        }
@@ -2242,11 +2240,11 @@ void PerceptualToneCurve::Apply(float &r, float &g, float &b, PerceptualToneCurv

    {
        // increase chroma scaling slightly of shadows
-        float nL = CurveFactory::gamma2(newLuminance / 65535); // apply gamma so we make comparison and transition with a more perceptual lightness scale
-        float dark_scale_factor = 1.20;
+        float nL = gamma2curve[newLuminance]; // apply gamma so we make comparison and transition with a more perceptual lightness scale
+        float dark_scale_factor = 1.20f;
        //float dark_scale_factor = 1.0 + state.debug.p2 / 100.0f;
-        const float lolim = 0.15;
-        const float hilim = 0.50;
+        const float lolim = 0.15f;
+        const float hilim = 0.50f;

        if (nL < lolim) {
            // do nothing, keep scale factor
@@ -2254,15 +2252,15 @@ void PerceptualToneCurve::Apply(float &r, float &g, float &b, PerceptualToneCurv
            // S-curve transition
            float x = (nL - lolim) / (hilim - lolim); // x = [0..1], 0 at lolim, 1 at hilim

-            if (x < 0.5) {
-                x = 0.5 * powf(2 * x, 2);
+            if (x < 0.5f) {
+                x = 2.f * SQR(x);
            } else {
-                x = 0.5 + 0.5 * (1 - powf(1 - 2 * (x - 0.5), 2));
+                x = 1.f - 2.f * SQR(1 - x);
            }

-            dark_scale_factor = dark_scale_factor * (1.0 - x) + 1.0 * x;
+            dark_scale_factor = dark_scale_factor * (1.0f - x) + x;
        } else {
-            dark_scale_factor = 1.0;
+            dark_scale_factor = 1.f;
        }

        cmul *= dark_scale_factor;
@@ -2270,34 +2268,38 @@ void PerceptualToneCurve::Apply(float &r, float &g, float &b, PerceptualToneCurv

    {
        // to avoid strange CIECAM02 chroma errors on close-to-shadow-clipping colors we reduce chroma scaling towards 1.0 for black colors
-        float dark_scale_factor = 1.0 / cmul;
-        const float lolim = 4;
-        const float hilim = 7;
+        float dark_scale_factor = 1.f / cmul;
+        const float lolim = 4.f;
+        const float hilim = 7.f;

-        if (JCh.J < lolim) {
+        if (J < lolim) {
            // do nothing, keep scale factor
-        } else if (JCh.J < hilim) {
+        } else if (J < hilim) {
            // S-curve transition
-            float x = (JCh.J - lolim) / (hilim - lolim);
+            float x = (J - lolim) / (hilim - lolim);

-            if (x < 0.5) {
-                x = 0.5 * powf(2 * x, 2);
+            if (x < 0.5f) {
+                x = 2.f * SQR(x);
            } else {
-                x = 0.5 + 0.5 * (1 - powf(1 - 2 * (x - 0.5), 2));
+                x = 1.f - 2.f * SQR(1 - x);
            }

-            dark_scale_factor = dark_scale_factor * (1.0 - x) + 1.0 * x;
+            dark_scale_factor = dark_scale_factor * (1.f - x) + x;
        } else {
-            dark_scale_factor = 1.0;
+            dark_scale_factor = 1.f;
        }

        cmul *= dark_scale_factor;
    }

-    JCh.C *= cmul;
-    cmsCIECAM02Reverse(h02[thread_idx], &JCh, &XYZ);
+    C *= cmul;

-    if (!isfinite(XYZ.X) || !isfinite(XYZ.Y) || !isfinite(XYZ.Z)) {
+    Ciecam02::jch2xyz_ciecam02float( x, y, z,
+                                     J, C, h,
+                                     xw, yw,  zw,
+                                     f,  c, nc, 1, pow1, nbb, ncb, fl, cz, d, aw );
+
+    if (!isfinite(x) || !isfinite(y) || !isfinite(z)) {
        // can happen for colors on the rim of being outside gamut, that worked without chroma scaling but not with. Then we return only the curve's result.
        if (!state.isProphoto) {
            float newr = state.Prophoto2Working[0][0] * r + state.Prophoto2Working[0][1] * g + state.Prophoto2Working[0][2] * b;
@@ -2311,10 +2313,10 @@ void PerceptualToneCurve::Apply(float &r, float &g, float &b, PerceptualToneCurv
        return;
    }

-    Color::xyz2Prophoto(XYZ.X, XYZ.Y, XYZ.Z, r, g, b);
-    r *= 655.35;
-    g *= 655.35;
-    b *= 655.35;
+    Color::xyz2Prophoto(x, y, z, r, g, b);
+    r *= 655.35f;
+    g *= 655.35f;
+    b *= 655.35f;
    r = LIM<float>(r, 0.f, 65535.f);
    g = LIM<float>(g, 0.f, 65535.f);
    b = LIM<float>(b, 0.f, 65535.f);
@@ -2329,34 +2331,34 @@ void PerceptualToneCurve::Apply(float &r, float &g, float &b, PerceptualToneCurv
        Color::rgb2hsv(ar, ag, ab, ah, as, av);
        Color::rgb2hsv(r, g, b, h, s, v);

-        float sat_scale = as <= 0.0 ? 1.0 : s / as; // saturation scale compared to Adobe curve
-        float keep = 0.2;
-        const float lolim = 1.00; // only mix in the Adobe curve if we have increased saturation compared to it
-        const float hilim = 1.20;
+        float sat_scale = as <= 0.f ? 1.f : s / as; // saturation scale compared to Adobe curve
+        float keep = 0.2f;
+        const float lolim = 1.00f; // only mix in the Adobe curve if we have increased saturation compared to it
+        const float hilim = 1.20f;

        if (sat_scale < lolim) {
            // saturation is low enough, don't desaturate
-            keep = 1.0;
+            keep = 1.f;
        } else if (sat_scale < hilim) {
            // S-curve transition
            float x = (sat_scale - lolim) / (hilim - lolim); // x = [0..1], 0 at lolim, 1 at hilim

-            if (x < 0.5) {
-                x = 0.5 * powf(2 * x, 2);
+            if (x < 0.5f) {
+                x = 2.f * SQR(x);
            } else {
-                x = 0.5 + 0.5 * (1 - powf(1 - 2 * (x - 0.5), 2));
+                x = 1.f - 2.f * SQR(1 - x);
            }

-            keep = 1.0 * (1.0 - x) + keep * x;
+            keep = (1.f - x) + keep * x;
        } else {
            // do nothing, very high increase, keep minimum amount
        }

-        if (keep < 1.0) {
+        if (keep < 1.f) {
            // mix in some of the Adobe curve result
-            r = r * keep + (1.0 - keep) * ar;
-            g = g * keep + (1.0 - keep) * ag;
-            b = b * keep + (1.0 - keep) * ab;
+            r = r * keep + (1.f - keep) * ar;
+            g = g * keep + (1.f - keep) * ag;
+            b = b * keep + (1.f - keep) * ab;
        }
    }

@@ -2370,41 +2372,28 @@ void PerceptualToneCurve::Apply(float &r, float &g, float &b, PerceptualToneCurv
    }
 }

-cmsContext * PerceptualToneCurve::c02;
-cmsHANDLE * PerceptualToneCurve::h02;
 float PerceptualToneCurve::cf_range[2];
 float PerceptualToneCurve::cf[1000];
+LUTf PerceptualToneCurve::gamma2curve;
+float PerceptualToneCurve::f, PerceptualToneCurve::c, PerceptualToneCurve::nc, PerceptualToneCurve::yb, PerceptualToneCurve::la, PerceptualToneCurve::xw, PerceptualToneCurve::yw, PerceptualToneCurve::zw, PerceptualToneCurve::gamut;
+float PerceptualToneCurve::n, PerceptualToneCurve::d, PerceptualToneCurve::nbb, PerceptualToneCurve::ncb, PerceptualToneCurve::cz, PerceptualToneCurve::aw, PerceptualToneCurve::wh, PerceptualToneCurve::pfl, PerceptualToneCurve::fl, PerceptualToneCurve::pow1;

 void PerceptualToneCurve::init()
 {

-    {
-        // init ciecam02 state, used for chroma scalings
-        cmsViewingConditions vc;
-        vc.whitePoint = *cmsD50_XYZ();
-        vc.whitePoint.X *= 100;
-        vc.whitePoint.Y *= 100;
-        vc.whitePoint.Z *= 100;
-        vc.Yb = 20;
-        vc.La = 20;
-        vc.surround = AVG_SURROUND;
-        vc.D_value = 1.0;
+    // init ciecam02 state, used for chroma scalings
+    xw = 96.42f;
+    yw = 100.0f;
+    zw = 82.49f;
+    yb = 20;
+    la = 20;
+    f  = 1.00f;
+    c  = 0.69f;
+    nc = 1.00f;

-        int thread_count = 1;
-#ifdef _OPENMP
-        thread_count = omp_get_max_threads();
-#endif
-        h02 = (cmsHANDLE *)malloc(sizeof(h02[0]) * (thread_count + 1));
-        c02 = (cmsContext *)malloc(sizeof(c02[0]) * (thread_count + 1));
-        h02[thread_count] = NULL;
-        c02[thread_count] = NULL;
-
-        // little cms requires one state per thread, for thread safety
-        for (int i = 0; i < thread_count; i++) {
-            c02[i] = cmsCreateContext(NULL, NULL);
-            h02[i] = cmsCIECAM02Init(c02[i], &vc);
-        }
-    }
+    Ciecam02::initcam1float(gamut, yb, 1.f, f, la, xw, yw, zw, n, d, nbb, ncb,
+                            cz, aw, wh, pfl, fl, c);
+    pow1 = pow_F( 1.64f - pow_F( 0.29f, n ), 0.73f );

    {
        // init contrast-value-to-chroma-scaling conversion curve
@@ -2448,17 +2437,12 @@ void PerceptualToneCurve::init()
        cf_range[0] = in_x[0];
        cf_range[1] = in_x[in_len - 1];
    }
-}
+    gamma2curve(65536, 0);

-void PerceptualToneCurve::cleanup()
-{
-    for (int i = 0; h02[i] != NULL; i++) {
-        cmsCIECAM02Done(h02[i]);
-        cmsDeleteContext(c02[i]);
+    for (int i = 0; i < 65536; i++) {
+        gamma2curve[i] = CurveFactory::gamma2(i / 65535.0);
    }

-    free(h02);
-    free(c02);
 }

 void PerceptualToneCurve::initApplyState(PerceptualToneCurveState & state, Glib::ustring workingSpace) const
--- a/rtengine/curves.h
+++ b/rtengine/curves.h
@@ -827,10 +827,10 @@ public:
 class PerceptualToneCurveState
 {
 public:
-    bool isProphoto;
    float Working2Prophoto[3][3];
    float Prophoto2Working[3][3];
    float cmul_contrast;
+    bool isProphoto;
 };

 // Tone curve whose purpose is to keep the color appearance constant, that is the curve changes contrast
@@ -840,10 +840,13 @@ public:
 class PerceptualToneCurve : public ToneCurve
 {
 private:
-    static cmsHANDLE *h02;
-    static cmsContext *c02;
    static float cf_range[2];
    static float cf[1000];
+    static LUTf gamma2curve;
+    // for ciecam02
+    static float f, c, nc, yb, la, xw, yw, zw, gamut;
+    static float n, d, nbb, ncb, cz, aw, wh, pfl, fl, pow1;
+
    static void cubic_spline(const float x[], const float y[], const int len, const float out_x[], float out_y[], const int out_len);
    static float find_minimum_interval_halving(float (*func)(float x, void *arg), void *arg, float a, float b, float tol, int nmax);
    static float find_tc_slope_fun(float k, void *arg);
@@ -851,7 +854,6 @@ private:
    float calculateToneCurveContrastValue() const;
 public:
    static void init();
-    static void cleanup();
    void initApplyState(PerceptualToneCurveState & state, Glib::ustring workingSpace) const;
    void Apply(float& r, float& g, float& b, PerceptualToneCurveState & state) const;
 };
--- a/rtengine/diagonalcurves.cc
+++ b/rtengine/diagonalcurves.cc
@@ -70,6 +70,10 @@ DiagonalCurve::DiagonalCurve (const std::vector<double>& p, int poly_pn)
                identity = false;
            }

+            if(x[0] == 0.f && x[1] == 0.f)
+                // Avoid crash when first two points are at x = 0 (git Issue 2888)
+                x[1] = 0.5f;
+
            if (!identity) {
                if (kind == DCT_Spline && N > 2) {
                    spline_cubic_set ();
--- a/rtengine/editbuffer.h
+++ b/rtengine/editbuffer.h
@@ -22,6 +22,7 @@
 #include "../rtgui/edit.h"
 #include "array2D.h"
 #include "iimage.h"
+#include "coord.h"

 namespace rtengine
 {
--- a/rtengine/gauss.h
+++ b/rtengine/gauss.h
@@ -26,13 +26,7 @@
 #ifdef _OPENMP
 #include <omp.h>
 #endif
-#ifdef __SSE__
-#if defined( WIN32 ) && defined(__x86_64__)
-#include <intrin.h>
-#else
-#include <xmmintrin.h>
-#endif
-#endif
+#include "opthelper.h"

 // classical filtering if the support window is small:

@@ -88,13 +82,8 @@ template<class T> void gaussVertical3 (T** src, T** dst, AlignedBufferMP<double>
 }

 #ifdef __SSE__
-#ifdef WIN32
-template<class T> __attribute__((force_align_arg_pointer)) void gaussVertical3Sse (T** src, T** dst, int W, int H, const float c0, const float c1)
+template<class T> SSEFUNCTION void gaussVertical3Sse (T** src, T** dst, int W, int H, const float c0, const float c1)
 {
-#else
-template<class T> void gaussVertical3Sse (T** src, T** dst, int W, int H, const float c0, const float c1)
-{
-#endif
    __m128 Tv, Tm1v, Tp1v;
    __m128 c0v, c1v;
    c0v = _mm_set1_ps(c0);
@@ -107,7 +96,7 @@ template<class T> void gaussVertical3Sse (T** src, T** dst, int W, int H, const
        Tm1v = _mm_loadu_ps( &src[0][i] );
        _mm_storeu_ps( &dst[0][i], Tm1v);

-        if(H > 1) {
+        if (H > 1) {
            Tv = _mm_loadu_ps( &src[1][i]);
        }

@@ -126,7 +115,7 @@ template<class T> void gaussVertical3Sse (T** src, T** dst, int W, int H, const
    #pragma omp for
 #endif

-    for(int i = W - (W % 4); i < W; i++) {
+    for (int i = W - (W % 4); i < W; i++) {
        dst[0][i] = src[0][i];

        for (int j = 1; j < H - 1; j++) {
@@ -138,13 +127,8 @@ template<class T> void gaussVertical3Sse (T** src, T** dst, int W, int H, const
 }


-#ifdef WIN32
-template<class T> __attribute__((force_align_arg_pointer)) void gaussHorizontal3Sse (T** src, T** dst, int W, int H, const float c0, const float c1)
+template<class T> SSEFUNCTION void gaussHorizontal3Sse (T** src, T** dst, int W, int H, const float c0, const float c1)
 {
-#else
-template<class T> void gaussHorizontal3Sse (T** src, T** dst, int W, int H, const float c0, const float c1)
-{
-#endif
    float tmp[W][4] __attribute__ ((aligned (16)));

    __m128 Tv, Tm1v, Tp1v;
@@ -162,7 +146,7 @@ template<class T> void gaussHorizontal3Sse (T** src, T** dst, int W, int H, cons
        dst[i + 3][0] = src[i + 3][0];
        Tm1v = _mm_set_ps( src[i][0], src[i + 1][0], src[i + 2][0], src[i + 3][0] );

-        if(W > 1) {
+        if (W > 1) {
            Tv = _mm_set_ps( src[i][1], src[i + 1][1], src[i + 2][1], src[i + 3][1] );
        }

@@ -191,7 +175,7 @@ template<class T> void gaussHorizontal3Sse (T** src, T** dst, int W, int H, cons
    #pragma omp for
 #endif

-    for(int i = H - (H % 4); i < H; i++) {
+    for (int i = H - (H % 4); i < H; i++) {
        dst[i][0] = src[i][0];

        for (int j = 1; j < W - 1; j++) {
@@ -205,18 +189,15 @@ template<class T> void gaussHorizontal3Sse (T** src, T** dst, int W, int H, cons


 // fast gaussian approximation if the support window is large
-#ifdef WIN32
-template<class T> __attribute__((force_align_arg_pointer)) void gaussHorizontalSse (T** src, T** dst, int W, int H, float sigma)
+template<class T> SSEFUNCTION void gaussHorizontalSse (T** src, T** dst, int W, int H, float sigma)
 {
-#else
-template<class T> void gaussHorizontalSse (T** src, T** dst, int W, int H, float sigma)
-{
-#endif

    if (sigma < 0.25) {
        // dont perform filtering
        if (src != dst)
+#ifdef _OPENMP
            #pragma omp for
+#endif
            for (int i = 0; i < H; i++) {
                memcpy (dst[i], src[i], W * sizeof(T));
            }
@@ -279,7 +260,10 @@ template<class T> void gaussHorizontalSse (T** src, T** dst, int W, int H, float
    b1v = _mm_set1_ps(b1);
    b2v = _mm_set1_ps(b2);
    b3v = _mm_set1_ps(b3);
+
+#ifdef _OPENMP
    #pragma omp for
+#endif

    for (int i = 0; i < H - 3; i += 4) {
        tmpV[0] = src[i + 3][0];
@@ -351,9 +335,11 @@ template<class T> void gaussHorizontalSse (T** src, T** dst, int W, int H, float
    }

 // Borders are done without SSE
+#ifdef _OPENMP
    #pragma omp for
+#endif

-    for(int i = H - (H % 4); i < H; i++) {
+    for (int i = H - (H % 4); i < H; i++) {
        tmp[0][0] = B * src[i][0] + b1 * src[i][0] + b2 * src[i][0] + b3 * src[i][0];
        tmp[1][0] = B * src[i][1] + b1 * tmp[0][0]  + b2 * src[i][0] + b3 * src[i][0];
        tmp[2][0] = B * src[i][2] + b1 * tmp[1][0]  + b2 * tmp[0][0]  + b3 * src[i][0];
@@ -389,7 +375,7 @@ template<class T> void gaussHorizontal (T** src, T** dst, AlignedBufferMP<double

 #ifdef __SSE__

-    if(sigma < 70) { // bigger sigma only with double precision
+    if (sigma < 70) { // bigger sigma only with double precision
        gaussHorizontalSse<T> (src, dst, W, H, sigma);
        return;
    }
@@ -399,7 +385,9 @@ template<class T> void gaussHorizontal (T** src, T** dst, AlignedBufferMP<double
    if (sigma < 0.25) {
        // dont perform filtering
        if (src != dst)
+#ifdef _OPENMP
            #pragma omp for
+#endif
            for (int i = 0; i < H; i++) {
                memcpy (dst[i], src[i], W * sizeof(T));
            }
@@ -451,7 +439,9 @@ template<class T> void gaussHorizontal (T** src, T** dst, AlignedBufferMP<double
            M[i][j] /= (1.0 + b1 - b2 + b3) * (1.0 + b2 + (b1 - b3) * b3);
        }

+#ifdef _OPENMP
    #pragma omp for
+#endif

    for (int i = 0; i < H; i++) {
        AlignedBuffer<double>* pBuf = buffer.acquire();
@@ -486,18 +476,15 @@ template<class T> void gaussHorizontal (T** src, T** dst, AlignedBufferMP<double
 }

 #ifdef __SSE__
-#ifdef WIN32
-template<class T> __attribute__((force_align_arg_pointer)) void gaussVerticalSse (T** src, T** dst, int W, int H, float sigma)
+template<class T> SSEFUNCTION void gaussVerticalSse (T** src, T** dst, int W, int H, float sigma)
 {
-#else
-template<class T> void gaussVerticalSse (T** src, T** dst, int W, int H, float sigma)
-{
-#endif

    if (sigma < 0.25) {
        // dont perform filtering
        if (src != dst)
+#ifdef _OPENMP
            #pragma omp for
+#endif
            for (int i = 0; i < H; i++) {
                memcpy (dst[i], src[i], W * sizeof(T));
            }
@@ -614,9 +601,11 @@ template<class T> void gaussVerticalSse (T** src, T** dst, int W, int H, float s
    }

 // Borders are done without SSE
+#ifdef _OPENMP
    #pragma omp for
+#endif

-    for(int i = W - (W % 4); i < W; i++) {
+    for (int i = W - (W % 4); i < W; i++) {
        tmp[0][0] = B * src[0][i] + b1 * src[0][i] + b2 * src[0][i] + b3 * src[0][i];
        tmp[1][0] = B * src[1][i] + b1 * tmp[0][0] + b2 * src[0][i] + b3 * src[0][i];
        tmp[2][0] = B * src[2][i] + b1 * tmp[1][0] + b2 * tmp[0][0] + b3 * src[0][i];
@@ -651,7 +640,7 @@ template<class T> void gaussVertical (T** src, T** dst, AlignedBufferMP<double>

 #ifdef __SSE__

-    if(sigma < 70) { // bigger sigma only with double precision
+    if (sigma < 70) { // bigger sigma only with double precision
        gaussVerticalSse<T> (src, dst, W, H, sigma);
        return;
    }
@@ -659,9 +648,11 @@ template<class T> void gaussVertical (T** src, T** dst, AlignedBufferMP<double>
 #endif

    if (sigma < 0.25) {
-        // dont perform filtering
+        // don't perform filtering
        if (src != dst)
+#ifdef _OPENMP
            #pragma omp for
+#endif
            for (int i = 0; i < H; i++) {
                memcpy (dst[i], src[i], W * sizeof(T));
            }
@@ -713,38 +704,81 @@ template<class T> void gaussVertical (T** src, T** dst, AlignedBufferMP<double>
            M[i][j] /= (1.0 + b1 - b2 + b3) * (1.0 + b2 + (b1 - b3) * b3);
        }

+    // process 'numcols' columns for better usage of L1 cpu cache (especially faster for large values of H)
+    static const int numcols = 4;
+    double temp2[H][numcols] ALIGNED16;
+    double temp2Hm1[numcols], temp2H[numcols], temp2Hp1[numcols];
 #ifdef _OPENMP
-    #pragma omp for
+    #pragma omp for nowait
 #endif

-    for (int i = 0; i < W; i++) {
-        AlignedBuffer<double>* pBuf = buffer.acquire();
-        double* temp2 = pBuf->data;
-        temp2[0] = B * src[0][i] + b1 * src[0][i] + b2 * src[0][i] + b3 * src[0][i];
-        temp2[1] = B * src[1][i] + b1 * temp2[0]  + b2 * src[0][i] + b3 * src[0][i];
-        temp2[2] = B * src[2][i] + b1 * temp2[1]  + b2 * temp2[0]  + b3 * src[0][i];
-
-        for (int j = 3; j < H; j++) {
-            temp2[j] = B * src[j][i] + b1 * temp2[j - 1] + b2 * temp2[j - 2] + b3 * temp2[j - 3];
+    for (int i = 0; i < W - numcols + 1; i += numcols) {
+        for (int k = 0; k < numcols; k++) {
+            temp2[0][k] = B * src[0][i + k] + b1 * src[0][i + k] + b2 * src[0][i + k] + b3 * src[0][i + k];
+            temp2[1][k] = B * src[1][i + k] + b1 * temp2[0][k] + b2 * src[0][i + k] + b3 * src[0][i + k];
+            temp2[2][k] = B * src[2][i + k] + b1 * temp2[1][k] + b2 * temp2[0][k] + b3 * src[0][i + k];
        }

-        double temp2Hm1 = src[H - 1][i] + M[0][0] * (temp2[H - 1] - src[H - 1][i]) + M[0][1] * (temp2[H - 2] - src[H - 1][i]) + M[0][2] * (temp2[H - 3] - src[H - 1][i]);
-        double temp2H   = src[H - 1][i] + M[1][0] * (temp2[H - 1] - src[H - 1][i]) + M[1][1] * (temp2[H - 2] - src[H - 1][i]) + M[1][2] * (temp2[H - 3] - src[H - 1][i]);
-        double temp2Hp1 = src[H - 1][i] + M[2][0] * (temp2[H - 1] - src[H - 1][i]) + M[2][1] * (temp2[H - 2] - src[H - 1][i]) + M[2][2] * (temp2[H - 3] - src[H - 1][i]);
+        for (int j = 3; j < H; j++) {
+            for (int k = 0; k < numcols; k++) {
+                temp2[j][k] = B * src[j][i + k] + b1 * temp2[j - 1][k] + b2 * temp2[j - 2][k] + b3 * temp2[j - 3][k];
+            }
+        }

-        temp2[H - 1] = temp2Hm1;
-        temp2[H - 2] = B * temp2[H - 2] + b1 * temp2[H - 1] + b2 * temp2H + b3 * temp2Hp1;
-        temp2[H - 3] = B * temp2[H - 3] + b1 * temp2[H - 2] + b2 * temp2[H - 1] + b3 * temp2H;
+        for (int k = 0; k < numcols; k++) {
+            temp2Hm1[k] = src[H - 1][i + k] + M[0][0] * (temp2[H - 1][k] - src[H - 1][i + k]) + M[0][1] * (temp2[H - 2][k] - src[H - 1][i + k]) + M[0][2] * (temp2[H - 3][k] - src[H - 1][i + k]);
+            temp2H[k]   = src[H - 1][i + k] + M[1][0] * (temp2[H - 1][k] - src[H - 1][i + k]) + M[1][1] * (temp2[H - 2][k] - src[H - 1][i + k]) + M[1][2] * (temp2[H - 3][k] - src[H - 1][i + k]);
+            temp2Hp1[k] = src[H - 1][i + k] + M[2][0] * (temp2[H - 1][k] - src[H - 1][i + k]) + M[2][1] * (temp2[H - 2][k] - src[H - 1][i + k]) + M[2][2] * (temp2[H - 3][k] - src[H - 1][i + k]);
+        }
+
+        for (int k = 0; k < numcols; k++) {
+            temp2[H - 1][k] = temp2Hm1[k];
+            temp2[H - 2][k] = B * temp2[H - 2][k] + b1 * temp2[H - 1][k] + b2 * temp2H[k] + b3 * temp2Hp1[k];
+            temp2[H - 3][k] = B * temp2[H - 3][k] + b1 * temp2[H - 2][k] + b2 * temp2[H - 1][k] + b3 * temp2H[k];
+        }

        for (int j = H - 4; j >= 0; j--) {
-            temp2[j] = B * temp2[j] + b1 * temp2[j + 1] + b2 * temp2[j + 2] + b3 * temp2[j + 3];
+            for (int k = 0; k < numcols; k++) {
+                temp2[j][k] = B * temp2[j][k] + b1 * temp2[j + 1][k] + b2 * temp2[j + 2][k] + b3 * temp2[j + 3][k];
+            }
        }

        for (int j = 0; j < H; j++) {
-            dst[j][i] = (T)temp2[j];
+            for (int k = 0; k < numcols; k++) {
+                dst[j][i + k] = (T)temp2[j][k];
+            }
+        }
+    }
+
+#ifdef _OPENMP
+    #pragma omp single
+#endif
+
+    // process remaining column
+    for (int i = W - (W % numcols); i < W; i++) {
+        temp2[0][0] = B * src[0][i] + b1 * src[0][i] + b2 * src[0][i] + b3 * src[0][i];
+        temp2[1][0] = B * src[1][i] + b1 * temp2[0][0]  + b2 * src[0][i] + b3 * src[0][i];
+        temp2[2][0] = B * src[2][i] + b1 * temp2[1][0]  + b2 * temp2[0][0]  + b3 * src[0][i];
+
+        for (int j = 3; j < H; j++) {
+            temp2[j][0] = B * src[j][i] + b1 * temp2[j - 1][0] + b2 * temp2[j - 2][0] + b3 * temp2[j - 3][0];
        }

-        buffer.release(pBuf);
+        double temp2Hm1 = src[H - 1][i] + M[0][0] * (temp2[H - 1][0] - src[H - 1][i]) + M[0][1] * (temp2[H - 2][0] - src[H - 1][i]) + M[0][2] * (temp2[H - 3][0] - src[H - 1][i]);
+        double temp2H   = src[H - 1][i] + M[1][0] * (temp2[H - 1][0] - src[H - 1][i]) + M[1][1] * (temp2[H - 2][0] - src[H - 1][i]) + M[1][2] * (temp2[H - 3][0] - src[H - 1][i]);
+        double temp2Hp1 = src[H - 1][i] + M[2][0] * (temp2[H - 1][0] - src[H - 1][i]) + M[2][1] * (temp2[H - 2][0] - src[H - 1][i]) + M[2][2] * (temp2[H - 3][0] - src[H - 1][i]);
+
+        temp2[H - 1][0] = temp2Hm1;
+        temp2[H - 2][0] = B * temp2[H - 2][0] + b1 * temp2[H - 1][0] + b2 * temp2H + b3 * temp2Hp1;
+        temp2[H - 3][0] = B * temp2[H - 3][0] + b1 * temp2[H - 2][0] + b2 * temp2[H - 1][0] + b3 * temp2H;
+
+        for (int j = H - 4; j >= 0; j--) {
+            temp2[j][0] = B * temp2[j][0] + b1 * temp2[j + 1][0] + b2 * temp2[j + 2][0] + b3 * temp2[j + 3][0];
+        }
+
+        for (int j = 0; j < H; j++) {
+            dst[j][i] = (T)temp2[j][0];
+        }
    }
 }

--- a/rtengine/helpersse2.h
+++ b/rtengine/helpersse2.h
@@ -22,19 +22,19 @@ typedef __m128i vint2;

 //
 #ifdef __GNUC__
-#if (__GNUC__ == 4 && __GNUC_MINOR__ >= 9) || __GNUC__ > 4
-#define LVF(x) _mm_load_ps(&x)
-#define LVFU(x) _mm_loadu_ps(&x)
-#define STVF(x,y) _mm_store_ps(&x,y)
-#else // there is a bug in gcc 4.7.x when using openmp and aligned memory and -O3
-#define LVF(x) _mm_loadu_ps(&x)
-#define LVFU(x) _mm_loadu_ps(&x)
-#define STVF(x,y) _mm_storeu_ps(&x,y)
-#endif
+    #if (__GNUC__ == 4 && __GNUC_MINOR__ >= 9) || __GNUC__ > 4
+        #define LVF(x) _mm_load_ps(&x)
+        #define LVFU(x) _mm_loadu_ps(&x)
+        #define STVF(x,y) _mm_store_ps(&x,y)
+    #else // there is a bug in gcc 4.7.x when using openmp and aligned memory and -O3
+        #define LVF(x) _mm_loadu_ps(&x)
+        #define LVFU(x) _mm_loadu_ps(&x)
+        #define STVF(x,y) _mm_storeu_ps(&x,y)
+    #endif
 #else
-#define LVF(x) _mm_load_ps(&x)
-#define LVFU(x) _mm_loadu_ps(&x)
-#define STVF(x,y) _mm_store_ps(&x,y)
+    #define LVF(x) _mm_load_ps(&x)
+    #define LVFU(x) _mm_loadu_ps(&x)
+    #define STVF(x,y) _mm_store_ps(&x,y)
 #endif

 // Load 8 floats from a and combine a[0],a[2],a[4] and a[6] into a vector of 4 floats
--- a/rtengine/imagedata.cc
+++ b/rtengine/imagedata.cc
@@ -406,7 +406,10 @@ void ImageData::extractInfo ()
                rtexif::Tag* flt = mnote->getTagP ("LensInfo/FocalLength");

                if (flt) {
-                    focal_len = flt->toDouble ();
+                    // Don't replace Exif focal_len if Makernotes focal_len is 0
+                    if (flt->toDouble() > 0) {
+                        focal_len = flt->toDouble ();
+                    }
                } else if ((flt = mnote->getTagP ("FocalLength"))) {
                    rtexif::Tag* flt = mnote->getTag ("FocalLength");
                    focal_len = flt->toDouble ();
--- a/rtengine/improcfun.cc
+++ b/rtengine/improcfun.cc
@@ -2049,8 +2049,7 @@ void ImProcFunctions::ciecam_02float (CieImage* ncie, float adap, int begh, int
                                                        Q,  M,  s, F2V(aw), F2V(fl), F2V(wh),
                                                        x,  y,  z,
                                                        F2V(xw1), F2V(yw1),  F2V(zw1),
-                                                        F2V(yb),  F2V(la),
-                                                        F2V(f), F2V(c),  F2V(nc), F2V(pow1), F2V(nbb), F2V(ncb), F2V(pfl), F2V(cz), F2V(d));
+                                                        F2V(c),  F2V(nc), F2V(pow1), F2V(nbb), F2V(ncb), F2V(pfl), F2V(cz), F2V(d));
                    STVF(Jbuffer[k], J);
                    STVF(Cbuffer[k], C);
                    STVF(hbuffer[k], h);
@@ -2074,8 +2073,7 @@ void ImProcFunctions::ciecam_02float (CieImage* ncie, float adap, int begh, int
                                                        Q,  M,  s, aw, fl, wh,
                                                        x,  y,  z,
                                                        xw1, yw1,  zw1,
-                                                        yb,  la,
-                                                        f, c,  nc,  pilot, gamu, pow1, nbb, ncb, pfl, cz, d);
+                                                        c,  nc, gamu, pow1, nbb, ncb, pfl, cz, d);
                    Jbuffer[k] = J;
                    Cbuffer[k] = C;
                    hbuffer[k] = h;
@@ -2113,8 +2111,7 @@ void ImProcFunctions::ciecam_02float (CieImage* ncie, float adap, int begh, int
                                                        Q,  M,  s, aw, fl, wh,
                                                        x,  y,  z,
                                                        xw1, yw1,  zw1,
-                                                        yb,  la,
-                                                        f, c,  nc,  pilot, gamu, pow1, nbb, ncb, pfl, cz, d);
+                                                        c,  nc, gamu, pow1, nbb, ncb, pfl, cz, d);
 #endif
                    float Jpro, Cpro, hpro, Qpro, Mpro, spro;
                    Jpro = J;
@@ -2545,7 +2542,6 @@ void ImProcFunctions::ciecam_02float (CieImage* ncie, float adap, int begh, int
                            Ciecam02::jch2xyz_ciecam02float( xx, yy, zz,
                                                             J,  C, h,
                                                             xw2, yw2,  zw2,
-                                                             yb2, la2,
                                                             f2,  c2, nc2, gamu, pow1n, nbbj, ncbj, flj, czj, dj, awj);
                            float x, y, z;
                            x = (float)xx * 655.35f;
@@ -2607,7 +2603,6 @@ void ImProcFunctions::ciecam_02float (CieImage* ncie, float adap, int begh, int
                    Ciecam02::jch2xyz_ciecam02float( x, y, z,
                                                     LVF(Jbuffer[k]), LVF(Cbuffer[k]), LVF(hbuffer[k]),
                                                     F2V(xw2), F2V(yw2), F2V(zw2),
-                                                     F2V(yb2), F2V(la2),
                                                     F2V(f2),  F2V(nc2), F2V(pow1n), F2V(nbbj), F2V(ncbj), F2V(flj), F2V(dj), F2V(awj), F2V(reccmcz));
                    STVF(xbuffer[k], x * c655d35);
                    STVF(ybuffer[k], y * c655d35);
@@ -2936,7 +2931,6 @@ void ImProcFunctions::ciecam_02float (CieImage* ncie, float adap, int begh, int
                        Ciecam02::jch2xyz_ciecam02float( xx, yy, zz,
                                                         ncie->J_p[i][j],  ncie_C_p, ncie->h_p[i][j],
                                                         xw2, yw2,  zw2,
-                                                         yb2, la2,
                                                         f2,  c2, nc2, gamu, pow1n, nbbj, ncbj, flj, czj, dj, awj);
                        x = (float)xx * 655.35f;
                        y = (float)yy * 655.35f;
@@ -2992,7 +2986,6 @@ void ImProcFunctions::ciecam_02float (CieImage* ncie, float adap, int begh, int
                        Ciecam02::jch2xyz_ciecam02float( x, y, z,
                                                         LVF(Jbuffer[k]), LVF(Cbuffer[k]), LVF(hbuffer[k]),
                                                         F2V(xw2), F2V(yw2), F2V(zw2),
-                                                         F2V(yb2), F2V(la2),
                                                         F2V(f2), F2V(nc2), F2V(pow1n), F2V(nbbj), F2V(ncbj), F2V(flj), F2V(dj), F2V(awj), F2V(reccmcz));
                        x *= c655d35;
                        y *= c655d35;
@@ -3178,7 +3171,6 @@ void ImProcFunctions::rgbProc (Imagefloat* working, LabImage* lab, EditBuffer *e
                               SHMap* shmap, int sat, LUTf & rCurve, LUTf & gCurve, LUTf & bCurve, float satLimit , float satLimitOpacity, const ColorGradientCurve & ctColorCurve, const OpacityCurve & ctOpacityCurve, bool opautili, LUTf & clToningcurve, LUTf & cl2Toningcurve,
                               const ToneCurve & customToneCurve1, const ToneCurve & customToneCurve2,  const ToneCurve & customToneCurvebw1, const ToneCurve & customToneCurvebw2, double &rrm, double &ggm, double &bbm, float &autor, float &autog, float &autob, double expcomp, int hlcompr, int hlcomprthresh, DCPProfile *dcpProf)
 {
-
    LUTf fGammaLUTf;
    Imagefloat *tmpImage = NULL;

--- a/rtengine/improcfun.h
+++ b/rtengine/improcfun.h
@@ -313,16 +313,15 @@ public:

    void WaveletcontAllL(LabImage * lab, float **varhue, float **varchrom, wavelet_decomposition &WaveletCoeffs_L,
                         struct cont_params &cp, int skip, float *mean, float *meanN, float *sigma, float *sigmaN, float *MaxP, float *MaxN,  const WavCurve & wavCLVCcurve, const WavOpacityCurveW & waOpacityCurveW, const WavOpacityCurveWL & waOpacityCurveWL, FlatCurve* ChCurve, bool Chutili);
-    void WaveletcontAllLfinal(LabImage * lab, float **varhue, float **varchrom, wavelet_decomposition &WaveletCoeffs_L,
-                              struct cont_params &cp, int skip, float *mean, float *meanN, float *sigma, float *sigmaN, float *MaxP, float *MaxN,  const WavCurve & wavCLVCcurve, const WavOpacityCurveWL & waOpacityCurveWL, FlatCurve* ChCurve, bool Chutili);
+    void WaveletcontAllLfinal(wavelet_decomposition &WaveletCoeffs_L, struct cont_params &cp, float *mean, float *sigma, float *MaxP, const WavOpacityCurveWL & waOpacityCurveWL);
    void WaveletcontAllAB(LabImage * lab, float **varhue, float **varchrom, wavelet_decomposition &WaveletCoeffs_a, const WavOpacityCurveW & waOpacityCurveW,
                          struct cont_params &cp, const bool useChannelA);
    void WaveletAandBAllAB(LabImage * lab, float **varhue, float **varchrom, wavelet_decomposition &WaveletCoeffs_a, wavelet_decomposition &WaveletCoeffs_b,
                           struct cont_params &cp, const WavOpacityCurveW & waOpacityCurveW, FlatCurve* hhcurve, bool hhutili);
    void ContAllL (float **koeLi, float *maxkoeLi, bool lipschitz, int maxlvl, LabImage * lab, float **varhue, float **varchrom, float ** WavCoeffs_L, float * WavCoeffs_L0, int level, int dir, struct cont_params &cp,
                   int W_L, int H_L, int skip, float *mean, float *meanN, float *sigma, float *sigmaN, float *MaxP, float *MaxN,  const WavCurve & wavCLVCcurve, const WavOpacityCurveW & waOpacityCurveW, FlatCurve* ChCurve, bool Chutili);
-    void finalContAllL (int maxlvl, LabImage * lab, float **varhue, float **varchrom, float ** WavCoeffs_L, float * WavCoeffs_L0, int level, int dir, struct cont_params &cp,
-                        int W_L, int H_L, int skip, float *mean, float *meanN, float *sigma, float *sigmaN, float *MaxP, float *MaxN,  const WavCurve & wavCLVCcurve, const WavOpacityCurveWL & waOpacityCurveWL, FlatCurve* ChCurve, bool Chutili);
+    void finalContAllL (float ** WavCoeffs_L, float * WavCoeffs_L0, int level, int dir, struct cont_params &cp,
+                        int W_L, int H_L, float *mean, float *sigma, float *MaxP, const WavOpacityCurveWL & waOpacityCurveWL);
    void ContAllAB (LabImage * lab, int maxlvl, float **varhue, float **varchrom, float ** WavCoeffs_a, float * WavCoeffs_a0, int level, int dir, const WavOpacityCurveW & waOpacityCurveW, struct cont_params &cp,
                    int W_ab, int H_ab, const bool useChannelA);
    void Evaluate2(wavelet_decomposition &WaveletCoeffs_L,
--- a/rtengine/init.cc
+++ b/rtengine/init.cc
@@ -66,7 +66,6 @@ void cleanup ()

    ProcParams::cleanup ();
    Color::cleanup ();
-    PerceptualToneCurve::cleanup ();
    ImProcFunctions::cleanupCache ();
    Thumbnail::cleanupGamma ();
    RawImageSource::cleanup ();
--- a/rtengine/ipwavelet.cc
+++ b/rtengine/ipwavelet.cc
@@ -37,7 +37,6 @@
 #include "mytime.h"
 #include "sleef.c"
 #include "opthelper.h"
-#include "StopWatch.h"
 #include "EdgePreservingDecomposition.h"

 #ifdef _OPENMP
@@ -994,7 +993,7 @@ SSEFUNCTION void ImProcFunctions::ip_wavelet(LabImage * lab, LabImage * dst, int
                            Evaluate2(*Ldecomp, cp, ind, mean, meanN, sigma, sigmaN, MaxP, MaxN, madL);
                        }

-                        WaveletcontAllLfinal(labco, varhue, varchro, *Ldecomp, cp, skip, mean, meanN, sigma, sigmaN, MaxP, MaxN, wavCLVCcurve, waOpacityCurveWL, ChCurve, Chutili);
+                        WaveletcontAllLfinal(*Ldecomp, cp, mean, sigma, MaxP, waOpacityCurveWL);
                        //Evaluate2(*Ldecomp, cp, ind, mean, meanN, sigma, sigmaN, MaxP, MaxN, madL);

                        Ldecomp->reconstruct(labco->data, cp.strength);
@@ -1745,41 +1744,25 @@ void ImProcFunctions::EPDToneMapResid(float * WavCoeffs_L0,  unsigned int Iterat
    }
 }

-void ImProcFunctions::WaveletcontAllLfinal(LabImage * labco, float ** varhue, float **varchrom, wavelet_decomposition &WaveletCoeffs_L,
-        struct cont_params &cp, int skip, float *mean, float *meanN, float *sigma, float *sigmaN, float *MaxP, float *MaxN, const WavCurve & wavCLVCcurve, const WavOpacityCurveWL & waOpacityCurveWL, FlatCurve* ChCurve, bool Chutili)
+void ImProcFunctions::WaveletcontAllLfinal(wavelet_decomposition &WaveletCoeffs_L, struct cont_params &cp, float *mean,float *sigma, float *MaxP, const WavOpacityCurveWL & waOpacityCurveWL)
 {
-
    int maxlvl = WaveletCoeffs_L.maxlevel();
-    int W_L = WaveletCoeffs_L.level_W(0);
-    int H_L = WaveletCoeffs_L.level_H(0);
    float * WavCoeffs_L0 = WaveletCoeffs_L.coeff0;

-
-#ifdef _RT_NESTED_OPENMP
-    #pragma omp for schedule(dynamic) collapse(2)
-#endif
-
    for (int dir = 1; dir < 4; dir++) {
        for (int lvl = 0; lvl < maxlvl; lvl++) {
-
            int Wlvl_L = WaveletCoeffs_L.level_W(lvl);
            int Hlvl_L = WaveletCoeffs_L.level_H(lvl);
-
            float ** WavCoeffs_L = WaveletCoeffs_L.level_coeffs(lvl);
-
-            finalContAllL (maxlvl, labco,  varhue, varchrom, WavCoeffs_L, WavCoeffs_L0, lvl, dir, cp, Wlvl_L, Hlvl_L, skip, mean, meanN, sigma, sigmaN, MaxP, MaxN, wavCLVCcurve, waOpacityCurveWL, ChCurve, Chutili);
-
-
+            finalContAllL (WavCoeffs_L, WavCoeffs_L0, lvl, dir, cp, Wlvl_L, Hlvl_L, mean, sigma, MaxP, waOpacityCurveWL);
        }
    }
-
 }


 void ImProcFunctions::WaveletcontAllL(LabImage * labco, float ** varhue, float **varchrom, wavelet_decomposition &WaveletCoeffs_L,
                                      struct cont_params &cp, int skip, float *mean, float *meanN, float *sigma, float *sigmaN, float *MaxP, float *MaxN, const WavCurve & wavCLVCcurve, const WavOpacityCurveW & waOpacityCurveW, const WavOpacityCurveWL & waOpacityCurveWL, FlatCurve* ChCurve, bool Chutili)
 {
-    //StopWatch Stop1("WaveletcontAllL");
    int maxlvl = WaveletCoeffs_L.maxlevel();
    int W_L = WaveletCoeffs_L.level_W(0);
    int H_L = WaveletCoeffs_L.level_H(0);
@@ -2530,14 +2513,10 @@ void ImProcFunctions::calckoe(float ** WavCoeffs_LL, struct cont_params cp, floa

 }

-void ImProcFunctions::finalContAllL (int maxlvl, LabImage * labco, float ** varhue, float **varchrom, float ** WavCoeffs_L, float * WavCoeffs_L0, int level, int dir, struct cont_params &cp,
-                                     int W_L, int H_L, int skip, float *mean, float *meanN, float *sigma, float *sigmaN, float *MaxP, float *MaxN, const WavCurve & wavCLVCcurve, const WavOpacityCurveWL & waOpacityCurveWL, FlatCurve* ChCurve, bool Chutili)
+void ImProcFunctions::finalContAllL (float ** WavCoeffs_L, float * WavCoeffs_L0, int level, int dir, struct cont_params &cp,
+                                     int W_L, int H_L, float *mean, float *sigma, float *MaxP, const WavOpacityCurveWL & waOpacityCurveWL)
 {
-    bool lipschitz = true;
-    float edge = 1.f;
-    bool curvdiag = true;
-
-    if(curvdiag  && cp.finena) {//curve
+    if(cp.diagcurv  && cp.finena) {//curve
        float insigma = 0.666f; //SD
        float logmax = log(MaxP[level]); //log Max
        float rapX = (mean[level] + sigma[level]) / MaxP[level]; //rapport between sD / max
@@ -2547,47 +2526,30 @@ void ImProcFunctions::finalContAllL (int maxlvl, LabImage * labco, float ** varh
        float asig = 0.166f / sigma[level];
        float bsig = 0.5f - asig * mean[level];
        float amean = 0.5f / mean[level];
-        float absciss;
-        float kinterm;
-        float kmul;

+#ifdef _RT_NESTED_OPENMP
+        #pragma omp parallel for schedule(dynamic, W_L * 16) num_threads(wavNestedLevels) if(wavNestedLevels>1)
+#endif
        for (int i = 0; i < W_L * H_L; i++) {
-            edge = 1.f;
-            kinterm = 1.f;
+            float absciss;

-            if(cp.diagcurv) {
-                if(fabs(WavCoeffs_L[dir][i]) >= (mean[level] + sigma[level])) { //for max
-                    float valcour = log(fabs(WavCoeffs_L[dir][i]));
-                    float valc = valcour - logmax;
-                    float vald = valc * rap;
-                    absciss = exp(vald);
-
-                } else if(fabs(WavCoeffs_L[dir][i]) >= mean[level] &&  fabs(WavCoeffs_L[dir][i]) < (mean[level] + sigma[level])) {
-                    absciss = asig * fabs(WavCoeffs_L[dir][i]) + bsig;
-                }
-                //  else if(fabs(WavCoeffs_L[dir][i]) < mean[level]){
-                else {
-                    absciss = amean * fabs(WavCoeffs_L[dir][i]);
-                }
-
-                //  }
-                kinterm = 1.f;
-                kmul = 1.f;
-
-                float kc = kmul * (waOpacityCurveWL[absciss * 500.f] - 0.5f);
-                float reduceeffect = 1.5f;
-
-                if(kc <= 0.f) {
-                    reduceeffect = 1.f;
-                }
-
-                kinterm = 1.f + reduceeffect * kmul * (waOpacityCurveWL[absciss * 500.f] - 0.5f);
-
-                if(kinterm < 0.f) {
-                    kinterm = 0.01f;
-                }
+            if(fabsf(WavCoeffs_L[dir][i]) >= (mean[level] + sigma[level])) { //for max
+                float valcour = xlogf(fabsf(WavCoeffs_L[dir][i]));
+                float valc = valcour - logmax;
+                float vald = valc * rap;
+                absciss = xexpf(vald);
+            } else if(fabsf(WavCoeffs_L[dir][i]) >= mean[level]) {
+                absciss = asig * fabsf(WavCoeffs_L[dir][i]) + bsig;
+            } else {
+                absciss = amean * fabsf(WavCoeffs_L[dir][i]);
            }

+            float kc = waOpacityCurveWL[absciss * 500.f] - 0.5f;
+            float reduceeffect = kc <= 0.f ? 1.f : 1.5f;
+
+            float kinterm = 1.f + reduceeffect * kc;
+            kinterm = kinterm <= 0.f ? 0.01f : kinterm;
+
            WavCoeffs_L[dir][i] *=  kinterm;
        }
    }
--- a/rtengine/opthelper.h
+++ b/rtengine/opthelper.h
@@ -20,56 +20,56 @@
 ////////////////////////////////////////////////////////////////

 #ifndef OPTHELPER_H
-#define OPTHELPER_H
+    #define OPTHELPER_H

-#ifdef __SSE2__
-#include "sleefsseavx.c"
-#ifdef __GNUC__
-#if defined(WIN32) && !defined( __x86_64__ )
-// needed for actual versions of GCC with 32-Bit Windows
-#define SSEFUNCTION __attribute__((force_align_arg_pointer))
-#else
-#define SSEFUNCTION
-#endif
-#else
-#define SSEFUNCTION
-#endif
-#else
-#ifdef __SSE__
-#ifdef __GNUC__
-#if defined(WIN32) && !defined( __x86_64__ )
-// needed for actual versions of GCC with 32-Bit Windows
-#define SSEFUNCTION __attribute__((force_align_arg_pointer))
-#else
-#define SSEFUNCTION
-#endif
-#else
-#define SSEFUNCTION
-#endif
-#else
-#define SSEFUNCTION
-#endif
-#endif
+    #ifdef __SSE2__
+        #include "sleefsseavx.c"
+        #ifdef __GNUC__
+            #if defined(WIN32) && !defined( __x86_64__ )
+                // needed for actual versions of GCC with 32-Bit Windows
+                #define SSEFUNCTION __attribute__((force_align_arg_pointer))
+            #else
+                #define SSEFUNCTION
+            #endif
+        #else
+            #define SSEFUNCTION
+        #endif
+    #else
+        #ifdef __SSE__
+            #ifdef __GNUC__
+                #if defined(WIN32) && !defined( __x86_64__ )
+                    // needed for actual versions of GCC with 32-Bit Windows
+                    #define SSEFUNCTION __attribute__((force_align_arg_pointer))
+                #else
+                    #define SSEFUNCTION
+                #endif
+            #else
+                #define SSEFUNCTION
+            #endif
+        #else
+            #define SSEFUNCTION
+        #endif
+    #endif

-#ifdef __GNUC__
-#define RESTRICT    __restrict__
-#define LIKELY(x)   __builtin_expect (!!(x), 1)
-#define UNLIKELY(x) __builtin_expect (!!(x), 0)
-#if (__GNUC__ == 4 && __GNUC_MINOR__ >= 9) || __GNUC__ > 4
-#define ALIGNED64 __attribute__ ((aligned (64)))
-#define ALIGNED16 __attribute__ ((aligned (16)))
-#else // there is a bug in gcc 4.7.x when using openmp and aligned memory and -O3
-#define ALIGNED64
-#define ALIGNED16
-#endif
-#else
-#define RESTRICT
-#define LIKELY(x)    (x)
-#define UNLIKELY(x)  (x)
-#define ALIGNED64
-#define ALIGNED16
-#endif
-#ifndef __clang__
-#define _RT_NESTED_OPENMP _OPENMP
-#endif
+    #ifdef __GNUC__
+        #define RESTRICT    __restrict__
+        #define LIKELY(x)   __builtin_expect (!!(x), 1)
+        #define UNLIKELY(x) __builtin_expect (!!(x), 0)
+        #if (__GNUC__ == 4 && __GNUC_MINOR__ >= 9) || __GNUC__ > 4
+            #define ALIGNED64 __attribute__ ((aligned (64)))
+            #define ALIGNED16 __attribute__ ((aligned (16)))
+        #else // there is a bug in gcc 4.7.x when using openmp and aligned memory and -O3
+            #define ALIGNED64
+            #define ALIGNED16
+        #endif
+    #else
+        #define RESTRICT
+        #define LIKELY(x)    (x)
+        #define UNLIKELY(x)  (x)
+        #define ALIGNED64
+        #define ALIGNED16
+    #endif
+    #ifndef __clang__
+        #define _RT_NESTED_OPENMP _OPENMP
+    #endif
 #endif
--- a/rtengine/procparams.h
+++ b/rtengine/procparams.h
@@ -24,6 +24,7 @@
 #include <cstdio>
 #include <cmath>
 #include "LUT.h"
+#include "coord.h"

 class ParamsEdited;

@@ -1251,7 +1252,7 @@ public:
    ResizeParams            resize;          ///< Resize parameters
    ColorManagementParams   icm;             ///< profiles/color spaces used during the image processing
    RAWParams               raw;             ///< RAW parameters before demosaicing
-    WaveletParams           wavelet;       ///< wavelet wavelet parameters
+    WaveletParams           wavelet;         ///< Wavelet parameters
    DirPyrEqualizerParams   dirpyrequalizer; ///< directional pyramid wavelet parameters
    HSVEqualizerParams      hsvequalizer;    ///< hsv wavelet parameters
    FilmSimulationParams    filmSimulation;  ///< film simulation parameters