diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7c60fa37c..6d51503d6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -252,6 +252,12 @@ if(NOT DEFINED APPDATADIR)
     endif()
 endif()
 
+if (NOT APPLE)
+    if(DEFINED LENSFUNDBDIR AND NOT IS_ABSOLUTE "${LENSFUNDBDIR}") 
+        set(LENSFUNDBDIR "${DATADIR}/${LENSFUNDBDIR}") 
+    endif()
+endif()
+
 if(APPLE)
     if("${CODESIGNID}")
         set(CODESIGNID "${CODESIGNID}" CACHE STRING "Codesigning Identity")
diff --git a/rtengine/camconst.cc b/rtengine/camconst.cc
index d136d6a21..c5cfc26fa 100644
--- a/rtengine/camconst.cc
+++ b/rtengine/camconst.cc
@@ -638,7 +638,7 @@ CameraConst::update_globalGreenEquilibration(bool other)
 }
 
 bool
-CameraConstantsStore::parse_camera_constants_file(Glib::ustring filename_)
+CameraConstantsStore::parse_camera_constants_file(const Glib::ustring& filename_)
 {
     // read the file into a single long string
     const char *filename = filename_.c_str();
@@ -809,7 +809,7 @@ CameraConstantsStore::~CameraConstantsStore()
     }
 }
 
-void CameraConstantsStore::init(Glib::ustring baseDir, Glib::ustring userSettingsDir)
+void CameraConstantsStore::init(const Glib::ustring& baseDir, const Glib::ustring& userSettingsDir)
 {
     parse_camera_constants_file(Glib::build_filename(baseDir, "camconst.json"));
 
diff --git a/rtengine/camconst.h b/rtengine/camconst.h
index 1096e1767..3af05ab01 100644
--- a/rtengine/camconst.h
+++ b/rtengine/camconst.h
@@ -3,10 +3,17 @@
  */
 #pragma once
 
-#include <glibmm/ustring.h>
 #include <map>
+#include <string>
 #include <vector>
 
+namespace Glib
+{
+
+class ustring;
+
+}
+
 namespace rtengine
 {
 
@@ -14,7 +21,7 @@ struct camera_const_levels {
     int levels[4];
 };
 
-class CameraConst
+class CameraConst final
 {
 private:
     std::string make_model;
@@ -56,17 +63,17 @@ public:
     void update_globalGreenEquilibration(bool other);
 };
 
-class CameraConstantsStore
+class CameraConstantsStore final
 {
 private:
     std::map<std::string, CameraConst *> mCameraConstants;
 
     CameraConstantsStore();
-    bool parse_camera_constants_file(Glib::ustring filename);
+    bool parse_camera_constants_file(const Glib::ustring& filename);
 
 public:
     ~CameraConstantsStore();
-    void init(Glib::ustring baseDir, Glib::ustring userSettingsDir);
+    void init(const Glib::ustring& baseDir, const Glib::ustring& userSettingsDir);
     static CameraConstantsStore *getInstance(void);
     CameraConst *get(const char make[], const char model[]);
 };
diff --git a/rtengine/cieimage.h b/rtengine/cieimage.h
index fea675cd4..329c0ba74 100644
--- a/rtengine/cieimage.h
+++ b/rtengine/cieimage.h
@@ -23,7 +23,7 @@
 namespace rtengine
 {
 
-class CieImage :
+class CieImage final :
     public NonCopyable
 {
 private:
diff --git a/rtengine/color.h b/rtengine/color.h
index cb2a16a49..ad51dd49b 100644
--- a/rtengine/color.h
+++ b/rtengine/color.h
@@ -20,7 +20,6 @@
 #pragma once
 
 #include <array>
-#include <glibmm/ustring.h>
 
 #include "rt_math.h"
 #include "LUT.h"
@@ -30,6 +29,13 @@
 
 #define SAT(a,b,c) ((float)max(a,b,c)-(float)min(a,b,c))/(float)max(a,b,c)
 
+namespace Glib
+{
+
+class ustring;
+
+}
+
 namespace rtengine
 {
 
diff --git a/rtengine/curves.cc b/rtengine/curves.cc
index de1a378c2..35cc34089 100644
--- a/rtengine/curves.cc
+++ b/rtengine/curves.cc
@@ -21,8 +21,7 @@
 #include <memory>
 #include <cmath>
 #include <cstring>
-#include <glib.h>
-#include <glib/gstdio.h>
+#include <glibmm/ustring.h>
 
 #include "rt_math.h"
 
@@ -4104,7 +4103,7 @@ void PerceptualToneCurve::init()
     }
 }
 
-void PerceptualToneCurve::initApplyState(PerceptualToneCurveState & state, Glib::ustring workingSpace) const
+void PerceptualToneCurve::initApplyState(PerceptualToneCurveState & state, const Glib::ustring &workingSpace) const
 {
 
     // Get the curve's contrast value, and convert to a chroma scaling
diff --git a/rtengine/curves.h b/rtengine/curves.h
index 76b79b020..5d9e19940 100644
--- a/rtengine/curves.h
+++ b/rtengine/curves.h
@@ -22,21 +22,23 @@
 #include <string>
 #include <vector>
 
-#include <glibmm/ustring.h>
-
 #include "rt_math.h"
 #include "flatcurvetypes.h"
 #include "diagonalcurvetypes.h"
-#include "pipettebuffer.h"
-
+#include "noncopyable.h"
 #include "LUT.h"
 #include "sleef.h"
 #define CURVES_MIN_POLY_POINTS  1000
 
-#include "rt_math.h"
-
 #define CLIPI(a) ((a)>0?((a)<65534?(a):65534):0)
 
+namespace Glib
+{
+
+class ustring;
+
+}
+
 using namespace std;
 
 namespace rtengine
@@ -475,7 +477,7 @@ public:
     virtual bool   isIdentity() const = 0;
 };
 
-class DiagonalCurve : public Curve
+class DiagonalCurve final : public Curve
 {
 
 protected:
@@ -497,7 +499,7 @@ public:
     };
 };
 
-class FlatCurve : public Curve
+class FlatCurve final : public Curve
 {
 
 private:
@@ -1742,7 +1744,7 @@ private:
     float calculateToneCurveContrastValue() const;
 public:
     static void init();
-    void initApplyState(PerceptualToneCurveState & state, Glib::ustring workingSpace) const;
+    void initApplyState(PerceptualToneCurveState & state, const Glib::ustring& workingSpace) const;
     void BatchApply(const size_t start, const size_t end, float *r, float *g, float *b, const PerceptualToneCurveState &state) const;
 };
 
diff --git a/rtengine/dcraw.cc b/rtengine/dcraw.cc
index 812f122b3..0284b3f1d 100644
--- a/rtengine/dcraw.cc
+++ b/rtengine/dcraw.cc
@@ -9868,6 +9868,7 @@ void CLASS identify()
     filters = 0;
     tiff_samples = colors = 3;
     load_raw = &CLASS canon_sraw_load_raw;
+    FORC4 cblack[c] = 0; // ALB
   } else if (!strcmp(model,"PowerShot 600")) {
     height = 613;
     width  = 854;
diff --git a/rtengine/dcrop.cc b/rtengine/dcrop.cc
index b20fd6a33..d1b6bb394 100644
--- a/rtengine/dcrop.cc
+++ b/rtengine/dcrop.cc
@@ -24,6 +24,7 @@
 #include "dcrop.h"
 #include "image8.h"
 #include "imagefloat.h"
+#include "improccoordinator.h"
 #include "labimage.h"
 #include "mytime.h"
 #include "procparams.h"
diff --git a/rtengine/dcrop.h b/rtengine/dcrop.h
index ea7a53473..3f8a8ad6d 100644
--- a/rtengine/dcrop.h
+++ b/rtengine/dcrop.h
@@ -18,11 +18,7 @@
  */
 #pragma once
 
-#include "improccoordinator.h"
 #include "rtengine.h"
-#include "improcfun.h"
-#include "imagesource.h"
-#include "procevents.h"
 #include "pipettebuffer.h"
 #include "../rtgui/threadutils.h"
 
@@ -30,12 +26,13 @@ namespace rtengine
 {
 
 class Image8;
+class CieImage;
 
 using namespace procparams;
 
 class ImProcCoordinator;
 
-class Crop : public DetailedCrop, public PipetteBuffer
+class Crop final : public DetailedCrop, public PipetteBuffer
 {
 
 protected:
diff --git a/rtengine/dfmanager.h b/rtengine/dfmanager.h
index 216dcfc53..b23981ffb 100644
--- a/rtengine/dfmanager.h
+++ b/rtengine/dfmanager.h
@@ -31,7 +31,7 @@ namespace rtengine
 {
 
 class RawImage;
-class dfInfo
+class dfInfo final
 {
 public:
 
@@ -74,7 +74,7 @@ protected:
     void updateRawImage();
 };
 
-class DFManager
+class DFManager final
 {
 public:
     void init(const Glib::ustring &pathname);
diff --git a/rtengine/diagonalcurves.cc b/rtengine/diagonalcurves.cc
index bb20b7cc1..e81f2fe92 100644
--- a/rtengine/diagonalcurves.cc
+++ b/rtengine/diagonalcurves.cc
@@ -16,8 +16,6 @@
  *  You should have received a copy of the GNU General Public License
  *  along with RawTherapee.  If not, see <https://www.gnu.org/licenses/>.
  */
-#include <glib.h>
-#include <glib/gstdio.h>
 #include "curves.h"
 #include <cmath>
 #include <vector>
diff --git a/rtengine/ffmanager.h b/rtengine/ffmanager.h
index 80ef5fa1c..f3303f12b 100644
--- a/rtengine/ffmanager.h
+++ b/rtengine/ffmanager.h
@@ -29,7 +29,7 @@ namespace rtengine
 {
 
 class RawImage;
-class ffInfo
+class ffInfo final
 {
 public:
 
@@ -71,7 +71,7 @@ protected:
     void updateRawImage();
 };
 
-class FFManager
+class FFManager final
 {
 public:
     void init(const Glib::ustring &pathname);
diff --git a/rtengine/guidedfilter.cc b/rtengine/guidedfilter.cc
index 3d113424b..b3f843bc1 100644
--- a/rtengine/guidedfilter.cc
+++ b/rtengine/guidedfilter.cc
@@ -28,6 +28,8 @@
  * available at https://arxiv.org/abs/1505.00996
  */
 
+#include "array2D.h"
+#include "boxblur.h"
 #include "guidedfilter.h"
 #include "boxblur.h"
 #include "sleef.h"
diff --git a/rtengine/iccstore.h b/rtengine/iccstore.h
index 731a155bb..117124122 100644
--- a/rtengine/iccstore.h
+++ b/rtengine/iccstore.h
@@ -39,7 +39,7 @@ namespace procparams
 
 typedef const double(*TMatrix)[3];
 
-class ProfileContent
+class ProfileContent final
 {
 public:
     ProfileContent();
@@ -54,7 +54,7 @@ private:
     std::string data;
 };
 
-class ICCStore
+class ICCStore final
 {
 public:
     enum class ProfileType {
diff --git a/rtengine/iimage.h b/rtengine/iimage.h
index 7309dd91f..247d0cdaa 100644
--- a/rtengine/iimage.h
+++ b/rtengine/iimage.h
@@ -20,7 +20,6 @@
 
 #include <vector>
 
-#include <glibmm/ustring.h>
 #include <lcms2.h>
 
 #include "alignedbuffer.h"
@@ -41,6 +40,13 @@
 
 #define CHECK_BOUNDS 0
 
+namespace Glib
+{
+
+class ustring;
+
+}
+
 namespace rtengine
 {
 
@@ -658,7 +664,7 @@ public:
 
     /* If any of the required allocation fails, "width" and "height" are set to -1, and all remaining buffer are freed
      * Can be safely used to reallocate an existing image */
-    void allocate (int W, int H) override
+    void allocate (int W, int H) final
     {
 
         if (W == width && H == height) {
@@ -746,7 +752,7 @@ public:
         }
     }
 
-    void rotate (int deg) override
+    void rotate (int deg) final
     {
 
         if (deg == 90) {
@@ -873,7 +879,7 @@ public:
         }
     }
 
-    void hflip () override
+    void hflip () final
     {
         int width2 = width / 2;
 
@@ -905,7 +911,7 @@ public:
 #endif
     }
 
-    void vflip () override
+    void vflip () final
     {
 
         int height2 = height / 2;
@@ -989,7 +995,7 @@ public:
         }
     }
 
-    void computeHistogramAutoWB (double &avg_r, double &avg_g, double &avg_b, int &n, LUTu &histogram, const int compression) const override
+    void computeHistogramAutoWB (double &avg_r, double &avg_g, double &avg_b, int &n, LUTu &histogram, const int compression) const final
     {
         histogram.clear();
         avg_r = avg_g = avg_b = 0.;
@@ -1328,7 +1334,7 @@ public:
      * If any of the required allocation fails, "width" and "height" are set to -1, and all remaining buffer are freed
      * Can be safely used to reallocate an existing image or to free up it's memory with "allocate (0,0);"
      */
-    void allocate (int W, int H) override
+    void allocate (int W, int H) final
     {
 
         if (W == width && H == height) {
@@ -1382,7 +1388,7 @@ public:
         memcpy (dest->data, data, 3 * width * height * sizeof(T));
     }
 
-    void rotate (int deg) override
+    void rotate (int deg) final
     {
 
         if (deg == 90) {
@@ -1516,7 +1522,7 @@ public:
         }
     }
 
-    void hflip () override
+    void hflip () final
     {
         int width2 = width / 2;
 
@@ -1552,7 +1558,7 @@ public:
         }
     }
 
-    void vflip () override
+    void vflip () final
     {
 
         AlignedBuffer<T> lBuffer(3 * width);
@@ -1619,7 +1625,7 @@ public:
         }
     }
 
-    void computeHistogramAutoWB (double &avg_r, double &avg_g, double &avg_b, int &n, LUTu &histogram, const int compression) const override
+    void computeHistogramAutoWB (double &avg_r, double &avg_g, double &avg_b, int &n, LUTu &histogram, const int compression) const final
     {
         histogram.clear();
         avg_r = avg_g = avg_b = 0.;
diff --git a/rtengine/image16.h b/rtengine/image16.h
index d0053cbfc..de9288f2d 100644
--- a/rtengine/image16.h
+++ b/rtengine/image16.h
@@ -29,7 +29,7 @@ namespace rtengine
 class Image8;
 class Imagefloat;
 
-class Image16 : public IImage16, public ImageIO
+class Image16 final : public IImage16, public ImageIO
 {
 
 public:
diff --git a/rtengine/image8.h b/rtengine/image8.h
index f125dccf8..c11d9d7fc 100644
--- a/rtengine/image8.h
+++ b/rtengine/image8.h
@@ -27,7 +27,7 @@ namespace rtengine
 {
 class Imagefloat;
 
-class Image8 : public IImage8, public ImageIO
+class Image8 final : public IImage8, public ImageIO
 {
 
 public:
diff --git a/rtengine/imagedata.h b/rtengine/imagedata.h
index ff8ed4b86..4bf9bdf5b 100644
--- a/rtengine/imagedata.h
+++ b/rtengine/imagedata.h
@@ -23,12 +23,18 @@
 #include <string>
 #include <vector>
 
-#include <glibmm/ustring.h>
 
 #include <libiptcdata/iptc-data.h>
 
 #include "imageio.h"
 
+namespace Glib
+{
+
+class ustring;
+
+}
+
 namespace rtexif
 {
 
@@ -38,7 +44,7 @@ class TagDirectory;
 namespace rtengine
 {
 
-class FrameData
+class FrameData final
 {
 
 protected:
@@ -95,7 +101,7 @@ public:
     int getRating () const;
 };
 
-class FramesData : public FramesMetaData {
+class FramesData final : public FramesMetaData {
 private:
     // frame's root IFD, can be a file root IFD or a SUB-IFD
     std::vector<std::unique_ptr<FrameData>> frames;
diff --git a/rtengine/imagefloat.h b/rtengine/imagefloat.h
index 4a2b2f7e1..cd08daf70 100644
--- a/rtengine/imagefloat.h
+++ b/rtengine/imagefloat.h
@@ -34,7 +34,7 @@ class LabImage;
 /*
  * Image type used by most tools; expected range: [0.0 ; 65535.0]
  */
-class Imagefloat : public IImagefloat, public ImageIO
+class Imagefloat final : public IImagefloat, public ImageIO
 {
 
 public:
diff --git a/rtengine/imagesource.h b/rtengine/imagesource.h
index e0c26aa9f..76f4e435a 100644
--- a/rtengine/imagesource.h
+++ b/rtengine/imagesource.h
@@ -25,7 +25,6 @@
 
 #include "coord2d.h"
 #include "imagedata.h"
-#include "LUT.h"
 #include "rtengine.h"
 
 template<typename T>
@@ -143,11 +142,11 @@ public:
 
     virtual void        setProgressListener (ProgressListener* pl) {}
 
-    void        increaseRef () override
+    void        increaseRef () final
     {
         references++;
     }
-    void        decreaseRef () override
+    void        decreaseRef () final
     {
         references--;
 
@@ -175,15 +174,15 @@ public:
         return dirpyrdenoiseExpComp;
     }
     // functions inherited from the InitialImage interface
-    Glib::ustring getFileName () override
+    Glib::ustring getFileName () final
     {
         return fileName;
     }
-    cmsHPROFILE getEmbeddedProfile () override
+    cmsHPROFILE getEmbeddedProfile () final
     {
         return embProfile;
     }
-    const FramesMetaData* getMetaData () override
+    const FramesMetaData* getMetaData () final
     {
         return idata;
     }
diff --git a/rtengine/improccoordinator.h b/rtengine/improccoordinator.h
index cae396c37..aeb29f257 100644
--- a/rtengine/improccoordinator.h
+++ b/rtengine/improccoordinator.h
@@ -27,7 +27,6 @@
 #include "imagesource.h"
 #include "improcfun.h"
 #include "LUT.h"
-#include "procevents.h"
 #include "rtengine.h"
 
 #include "../rtgui/threadutils.h"
@@ -55,7 +54,7 @@ class Crop;
   * but using this class' LUT and other precomputed parameters. The main preview area is displaying a non framed Crop object,
   * while detail windows are framed Crop objects.
   */
-class ImProcCoordinator : public StagedImageProcessor
+class ImProcCoordinator final : public StagedImageProcessor
 {
 
     friend class Crop;
diff --git a/rtengine/improcfun.cc b/rtengine/improcfun.cc
index 9b35dc2ed..6b3a75f60 100644
--- a/rtengine/improcfun.cc
+++ b/rtengine/improcfun.cc
@@ -31,15 +31,16 @@
 #include "cieimage.h"
 #include "clutstore.h"
 #include "color.h"
+#include "colortemp.h"
 #include "curves.h"
 #include "dcp.h"
 #include "EdgePreservingDecomposition.h"
 #include "iccmatrices.h"
 #include "iccstore.h"
 #include "imagesource.h"
-#include "improccoordinator.h"
 #include "improcfun.h"
 #include "labimage.h"
+#include "pipettebuffer.h"
 #include "procparams.h"
 #include "rt_math.h"
 #include "rtengine.h"
diff --git a/rtengine/improcfun.h b/rtengine/improcfun.h
index be5566483..23154ecd2 100644
--- a/rtengine/improcfun.h
+++ b/rtengine/improcfun.h
@@ -19,6 +19,7 @@
 #pragma once
 
 #include <memory>
+#include <vector>
 
 #include "coord2d.h"
 #include "gamutwarning.h"
@@ -28,6 +29,12 @@
 #include "imagesource.h"
 #include <cairomm/cairomm.h>
 
+namespace Glib
+{
+
+class ustring;
+
+}
 template<typename T>
 class LUT;
 
@@ -56,6 +63,7 @@ class LocLHCurve;
 class LocHHCurve;
 class NoiseCurve;
 class OpacityCurve;
+class PipetteBuffer;
 class ToneCurve;
 class WavCurve;
 class WavOpacityCurveBY;
diff --git a/rtengine/ipdehaze.cc b/rtengine/ipdehaze.cc
index 69cb3b17c..4b72e269d 100644
--- a/rtengine/ipdehaze.cc
+++ b/rtengine/ipdehaze.cc
@@ -32,6 +32,7 @@
 #include <iostream>
 #include <vector>
 
+#include "array2D.h"
 #include "color.h"
 #include "guidedfilter.h"
 #include "iccstore.h"
diff --git a/rtengine/labimage.h b/rtengine/labimage.h
index 4dd10dfb0..6f394645d 100644
--- a/rtengine/labimage.h
+++ b/rtengine/labimage.h
@@ -23,7 +23,7 @@
 namespace rtengine
 {
 
-class LabImage
+class LabImage final
 {
 private:
     void allocLab(size_t w, size_t h);
diff --git a/rtengine/opthelper.h b/rtengine/opthelper.h
index b65ede227..f431c0ec9 100644
--- a/rtengine/opthelper.h
+++ b/rtengine/opthelper.h
@@ -18,27 +18,24 @@
 //  along with this program.  If not, see <https://www.gnu.org/licenses/>.
 //
 ////////////////////////////////////////////////////////////////
+#pragma once
 
-#ifndef OPTHELPER_H
-    #define OPTHELPER_H
+#define pow_F(a,b) (xexpf(b*xlogf(a)))
 
-    #define pow_F(a,b) (xexpf(b*xlogf(a)))
-
-    #ifdef __SSE2__
-        #include "sleefsseavx.c"
-    #endif
-
-    #ifdef __GNUC__
-        #define RESTRICT    __restrict__
-        #define LIKELY(x)   __builtin_expect (!!(x), 1)
-        #define UNLIKELY(x) __builtin_expect (!!(x), 0)
-        #define ALIGNED64 __attribute__ ((aligned (64)))
-        #define ALIGNED16 __attribute__ ((aligned (16)))
-    #else
-        #define RESTRICT
-        #define LIKELY(x)    (x)
-        #define UNLIKELY(x)  (x)
-        #define ALIGNED64
-        #define ALIGNED16
-    #endif
+#ifdef __SSE2__
+    #include "sleefsseavx.h"
+#endif
+
+#ifdef __GNUC__
+    #define RESTRICT    __restrict__
+    #define LIKELY(x)   __builtin_expect (!!(x), 1)
+    #define UNLIKELY(x) __builtin_expect (!!(x), 0)
+    #define ALIGNED64 __attribute__ ((aligned (64)))
+    #define ALIGNED16 __attribute__ ((aligned (16)))
+#else
+    #define RESTRICT
+    #define LIKELY(x)    (x)
+    #define UNLIKELY(x)  (x)
+    #define ALIGNED64
+    #define ALIGNED16
 #endif
diff --git a/rtengine/previewimage.h b/rtengine/previewimage.h
index e6c3ea070..2143509a3 100644
--- a/rtengine/previewimage.h
+++ b/rtengine/previewimage.h
@@ -18,10 +18,16 @@
  */
 #pragma once
 
-#include <glibmm/ustring.h>
 
 #include <cairomm/cairomm.h>
 
+namespace Glib
+{
+
+class ustring;
+
+}
+
 namespace rtengine
 {
 
diff --git a/rtengine/processingjob.h b/rtengine/processingjob.h
index 6cdc6bd7c..9c3846958 100644
--- a/rtengine/processingjob.h
+++ b/rtengine/processingjob.h
@@ -24,7 +24,7 @@
 namespace rtengine
 {
 
-class ProcessingJobImpl : public ProcessingJob
+class ProcessingJobImpl final : public ProcessingJob
 {
 
 public:
diff --git a/rtengine/rawimagesource.h b/rtengine/rawimagesource.h
index eff7a9593..b514b24a3 100644
--- a/rtengine/rawimagesource.h
+++ b/rtengine/rawimagesource.h
@@ -37,7 +37,7 @@ class DiagonalCurve;
 class RetinextransmissionCurve;
 class RetinexgaintransmissionCurve;
 
-class RawImageSource : public ImageSource
+class RawImageSource final : public ImageSource
 {
 private:
     static DiagonalCurve *phaseOneIccCurve;
diff --git a/rtengine/rescale.h b/rtengine/rescale.h
index 70974aa48..2138cd8e8 100644
--- a/rtengine/rescale.h
+++ b/rtengine/rescale.h
@@ -20,9 +20,12 @@
 
 #pragma once
 
-#include "array2D.h"
 #include "rt_math.h"
 
+template<typename T>
+class array2D;
+
+
 namespace rtengine
 {
 
diff --git a/rtengine/rtengine.h b/rtengine/rtengine.h
index be623ce57..96ffeb918 100644
--- a/rtengine/rtengine.h
+++ b/rtengine/rtengine.h
@@ -31,7 +31,6 @@
 #include "imageformat.h"
 #include "procevents.h"
 #include "rawmetadatalocation.h"
-#include "rt_math.h"
 #include "settings.h"
 
 #include "../rtgui/threadutils.h"
diff --git a/rtengine/rtthumbnail.h b/rtengine/rtthumbnail.h
index dcc9596f6..c8d657a62 100644
--- a/rtengine/rtthumbnail.h
+++ b/rtengine/rtthumbnail.h
@@ -18,8 +18,6 @@
  */
 #pragma once
 
-#include <glibmm/ustring.h>
-
 #include <lcms2.h>
 
 #include "image16.h"
@@ -30,6 +28,13 @@
 
 #include "../rtgui/threadutils.h"
 
+namespace Glib
+{
+
+class ustring;
+
+}
+
 namespace rtengine
 {
 
diff --git a/rtengine/sleefsseavx.c b/rtengine/sleefsseavx.c
deleted file mode 100644
index 0af516f9b..000000000
--- a/rtengine/sleefsseavx.c
+++ /dev/null
@@ -1,1483 +0,0 @@
-////////////////////////////////////////////////////////////////
-//
-//  this code was taken from http://shibatch.sourceforge.net/
-//  Many thanks to the author of original version: Naoki Shibata
-//
-//  This version contains modifications made by Ingo Weyrich
-//
-////////////////////////////////////////////////////////////////
-
-
-#ifndef SLEEFSSEAVX
-#define SLEEFSSEAVX
-
-#include <assert.h>
-#include "rt_math.h"
-#ifdef __SSE2__
-#include "helpersse2.h"
-
-#ifdef ENABLE_AVX
-#include "helperavx.h"
-#endif
-
-#ifdef __GNUC__
-#define INLINE __inline
-#else
-#define INLINE inline
-#endif
-
-#define PI4_A .7853981554508209228515625
-#define PI4_B .794662735614792836713604629039764404296875e-8
-#define PI4_C .306161699786838294306516483068750264552437361480769e-16
-#define M_4_PI 1.273239544735162542821171882678754627704620361328125
-
-#define L2U .69314718055966295651160180568695068359375
-#define L2L .28235290563031577122588448175013436025525412068e-12
-#define R_LN2 1.442695040888963407359924681001892137426645954152985934135449406931
-
-#define PI4_Af 0.78515625f
-#define PI4_Bf 0.00024127960205078125f
-#define PI4_Cf 6.3329935073852539062e-07f
-#define PI4_Df 4.9604681473525147339e-10f
-
-#define L2Uf 0.693145751953125f
-#define L2Lf 1.428606765330187045e-06f
-#define R_LN2f 1.442695040888963407359924681001892137426645954152985934135449406931f
-
-#define INFINITYf ((float)rtengine::RT_INFINITY)
-#define NANf ((float)rtengine::RT_NAN)
-
-static INLINE vdouble vadd3(vdouble v0, vdouble v1, vdouble v2) {
-    return vadd(vadd(v0, v1), v2);
-}
-
-static INLINE vdouble vadd4(vdouble v0, vdouble v1, vdouble v2, vdouble v3) {
-    return vadd3(vadd(v0, v1), v2, v3);
-}
-
-static INLINE vdouble vadd5(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4) {
-    return vadd4(vadd(v0, v1), v2, v3, v4);
-}
-
-static INLINE vdouble vadd6(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4, vdouble v5) {
-    return vadd5(vadd(v0, v1), v2, v3, v4, v5);
-}
-
-static INLINE vdouble vadd7(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4, vdouble v5, vdouble v6) {
-    return vadd6(vadd(v0, v1), v2, v3, v4, v5, v6);
-}
-
-static INLINE vdouble vsub3(vdouble v0, vdouble v1, vdouble v2) {
-    return vsub(vsub(v0, v1), v2);
-}
-
-static INLINE vdouble vsub4(vdouble v0, vdouble v1, vdouble v2, vdouble v3) {
-    return vsub3(vsub(v0, v1), v2, v3);
-}
-
-static INLINE vdouble vsub5(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4) {
-    return vsub4(vsub(v0, v1), v2, v3, v4);
-}
-
-//
-
-static INLINE vdouble2 normalize_d(vdouble2 t) {
-    vdouble2 s;
-
-    s.x = vadd(t.x, t.y);
-    s.y = vadd(vsub(t.x, s.x), t.y);
-
-    return s;
-}
-
-static INLINE vdouble2 scale_d(vdouble2 d, vdouble s) {
-    vdouble2 r = {vmul(d.x, s), vmul(d.y, s)};
-    return r;
-}
-
-static INLINE vdouble2 add_ss(vdouble x, vdouble y) {
-    vdouble2 r;
-
-    r.x = vadd(x, y);
-    r.y = vadd(vsub(x, r.x), y);
-
-    return r;
-}
-
-static INLINE vdouble2 add2_ss(vdouble x, vdouble y) {
-    vdouble2 r;
-
-    r.x = vadd(x, y);
-    vdouble v = vsub(r.x, x);
-    r.y = vadd(vsub(x, vsub(r.x, v)), vsub(y, v));
-
-    return r;
-}
-
-static INLINE vdouble2 add_ds(vdouble2 x, vdouble y) {
-    vdouble2 r;
-
-    r.x = vadd(x.x, y);
-    r.y = vadd3(vsub(x.x, r.x), y, x.y);
-
-    return r;
-}
-
-static INLINE vdouble2 add2_ds(vdouble2 x, vdouble y) {
-    vdouble2 r;
-
-    r.x = vadd(x.x, y);
-    vdouble v = vsub(r.x, x.x);
-    r.y = vadd(vsub(x.x, vsub(r.x, v)), vsub(y, v));
-    r.y = vadd(r.y, x.y);
-
-    return r;
-}
-
-static INLINE vdouble2 add_sd(vdouble x, vdouble2 y) {
-    vdouble2 r;
-
-    r.x = vadd(x, y.x);
-    r.y = vadd3(vsub(x, r.x), y.x, y.y);
-
-    return r;
-}
-
-static INLINE vdouble2 add_dd(vdouble2 x, vdouble2 y) {
-    // |x| >= |y|
-
-    vdouble2 r;
-
-    r.x = vadd(x.x, y.x);
-    r.y = vadd4(vsub(x.x, r.x), y.x, x.y, y.y);
-
-    return r;
-}
-
-static INLINE vdouble2 add2_dd(vdouble2 x, vdouble2 y) {
-    vdouble2 r;
-
-    r.x  = vadd(x.x, y.x);
-    vdouble v = vsub(r.x, x.x);
-    r.y = vadd(vsub(x.x, vsub(r.x, v)), vsub(y.x, v));
-    r.y = vadd(r.y, vadd(x.y, y.y));
-
-    return r;
-}
-
-static INLINE vdouble2 div_dd(vdouble2 n, vdouble2 d) {
-    vdouble t = vrec(d.x);
-    vdouble dh  = vupper(d.x), dl  = vsub(d.x,  dh);
-    vdouble th  = vupper(t  ), tl  = vsub(t  ,  th);
-    vdouble nhh = vupper(n.x), nhl = vsub(n.x, nhh);
-
-    vdouble2 q;
-
-    q.x = vmul(n.x, t);
-
-    vdouble u = vadd5(vsub(vmul(nhh, th), q.x), vmul(nhh, tl), vmul(nhl, th), vmul(nhl, tl),
-            vmul(q.x, vsub5(vcast_vd_d(1), vmul(dh, th), vmul(dh, tl), vmul(dl, th), vmul(dl, tl))));
-
-    q.y = vadd(vmul(t, vsub(n.y, vmul(q.x, d.y))), u);
-
-    return q;
-}
-
-static INLINE vdouble2 mul_ss(vdouble x, vdouble y) {
-    vdouble xh = vupper(x), xl = vsub(x, xh);
-    vdouble yh = vupper(y), yl = vsub(y, yh);
-    vdouble2 r;
-
-    r.x = vmul(x, y);
-    r.y = vadd5(vmul(xh, yh), vneg(r.x), vmul(xl, yh), vmul(xh, yl), vmul(xl, yl));
-
-    return r;
-}
-
-static INLINE vdouble2 mul_ds(vdouble2 x, vdouble y) {
-    vdouble xh = vupper(x.x), xl = vsub(x.x, xh);
-    vdouble yh = vupper(y  ), yl = vsub(y  , yh);
-    vdouble2 r;
-
-    r.x = vmul(x.x, y);
-    r.y = vadd6(vmul(xh, yh), vneg(r.x), vmul(xl, yh), vmul(xh, yl), vmul(xl, yl), vmul(x.y, y));
-
-    return r;
-}
-
-static INLINE vdouble2 mul_dd(vdouble2 x, vdouble2 y) {
-    vdouble xh = vupper(x.x), xl = vsub(x.x, xh);
-    vdouble yh = vupper(y.x), yl = vsub(y.x, yh);
-    vdouble2 r;
-
-    r.x = vmul(x.x, y.x);
-    r.y = vadd7(vmul(xh, yh), vneg(r.x), vmul(xl, yh), vmul(xh, yl), vmul(xl, yl), vmul(x.x, y.y), vmul(x.y, y.x));
-
-    return r;
-}
-
-static INLINE vdouble2 squ_d(vdouble2 x) {
-    vdouble xh = vupper(x.x), xl = vsub(x.x, xh);
-    vdouble2 r;
-
-    r.x = vmul(x.x, x.x);
-    r.y = vadd5(vmul(xh, xh), vneg(r.x), vmul(vadd(xh, xh), xl), vmul(xl, xl), vmul(x.x, vadd(x.y, x.y)));
-
-    return r;
-}
-
-static INLINE vdouble2 rec_s(vdouble d) {
-    vdouble t = vrec(d);
-    vdouble dh = vupper(d), dl = vsub(d, dh);
-    vdouble th = vupper(t), tl = vsub(t, th);
-    vdouble2 q;
-
-    q.x = t;
-    q.y = vmul(t, vsub5(vcast_vd_d(1), vmul(dh, th), vmul(dh, tl), vmul(dl, th), vmul(dl, tl)));
-
-    return q;
-}
-
-static INLINE vdouble2 sqrt_d(vdouble2 d) {
-    vdouble t = vsqrt(vadd(d.x, d.y));
-    return scale_d(mul_dd(add2_dd(d, mul_ss(t, t)), rec_s(t)), vcast_vd_d(0.5));
-}
-
-//
-
-static INLINE vdouble xldexp(vdouble x, vint q) { return vldexp(x, q); }
-
-static INLINE vint xilogb(vdouble d) {
-    vdouble e = vcast_vd_vi(vsubi(vilogbp1(vabs(d)), vcast_vi_i(1)));
-    e = vsel(vmask_eq(d, vcast_vd_d(0)), vcast_vd_d(-2147483648.0), e);
-    e = vsel(vmask_eq(vabs(d), vcast_vd_d(rtengine::RT_INFINITY)), vcast_vd_d(2147483647), e);
-    return vrint_vi_vd(e);
-}
-
-static INLINE vdouble xsin(vdouble d) {
-    vint q;
-    vdouble u, s;
-
-    q = vrint_vi_vd(vmul(d, vcast_vd_d(rtengine::RT_1_PI)));
-
-    u = vcast_vd_vi(q);
-    d = vadd(d, vmul(u, vcast_vd_d(-PI4_A*4)));
-    d = vadd(d, vmul(u, vcast_vd_d(-PI4_B*4)));
-    d = vadd(d, vmul(u, vcast_vd_d(-PI4_C*4)));
-
-    s = vmul(d, d);
-
-    d = vsel(vmaski_eq(vandi(q, vcast_vi_i(1)), vcast_vi_i(1)), vneg(d), d);
-
-    u = vcast_vd_d(-7.97255955009037868891952e-18);
-    u = vmla(u, s, vcast_vd_d(2.81009972710863200091251e-15));
-    u = vmla(u, s, vcast_vd_d(-7.64712219118158833288484e-13));
-    u = vmla(u, s, vcast_vd_d(1.60590430605664501629054e-10));
-    u = vmla(u, s, vcast_vd_d(-2.50521083763502045810755e-08));
-    u = vmla(u, s, vcast_vd_d(2.75573192239198747630416e-06));
-    u = vmla(u, s, vcast_vd_d(-0.000198412698412696162806809));
-    u = vmla(u, s, vcast_vd_d(0.00833333333333332974823815));
-    u = vmla(u, s, vcast_vd_d(-0.166666666666666657414808));
-
-    u = vmla(s, vmul(u, d), d);
-
-    return u;
-}
-
-static INLINE vdouble xcos(vdouble d) {
-    vint q;
-    vdouble u, s;
-
-    q = vrint_vi_vd(vsub(vmul(d, vcast_vd_d(rtengine::RT_1_PI)), vcast_vd_d(0.5)));
-    q = vaddi(vaddi(q, q), vcast_vi_i(1));
-
-    u = vcast_vd_vi(q);
-    d = vadd(d, vmul(u, vcast_vd_d(-PI4_A*2)));
-    d = vadd(d, vmul(u, vcast_vd_d(-PI4_B*2)));
-    d = vadd(d, vmul(u, vcast_vd_d(-PI4_C*2)));
-
-    s = vmul(d, d);
-
-    d = vsel(vmaski_eq(vandi(q, vcast_vi_i(2)), vcast_vi_i(0)), vneg(d), d);
-
-    u = vcast_vd_d(-7.97255955009037868891952e-18);
-    u = vmla(u, s, vcast_vd_d(2.81009972710863200091251e-15));
-    u = vmla(u, s, vcast_vd_d(-7.64712219118158833288484e-13));
-    u = vmla(u, s, vcast_vd_d(1.60590430605664501629054e-10));
-    u = vmla(u, s, vcast_vd_d(-2.50521083763502045810755e-08));
-    u = vmla(u, s, vcast_vd_d(2.75573192239198747630416e-06));
-    u = vmla(u, s, vcast_vd_d(-0.000198412698412696162806809));
-    u = vmla(u, s, vcast_vd_d(0.00833333333333332974823815));
-    u = vmla(u, s, vcast_vd_d(-0.166666666666666657414808));
-
-    u = vmla(s, vmul(u, d), d);
-
-    return u;
-}
-
-static INLINE vdouble2 xsincos(vdouble d) {
-    vint q;
-    vmask m;
-    vdouble u, s, t, rx, ry;
-    vdouble2 r;
-
-    q = vrint_vi_vd(vmul(d, vcast_vd_d(rtengine::RT_2_PI)));
-
-    s = d;
-
-    u = vcast_vd_vi(q);
-    s = vmla(u, vcast_vd_d(-PI4_A*2), s);
-    s = vmla(u, vcast_vd_d(-PI4_B*2), s);
-    s = vmla(u, vcast_vd_d(-PI4_C*2), s);
-
-    t = s;
-
-    s = vmul(s, s);
-
-    u = vcast_vd_d(1.58938307283228937328511e-10);
-    u = vmla(u, s, vcast_vd_d(-2.50506943502539773349318e-08));
-    u = vmla(u, s, vcast_vd_d(2.75573131776846360512547e-06));
-    u = vmla(u, s, vcast_vd_d(-0.000198412698278911770864914));
-    u = vmla(u, s, vcast_vd_d(0.0083333333333191845961746));
-    u = vmla(u, s, vcast_vd_d(-0.166666666666666130709393));
-    u = vmul(vmul(u, s), t);
-
-    rx = vadd(t, u);
-
-    u = vcast_vd_d(-1.13615350239097429531523e-11);
-    u = vmla(u, s, vcast_vd_d(2.08757471207040055479366e-09));
-    u = vmla(u, s, vcast_vd_d(-2.75573144028847567498567e-07));
-    u = vmla(u, s, vcast_vd_d(2.48015872890001867311915e-05));
-    u = vmla(u, s, vcast_vd_d(-0.00138888888888714019282329));
-    u = vmla(u, s, vcast_vd_d(0.0416666666666665519592062));
-    u = vmla(u, s, vcast_vd_d(-0.5));
-
-    ry = vadd(vcast_vd_d(1), vmul(s, u));
-
-    m = vmaski_eq(vandi(q, vcast_vi_i(1)), vcast_vi_i(0));
-    r.x = vsel(m, rx, ry);
-    r.y = vsel(m, ry, rx);
-
-    m = vmaski_eq(vandi(q, vcast_vi_i(2)), vcast_vi_i(2));
-    r.x = vreinterpret_vd_vm(vxorm(vandm(m, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(r.x)));
-
-    m = vmaski_eq(vandi(vaddi(q, vcast_vi_i(1)), vcast_vi_i(2)), vcast_vi_i(2));
-    r.y = vreinterpret_vd_vm(vxorm(vandm(m, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(r.y)));
-
-    m = vmask_isinf(d);
-    r.x = vsel(m, vcast_vd_d(rtengine::RT_NAN), r.x);
-    r.y = vsel(m, vcast_vd_d(rtengine::RT_NAN), r.y);
-
-    return r;
-}
-
-static INLINE vdouble xtan(vdouble d) {
-    vint q;
-    vdouble u, s, x;
-    vmask m;
-
-    q = vrint_vi_vd(vmul(d, vcast_vd_d(rtengine::RT_2_PI)));
-
-    u = vcast_vd_vi(q);
-    x = vadd(d, vmul(u, vcast_vd_d(-PI4_A*2)));
-    x = vadd(x, vmul(u, vcast_vd_d(-PI4_B*2)));
-    x = vadd(x, vmul(u, vcast_vd_d(-PI4_C*2)));
-
-    s = vmul(x, x);
-
-    m = vmaski_eq(vandi(q, vcast_vi_i(1)), vcast_vi_i(1));
-    x = vsel(m, vneg(x), x);
-
-    u = vcast_vd_d(1.01419718511083373224408e-05);
-    u = vmla(u, s, vcast_vd_d(-2.59519791585924697698614e-05));
-    u = vmla(u, s, vcast_vd_d(5.23388081915899855325186e-05));
-    u = vmla(u, s, vcast_vd_d(-3.05033014433946488225616e-05));
-    u = vmla(u, s, vcast_vd_d(7.14707504084242744267497e-05));
-    u = vmla(u, s, vcast_vd_d(8.09674518280159187045078e-05));
-    u = vmla(u, s, vcast_vd_d(0.000244884931879331847054404));
-    u = vmla(u, s, vcast_vd_d(0.000588505168743587154904506));
-    u = vmla(u, s, vcast_vd_d(0.00145612788922812427978848));
-    u = vmla(u, s, vcast_vd_d(0.00359208743836906619142924));
-    u = vmla(u, s, vcast_vd_d(0.00886323944362401618113356));
-    u = vmla(u, s, vcast_vd_d(0.0218694882853846389592078));
-    u = vmla(u, s, vcast_vd_d(0.0539682539781298417636002));
-    u = vmla(u, s, vcast_vd_d(0.133333333333125941821962));
-    u = vmla(u, s, vcast_vd_d(0.333333333333334980164153));
-
-    u = vmla(s, vmul(u, x), x);
-
-    u = vsel(m, vrec(u), u);
-
-    u = vsel(vmask_isinf(d), vcast_vd_d(rtengine::RT_NAN), u);
-
-    return u;
-}
-
-static INLINE vdouble atan2k(vdouble y, vdouble x) {
-    vdouble s, t, u;
-    vint q;
-    vmask p;
-
-    q = vseli_lt(x, vcast_vd_d(0), vcast_vi_i(-2), vcast_vi_i(0));
-    x = vabs(x);
-
-    q = vseli_lt(x, y, vaddi(q, vcast_vi_i(1)), q);
-    p = vmask_lt(x, y);
-    s = vsel (p, vneg(x), y);
-    t = vmax (x, y);
-
-    s = vdiv(s, t);
-    t = vmul(s, s);
-
-    u = vcast_vd_d(-1.88796008463073496563746e-05);
-    u = vmla(u, t, vcast_vd_d(0.000209850076645816976906797));
-    u = vmla(u, t, vcast_vd_d(-0.00110611831486672482563471));
-    u = vmla(u, t, vcast_vd_d(0.00370026744188713119232403));
-    u = vmla(u, t, vcast_vd_d(-0.00889896195887655491740809));
-    u = vmla(u, t, vcast_vd_d(0.016599329773529201970117));
-    u = vmla(u, t, vcast_vd_d(-0.0254517624932312641616861));
-    u = vmla(u, t, vcast_vd_d(0.0337852580001353069993897));
-    u = vmla(u, t, vcast_vd_d(-0.0407629191276836500001934));
-    u = vmla(u, t, vcast_vd_d(0.0466667150077840625632675));
-    u = vmla(u, t, vcast_vd_d(-0.0523674852303482457616113));
-    u = vmla(u, t, vcast_vd_d(0.0587666392926673580854313));
-    u = vmla(u, t, vcast_vd_d(-0.0666573579361080525984562));
-    u = vmla(u, t, vcast_vd_d(0.0769219538311769618355029));
-    u = vmla(u, t, vcast_vd_d(-0.090908995008245008229153));
-    u = vmla(u, t, vcast_vd_d(0.111111105648261418443745));
-    u = vmla(u, t, vcast_vd_d(-0.14285714266771329383765));
-    u = vmla(u, t, vcast_vd_d(0.199999999996591265594148));
-    u = vmla(u, t, vcast_vd_d(-0.333333333333311110369124));
-
-    t = vadd(s, vmul(s, vmul(t, u)));
-    t = vadd(t, vmul(vcast_vd_vi(q), vcast_vd_d(rtengine::RT_PI/2)));
-
-    return t;
-}
-
-static INLINE vdouble xatan2(vdouble y, vdouble x) {
-    vdouble r = atan2k(vabs(y), x);
-
-    r = vmulsign(r, x);
-    r = vsel(vorm(vmask_isinf(x), vmask_eq(x, vcast_vd_d(0))), vsub(vcast_vd_d(rtengine::RT_PI/2), visinf2(x, vmulsign(vcast_vd_d(rtengine::RT_PI/2), x))), r);
-    r = vsel(vmask_isinf(y), vsub(vcast_vd_d(rtengine::RT_PI/2), visinf2(x, vmulsign(vcast_vd_d(rtengine::RT_PI/4), x))), r);
-    r = vsel(vmask_eq(y, vcast_vd_d(0)), vsel(vmask_eq(vsign(x), vcast_vd_d(-1.0)), vcast_vd_d(rtengine::RT_PI), vcast_vd_d(0)), r);
-
-    return vsel(vorm(vmask_isnan(x), vmask_isnan(y)), vcast_vd_d(rtengine::RT_NAN), vmulsign(r, y));
-}
-
-static INLINE vdouble xasin(vdouble d) {
-    vdouble x, y;
-    x = vadd(vcast_vd_d(1), d);
-    y = vsub(vcast_vd_d(1), d);
-    x = vmul(x, y);
-    x = vsqrt(x);
-    x = vsel(vmask_isnan(x), vcast_vd_d(rtengine::RT_NAN), atan2k(vabs(d), x));
-    return vmulsign(x, d);
-}
-
-static INLINE vdouble xacos(vdouble d) {
-    vdouble x, y;
-    x = vadd(vcast_vd_d(1), d);
-    y = vsub(vcast_vd_d(1), d);
-    x = vmul(x, y);
-    x = vsqrt(x);
-    x = vmulsign(atan2k(x, vabs(d)), d);
-    y = (vdouble)vandm(vmask_lt(d, vcast_vd_d(0)), (vmask)vcast_vd_d(rtengine::RT_PI));
-    x = vadd(x, y);
-    return x;
-}
-
-static INLINE vdouble xatan(vdouble s) {
-    vdouble t, u;
-    vint q;
-
-    q = vseli_lt(s, vcast_vd_d(0), vcast_vi_i(2), vcast_vi_i(0));
-    s = vabs(s);
-
-    q = vseli_lt(vcast_vd_d(1), s, vaddi(q, vcast_vi_i(1)), q);
-    s = vsel(vmask_lt(vcast_vd_d(1), s), vdiv(vcast_vd_d(1), s), s);
-
-    t = vmul(s, s);
-
-    u = vcast_vd_d(-1.88796008463073496563746e-05);
-    u = vmla(u, t, vcast_vd_d(0.000209850076645816976906797));
-    u = vmla(u, t, vcast_vd_d(-0.00110611831486672482563471));
-    u = vmla(u, t, vcast_vd_d(0.00370026744188713119232403));
-    u = vmla(u, t, vcast_vd_d(-0.00889896195887655491740809));
-    u = vmla(u, t, vcast_vd_d(0.016599329773529201970117));
-    u = vmla(u, t, vcast_vd_d(-0.0254517624932312641616861));
-    u = vmla(u, t, vcast_vd_d(0.0337852580001353069993897));
-    u = vmla(u, t, vcast_vd_d(-0.0407629191276836500001934));
-    u = vmla(u, t, vcast_vd_d(0.0466667150077840625632675));
-    u = vmla(u, t, vcast_vd_d(-0.0523674852303482457616113));
-    u = vmla(u, t, vcast_vd_d(0.0587666392926673580854313));
-    u = vmla(u, t, vcast_vd_d(-0.0666573579361080525984562));
-    u = vmla(u, t, vcast_vd_d(0.0769219538311769618355029));
-    u = vmla(u, t, vcast_vd_d(-0.090908995008245008229153));
-    u = vmla(u, t, vcast_vd_d(0.111111105648261418443745));
-    u = vmla(u, t, vcast_vd_d(-0.14285714266771329383765));
-    u = vmla(u, t, vcast_vd_d(0.199999999996591265594148));
-    u = vmla(u, t, vcast_vd_d(-0.333333333333311110369124));
-
-    t = vadd(s, vmul(s, vmul(t, u)));
-
-    t = vsel(vmaski_eq(vandi(q, vcast_vi_i(1)), vcast_vi_i(1)), vsub(vcast_vd_d(rtengine::RT_PI/2), t), t);
-    t = vsel(vmaski_eq(vandi(q, vcast_vi_i(2)), vcast_vi_i(2)), vneg(t), t);
-
-    return t;
-}
-
-static INLINE vdouble xlog(vdouble d) {
-    vdouble x, x2;
-    vdouble t, m;
-    vint e;
-
-    e = vilogbp1(vmul(d, vcast_vd_d(0.7071)));
-    m = vldexp(d, vsubi(vcast_vi_i(0), e));
-
-    x = vdiv(vadd(vcast_vd_d(-1), m), vadd(vcast_vd_d(1), m));
-    x2 = vmul(x, x);
-
-    t = vcast_vd_d(0.148197055177935105296783);
-    t = vmla(t, x2, vcast_vd_d(0.153108178020442575739679));
-    t = vmla(t, x2, vcast_vd_d(0.181837339521549679055568));
-    t = vmla(t, x2, vcast_vd_d(0.22222194152736701733275));
-    t = vmla(t, x2, vcast_vd_d(0.285714288030134544449368));
-    t = vmla(t, x2, vcast_vd_d(0.399999999989941956712869));
-    t = vmla(t, x2, vcast_vd_d(0.666666666666685503450651));
-    t = vmla(t, x2, vcast_vd_d(2));
-
-    x = vadd(vmul(x, t), vmul(vcast_vd_d(0.693147180559945286226764), vcast_vd_vi(e)));
-
-    x = vsel(vmask_ispinf(d), vcast_vd_d(rtengine::RT_INFINITY), x);
-    x = vsel(vmask_gt(vcast_vd_d(0), d), vcast_vd_d(rtengine::RT_NAN), x);
-    x = vsel(vmask_eq(d, vcast_vd_d(0)), vcast_vd_d(-rtengine::RT_INFINITY), x);
-
-    return x;
-}
-
-static INLINE vdouble xexp(vdouble d) {
-    vint q = vrint_vi_vd(vmul(d, vcast_vd_d(R_LN2)));
-    vdouble s, u;
-
-    s = vadd(d, vmul(vcast_vd_vi(q), vcast_vd_d(-L2U)));
-    s = vadd(s, vmul(vcast_vd_vi(q), vcast_vd_d(-L2L)));
-
-    u = vcast_vd_d(2.08860621107283687536341e-09);
-    u = vmla(u, s, vcast_vd_d(2.51112930892876518610661e-08));
-    u = vmla(u, s, vcast_vd_d(2.75573911234900471893338e-07));
-    u = vmla(u, s, vcast_vd_d(2.75572362911928827629423e-06));
-    u = vmla(u, s, vcast_vd_d(2.4801587159235472998791e-05));
-    u = vmla(u, s, vcast_vd_d(0.000198412698960509205564975));
-    u = vmla(u, s, vcast_vd_d(0.00138888888889774492207962));
-    u = vmla(u, s, vcast_vd_d(0.00833333333331652721664984));
-    u = vmla(u, s, vcast_vd_d(0.0416666666666665047591422));
-    u = vmla(u, s, vcast_vd_d(0.166666666666666851703837));
-    u = vmla(u, s, vcast_vd_d(0.5));
-
-    u = vadd(vcast_vd_d(1), vadd(s, vmul(vmul(s, s), u)));
-
-    u = vldexp(u, q);
-
-    u = vsel(vmask_isminf(d), vcast_vd_d(0), u);
-
-    return u;
-}
-
-static INLINE vdouble2 logk(vdouble d) {
-    vdouble2 x, x2;
-    vdouble t, m;
-    vint e;
-
-    e = vilogbp1(vmul(d, vcast_vd_d(0.7071)));
-    m = vldexp(d, vsubi(vcast_vi_i(0), e));
-
-    x = div_dd(add2_ss(vcast_vd_d(-1), m), add2_ss(vcast_vd_d(1), m));
-    x2 = squ_d(x);
-    x2 = normalize_d(x2);
-
-    t = vcast_vd_d(0.134601987501262130076155);
-    t = vmla(t, x2.x, vcast_vd_d(0.132248509032032670243288));
-    t = vmla(t, x2.x, vcast_vd_d(0.153883458318096079652524));
-    t = vmla(t, x2.x, vcast_vd_d(0.181817427573705403298686));
-    t = vmla(t, x2.x, vcast_vd_d(0.222222231326187414840781));
-    t = vmla(t, x2.x, vcast_vd_d(0.285714285651261412873718));
-    t = vmla(t, x2.x, vcast_vd_d(0.400000000000222439910458));
-    t = vmla(t, x2.x, vcast_vd_d(0.666666666666666371239645));
-
-    return add2_dd(mul_ds(dd(vcast_vd_d(0.693147180559945286226764), vcast_vd_d(2.319046813846299558417771e-17)),
-                vcast_vd_vi(e)),
-            add2_dd(scale_d(x, vcast_vd_d(2)), mul_ds(mul_dd(x2, x), t)));
-}
-
-static INLINE vdouble expk(vdouble2 d) {
-    vdouble u = vmul(vadd(d.x, d.y), vcast_vd_d(R_LN2));
-    vint q = vrint_vi_vd(u);
-    vdouble2 s, t;
-
-    s = add2_ds(d, vmul(vcast_vd_vi(q), vcast_vd_d(-L2U)));
-    s = add2_ds(s, vmul(vcast_vd_vi(q), vcast_vd_d(-L2L)));
-
-    q = vrint_vi_vd(vmin(vmax(vcast_vd_d(-2047.49), u), vcast_vd_d(2047.49)));
-
-    s = normalize_d(s);
-
-    u = vcast_vd_d(2.51069683420950419527139e-08);
-    u = vmla(u, s.x, vcast_vd_d(2.76286166770270649116855e-07));
-    u = vmla(u, s.x, vcast_vd_d(2.75572496725023574143864e-06));
-    u = vmla(u, s.x, vcast_vd_d(2.48014973989819794114153e-05));
-    u = vmla(u, s.x, vcast_vd_d(0.000198412698809069797676111));
-    u = vmla(u, s.x, vcast_vd_d(0.0013888888939977128960529));
-    u = vmla(u, s.x, vcast_vd_d(0.00833333333332371417601081));
-    u = vmla(u, s.x, vcast_vd_d(0.0416666666665409524128449));
-    u = vmla(u, s.x, vcast_vd_d(0.166666666666666740681535));
-    u = vmla(u, s.x, vcast_vd_d(0.500000000000000999200722));
-
-    t = add_dd(s, mul_ds(squ_d(s), u));
-
-    t = add_sd(vcast_vd_d(1), t);
-    u = vadd(t.x, t.y);
-    u = vldexp(u, q);
-
-    return u;
-}
-
-static INLINE vdouble xpow(vdouble x, vdouble y) {
-#if 1
-    vmask yisint = vmask_eq(vcast_vd_vi(vrint_vi_vd(y)), y);
-    vmask yisodd = vandm(vmaski_eq(vandi(vrint_vi_vd(y), vcast_vi_i(1)), vcast_vi_i(1)), yisint);
-
-    vdouble result = expk(mul_ds(logk(vabs(x)), y));
-
-    //result = vsel(vmask_isnan(result), vcast_vd_d(rtengine::RT_INFINITY), result);
-
-    result = vmul(result,
-            vsel(vmask_gt(x, vcast_vd_d(0)),
-                vcast_vd_d(1),
-                vsel(yisint,
-                    vsel(yisodd,
-                        vcast_vd_d(-1),
-                        vcast_vd_d(1)),
-                    vcast_vd_d(rtengine::RT_NAN))));
-
-    vdouble efx = vreinterpret_vd_vm(vxorm(vreinterpret_vm_vd(vsub(vabs(x), vcast_vd_d(1))), vsignbit(y)));
-
-    result = vsel(vmask_isinf(y),
-            vsel(vmask_lt(efx, vcast_vd_d(0)),
-                vcast_vd_d(0),
-                vsel(vmask_eq(efx, vcast_vd_d(0)),
-                    vcast_vd_d(1.0),
-                    vcast_vd_d(rtengine::RT_INFINITY))),
-            result);
-
-    result = vsel(vorm(vmask_isinf(x), vmask_eq(x, vcast_vd_d(0))),
-            vmul(vsel(yisodd, vsign(x), vcast_vd_d(1)),
-                vsel(vmask_lt(vsel(vmask_eq(x, vcast_vd_d(0)), vneg(y), y), vcast_vd_d(0)),
-                    vcast_vd_d(0),
-                    vcast_vd_d(rtengine::RT_INFINITY))),
-            result);
-
-    result = vsel(vorm(vmask_isnan(x), vmask_isnan(y)), vcast_vd_d(rtengine::RT_NAN), result);
-
-    result = vsel(vorm(vmask_eq(y, vcast_vd_d(0)), vmask_eq(x, vcast_vd_d(1))), vcast_vd_d(1), result);
-
-    return result;
-#else
-    return expk(mul_ds(logk(x), y));
-#endif
-}
-
-static INLINE vdouble2 expk2(vdouble2 d) {
-    vdouble u = vmul(vadd(d.x, d.y), vcast_vd_d(R_LN2));
-    vint q = vrint_vi_vd(u);
-    vdouble2 s, t;
-
-    s = add2_ds(d, vmul(vcast_vd_vi(q), vcast_vd_d(-L2U)));
-    s = add2_ds(s, vmul(vcast_vd_vi(q), vcast_vd_d(-L2L)));
-
-    q = vrint_vi_vd(vmin(vmax(vcast_vd_d(-2047.49), u), vcast_vd_d(2047.49)));
-
-    s = normalize_d(s);
-
-    u = vcast_vd_d(2.51069683420950419527139e-08);
-    u = vmla(u, s.x, vcast_vd_d(2.76286166770270649116855e-07));
-    u = vmla(u, s.x, vcast_vd_d(2.75572496725023574143864e-06));
-    u = vmla(u, s.x, vcast_vd_d(2.48014973989819794114153e-05));
-    u = vmla(u, s.x, vcast_vd_d(0.000198412698809069797676111));
-    u = vmla(u, s.x, vcast_vd_d(0.0013888888939977128960529));
-    u = vmla(u, s.x, vcast_vd_d(0.00833333333332371417601081));
-    u = vmla(u, s.x, vcast_vd_d(0.0416666666665409524128449));
-    u = vmla(u, s.x, vcast_vd_d(0.166666666666666740681535));
-    u = vmla(u, s.x, vcast_vd_d(0.500000000000000999200722));
-
-    t = add_dd(s, mul_ds(squ_d(s), u));
-
-    t = add_sd(vcast_vd_d(1), t);
-
-    return dd(vldexp(t.x, q), vldexp(t.y, q));
-}
-
-static INLINE vdouble xsinh(vdouble x) {
-    vdouble y = vabs(x);
-    vdouble2 d = expk2(dd(y, vcast_vd_d(0)));
-    d = add2_dd(d, div_dd(dd(vcast_vd_d(-1), vcast_vd_d(0)), d));
-    y = vmul(vadd(d.x, d.y), vcast_vd_d(0.5));
-
-    y = vsel(vorm(vmask_isinf(x), vmask_isnan(y)), vcast_vd_d(rtengine::RT_INFINITY), y);
-    y = vmulsign(y, x);
-    y = vsel(vmask_isnan(x), vcast_vd_d(rtengine::RT_NAN), y);
-
-    return y;
-}
-
-static INLINE vdouble xcosh(vdouble x) {
-    vdouble2 d = expk2(dd(x, vcast_vd_d(0)));
-    d = add2_dd(d, div_dd(dd(vcast_vd_d(1), vcast_vd_d(0)), d));
-    vdouble y = vmul(vadd(d.x, d.y), vcast_vd_d(0.5));
-
-    y = vsel(vorm(vmask_isinf(x), vmask_isnan(y)), vcast_vd_d(rtengine::RT_INFINITY), y);
-    y = vsel(vmask_isnan(x), vcast_vd_d(rtengine::RT_NAN), y);
-
-    return y;
-}
-
-static INLINE vdouble xtanh(vdouble x) {
-    vdouble y = vabs(x);
-    vdouble2 d = expk2(dd(y, vcast_vd_d(0)));
-    vdouble2 e = div_dd(dd(vcast_vd_d(1), vcast_vd_d(0)), d);
-    d = div_dd(add2_dd(d, scale_d(e, vcast_vd_d(-1))), add2_dd(d, e));
-    y = d.x + d.y;
-
-    y = vsel(vorm(vmask_isinf(x), vmask_isnan(y)), vcast_vd_d(1.0), y);
-    y = vmulsign(y, x);
-    y = vsel(vmask_isnan(x), vcast_vd_d(rtengine::RT_NAN), y);
-
-    return y;
-}
-
-static INLINE vdouble2 logk2(vdouble2 d) {
-    vdouble2 x, x2, m;
-    vdouble t;
-    vint e;
-
-    d = normalize_d(d);
-    e = vilogbp1(vmul(d.x, vcast_vd_d(0.7071)));
-    m = scale_d(d, vldexp(vcast_vd_d(1), vsubi(vcast_vi_i(0), e)));
-
-    x = div_dd(add2_ds(m, vcast_vd_d(-1)), add2_ds(m, vcast_vd_d(1)));
-    x2 = squ_d(x);
-    x2 = normalize_d(x2);
-
-    t = vcast_vd_d(0.134601987501262130076155);
-    t = vmla(t, x2.x, vcast_vd_d(0.132248509032032670243288));
-    t = vmla(t, x2.x, vcast_vd_d(0.153883458318096079652524));
-    t = vmla(t, x2.x, vcast_vd_d(0.181817427573705403298686));
-    t = vmla(t, x2.x, vcast_vd_d(0.222222231326187414840781));
-    t = vmla(t, x2.x, vcast_vd_d(0.285714285651261412873718));
-    t = vmla(t, x2.x, vcast_vd_d(0.400000000000222439910458));
-    t = vmla(t, x2.x, vcast_vd_d(0.666666666666666371239645));
-
-    return add2_dd(mul_ds(dd(vcast_vd_d(0.693147180559945286226764), vcast_vd_d(2.319046813846299558417771e-17)),
-                vcast_vd_vi(e)),
-            add2_dd(scale_d(x, vcast_vd_d(2)), mul_ds(mul_dd(x2, x), t)));
-}
-
-static INLINE vdouble xasinh(vdouble x) {
-    vdouble y = vabs(x);
-    vdouble2 d = logk2(add2_ds(sqrt_d(add2_ds(mul_ss(y, y),  vcast_vd_d(1))), y));
-    y = vadd(d.x, d.y);
-
-    y = vsel(vorm(vmask_isinf(x), vmask_isnan(y)), vcast_vd_d(rtengine::RT_INFINITY), y);
-    y = vmulsign(y, x);
-    y = vsel(vmask_isnan(x), vcast_vd_d(rtengine::RT_NAN), y);
-
-    return y;
-}
-
-static INLINE vdouble xacosh(vdouble x) {
-    vdouble2 d = logk2(add2_ds(sqrt_d(add2_ds(mul_ss(x, x), vcast_vd_d(-1))), x));
-    vdouble y = vadd(d.x, d.y);
-
-    y = vsel(vorm(vmask_isinf(x), vmask_isnan(y)), vcast_vd_d(rtengine::RT_INFINITY), y);
-    y = vsel(vmask_eq(x, vcast_vd_d(1.0)), vcast_vd_d(0.0), y);
-    y = vsel(vmask_lt(x, vcast_vd_d(1.0)), vcast_vd_d(rtengine::RT_NAN), y);
-    y = vsel(vmask_isnan(x), vcast_vd_d(rtengine::RT_NAN), y);
-
-    return y;
-}
-
-static INLINE vdouble xatanh(vdouble x) {
-    vdouble y = vabs(x);
-    vdouble2 d = logk2(div_dd(add2_ss(vcast_vd_d(1), y), add2_ss(vcast_vd_d(1), -y)));
-    y = vsel(vmask_gt(y, vcast_vd_d(1.0)), vcast_vd_d(rtengine::RT_NAN), vsel(vmask_eq(y, vcast_vd_d(1.0)), vcast_vd_d(rtengine::RT_INFINITY), vmul(vadd(d.x, d.y), vcast_vd_d(0.5))));
-
-    y = vsel(vorm(vmask_isinf(x), vmask_isnan(y)), vcast_vd_d(rtengine::RT_NAN), y);
-    y = vmulsign(y, x);
-    y = vsel(vmask_isnan(x), vcast_vd_d(rtengine::RT_NAN), y);
-
-    return y;
-}
-
-static INLINE vdouble xcbrt(vdouble d) {
-    vdouble x, y, q = vcast_vd_d(1.0);
-    vint e, qu, re;
-    vdouble t;
-
-    e = vilogbp1(vabs(d));
-    d = vldexp(d, vsubi(vcast_vi_i(0), e));
-
-    t = vadd(vcast_vd_vi(e), vcast_vd_d(6144));
-    qu = vtruncate_vi_vd(vdiv(t, vcast_vd_d(3)));
-    re = vtruncate_vi_vd(vsub(t, vmul(vcast_vd_vi(qu), vcast_vd_d(3))));
-
-    q = vsel(vmaski_eq(re, vcast_vi_i(1)), vcast_vd_d(1.2599210498948731647672106), q);
-    q = vsel(vmaski_eq(re, vcast_vi_i(2)), vcast_vd_d(1.5874010519681994747517056), q);
-    q = vldexp(q, vsubi(qu, vcast_vi_i(2048)));
-
-    q = vmulsign(q, d);
-
-    d = vabs(d);
-
-    x = vcast_vd_d(-0.640245898480692909870982);
-    x = vmla(x, d, vcast_vd_d(2.96155103020039511818595));
-    x = vmla(x, d, vcast_vd_d(-5.73353060922947843636166));
-    x = vmla(x, d, vcast_vd_d(6.03990368989458747961407));
-    x = vmla(x, d, vcast_vd_d(-3.85841935510444988821632));
-    x = vmla(x, d, vcast_vd_d(2.2307275302496609725722));
-
-    y = vmul(x, x); y = vmul(y, y); x = vsub(x, vmul(vmla(d, y, vneg(x)), vcast_vd_d(1.0 / 3.0)));
-    y = vmul(vmul(d, x), x);
-    y = vmul(vsub(y, vmul(vmul(vcast_vd_d(2.0 / 3.0), y), vmla(y, x, vcast_vd_d(-1.0)))), q);
-
-    return y;
-}
-
-static INLINE vdouble xexp2(vdouble a) {
-    vdouble u = expk(mul_ds(dd(vcast_vd_d(0.69314718055994528623), vcast_vd_d(2.3190468138462995584e-17)), a));
-    u = vsel(vmask_ispinf(a), vcast_vd_d(rtengine::RT_INFINITY), u);
-    u = vsel(vmask_isminf(a), vcast_vd_d(0), u);
-    return u;
-}
-
-static INLINE vdouble xexp10(vdouble a) {
-    vdouble u = expk(mul_ds(dd(vcast_vd_d(2.3025850929940459011), vcast_vd_d(-2.1707562233822493508e-16)), a));
-    u = vsel(vmask_ispinf(a), vcast_vd_d(rtengine::RT_INFINITY), u);
-    u = vsel(vmask_isminf(a), vcast_vd_d(0), u);
-    return u;
-}
-
-static INLINE vdouble xexpm1(vdouble a) {
-    vdouble2 d = add2_ds(expk2(dd(a, vcast_vd_d(0))), vcast_vd_d(-1.0));
-    vdouble x = d.x + d.y;
-    x = vsel(vmask_ispinf(a), vcast_vd_d(rtengine::RT_INFINITY), x);
-    x = vsel(vmask_isminf(a), vcast_vd_d(-1), x);
-    return x;
-}
-
-static INLINE vdouble xlog10(vdouble a) {
-    vdouble2 d = mul_dd(logk(a), dd(vcast_vd_d(0.43429448190325176116), vcast_vd_d(6.6494347733425473126e-17)));
-    vdouble x = d.x + d.y;
-
-    x = vsel(vmask_ispinf(a), vcast_vd_d(rtengine::RT_INFINITY), x);
-    x = vsel(vmask_gt(vcast_vd_d(0), a), vcast_vd_d(rtengine::RT_NAN), x);
-    x = vsel(vmask_eq(a, vcast_vd_d(0)), vcast_vd_d(-rtengine::RT_INFINITY), x);
-
-    return x;
-}
-
-static INLINE vdouble xlog1p(vdouble a) {
-    vdouble2 d = logk2(add2_ss(a, vcast_vd_d(1)));
-    vdouble x = d.x + d.y;
-
-    x = vsel(vmask_ispinf(a), vcast_vd_d(rtengine::RT_INFINITY), x);
-    x = vsel(vmask_gt(vcast_vd_d(-1), a), vcast_vd_d(rtengine::RT_NAN), x);
-    x = vsel(vmask_eq(a, vcast_vd_d(-1)), vcast_vd_d(-rtengine::RT_INFINITY), x);
-
-    return x;
-}
-
-//
-
-typedef struct {
-    vfloat x, y;
-} vfloat2;
-
-static INLINE vfloat vabsf(vfloat f) { return (vfloat)vandnotm((vmask)vcast_vf_f(-0.0f), (vmask)f); }
-static INLINE vfloat vnegf(vfloat f) { return (vfloat)vxorm((vmask)f, (vmask)vcast_vf_f(-0.0f)); }
-
-#ifdef __SSE4_1__
-// only one instruction when using SSE4.1
-static INLINE vfloat vself(vmask mask, vfloat x, vfloat y) {
-    return _mm_blendv_ps(y,x,(vfloat)mask);
-}
-
-static INLINE vint vselc(vmask mask, vint x, vint y) {
-    return _mm_blendv_epi8(y,x,mask);
-}
-
-#else
-// three instructions when using SSE2
-static INLINE vfloat vself(vmask mask, vfloat x, vfloat y) {
-    return (vfloat)vorm(vandm(mask, (vmask)x), vandnotm(mask, (vmask)y));
-}
-
-static INLINE vint vselc(vmask mask, vint x, vint y) {
-    return vorm(vandm(mask, (vmask)x), vandnotm(mask, (vmask)y));
-}
-#endif
-
-static INLINE vfloat vselfzero(vmask mask, vfloat x) {
-    // returns value of x if corresponding mask bits are 1, else returns 0
-    // faster than vself(mask, x, ZEROV)
-    return _mm_and_ps((vfloat)mask, x);
-}
-static INLINE vfloat vselfnotzero(vmask mask, vfloat x) {
-    // returns value of x if corresponding mask bits are 0, else returns 0
-    // faster than vself(mask, ZEROV, x)
-    return _mm_andnot_ps((vfloat)mask, x);
-}
-
-static INLINE vint vselizero(vmask mask, vint x) {
-    // returns value of x if corresponding mask bits are 1, else returns 0
-    // faster than vselc(mask, x, ZEROV)
-    return _mm_and_si128(mask, x);
-}
-static INLINE vint vselinotzero(vmask mask, vint x) {
-    // returns value of x if corresponding mask bits are 0, else returns 0
-    // faster than vselc(mask, ZEROV, x)
-    return _mm_andnot_si128(mask, x);
-}
-
-static INLINE vint2 vseli2_lt(vfloat f0, vfloat f1, vint2 x, vint2 y) {
-    vint2 m2 = vcast_vi2_vm(vmaskf_lt(f0, f1));
-    return vori2(vandi2(m2, x), vandnoti2(m2, y));
-}
-
-static INLINE vmask vsignbitf(vfloat f) {
-    return vandm((vmask)f, (vmask)vcast_vf_f(-0.0f));
-}
-
-static INLINE vfloat vmulsignf(vfloat x, vfloat y) {
-    return (vfloat)vxorm((vmask)x, vsignbitf(y));
-}
-
-static INLINE vfloat vsignf(vfloat f) {
-    return (vfloat)vorm((vmask)vcast_vf_f(1.0f), vandm((vmask)vcast_vf_f(-0.0f), (vmask)f));
-}
-
-static INLINE vmask vmaskf_isinf(vfloat d) { return vmaskf_eq(vabsf(d), vcast_vf_f(INFINITYf)); }
-static INLINE vmask vmaskf_ispinf(vfloat d) { return vmaskf_eq(d, vcast_vf_f(INFINITYf)); }
-static INLINE vmask vmaskf_isminf(vfloat d) { return vmaskf_eq(d, vcast_vf_f(-INFINITYf)); }
-static INLINE vmask vmaskf_isnan(vfloat d) { return vmaskf_neq(d, d); }
-// the following is equivalent to vorm(vmaskf_isnan(a), vmaskf_isnan(b)), but faster
-static INLINE vmask vmaskf_isnan(vfloat a, vfloat b) { return (vmask)_mm_cmpunord_ps(a, b); }
-static INLINE vfloat visinf2f(vfloat d, vfloat m) { return (vfloat)vandm(vmaskf_isinf(d), vorm(vsignbitf(d), (vmask)m)); }
-static INLINE vfloat visinff(vfloat d) { return visinf2f(d, vcast_vf_f(1.0f)); }
-
-static INLINE vint2 vilogbp1f(vfloat d) {
-    vmask m = vmaskf_lt(d, vcast_vf_f(5.421010862427522E-20f));
-    d = vself(m, vmulf(vcast_vf_f(1.8446744073709552E19f), d), d);
-    vint2 q = vandi2(vsrli2(vcast_vi2_vm(vreinterpret_vm_vf(d)), 23), vcast_vi2_i(0xff));
-    q = vsubi2(q, vseli2(m, vcast_vi2_i(64 + 0x7e), vcast_vi2_i(0x7e)));
-    return q;
-}
-
-static INLINE vfloat vldexpf(vfloat x, vint2 q) {
-    vfloat u;
-    vint2 m = vsrai2(q, 31);
-    m = vslli2(vsubi2(vsrai2(vaddi2(m, q), 6), m), 4);
-    q = vsubi2(q, vslli2(m, 2));
-    u = vreinterpret_vf_vm(vcast_vm_vi2(vslli2(vaddi2(m, vcast_vi2_i(0x7f)), 23)));
-    x = vmulf(vmulf(vmulf(vmulf(x, u), u), u), u);
-    u = vreinterpret_vf_vm(vcast_vm_vi2(vslli2(vaddi2(q, vcast_vi2_i(0x7f)), 23)));
-    return vmulf(x, u);
-}
-
-static INLINE vfloat xsinf(vfloat d) {
-    vint2 q;
-    vfloat u, s;
-
-    q = vrint_vi2_vf(vmulf(d, vcast_vf_f((float)rtengine::RT_1_PI)));
-
-    u = vcast_vf_vi2(q);
-    d = vmlaf(u, vcast_vf_f(-PI4_Af*4), d);
-    d = vmlaf(u, vcast_vf_f(-PI4_Bf*4), d);
-    d = vmlaf(u, vcast_vf_f(-PI4_Cf*4), d);
-    d = vmlaf(u, vcast_vf_f(-PI4_Df*4), d);
-
-    s = vmulf(d, d);
-
-    d = vself(vmaski2_eq(vandi2(q, vcast_vi2_i(1)), vcast_vi2_i(1)), vnegf(d), d);
-
-    u = vcast_vf_f(2.6083159809786593541503e-06f);
-    u = vmlaf(u, s, vcast_vf_f(-0.0001981069071916863322258f));
-    u = vmlaf(u, s, vcast_vf_f(0.00833307858556509017944336f));
-    u = vmlaf(u, s, vcast_vf_f(-0.166666597127914428710938f));
-
-    u = vmlaf(s, vmulf(u, d), d);
-
-    return u;
-}
-
-static INLINE vfloat xcosf(vfloat d) {
-    vint2 q;
-    vfloat u, s;
-
-    q = vrint_vi2_vf(vsubf(vmulf(d, vcast_vf_f((float)rtengine::RT_1_PI)), vcast_vf_f(0.5f)));
-    q = vaddi2(vaddi2(q, q), vcast_vi2_i(1));
-
-    u = vcast_vf_vi2(q);
-    d = vmlaf(u, vcast_vf_f(-PI4_Af*2), d);
-    d = vmlaf(u, vcast_vf_f(-PI4_Bf*2), d);
-    d = vmlaf(u, vcast_vf_f(-PI4_Cf*2), d);
-    d = vmlaf(u, vcast_vf_f(-PI4_Df*2), d);
-
-    s = vmulf(d, d);
-
-    d = vself(vmaski2_eq(vandi2(q, vcast_vi2_i(2)), vcast_vi2_i(2)), d, vnegf(d));
-
-    u = vcast_vf_f(2.6083159809786593541503e-06f);
-    u = vmlaf(u, s, vcast_vf_f(-0.0001981069071916863322258f));
-    u = vmlaf(u, s, vcast_vf_f(0.00833307858556509017944336f));
-    u = vmlaf(u, s, vcast_vf_f(-0.166666597127914428710938f));
-
-    u = vmlaf(s, vmulf(u, d), d);
-
-    return u;
-}
-
-static INLINE vfloat2 xsincosf(vfloat d) {
-    vint2 q;
-    vmask m;
-    vfloat u, s, t, rx, ry;
-    vfloat2 r;
-
-    q = vrint_vi2_vf(vmulf(d, vcast_vf_f((float)rtengine::RT_2_PI)));
-
-    s = d;
-
-    u = vcast_vf_vi2(q);
-    s = vmlaf(u, vcast_vf_f(-PI4_Af*2), s);
-    s = vmlaf(u, vcast_vf_f(-PI4_Bf*2), s);
-    s = vmlaf(u, vcast_vf_f(-PI4_Cf*2), s);
-    s = vmlaf(u, vcast_vf_f(-PI4_Df*2), s);
-
-    t = s;
-
-    s = vmulf(s, s);
-
-    u = vcast_vf_f(-0.000195169282960705459117889f);
-    u = vmlaf(u, s, vcast_vf_f(0.00833215750753879547119141f));
-    u = vmlaf(u, s, vcast_vf_f(-0.166666537523269653320312f));
-    u = vmulf(vmulf(u, s), t);
-
-    rx = vaddf(t, u);
-
-    u = vcast_vf_f(-2.71811842367242206819355e-07f);
-    u = vmlaf(u, s, vcast_vf_f(2.47990446951007470488548e-05f));
-    u = vmlaf(u, s, vcast_vf_f(-0.00138888787478208541870117f));
-    u = vmlaf(u, s, vcast_vf_f(0.0416666641831398010253906f));
-    u = vmlaf(u, s, vcast_vf_f(-0.5));
-
-    ry = vaddf(vcast_vf_f(1), vmulf(s, u));
-
-    m = vmaski2_eq(vandi2(q, vcast_vi2_i(1)), vcast_vi2_i(0));
-    r.x = vself(m, rx, ry);
-    r.y = vself(m, ry, rx);
-
-    m = vmaski2_eq(vandi2(q, vcast_vi2_i(2)), vcast_vi2_i(2));
-    r.x = vreinterpret_vf_vm(vxorm(vandm(m, vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(r.x)));
-
-    m = vmaski2_eq(vandi2(vaddi2(q, vcast_vi2_i(1)), vcast_vi2_i(2)), vcast_vi2_i(2));
-    r.y = vreinterpret_vf_vm(vxorm(vandm(m, vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(r.y)));
-
-    m = vmaskf_isinf(d);
-    r.x = vself(m, vcast_vf_f(rtengine::RT_NAN), r.x);
-    r.y = vself(m, vcast_vf_f(rtengine::RT_NAN), r.y);
-
-    return r;
-}
-
-static INLINE vfloat xtanf(vfloat d) {
-    vint2 q;
-    vmask m;
-    vfloat u, s, x;
-
-    q = vrint_vi2_vf(vmulf(d, vcast_vf_f((float)(2 * rtengine::RT_1_PI))));
-
-    x = d;
-
-    u = vcast_vf_vi2(q);
-    x = vmlaf(u, vcast_vf_f(-PI4_Af*2), x);
-    x = vmlaf(u, vcast_vf_f(-PI4_Bf*2), x);
-    x = vmlaf(u, vcast_vf_f(-PI4_Cf*2), x);
-    x = vmlaf(u, vcast_vf_f(-PI4_Df*2), x);
-
-    s = vmulf(x, x);
-
-    m = vmaski2_eq(vandi2(q, vcast_vi2_i(1)), vcast_vi2_i(1));
-    x = vself(m, vnegf(x), x);
-
-    u = vcast_vf_f(0.00927245803177356719970703f);
-    u = vmlaf(u, s, vcast_vf_f(0.00331984995864331722259521f));
-    u = vmlaf(u, s, vcast_vf_f(0.0242998078465461730957031f));
-    u = vmlaf(u, s, vcast_vf_f(0.0534495301544666290283203f));
-    u = vmlaf(u, s, vcast_vf_f(0.133383005857467651367188f));
-    u = vmlaf(u, s, vcast_vf_f(0.333331853151321411132812f));
-
-    u = vmlaf(s, vmulf(u, x), x);
-
-    u = vself(m, vrecf(u), u);
-
-    u = vself(vmaskf_isinf(d), vcast_vf_f(NANf), u);
-
-    return u;
-}
-
-static INLINE vfloat xatanf(vfloat s) {
-    vfloat t, u;
-    vint2 q;
-
-    q = vseli2_lt(s, vcast_vf_f(0.0f), vcast_vi2_i(2), vcast_vi2_i(0));
-    s = vabsf(s);
-
-    q = vseli2_lt(vcast_vf_f(1.0f), s, vaddi2(q, vcast_vi2_i(1)), q);
-    s = vself(vmaskf_lt(vcast_vf_f(1.0f), s), vdivf(vcast_vf_f(1.0f), s), s);
-
-    t = vmulf(s, s);
-
-    u = vcast_vf_f(0.00282363896258175373077393f);
-    u = vmlaf(u, t, vcast_vf_f(-0.0159569028764963150024414f));
-    u = vmlaf(u, t, vcast_vf_f(0.0425049886107444763183594f));
-    u = vmlaf(u, t, vcast_vf_f(-0.0748900920152664184570312f));
-    u = vmlaf(u, t, vcast_vf_f(0.106347933411598205566406f));
-    u = vmlaf(u, t, vcast_vf_f(-0.142027363181114196777344f));
-    u = vmlaf(u, t, vcast_vf_f(0.199926957488059997558594f));
-    u = vmlaf(u, t, vcast_vf_f(-0.333331018686294555664062f));
-
-    t = vaddf(s, vmulf(s, vmulf(t, u)));
-
-    t = vself(vmaski2_eq(vandi2(q, vcast_vi2_i(1)), vcast_vi2_i(1)), vsubf(vcast_vf_f((float)(rtengine::RT_PI/2)), t), t);
-    t = vself(vmaski2_eq(vandi2(q, vcast_vi2_i(2)), vcast_vi2_i(2)), vnegf(t), t);
-
-    return t;
-}
-
-static INLINE vfloat atan2kf(vfloat y, vfloat x) {
-    vfloat s, t, u;
-    vint2 q;
-    vmask p;
-
-    q = vseli2_lt(x, vcast_vf_f(0.0f), vcast_vi2_i(-2), vcast_vi2_i(0));
-    x = vabsf(x);
-
-    q = vseli2_lt(x, y, vaddi2(q, vcast_vi2_i(1)), q);
-    p = vmaskf_lt(x, y);
-    s = vself(p, vnegf(x), y);
-    t = vmaxf(x, y);
-
-    s = vdivf(s, t);
-    t = vmulf(s, s);
-
-    u = vcast_vf_f(0.00282363896258175373077393f);
-    u = vmlaf(u, t, vcast_vf_f(-0.0159569028764963150024414f));
-    u = vmlaf(u, t, vcast_vf_f(0.0425049886107444763183594f));
-    u = vmlaf(u, t, vcast_vf_f(-0.0748900920152664184570312f));
-    u = vmlaf(u, t, vcast_vf_f(0.106347933411598205566406f));
-    u = vmlaf(u, t, vcast_vf_f(-0.142027363181114196777344f));
-    u = vmlaf(u, t, vcast_vf_f(0.199926957488059997558594f));
-    u = vmlaf(u, t, vcast_vf_f(-0.333331018686294555664062f));
-
-    t = vaddf(s, vmulf(s, vmulf(t, u)));
-    t = vaddf(t, vmulf(vcast_vf_vi2(q), vcast_vf_f((float)(rtengine::RT_PI/2))));
-
-    return t;
-}
-
-static INLINE vfloat xatan2f(vfloat y, vfloat x) {
-    vfloat r = atan2kf(vabsf(y), x);
-
-    r = vmulsignf(r, x);
-    r = vself(vorm(vmaskf_isinf(x), vmaskf_eq(x, vcast_vf_f(0.0f))), vsubf(vcast_vf_f((float)(rtengine::RT_PI/2)), visinf2f(x, vmulsignf(vcast_vf_f((float)(rtengine::RT_PI/2)), x))), r);
-    r = vself(vmaskf_isinf(y), vsubf(vcast_vf_f((float)(rtengine::RT_PI/2)), visinf2f(x, vmulsignf(vcast_vf_f((float)(rtengine::RT_PI/4)), x))), r);
-    r = vself(vmaskf_eq(y, vcast_vf_f(0.0f)), vselfzero(vmaskf_eq(vsignf(x), vcast_vf_f(-1.0f)), vcast_vf_f((float)rtengine::RT_PI)), r);
-
-    return vself(vmaskf_isnan(x, y), vcast_vf_f(NANf), vmulsignf(r, y));
-}
-
-static INLINE vfloat xasinf(vfloat d) {
-    vfloat x, y;
-    x = vaddf(vcast_vf_f(1.0f), d);
-    y = vsubf(vcast_vf_f(1.0f), d);
-    x = vmulf(x, y);
-    x = vsqrtf(x);
-    x = vself(vmaskf_isnan(x), vcast_vf_f(NANf), atan2kf(vabsf(d), x));
-    return vmulsignf(x, d);
-}
-
-static INLINE vfloat xacosf(vfloat d) {
-    vfloat x, y;
-    x = vaddf(vcast_vf_f(1.0f), d);
-    y = vsubf(vcast_vf_f(1.0f), d);
-    x = vmulf(x, y);
-    x = vsqrtf(x);
-    x = vmulsignf(atan2kf(x, vabsf(d)), d);
-    y = (vfloat)vandm(vmaskf_lt(d, vcast_vf_f(0.0f)), (vmask)vcast_vf_f((float)rtengine::RT_PI));
-    x = vaddf(x, y);
-    return x;
-}
-
-static INLINE vfloat xlogf(vfloat d) {
-    vfloat x, x2, t, m;
-    vint2 e;
-
-    e = vilogbp1f(vmulf(d, vcast_vf_f(0.7071f)));
-    m = vldexpf(d, vsubi2(vcast_vi2_i(0), e));
-
-    x = vdivf(vaddf(vcast_vf_f(-1.0f), m), vaddf(vcast_vf_f(1.0f), m));
-    x2 = vmulf(x, x);
-
-    t = vcast_vf_f(0.2371599674224853515625f);
-    t = vmlaf(t, x2, vcast_vf_f(0.285279005765914916992188f));
-    t = vmlaf(t, x2, vcast_vf_f(0.400005519390106201171875f));
-    t = vmlaf(t, x2, vcast_vf_f(0.666666567325592041015625f));
-    t = vmlaf(t, x2, vcast_vf_f(2.0f));
-
-    x = vaddf(vmulf(x, t), vmulf(vcast_vf_f(0.693147180559945286226764f), vcast_vf_vi2(e)));
-
-    x = vself(vmaskf_ispinf(d), vcast_vf_f(INFINITYf), x);
-    x = vself(vmaskf_gt(vcast_vf_f(0), d), vcast_vf_f(NANf), x);
-    x = vself(vmaskf_eq(d, vcast_vf_f(0)), vcast_vf_f(-INFINITYf), x);
-
-    return x;
-}
-
-static INLINE vfloat xlogf1(vfloat d) { // does xlogf(vmaxf(d, 1.f)) but faster
-    vfloat x, x2, t, m;
-    vint2 e;
-
-    e = vilogbp1f(vmulf(d, vcast_vf_f(0.7071f)));
-    m = vldexpf(d, vsubi2(vcast_vi2_i(0), e));
-
-    x = vdivf(vaddf(vcast_vf_f(-1.0f), m), vaddf(vcast_vf_f(1.0f), m));
-    x2 = vmulf(x, x);
-
-    t = vcast_vf_f(0.2371599674224853515625f);
-    t = vmlaf(t, x2, vcast_vf_f(0.285279005765914916992188f));
-    t = vmlaf(t, x2, vcast_vf_f(0.400005519390106201171875f));
-    t = vmlaf(t, x2, vcast_vf_f(0.666666567325592041015625f));
-    t = vmlaf(t, x2, vcast_vf_f(2.0f));
-
-    x = vaddf(vmulf(x, t), vmulf(vcast_vf_f(0.693147180559945286226764f), vcast_vf_vi2(e)));
-
-    x = vself(vmaskf_ispinf(d), vcast_vf_f(INFINITYf), x);
-    x = vselfnotzero(vmaskf_le(d, vcast_vf_f(1.f)), x);
-
-    return x;
-}
-
-static INLINE vfloat xlogf0(vfloat d) {
-    vfloat x, x2, t, m;
-    vint2 e;
-
-    e = vilogbp1f(vmulf(d, vcast_vf_f(0.7071f)));
-    m = vldexpf(d, vsubi2(vcast_vi2_i(0), e));
-
-    x = vdivf(vaddf(vcast_vf_f(-1.0f), m), vaddf(vcast_vf_f(1.0f), m));
-    x2 = vmulf(x, x);
-
-    t = vcast_vf_f(0.2371599674224853515625f);
-    t = vmlaf(t, x2, vcast_vf_f(0.285279005765914916992188f));
-    t = vmlaf(t, x2, vcast_vf_f(0.400005519390106201171875f));
-    t = vmlaf(t, x2, vcast_vf_f(0.666666567325592041015625f));
-    t = vmlaf(t, x2, vcast_vf_f(2.0f));
-
-    x = vaddf(vmulf(x, t), vmulf(vcast_vf_f(0.693147180559945286226764f), vcast_vf_vi2(e)));
-
-    x = vself(vmaskf_ispinf(d), vcast_vf_f(0), x);
-    x = vself(vmaskf_gt(vcast_vf_f(0), d), vcast_vf_f(0), x);
-    x = vself(vmaskf_eq(d, vcast_vf_f(0)), vcast_vf_f(0), x);
-
-    return x;
-}
-
-static INLINE vfloat xlogfNoCheck(vfloat d) { // this version does not check input values. Use it only when you know the input values are > 0 e.g. when filling a lookup table
-    vfloat x, x2, t, m;
-    vint2 e;
-
-    e = vilogbp1f(vmulf(d, vcast_vf_f(0.7071f)));
-    m = vldexpf(d, vsubi2(vcast_vi2_i(0), e));
-
-    x = vdivf(vaddf(vcast_vf_f(-1.0f), m), vaddf(vcast_vf_f(1.0f), m));
-    x2 = vmulf(x, x);
-
-    t = vcast_vf_f(0.2371599674224853515625f);
-    t = vmlaf(t, x2, vcast_vf_f(0.285279005765914916992188f));
-    t = vmlaf(t, x2, vcast_vf_f(0.400005519390106201171875f));
-    t = vmlaf(t, x2, vcast_vf_f(0.666666567325592041015625f));
-    t = vmlaf(t, x2, vcast_vf_f(2.0f));
-
-    return vaddf(vmulf(x, t), vmulf(vcast_vf_f(0.693147180559945286226764f), vcast_vf_vi2(e)));
-
-}
-
-static INLINE vfloat xexpf(vfloat d) {
-    vint2 q = vrint_vi2_vf(vmulf(d, vcast_vf_f(R_LN2f)));
-    vfloat s, u;
-
-    s = vmlaf(vcast_vf_vi2(q), vcast_vf_f(-L2Uf),d);
-    s = vmlaf(vcast_vf_vi2(q), vcast_vf_f(-L2Lf),s);
-
-    u = vcast_vf_f(0.00136324646882712841033936f);
-    u = vmlaf(u, s, vcast_vf_f(0.00836596917361021041870117f));
-    u = vmlaf(u, s, vcast_vf_f(0.0416710823774337768554688f));
-    u = vmlaf(u, s, vcast_vf_f(0.166665524244308471679688f));
-    u = vmlaf(u, s, vcast_vf_f(0.499999850988388061523438f));
-
-    u = vaddf(vcast_vf_f(1.0f), vmlaf(vmulf(s, s), u, s));
-
-    u = vldexpf(u, q);
-
-    // -104.0
-    return vselfnotzero(vmaskf_gt(vcast_vf_f(-104.f), d), u);
-}
-
-static INLINE vfloat xexpfNoCheck(vfloat d) { // this version does not check input values. Use it only when you know the input values are > -104.f e.g. when filling a lookup table
-    vint2 q = vrint_vi2_vf(vmulf(d, vcast_vf_f(R_LN2f)));
-    vfloat s, u;
-
-    s = vmlaf(vcast_vf_vi2(q), vcast_vf_f(-L2Uf),d);
-    s = vmlaf(vcast_vf_vi2(q), vcast_vf_f(-L2Lf),s);
-
-    u = vcast_vf_f(0.00136324646882712841033936f);
-    u = vmlaf(u, s, vcast_vf_f(0.00836596917361021041870117f));
-    u = vmlaf(u, s, vcast_vf_f(0.0416710823774337768554688f));
-    u = vmlaf(u, s, vcast_vf_f(0.166665524244308471679688f));
-    u = vmlaf(u, s, vcast_vf_f(0.499999850988388061523438f));
-
-    u = vaddf(vcast_vf_f(1.0f), vmlaf(vmulf(s, s), u, s));
-
-    return vldexpf(u, q);
-}
-
-static INLINE vfloat xcbrtf(vfloat d) {
-    vfloat x, y, q = vcast_vf_f(1.0), t;
-    vint2 e, qu, re;
-
-    e = vilogbp1f(vabsf(d));
-    d = vldexpf(d, vsubi2(vcast_vi2_i(0), e));
-
-    t = vaddf(vcast_vf_vi2(e), vcast_vf_f(6144));
-    qu = vtruncate_vi2_vf(vdivf(t, vcast_vf_f(3)));
-    re = vtruncate_vi2_vf(vsubf(t, vmulf(vcast_vf_vi2(qu), vcast_vf_f(3))));
-
-    q = vself(vmaski2_eq(re, vcast_vi2_i(1)), vcast_vf_f(1.2599210498948731647672106f), q);
-    q = vself(vmaski2_eq(re, vcast_vi2_i(2)), vcast_vf_f(1.5874010519681994747517056f), q);
-    q = vldexpf(q, vsubi2(qu, vcast_vi2_i(2048)));
-
-    q = vmulsignf(q, d);
-    d = vabsf(d);
-
-    x = vcast_vf_f(-0.601564466953277587890625f);
-    x = vmlaf(x, d, vcast_vf_f(2.8208892345428466796875f));
-    x = vmlaf(x, d, vcast_vf_f(-5.532182216644287109375f));
-    x = vmlaf(x, d, vcast_vf_f(5.898262500762939453125f));
-    x = vmlaf(x, d, vcast_vf_f(-3.8095417022705078125f));
-    x = vmlaf(x, d, vcast_vf_f(2.2241256237030029296875f));
-
-    y = vmulf(vmulf(d, x), x);
-    y = vmulf(vsubf(y, vmulf(vmulf(vcast_vf_f(2.0f / 3.0f), y), vmlaf(y, x, vcast_vf_f(-1.0f)))), q);
-
-    return y;
-}
-
-static INLINE vfloat vclampf(vfloat value, vfloat low, vfloat high) {
-    // clamps value in [low;high], returns low if value is NaN
-    return vmaxf(vminf(high, value), low);
-}
-
-static INLINE vfloat SQRV(vfloat a){
-    return a * a;
-}
-
-static inline void vswap( vmask condition, vfloat &a, vfloat &b) {
-    // conditional swap the elements of two vfloats
-    vfloat temp = vself(condition, a, b); // the values which fit to condition
-    a = vself(condition, b, a); // the values which fit to inverted condition
-    b = temp;
-}
-
-static inline float vhadd( vfloat a ) {
-    // returns a[0] + a[1] + a[2] + a[3]
-    a += _mm_movehl_ps(a, a);
-    return _mm_cvtss_f32(_mm_add_ss(a, _mm_shuffle_ps(a, a, 1)));
-}
-
-static inline float vhmin(vfloat a) {
-    // returns min(a[0], a[1], a[2], a[3])
-    a = vminf(a, _mm_movehl_ps(a, a));
-    return _mm_cvtss_f32(vminf(a, _mm_shuffle_ps(a, a, 1)));
-}
-
-static inline float vhmax(vfloat a) {
-    // returns max(a[0], a[1], a[2], a[3])
-    a = vmaxf(a, _mm_movehl_ps(a, a));
-    return _mm_cvtss_f32(vmaxf(a, _mm_shuffle_ps(a, a, 1)));
-}
-
-static INLINE vfloat vmul2f(vfloat a){
-    // fastest way to multiply by 2
-    return a + a;
-}
-
-static INLINE vfloat vintpf(vfloat a, vfloat b, vfloat c) {
-    // calculate a * b + (1 - a) * c (interpolate two values)
-    // following is valid:
-    // vintpf(a, b+x, c+x) = vintpf(a, b, c) + x
-    // vintpf(a, b*x, c*x) = vintpf(a, b, c) * x
-    return a * (b-c) + c;
-}
-
-static INLINE vfloat vdup(vfloat a){
-    // returns { a[0],a[0],a[1],a[1] }
-    return _mm_unpacklo_ps( a, a );
-}
-
-static INLINE vfloat vaddc2vfu(float &a)
-{
-    // loads a[0]..a[7] and returns { a[0]+a[1], a[2]+a[3], a[4]+a[5], a[6]+a[7] }
-    vfloat a1 = _mm_loadu_ps( &a );
-    vfloat a2 = _mm_loadu_ps( (&a) + 4 );
-    return _mm_shuffle_ps(a1,a2,_MM_SHUFFLE( 2,0,2,0 )) + _mm_shuffle_ps(a1,a2,_MM_SHUFFLE( 3,1,3,1 ));
-}
-
-static INLINE vfloat vadivapb (vfloat a, vfloat b) {
-    return a / (a+b);
-}
-
-static INLINE void vconvertrgbrgbrgbrgb2rrrrggggbbbb (const float * src, vfloat &rv, vfloat &gv, vfloat &bv) { // cool function name, isn't it ? :P
-    // converts a sequence of 4 float RGB triplets to 3 red, green and blue quadruples
-    rv = _mm_setr_ps(src[0],src[3],src[6],src[9]);
-    gv = _mm_setr_ps(src[1],src[4],src[7],src[10]);
-    bv = _mm_setr_ps(src[2],src[5],src[8],src[11]);
-}
-
-#if defined( __SSE4_1__ ) && defined( __x86_64__ )
-static INLINE vfloat vceilf(vfloat x) {
-    return _mm_round_ps(x, _MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC);
-}
-
-#else
-
-static INLINE vfloat vceilf(vfloat x) {
-    __m128i zerov = _mm_setzero_si128();
-    zerov = _mm_cmpeq_epi32(zerov, zerov);
-    const vfloat onev = (vfloat)_mm_slli_epi32(_mm_srli_epi32(zerov, 25), 23); //create vector 1.0f
-    const vfloat xi = _mm_cvtepi32_ps(_mm_cvttps_epi32(x));
-    return xi + _mm_and_ps(_mm_cmplt_ps(xi, x), onev);
-}
-#endif
-
-#endif // __SSE2__
-#endif // SLEEFSSEAVX
diff --git a/rtengine/sleefsseavx.h b/rtengine/sleefsseavx.h
index 8fe20c54b..c10f4a0c2 100644
--- a/rtengine/sleefsseavx.h
+++ b/rtengine/sleefsseavx.h
@@ -1,124 +1,897 @@
-#include <immintrin.h>
-#include <stdint.h>
+////////////////////////////////////////////////////////////////
+//
+//  this code was taken from http://shibatch.sourceforge.net/
+//  Many thanks to the author of original version: Naoki Shibata
+//
+//  This version contains modifications made by Ingo Weyrich
+//
+////////////////////////////////////////////////////////////////
+#pragma once
 
+#include "rt_math.h"
 #ifdef __SSE2__
-#define VECTLENDP 2
-#define VECTLENSP 4
-
-typedef __m128d vdouble;
-typedef __m128i vint;
-
-typedef __m128 vfloat;
-typedef __m128i vint2;
-typedef __m128i vmask;
-
-static vdouble vloadu(double *p)
-{
-    return _mm_loadu_pd(p);
-}
-static void vstoreu(double *p, vdouble v)
-{
-    _mm_storeu_pd(p, v);
-}
-
-static vfloat vloaduf(float *p)
-{
-    return _mm_loadu_ps(p);
-}
-static void vstoreuf(float *p, vfloat v)
-{
-    _mm_storeu_ps(p, v);
-}
-
-static vint2 vloadui2(int32_t *p)
-{
-    return (vint2)_mm_loadu_si128((__m128i *)p);
-}
-static void vstoreui2(int32_t *p, vint2 v)
-{
-    _mm_storeu_si128((__m128i *)p, (__m128i)v);
-}
-#endif
+#include "helpersse2.h"
 
 #ifdef ENABLE_AVX
-#define VECTLENDP 4
-#define VECTLENSP 8
+#include "helperavx.h"
+#endif
 
-typedef __m256d vdouble;
-typedef __m128i vint;
+#ifdef __GNUC__
+#define INLINE __inline
+#else
+#define INLINE inline
+#endif
 
+#define PI4_A .7853981554508209228515625
+#define PI4_B .794662735614792836713604629039764404296875e-8
+#define PI4_C .306161699786838294306516483068750264552437361480769e-16
+#define M_4_PI 1.273239544735162542821171882678754627704620361328125
 
-typedef __m256 vfloat;
-typedef struct {
-    vint x, y;
-} vint2;
+#define L2U .69314718055966295651160180568695068359375
+#define L2L .28235290563031577122588448175013436025525412068e-12
+#define R_LN2 1.442695040888963407359924681001892137426645954152985934135449406931
 
-static vdouble vloadu(double *p)
-{
-    return _mm256_loadu_pd(p);
-}
-static void vstoreu(double *p, vdouble v)
-{
-    return _mm256_storeu_pd(p, v);
+#define PI4_Af 0.78515625f
+#define PI4_Bf 0.00024127960205078125f
+#define PI4_Cf 6.3329935073852539062e-07f
+#define PI4_Df 4.9604681473525147339e-10f
+
+#define L2Uf 0.693145751953125f
+#define L2Lf 1.428606765330187045e-06f
+#define R_LN2f 1.442695040888963407359924681001892137426645954152985934135449406931f
+
+#define INFINITYf ((float)rtengine::RT_INFINITY)
+#define NANf ((float)rtengine::RT_NAN)
+
+static INLINE vdouble vadd3(vdouble v0, vdouble v1, vdouble v2) {
+    return vadd(vadd(v0, v1), v2);
 }
 
-static vfloat vloaduf(float *p)
-{
-    return _mm256_loadu_ps(p);
-}
-static void vstoreuf(float *p, vfloat v)
-{
-    return _mm256_storeu_ps(p, v);
+static INLINE vdouble vadd4(vdouble v0, vdouble v1, vdouble v2, vdouble v3) {
+    return vadd3(vadd(v0, v1), v2, v3);
 }
 
-static vint2 vloadui2(int32_t *p)
-{
-    vint2 r;
-    r.x = _mm_loadu_si128((__m128i *) p     );
-    r.y = _mm_loadu_si128((__m128i *)(p + 4));
+static INLINE vdouble vadd5(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4) {
+    return vadd4(vadd(v0, v1), v2, v3, v4);
+}
+
+static INLINE vdouble vadd6(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4, vdouble v5) {
+    return vadd5(vadd(v0, v1), v2, v3, v4, v5);
+}
+
+static INLINE vdouble vadd7(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4, vdouble v5, vdouble v6) {
+    return vadd6(vadd(v0, v1), v2, v3, v4, v5, v6);
+}
+
+static INLINE vdouble vsub3(vdouble v0, vdouble v1, vdouble v2) {
+    return vsub(vsub(v0, v1), v2);
+}
+
+static INLINE vdouble vsub4(vdouble v0, vdouble v1, vdouble v2, vdouble v3) {
+    return vsub3(vsub(v0, v1), v2, v3);
+}
+
+static INLINE vdouble vsub5(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4) {
+    return vsub4(vsub(v0, v1), v2, v3, v4);
+}
+
+//
+
+static INLINE vdouble2 normalize_d(vdouble2 t) {
+    vdouble2 s;
+
+    s.x = vadd(t.x, t.y);
+    s.y = vadd(vsub(t.x, s.x), t.y);
+
+    return s;
+}
+
+static INLINE vdouble2 scale_d(vdouble2 d, vdouble s) {
+    vdouble2 r = {vmul(d.x, s), vmul(d.y, s)};
     return r;
 }
 
-static void vstoreui2(int32_t *p, vint2 v)
-{
-    _mm_storeu_si128((__m128i *) p     , v.x);
-    _mm_storeu_si128((__m128i *)(p + 4), v.y);
+static INLINE vdouble2 add_ss(vdouble x, vdouble y) {
+    vdouble2 r;
+
+    r.x = vadd(x, y);
+    r.y = vadd(vsub(x, r.x), y);
+
+    return r;
 }
-#endif
 
-typedef struct {
+static INLINE vdouble2 add2_ss(vdouble x, vdouble y) {
+    vdouble2 r;
+
+    r.x = vadd(x, y);
+    vdouble v = vsub(r.x, x);
+    r.y = vadd(vsub(x, vsub(r.x, v)), vsub(y, v));
+
+    return r;
+}
+
+static INLINE vdouble2 add_ds(vdouble2 x, vdouble y) {
+    vdouble2 r;
+
+    r.x = vadd(x.x, y);
+    r.y = vadd3(vsub(x.x, r.x), y, x.y);
+
+    return r;
+}
+
+static INLINE vdouble2 add2_ds(vdouble2 x, vdouble y) {
+    vdouble2 r;
+
+    r.x = vadd(x.x, y);
+    vdouble v = vsub(r.x, x.x);
+    r.y = vadd(vsub(x.x, vsub(r.x, v)), vsub(y, v));
+    r.y = vadd(r.y, x.y);
+
+    return r;
+}
+
+static INLINE vdouble2 add_sd(vdouble x, vdouble2 y) {
+    vdouble2 r;
+
+    r.x = vadd(x, y.x);
+    r.y = vadd3(vsub(x, r.x), y.x, y.y);
+
+    return r;
+}
+
+static INLINE vdouble2 add_dd(vdouble2 x, vdouble2 y) {
+    // |x| >= |y|
+
+    vdouble2 r;
+
+    r.x = vadd(x.x, y.x);
+    r.y = vadd4(vsub(x.x, r.x), y.x, x.y, y.y);
+
+    return r;
+}
+
+static INLINE vdouble2 add2_dd(vdouble2 x, vdouble2 y) {
+    vdouble2 r;
+
+    r.x  = vadd(x.x, y.x);
+    vdouble v = vsub(r.x, x.x);
+    r.y = vadd(vsub(x.x, vsub(r.x, v)), vsub(y.x, v));
+    r.y = vadd(r.y, vadd(x.y, y.y));
+
+    return r;
+}
+
+static INLINE vdouble2 div_dd(vdouble2 n, vdouble2 d) {
+    vdouble t = vrec(d.x);
+    vdouble dh  = vupper(d.x), dl  = vsub(d.x,  dh);
+    vdouble th  = vupper(t  ), tl  = vsub(t  ,  th);
+    vdouble nhh = vupper(n.x), nhl = vsub(n.x, nhh);
+
+    vdouble2 q;
+
+    q.x = vmul(n.x, t);
+
+    vdouble u = vadd5(vsub(vmul(nhh, th), q.x), vmul(nhh, tl), vmul(nhl, th), vmul(nhl, tl),
+            vmul(q.x, vsub5(vcast_vd_d(1), vmul(dh, th), vmul(dh, tl), vmul(dl, th), vmul(dl, tl))));
+
+    q.y = vadd(vmul(t, vsub(n.y, vmul(q.x, d.y))), u);
+
+    return q;
+}
+
+static INLINE vdouble2 mul_ss(vdouble x, vdouble y) {
+    vdouble xh = vupper(x), xl = vsub(x, xh);
+    vdouble yh = vupper(y), yl = vsub(y, yh);
+    vdouble2 r;
+
+    r.x = vmul(x, y);
+    r.y = vadd5(vmul(xh, yh), vneg(r.x), vmul(xl, yh), vmul(xh, yl), vmul(xl, yl));
+
+    return r;
+}
+
+static INLINE vdouble2 mul_ds(vdouble2 x, vdouble y) {
+    vdouble xh = vupper(x.x), xl = vsub(x.x, xh);
+    vdouble yh = vupper(y  ), yl = vsub(y  , yh);
+    vdouble2 r;
+
+    r.x = vmul(x.x, y);
+    r.y = vadd6(vmul(xh, yh), vneg(r.x), vmul(xl, yh), vmul(xh, yl), vmul(xl, yl), vmul(x.y, y));
+
+    return r;
+}
+
+static INLINE vdouble2 mul_dd(vdouble2 x, vdouble2 y) {
+    vdouble xh = vupper(x.x), xl = vsub(x.x, xh);
+    vdouble yh = vupper(y.x), yl = vsub(y.x, yh);
+    vdouble2 r;
+
+    r.x = vmul(x.x, y.x);
+    r.y = vadd7(vmul(xh, yh), vneg(r.x), vmul(xl, yh), vmul(xh, yl), vmul(xl, yl), vmul(x.x, y.y), vmul(x.y, y.x));
+
+    return r;
+}
+
+static INLINE vdouble2 squ_d(vdouble2 x) {
+    vdouble xh = vupper(x.x), xl = vsub(x.x, xh);
+    vdouble2 r;
+
+    r.x = vmul(x.x, x.x);
+    r.y = vadd5(vmul(xh, xh), vneg(r.x), vmul(vadd(xh, xh), xl), vmul(xl, xl), vmul(x.x, vadd(x.y, x.y)));
+
+    return r;
+}
+
+static INLINE vdouble2 rec_s(vdouble d) {
+    vdouble t = vrec(d);
+    vdouble dh = vupper(d), dl = vsub(d, dh);
+    vdouble th = vupper(t), tl = vsub(t, th);
+    vdouble2 q;
+
+    q.x = t;
+    q.y = vmul(t, vsub5(vcast_vd_d(1), vmul(dh, th), vmul(dh, tl), vmul(dl, th), vmul(dl, tl)));
+
+    return q;
+}
+
+static INLINE vdouble2 sqrt_d(vdouble2 d) {
+    vdouble t = vsqrt(vadd(d.x, d.y));
+    return scale_d(mul_dd(add2_dd(d, mul_ss(t, t)), rec_s(t)), vcast_vd_d(0.5));
+}
+
+//
+
+static INLINE vdouble xldexp(vdouble x, vint q) { return vldexp(x, q); }
+
+static INLINE vint xilogb(vdouble d) {
+    vdouble e = vcast_vd_vi(vsubi(vilogbp1(vabs(d)), vcast_vi_i(1)));
+    e = vsel(vmask_eq(d, vcast_vd_d(0)), vcast_vd_d(-2147483648.0), e);
+    e = vsel(vmask_eq(vabs(d), vcast_vd_d(rtengine::RT_INFINITY)), vcast_vd_d(2147483647), e);
+    return vrint_vi_vd(e);
+}
+
+static INLINE vdouble xsin(vdouble d) {
+    vint q;
+    vdouble u, s;
+
+    q = vrint_vi_vd(vmul(d, vcast_vd_d(rtengine::RT_1_PI)));
+
+    u = vcast_vd_vi(q);
+    d = vadd(d, vmul(u, vcast_vd_d(-PI4_A*4)));
+    d = vadd(d, vmul(u, vcast_vd_d(-PI4_B*4)));
+    d = vadd(d, vmul(u, vcast_vd_d(-PI4_C*4)));
+
+    s = vmul(d, d);
+
+    d = vsel(vmaski_eq(vandi(q, vcast_vi_i(1)), vcast_vi_i(1)), vneg(d), d);
+
+    u = vcast_vd_d(-7.97255955009037868891952e-18);
+    u = vmla(u, s, vcast_vd_d(2.81009972710863200091251e-15));
+    u = vmla(u, s, vcast_vd_d(-7.64712219118158833288484e-13));
+    u = vmla(u, s, vcast_vd_d(1.60590430605664501629054e-10));
+    u = vmla(u, s, vcast_vd_d(-2.50521083763502045810755e-08));
+    u = vmla(u, s, vcast_vd_d(2.75573192239198747630416e-06));
+    u = vmla(u, s, vcast_vd_d(-0.000198412698412696162806809));
+    u = vmla(u, s, vcast_vd_d(0.00833333333333332974823815));
+    u = vmla(u, s, vcast_vd_d(-0.166666666666666657414808));
+
+    u = vmla(s, vmul(u, d), d);
+
+    return u;
+}
+
+static INLINE vdouble xcos(vdouble d) {
+    vint q;
+    vdouble u, s;
+
+    q = vrint_vi_vd(vsub(vmul(d, vcast_vd_d(rtengine::RT_1_PI)), vcast_vd_d(0.5)));
+    q = vaddi(vaddi(q, q), vcast_vi_i(1));
+
+    u = vcast_vd_vi(q);
+    d = vadd(d, vmul(u, vcast_vd_d(-PI4_A*2)));
+    d = vadd(d, vmul(u, vcast_vd_d(-PI4_B*2)));
+    d = vadd(d, vmul(u, vcast_vd_d(-PI4_C*2)));
+
+    s = vmul(d, d);
+
+    d = vsel(vmaski_eq(vandi(q, vcast_vi_i(2)), vcast_vi_i(0)), vneg(d), d);
+
+    u = vcast_vd_d(-7.97255955009037868891952e-18);
+    u = vmla(u, s, vcast_vd_d(2.81009972710863200091251e-15));
+    u = vmla(u, s, vcast_vd_d(-7.64712219118158833288484e-13));
+    u = vmla(u, s, vcast_vd_d(1.60590430605664501629054e-10));
+    u = vmla(u, s, vcast_vd_d(-2.50521083763502045810755e-08));
+    u = vmla(u, s, vcast_vd_d(2.75573192239198747630416e-06));
+    u = vmla(u, s, vcast_vd_d(-0.000198412698412696162806809));
+    u = vmla(u, s, vcast_vd_d(0.00833333333333332974823815));
+    u = vmla(u, s, vcast_vd_d(-0.166666666666666657414808));
+
+    u = vmla(s, vmul(u, d), d);
+
+    return u;
+}
+
+static INLINE vdouble2 xsincos(vdouble d) {
+    vint q;
+    vmask m;
+    vdouble u, s, t, rx, ry;
+    vdouble2 r;
+
+    q = vrint_vi_vd(vmul(d, vcast_vd_d(rtengine::RT_2_PI)));
+
+    s = d;
+
+    u = vcast_vd_vi(q);
+    s = vmla(u, vcast_vd_d(-PI4_A*2), s);
+    s = vmla(u, vcast_vd_d(-PI4_B*2), s);
+    s = vmla(u, vcast_vd_d(-PI4_C*2), s);
+
+    t = s;
+
+    s = vmul(s, s);
+
+    u = vcast_vd_d(1.58938307283228937328511e-10);
+    u = vmla(u, s, vcast_vd_d(-2.50506943502539773349318e-08));
+    u = vmla(u, s, vcast_vd_d(2.75573131776846360512547e-06));
+    u = vmla(u, s, vcast_vd_d(-0.000198412698278911770864914));
+    u = vmla(u, s, vcast_vd_d(0.0083333333333191845961746));
+    u = vmla(u, s, vcast_vd_d(-0.166666666666666130709393));
+    u = vmul(vmul(u, s), t);
+
+    rx = vadd(t, u);
+
+    u = vcast_vd_d(-1.13615350239097429531523e-11);
+    u = vmla(u, s, vcast_vd_d(2.08757471207040055479366e-09));
+    u = vmla(u, s, vcast_vd_d(-2.75573144028847567498567e-07));
+    u = vmla(u, s, vcast_vd_d(2.48015872890001867311915e-05));
+    u = vmla(u, s, vcast_vd_d(-0.00138888888888714019282329));
+    u = vmla(u, s, vcast_vd_d(0.0416666666666665519592062));
+    u = vmla(u, s, vcast_vd_d(-0.5));
+
+    ry = vadd(vcast_vd_d(1), vmul(s, u));
+
+    m = vmaski_eq(vandi(q, vcast_vi_i(1)), vcast_vi_i(0));
+    r.x = vsel(m, rx, ry);
+    r.y = vsel(m, ry, rx);
+
+    m = vmaski_eq(vandi(q, vcast_vi_i(2)), vcast_vi_i(2));
+    r.x = vreinterpret_vd_vm(vxorm(vandm(m, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(r.x)));
+
+    m = vmaski_eq(vandi(vaddi(q, vcast_vi_i(1)), vcast_vi_i(2)), vcast_vi_i(2));
+    r.y = vreinterpret_vd_vm(vxorm(vandm(m, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(r.y)));
+
+    m = vmask_isinf(d);
+    r.x = vsel(m, vcast_vd_d(rtengine::RT_NAN), r.x);
+    r.y = vsel(m, vcast_vd_d(rtengine::RT_NAN), r.y);
+
+    return r;
+}
+
+static INLINE vdouble xtan(vdouble d) {
+    vint q;
+    vdouble u, s, x;
+    vmask m;
+
+    q = vrint_vi_vd(vmul(d, vcast_vd_d(rtengine::RT_2_PI)));
+
+    u = vcast_vd_vi(q);
+    x = vadd(d, vmul(u, vcast_vd_d(-PI4_A*2)));
+    x = vadd(x, vmul(u, vcast_vd_d(-PI4_B*2)));
+    x = vadd(x, vmul(u, vcast_vd_d(-PI4_C*2)));
+
+    s = vmul(x, x);
+
+    m = vmaski_eq(vandi(q, vcast_vi_i(1)), vcast_vi_i(1));
+    x = vsel(m, vneg(x), x);
+
+    u = vcast_vd_d(1.01419718511083373224408e-05);
+    u = vmla(u, s, vcast_vd_d(-2.59519791585924697698614e-05));
+    u = vmla(u, s, vcast_vd_d(5.23388081915899855325186e-05));
+    u = vmla(u, s, vcast_vd_d(-3.05033014433946488225616e-05));
+    u = vmla(u, s, vcast_vd_d(7.14707504084242744267497e-05));
+    u = vmla(u, s, vcast_vd_d(8.09674518280159187045078e-05));
+    u = vmla(u, s, vcast_vd_d(0.000244884931879331847054404));
+    u = vmla(u, s, vcast_vd_d(0.000588505168743587154904506));
+    u = vmla(u, s, vcast_vd_d(0.00145612788922812427978848));
+    u = vmla(u, s, vcast_vd_d(0.00359208743836906619142924));
+    u = vmla(u, s, vcast_vd_d(0.00886323944362401618113356));
+    u = vmla(u, s, vcast_vd_d(0.0218694882853846389592078));
+    u = vmla(u, s, vcast_vd_d(0.0539682539781298417636002));
+    u = vmla(u, s, vcast_vd_d(0.133333333333125941821962));
+    u = vmla(u, s, vcast_vd_d(0.333333333333334980164153));
+
+    u = vmla(s, vmul(u, x), x);
+
+    u = vsel(m, vrec(u), u);
+
+    u = vsel(vmask_isinf(d), vcast_vd_d(rtengine::RT_NAN), u);
+
+    return u;
+}
+
+static INLINE vdouble atan2k(vdouble y, vdouble x) {
+    vdouble s, t, u;
+    vint q;
+    vmask p;
+
+    q = vseli_lt(x, vcast_vd_d(0), vcast_vi_i(-2), vcast_vi_i(0));
+    x = vabs(x);
+
+    q = vseli_lt(x, y, vaddi(q, vcast_vi_i(1)), q);
+    p = vmask_lt(x, y);
+    s = vsel (p, vneg(x), y);
+    t = vmax (x, y);
+
+    s = vdiv(s, t);
+    t = vmul(s, s);
+
+    u = vcast_vd_d(-1.88796008463073496563746e-05);
+    u = vmla(u, t, vcast_vd_d(0.000209850076645816976906797));
+    u = vmla(u, t, vcast_vd_d(-0.00110611831486672482563471));
+    u = vmla(u, t, vcast_vd_d(0.00370026744188713119232403));
+    u = vmla(u, t, vcast_vd_d(-0.00889896195887655491740809));
+    u = vmla(u, t, vcast_vd_d(0.016599329773529201970117));
+    u = vmla(u, t, vcast_vd_d(-0.0254517624932312641616861));
+    u = vmla(u, t, vcast_vd_d(0.0337852580001353069993897));
+    u = vmla(u, t, vcast_vd_d(-0.0407629191276836500001934));
+    u = vmla(u, t, vcast_vd_d(0.0466667150077840625632675));
+    u = vmla(u, t, vcast_vd_d(-0.0523674852303482457616113));
+    u = vmla(u, t, vcast_vd_d(0.0587666392926673580854313));
+    u = vmla(u, t, vcast_vd_d(-0.0666573579361080525984562));
+    u = vmla(u, t, vcast_vd_d(0.0769219538311769618355029));
+    u = vmla(u, t, vcast_vd_d(-0.090908995008245008229153));
+    u = vmla(u, t, vcast_vd_d(0.111111105648261418443745));
+    u = vmla(u, t, vcast_vd_d(-0.14285714266771329383765));
+    u = vmla(u, t, vcast_vd_d(0.199999999996591265594148));
+    u = vmla(u, t, vcast_vd_d(-0.333333333333311110369124));
+
+    t = vadd(s, vmul(s, vmul(t, u)));
+    t = vadd(t, vmul(vcast_vd_vi(q), vcast_vd_d(rtengine::RT_PI/2)));
+
+    return t;
+}
+
+static INLINE vdouble xatan2(vdouble y, vdouble x) {
+    vdouble r = atan2k(vabs(y), x);
+
+    r = vmulsign(r, x);
+    r = vsel(vorm(vmask_isinf(x), vmask_eq(x, vcast_vd_d(0))), vsub(vcast_vd_d(rtengine::RT_PI/2), visinf2(x, vmulsign(vcast_vd_d(rtengine::RT_PI/2), x))), r);
+    r = vsel(vmask_isinf(y), vsub(vcast_vd_d(rtengine::RT_PI/2), visinf2(x, vmulsign(vcast_vd_d(rtengine::RT_PI/4), x))), r);
+    r = vsel(vmask_eq(y, vcast_vd_d(0)), vsel(vmask_eq(vsign(x), vcast_vd_d(-1.0)), vcast_vd_d(rtengine::RT_PI), vcast_vd_d(0)), r);
+
+    return vsel(vorm(vmask_isnan(x), vmask_isnan(y)), vcast_vd_d(rtengine::RT_NAN), vmulsign(r, y));
+}
+
+static INLINE vdouble xasin(vdouble d) {
     vdouble x, y;
-} vdouble2;
+    x = vadd(vcast_vd_d(1), d);
+    y = vsub(vcast_vd_d(1), d);
+    x = vmul(x, y);
+    x = vsqrt(x);
+    x = vsel(vmask_isnan(x), vcast_vd_d(rtengine::RT_NAN), atan2k(vabs(d), x));
+    return vmulsign(x, d);
+}
 
-vdouble xldexp(vdouble x, vint q);
-vint xilogb(vdouble d);
+static INLINE vdouble xacos(vdouble d) {
+    vdouble x, y;
+    x = vadd(vcast_vd_d(1), d);
+    y = vsub(vcast_vd_d(1), d);
+    x = vmul(x, y);
+    x = vsqrt(x);
+    x = vmulsign(atan2k(x, vabs(d)), d);
+    y = (vdouble)vandm(vmask_lt(d, vcast_vd_d(0)), (vmask)vcast_vd_d(rtengine::RT_PI));
+    x = vadd(x, y);
+    return x;
+}
 
-vdouble xsin(vdouble d);
-vdouble xcos(vdouble d);
-vdouble2 xsincos(vdouble d);
-vdouble xtan(vdouble d);
-vdouble xasin(vdouble s);
-vdouble xacos(vdouble s);
-vdouble xatan(vdouble s);
-vdouble xatan2(vdouble y, vdouble x);
-vdouble xlog(vdouble d);
-vdouble xexp(vdouble d);
-vdouble xpow(vdouble x, vdouble y);
+static INLINE vdouble xatan(vdouble s) {
+    vdouble t, u;
+    vint q;
 
-vdouble xsinh(vdouble d);
-vdouble xcosh(vdouble d);
-vdouble xtanh(vdouble d);
-vdouble xasinh(vdouble s);
-vdouble xacosh(vdouble s);
-vdouble xatanh(vdouble s);
+    q = vseli_lt(s, vcast_vd_d(0), vcast_vi_i(2), vcast_vi_i(0));
+    s = vabs(s);
 
-vdouble xcbrt(vdouble d);
+    q = vseli_lt(vcast_vd_d(1), s, vaddi(q, vcast_vi_i(1)), q);
+    s = vsel(vmask_lt(vcast_vd_d(1), s), vdiv(vcast_vd_d(1), s), s);
 
-vdouble xexp2(vdouble a);
-vdouble xexp10(vdouble a);
-vdouble xexpm1(vdouble a);
-vdouble xlog10(vdouble a);
-vdouble xlog1p(vdouble a);
+    t = vmul(s, s);
+
+    u = vcast_vd_d(-1.88796008463073496563746e-05);
+    u = vmla(u, t, vcast_vd_d(0.000209850076645816976906797));
+    u = vmla(u, t, vcast_vd_d(-0.00110611831486672482563471));
+    u = vmla(u, t, vcast_vd_d(0.00370026744188713119232403));
+    u = vmla(u, t, vcast_vd_d(-0.00889896195887655491740809));
+    u = vmla(u, t, vcast_vd_d(0.016599329773529201970117));
+    u = vmla(u, t, vcast_vd_d(-0.0254517624932312641616861));
+    u = vmla(u, t, vcast_vd_d(0.0337852580001353069993897));
+    u = vmla(u, t, vcast_vd_d(-0.0407629191276836500001934));
+    u = vmla(u, t, vcast_vd_d(0.0466667150077840625632675));
+    u = vmla(u, t, vcast_vd_d(-0.0523674852303482457616113));
+    u = vmla(u, t, vcast_vd_d(0.0587666392926673580854313));
+    u = vmla(u, t, vcast_vd_d(-0.0666573579361080525984562));
+    u = vmla(u, t, vcast_vd_d(0.0769219538311769618355029));
+    u = vmla(u, t, vcast_vd_d(-0.090908995008245008229153));
+    u = vmla(u, t, vcast_vd_d(0.111111105648261418443745));
+    u = vmla(u, t, vcast_vd_d(-0.14285714266771329383765));
+    u = vmla(u, t, vcast_vd_d(0.199999999996591265594148));
+    u = vmla(u, t, vcast_vd_d(-0.333333333333311110369124));
+
+    t = vadd(s, vmul(s, vmul(t, u)));
+
+    t = vsel(vmaski_eq(vandi(q, vcast_vi_i(1)), vcast_vi_i(1)), vsub(vcast_vd_d(rtengine::RT_PI/2), t), t);
+    t = vsel(vmaski_eq(vandi(q, vcast_vi_i(2)), vcast_vi_i(2)), vneg(t), t);
+
+    return t;
+}
+
+static INLINE vdouble xlog(vdouble d) {
+    vdouble x, x2;
+    vdouble t, m;
+    vint e;
+
+    e = vilogbp1(vmul(d, vcast_vd_d(0.7071)));
+    m = vldexp(d, vsubi(vcast_vi_i(0), e));
+
+    x = vdiv(vadd(vcast_vd_d(-1), m), vadd(vcast_vd_d(1), m));
+    x2 = vmul(x, x);
+
+    t = vcast_vd_d(0.148197055177935105296783);
+    t = vmla(t, x2, vcast_vd_d(0.153108178020442575739679));
+    t = vmla(t, x2, vcast_vd_d(0.181837339521549679055568));
+    t = vmla(t, x2, vcast_vd_d(0.22222194152736701733275));
+    t = vmla(t, x2, vcast_vd_d(0.285714288030134544449368));
+    t = vmla(t, x2, vcast_vd_d(0.399999999989941956712869));
+    t = vmla(t, x2, vcast_vd_d(0.666666666666685503450651));
+    t = vmla(t, x2, vcast_vd_d(2));
+
+    x = vadd(vmul(x, t), vmul(vcast_vd_d(0.693147180559945286226764), vcast_vd_vi(e)));
+
+    x = vsel(vmask_ispinf(d), vcast_vd_d(rtengine::RT_INFINITY), x);
+    x = vsel(vmask_gt(vcast_vd_d(0), d), vcast_vd_d(rtengine::RT_NAN), x);
+    x = vsel(vmask_eq(d, vcast_vd_d(0)), vcast_vd_d(-rtengine::RT_INFINITY), x);
+
+    return x;
+}
+
+static INLINE vdouble xexp(vdouble d) {
+    vint q = vrint_vi_vd(vmul(d, vcast_vd_d(R_LN2)));
+    vdouble s, u;
+
+    s = vadd(d, vmul(vcast_vd_vi(q), vcast_vd_d(-L2U)));
+    s = vadd(s, vmul(vcast_vd_vi(q), vcast_vd_d(-L2L)));
+
+    u = vcast_vd_d(2.08860621107283687536341e-09);
+    u = vmla(u, s, vcast_vd_d(2.51112930892876518610661e-08));
+    u = vmla(u, s, vcast_vd_d(2.75573911234900471893338e-07));
+    u = vmla(u, s, vcast_vd_d(2.75572362911928827629423e-06));
+    u = vmla(u, s, vcast_vd_d(2.4801587159235472998791e-05));
+    u = vmla(u, s, vcast_vd_d(0.000198412698960509205564975));
+    u = vmla(u, s, vcast_vd_d(0.00138888888889774492207962));
+    u = vmla(u, s, vcast_vd_d(0.00833333333331652721664984));
+    u = vmla(u, s, vcast_vd_d(0.0416666666666665047591422));
+    u = vmla(u, s, vcast_vd_d(0.166666666666666851703837));
+    u = vmla(u, s, vcast_vd_d(0.5));
+
+    u = vadd(vcast_vd_d(1), vadd(s, vmul(vmul(s, s), u)));
+
+    u = vldexp(u, q);
+
+    u = vsel(vmask_isminf(d), vcast_vd_d(0), u);
+
+    return u;
+}
+
+static INLINE vdouble2 logk(vdouble d) {
+    vdouble2 x, x2;
+    vdouble t, m;
+    vint e;
+
+    e = vilogbp1(vmul(d, vcast_vd_d(0.7071)));
+    m = vldexp(d, vsubi(vcast_vi_i(0), e));
+
+    x = div_dd(add2_ss(vcast_vd_d(-1), m), add2_ss(vcast_vd_d(1), m));
+    x2 = squ_d(x);
+    x2 = normalize_d(x2);
+
+    t = vcast_vd_d(0.134601987501262130076155);
+    t = vmla(t, x2.x, vcast_vd_d(0.132248509032032670243288));
+    t = vmla(t, x2.x, vcast_vd_d(0.153883458318096079652524));
+    t = vmla(t, x2.x, vcast_vd_d(0.181817427573705403298686));
+    t = vmla(t, x2.x, vcast_vd_d(0.222222231326187414840781));
+    t = vmla(t, x2.x, vcast_vd_d(0.285714285651261412873718));
+    t = vmla(t, x2.x, vcast_vd_d(0.400000000000222439910458));
+    t = vmla(t, x2.x, vcast_vd_d(0.666666666666666371239645));
+
+    return add2_dd(mul_ds(dd(vcast_vd_d(0.693147180559945286226764), vcast_vd_d(2.319046813846299558417771e-17)),
+                vcast_vd_vi(e)),
+            add2_dd(scale_d(x, vcast_vd_d(2)), mul_ds(mul_dd(x2, x), t)));
+}
+
+static INLINE vdouble expk(vdouble2 d) {
+    vdouble u = vmul(vadd(d.x, d.y), vcast_vd_d(R_LN2));
+    vint q = vrint_vi_vd(u);
+    vdouble2 s, t;
+
+    s = add2_ds(d, vmul(vcast_vd_vi(q), vcast_vd_d(-L2U)));
+    s = add2_ds(s, vmul(vcast_vd_vi(q), vcast_vd_d(-L2L)));
+
+    q = vrint_vi_vd(vmin(vmax(vcast_vd_d(-2047.49), u), vcast_vd_d(2047.49)));
+
+    s = normalize_d(s);
+
+    u = vcast_vd_d(2.51069683420950419527139e-08);
+    u = vmla(u, s.x, vcast_vd_d(2.76286166770270649116855e-07));
+    u = vmla(u, s.x, vcast_vd_d(2.75572496725023574143864e-06));
+    u = vmla(u, s.x, vcast_vd_d(2.48014973989819794114153e-05));
+    u = vmla(u, s.x, vcast_vd_d(0.000198412698809069797676111));
+    u = vmla(u, s.x, vcast_vd_d(0.0013888888939977128960529));
+    u = vmla(u, s.x, vcast_vd_d(0.00833333333332371417601081));
+    u = vmla(u, s.x, vcast_vd_d(0.0416666666665409524128449));
+    u = vmla(u, s.x, vcast_vd_d(0.166666666666666740681535));
+    u = vmla(u, s.x, vcast_vd_d(0.500000000000000999200722));
+
+    t = add_dd(s, mul_ds(squ_d(s), u));
+
+    t = add_sd(vcast_vd_d(1), t);
+    u = vadd(t.x, t.y);
+    u = vldexp(u, q);
+
+    return u;
+}
+
+static INLINE vdouble xpow(vdouble x, vdouble y) {
+#if 1
+    vmask yisint = vmask_eq(vcast_vd_vi(vrint_vi_vd(y)), y);
+    vmask yisodd = vandm(vmaski_eq(vandi(vrint_vi_vd(y), vcast_vi_i(1)), vcast_vi_i(1)), yisint);
+
+    vdouble result = expk(mul_ds(logk(vabs(x)), y));
+
+    //result = vsel(vmask_isnan(result), vcast_vd_d(rtengine::RT_INFINITY), result);
+
+    result = vmul(result,
+            vsel(vmask_gt(x, vcast_vd_d(0)),
+                vcast_vd_d(1),
+                vsel(yisint,
+                    vsel(yisodd,
+                        vcast_vd_d(-1),
+                        vcast_vd_d(1)),
+                    vcast_vd_d(rtengine::RT_NAN))));
+
+    vdouble efx = vreinterpret_vd_vm(vxorm(vreinterpret_vm_vd(vsub(vabs(x), vcast_vd_d(1))), vsignbit(y)));
+
+    result = vsel(vmask_isinf(y),
+            vsel(vmask_lt(efx, vcast_vd_d(0)),
+                vcast_vd_d(0),
+                vsel(vmask_eq(efx, vcast_vd_d(0)),
+                    vcast_vd_d(1.0),
+                    vcast_vd_d(rtengine::RT_INFINITY))),
+            result);
+
+    result = vsel(vorm(vmask_isinf(x), vmask_eq(x, vcast_vd_d(0))),
+            vmul(vsel(yisodd, vsign(x), vcast_vd_d(1)),
+                vsel(vmask_lt(vsel(vmask_eq(x, vcast_vd_d(0)), vneg(y), y), vcast_vd_d(0)),
+                    vcast_vd_d(0),
+                    vcast_vd_d(rtengine::RT_INFINITY))),
+            result);
+
+    result = vsel(vorm(vmask_isnan(x), vmask_isnan(y)), vcast_vd_d(rtengine::RT_NAN), result);
+
+    result = vsel(vorm(vmask_eq(y, vcast_vd_d(0)), vmask_eq(x, vcast_vd_d(1))), vcast_vd_d(1), result);
+
+    return result;
+#else
+    return expk(mul_ds(logk(x), y));
+#endif
+}
+
+static INLINE vdouble2 expk2(vdouble2 d) {
+    vdouble u = vmul(vadd(d.x, d.y), vcast_vd_d(R_LN2));
+    vint q = vrint_vi_vd(u);
+    vdouble2 s, t;
+
+    s = add2_ds(d, vmul(vcast_vd_vi(q), vcast_vd_d(-L2U)));
+    s = add2_ds(s, vmul(vcast_vd_vi(q), vcast_vd_d(-L2L)));
+
+    q = vrint_vi_vd(vmin(vmax(vcast_vd_d(-2047.49), u), vcast_vd_d(2047.49)));
+
+    s = normalize_d(s);
+
+    u = vcast_vd_d(2.51069683420950419527139e-08);
+    u = vmla(u, s.x, vcast_vd_d(2.76286166770270649116855e-07));
+    u = vmla(u, s.x, vcast_vd_d(2.75572496725023574143864e-06));
+    u = vmla(u, s.x, vcast_vd_d(2.48014973989819794114153e-05));
+    u = vmla(u, s.x, vcast_vd_d(0.000198412698809069797676111));
+    u = vmla(u, s.x, vcast_vd_d(0.0013888888939977128960529));
+    u = vmla(u, s.x, vcast_vd_d(0.00833333333332371417601081));
+    u = vmla(u, s.x, vcast_vd_d(0.0416666666665409524128449));
+    u = vmla(u, s.x, vcast_vd_d(0.166666666666666740681535));
+    u = vmla(u, s.x, vcast_vd_d(0.500000000000000999200722));
+
+    t = add_dd(s, mul_ds(squ_d(s), u));
+
+    t = add_sd(vcast_vd_d(1), t);
+
+    return dd(vldexp(t.x, q), vldexp(t.y, q));
+}
+
+static INLINE vdouble xsinh(vdouble x) {
+    vdouble y = vabs(x);
+    vdouble2 d = expk2(dd(y, vcast_vd_d(0)));
+    d = add2_dd(d, div_dd(dd(vcast_vd_d(-1), vcast_vd_d(0)), d));
+    y = vmul(vadd(d.x, d.y), vcast_vd_d(0.5));
+
+    y = vsel(vorm(vmask_isinf(x), vmask_isnan(y)), vcast_vd_d(rtengine::RT_INFINITY), y);
+    y = vmulsign(y, x);
+    y = vsel(vmask_isnan(x), vcast_vd_d(rtengine::RT_NAN), y);
+
+    return y;
+}
+
+static INLINE vdouble xcosh(vdouble x) {
+    vdouble2 d = expk2(dd(x, vcast_vd_d(0)));
+    d = add2_dd(d, div_dd(dd(vcast_vd_d(1), vcast_vd_d(0)), d));
+    vdouble y = vmul(vadd(d.x, d.y), vcast_vd_d(0.5));
+
+    y = vsel(vorm(vmask_isinf(x), vmask_isnan(y)), vcast_vd_d(rtengine::RT_INFINITY), y);
+    y = vsel(vmask_isnan(x), vcast_vd_d(rtengine::RT_NAN), y);
+
+    return y;
+}
+
+static INLINE vdouble xtanh(vdouble x) {
+    vdouble y = vabs(x);
+    vdouble2 d = expk2(dd(y, vcast_vd_d(0)));
+    vdouble2 e = div_dd(dd(vcast_vd_d(1), vcast_vd_d(0)), d);
+    d = div_dd(add2_dd(d, scale_d(e, vcast_vd_d(-1))), add2_dd(d, e));
+    y = d.x + d.y;
+
+    y = vsel(vorm(vmask_isinf(x), vmask_isnan(y)), vcast_vd_d(1.0), y);
+    y = vmulsign(y, x);
+    y = vsel(vmask_isnan(x), vcast_vd_d(rtengine::RT_NAN), y);
+
+    return y;
+}
+
+static INLINE vdouble2 logk2(vdouble2 d) {
+    vdouble2 x, x2, m;
+    vdouble t;
+    vint e;
+
+    d = normalize_d(d);
+    e = vilogbp1(vmul(d.x, vcast_vd_d(0.7071)));
+    m = scale_d(d, vldexp(vcast_vd_d(1), vsubi(vcast_vi_i(0), e)));
+
+    x = div_dd(add2_ds(m, vcast_vd_d(-1)), add2_ds(m, vcast_vd_d(1)));
+    x2 = squ_d(x);
+    x2 = normalize_d(x2);
+
+    t = vcast_vd_d(0.134601987501262130076155);
+    t = vmla(t, x2.x, vcast_vd_d(0.132248509032032670243288));
+    t = vmla(t, x2.x, vcast_vd_d(0.153883458318096079652524));
+    t = vmla(t, x2.x, vcast_vd_d(0.181817427573705403298686));
+    t = vmla(t, x2.x, vcast_vd_d(0.222222231326187414840781));
+    t = vmla(t, x2.x, vcast_vd_d(0.285714285651261412873718));
+    t = vmla(t, x2.x, vcast_vd_d(0.400000000000222439910458));
+    t = vmla(t, x2.x, vcast_vd_d(0.666666666666666371239645));
+
+    return add2_dd(mul_ds(dd(vcast_vd_d(0.693147180559945286226764), vcast_vd_d(2.319046813846299558417771e-17)),
+                vcast_vd_vi(e)),
+            add2_dd(scale_d(x, vcast_vd_d(2)), mul_ds(mul_dd(x2, x), t)));
+}
+
+static INLINE vdouble xasinh(vdouble x) {
+    vdouble y = vabs(x);
+    vdouble2 d = logk2(add2_ds(sqrt_d(add2_ds(mul_ss(y, y),  vcast_vd_d(1))), y));
+    y = vadd(d.x, d.y);
+
+    y = vsel(vorm(vmask_isinf(x), vmask_isnan(y)), vcast_vd_d(rtengine::RT_INFINITY), y);
+    y = vmulsign(y, x);
+    y = vsel(vmask_isnan(x), vcast_vd_d(rtengine::RT_NAN), y);
+
+    return y;
+}
+
+static INLINE vdouble xacosh(vdouble x) {
+    vdouble2 d = logk2(add2_ds(sqrt_d(add2_ds(mul_ss(x, x), vcast_vd_d(-1))), x));
+    vdouble y = vadd(d.x, d.y);
+
+    y = vsel(vorm(vmask_isinf(x), vmask_isnan(y)), vcast_vd_d(rtengine::RT_INFINITY), y);
+    y = vsel(vmask_eq(x, vcast_vd_d(1.0)), vcast_vd_d(0.0), y);
+    y = vsel(vmask_lt(x, vcast_vd_d(1.0)), vcast_vd_d(rtengine::RT_NAN), y);
+    y = vsel(vmask_isnan(x), vcast_vd_d(rtengine::RT_NAN), y);
+
+    return y;
+}
+
+static INLINE vdouble xatanh(vdouble x) {
+    vdouble y = vabs(x);
+    vdouble2 d = logk2(div_dd(add2_ss(vcast_vd_d(1), y), add2_ss(vcast_vd_d(1), -y)));
+    y = vsel(vmask_gt(y, vcast_vd_d(1.0)), vcast_vd_d(rtengine::RT_NAN), vsel(vmask_eq(y, vcast_vd_d(1.0)), vcast_vd_d(rtengine::RT_INFINITY), vmul(vadd(d.x, d.y), vcast_vd_d(0.5))));
+
+    y = vsel(vorm(vmask_isinf(x), vmask_isnan(y)), vcast_vd_d(rtengine::RT_NAN), y);
+    y = vmulsign(y, x);
+    y = vsel(vmask_isnan(x), vcast_vd_d(rtengine::RT_NAN), y);
+
+    return y;
+}
+
+static INLINE vdouble xcbrt(vdouble d) {
+    vdouble x, y, q = vcast_vd_d(1.0);
+    vint e, qu, re;
+    vdouble t;
+
+    e = vilogbp1(vabs(d));
+    d = vldexp(d, vsubi(vcast_vi_i(0), e));
+
+    t = vadd(vcast_vd_vi(e), vcast_vd_d(6144));
+    qu = vtruncate_vi_vd(vdiv(t, vcast_vd_d(3)));
+    re = vtruncate_vi_vd(vsub(t, vmul(vcast_vd_vi(qu), vcast_vd_d(3))));
+
+    q = vsel(vmaski_eq(re, vcast_vi_i(1)), vcast_vd_d(1.2599210498948731647672106), q);
+    q = vsel(vmaski_eq(re, vcast_vi_i(2)), vcast_vd_d(1.5874010519681994747517056), q);
+    q = vldexp(q, vsubi(qu, vcast_vi_i(2048)));
+
+    q = vmulsign(q, d);
+
+    d = vabs(d);
+
+    x = vcast_vd_d(-0.640245898480692909870982);
+    x = vmla(x, d, vcast_vd_d(2.96155103020039511818595));
+    x = vmla(x, d, vcast_vd_d(-5.73353060922947843636166));
+    x = vmla(x, d, vcast_vd_d(6.03990368989458747961407));
+    x = vmla(x, d, vcast_vd_d(-3.85841935510444988821632));
+    x = vmla(x, d, vcast_vd_d(2.2307275302496609725722));
+
+    y = vmul(x, x); y = vmul(y, y); x = vsub(x, vmul(vmla(d, y, vneg(x)), vcast_vd_d(1.0 / 3.0)));
+    y = vmul(vmul(d, x), x);
+    y = vmul(vsub(y, vmul(vmul(vcast_vd_d(2.0 / 3.0), y), vmla(y, x, vcast_vd_d(-1.0)))), q);
+
+    return y;
+}
+
+static INLINE vdouble xexp2(vdouble a) {
+    vdouble u = expk(mul_ds(dd(vcast_vd_d(0.69314718055994528623), vcast_vd_d(2.3190468138462995584e-17)), a));
+    u = vsel(vmask_ispinf(a), vcast_vd_d(rtengine::RT_INFINITY), u);
+    u = vsel(vmask_isminf(a), vcast_vd_d(0), u);
+    return u;
+}
+
+static INLINE vdouble xexp10(vdouble a) {
+    vdouble u = expk(mul_ds(dd(vcast_vd_d(2.3025850929940459011), vcast_vd_d(-2.1707562233822493508e-16)), a));
+    u = vsel(vmask_ispinf(a), vcast_vd_d(rtengine::RT_INFINITY), u);
+    u = vsel(vmask_isminf(a), vcast_vd_d(0), u);
+    return u;
+}
+
+static INLINE vdouble xexpm1(vdouble a) {
+    vdouble2 d = add2_ds(expk2(dd(a, vcast_vd_d(0))), vcast_vd_d(-1.0));
+    vdouble x = d.x + d.y;
+    x = vsel(vmask_ispinf(a), vcast_vd_d(rtengine::RT_INFINITY), x);
+    x = vsel(vmask_isminf(a), vcast_vd_d(-1), x);
+    return x;
+}
+
+static INLINE vdouble xlog10(vdouble a) {
+    vdouble2 d = mul_dd(logk(a), dd(vcast_vd_d(0.43429448190325176116), vcast_vd_d(6.6494347733425473126e-17)));
+    vdouble x = d.x + d.y;
+
+    x = vsel(vmask_ispinf(a), vcast_vd_d(rtengine::RT_INFINITY), x);
+    x = vsel(vmask_gt(vcast_vd_d(0), a), vcast_vd_d(rtengine::RT_NAN), x);
+    x = vsel(vmask_eq(a, vcast_vd_d(0)), vcast_vd_d(-rtengine::RT_INFINITY), x);
+
+    return x;
+}
+
+static INLINE vdouble xlog1p(vdouble a) {
+    vdouble2 d = logk2(add2_ss(a, vcast_vd_d(1)));
+    vdouble x = d.x + d.y;
+
+    x = vsel(vmask_ispinf(a), vcast_vd_d(rtengine::RT_INFINITY), x);
+    x = vsel(vmask_gt(vcast_vd_d(-1), a), vcast_vd_d(rtengine::RT_NAN), x);
+    x = vsel(vmask_eq(a, vcast_vd_d(-1)), vcast_vd_d(-rtengine::RT_INFINITY), x);
+
+    return x;
+}
 
 //
 
@@ -126,15 +899,580 @@ typedef struct {
     vfloat x, y;
 } vfloat2;
 
-vfloat xsinf(vfloat d);
-vfloat xcosf(vfloat d);
-vfloat2 xsincosf(vfloat d);
-vfloat xtanf(vfloat d);
-vfloat xasinf(vfloat s);
-vfloat xacosf(vfloat s);
-vfloat xatanf(vfloat s);
-vfloat xatan2f(vfloat y, vfloat x);
-vfloat xlogf(vfloat d);
-vfloat xlogf0(vfloat d);
-vfloat xexpf(vfloat d);
-vfloat xcbrtf(vfloat s);
+static INLINE vfloat vabsf(vfloat f) { return (vfloat)vandnotm((vmask)vcast_vf_f(-0.0f), (vmask)f); }
+static INLINE vfloat vnegf(vfloat f) { return (vfloat)vxorm((vmask)f, (vmask)vcast_vf_f(-0.0f)); }
+
+#ifdef __SSE4_1__
+// only one instruction when using SSE4.1
+static INLINE vfloat vself(vmask mask, vfloat x, vfloat y) {
+    return _mm_blendv_ps(y,x,(vfloat)mask);
+}
+
+static INLINE vint vselc(vmask mask, vint x, vint y) {
+    return _mm_blendv_epi8(y,x,mask);
+}
+
+#else
+// three instructions when using SSE2
+static INLINE vfloat vself(vmask mask, vfloat x, vfloat y) {
+    return (vfloat)vorm(vandm(mask, (vmask)x), vandnotm(mask, (vmask)y));
+}
+
+static INLINE vint vselc(vmask mask, vint x, vint y) {
+    return vorm(vandm(mask, (vmask)x), vandnotm(mask, (vmask)y));
+}
+#endif
+
+static INLINE vfloat vselfzero(vmask mask, vfloat x) {
+    // returns value of x if corresponding mask bits are 1, else returns 0
+    // faster than vself(mask, x, ZEROV)
+    return _mm_and_ps((vfloat)mask, x);
+}
+static INLINE vfloat vselfnotzero(vmask mask, vfloat x) {
+    // returns value of x if corresponding mask bits are 0, else returns 0
+    // faster than vself(mask, ZEROV, x)
+    return _mm_andnot_ps((vfloat)mask, x);
+}
+
+static INLINE vint vselizero(vmask mask, vint x) {
+    // returns value of x if corresponding mask bits are 1, else returns 0
+    // faster than vselc(mask, x, ZEROV)
+    return _mm_and_si128(mask, x);
+}
+static INLINE vint vselinotzero(vmask mask, vint x) {
+    // returns value of x if corresponding mask bits are 0, else returns 0
+    // faster than vselc(mask, ZEROV, x)
+    return _mm_andnot_si128(mask, x);
+}
+
+static INLINE vint2 vseli2_lt(vfloat f0, vfloat f1, vint2 x, vint2 y) {
+    vint2 m2 = vcast_vi2_vm(vmaskf_lt(f0, f1));
+    return vori2(vandi2(m2, x), vandnoti2(m2, y));
+}
+
+static INLINE vmask vsignbitf(vfloat f) {
+    return vandm((vmask)f, (vmask)vcast_vf_f(-0.0f));
+}
+
+static INLINE vfloat vmulsignf(vfloat x, vfloat y) {
+    return (vfloat)vxorm((vmask)x, vsignbitf(y));
+}
+
+static INLINE vfloat vsignf(vfloat f) {
+    return (vfloat)vorm((vmask)vcast_vf_f(1.0f), vandm((vmask)vcast_vf_f(-0.0f), (vmask)f));
+}
+
+static INLINE vmask vmaskf_isinf(vfloat d) { return vmaskf_eq(vabsf(d), vcast_vf_f(INFINITYf)); }
+static INLINE vmask vmaskf_ispinf(vfloat d) { return vmaskf_eq(d, vcast_vf_f(INFINITYf)); }
+static INLINE vmask vmaskf_isminf(vfloat d) { return vmaskf_eq(d, vcast_vf_f(-INFINITYf)); }
+static INLINE vmask vmaskf_isnan(vfloat d) { return vmaskf_neq(d, d); }
+// the following is equivalent to vorm(vmaskf_isnan(a), vmaskf_isnan(b)), but faster
+static INLINE vmask vmaskf_isnan(vfloat a, vfloat b) { return (vmask)_mm_cmpunord_ps(a, b); }
+static INLINE vfloat visinf2f(vfloat d, vfloat m) { return (vfloat)vandm(vmaskf_isinf(d), vorm(vsignbitf(d), (vmask)m)); }
+static INLINE vfloat visinff(vfloat d) { return visinf2f(d, vcast_vf_f(1.0f)); }
+
+static INLINE vint2 vilogbp1f(vfloat d) {
+    vmask m = vmaskf_lt(d, vcast_vf_f(5.421010862427522E-20f));
+    d = vself(m, vmulf(vcast_vf_f(1.8446744073709552E19f), d), d);
+    vint2 q = vandi2(vsrli2(vcast_vi2_vm(vreinterpret_vm_vf(d)), 23), vcast_vi2_i(0xff));
+    q = vsubi2(q, vseli2(m, vcast_vi2_i(64 + 0x7e), vcast_vi2_i(0x7e)));
+    return q;
+}
+
+static INLINE vfloat vldexpf(vfloat x, vint2 q) {
+    vfloat u;
+    vint2 m = vsrai2(q, 31);
+    m = vslli2(vsubi2(vsrai2(vaddi2(m, q), 6), m), 4);
+    q = vsubi2(q, vslli2(m, 2));
+    u = vreinterpret_vf_vm(vcast_vm_vi2(vslli2(vaddi2(m, vcast_vi2_i(0x7f)), 23)));
+    x = vmulf(vmulf(vmulf(vmulf(x, u), u), u), u);
+    u = vreinterpret_vf_vm(vcast_vm_vi2(vslli2(vaddi2(q, vcast_vi2_i(0x7f)), 23)));
+    return vmulf(x, u);
+}
+
+static INLINE vfloat xsinf(vfloat d) {
+    vint2 q;
+    vfloat u, s;
+
+    q = vrint_vi2_vf(vmulf(d, vcast_vf_f((float)rtengine::RT_1_PI)));
+
+    u = vcast_vf_vi2(q);
+    d = vmlaf(u, vcast_vf_f(-PI4_Af*4), d);
+    d = vmlaf(u, vcast_vf_f(-PI4_Bf*4), d);
+    d = vmlaf(u, vcast_vf_f(-PI4_Cf*4), d);
+    d = vmlaf(u, vcast_vf_f(-PI4_Df*4), d);
+
+    s = vmulf(d, d);
+
+    d = vself(vmaski2_eq(vandi2(q, vcast_vi2_i(1)), vcast_vi2_i(1)), vnegf(d), d);
+
+    u = vcast_vf_f(2.6083159809786593541503e-06f);
+    u = vmlaf(u, s, vcast_vf_f(-0.0001981069071916863322258f));
+    u = vmlaf(u, s, vcast_vf_f(0.00833307858556509017944336f));
+    u = vmlaf(u, s, vcast_vf_f(-0.166666597127914428710938f));
+
+    u = vmlaf(s, vmulf(u, d), d);
+
+    return u;
+}
+
+static INLINE vfloat xcosf(vfloat d) {
+    vint2 q;
+    vfloat u, s;
+
+    q = vrint_vi2_vf(vsubf(vmulf(d, vcast_vf_f((float)rtengine::RT_1_PI)), vcast_vf_f(0.5f)));
+    q = vaddi2(vaddi2(q, q), vcast_vi2_i(1));
+
+    u = vcast_vf_vi2(q);
+    d = vmlaf(u, vcast_vf_f(-PI4_Af*2), d);
+    d = vmlaf(u, vcast_vf_f(-PI4_Bf*2), d);
+    d = vmlaf(u, vcast_vf_f(-PI4_Cf*2), d);
+    d = vmlaf(u, vcast_vf_f(-PI4_Df*2), d);
+
+    s = vmulf(d, d);
+
+    d = vself(vmaski2_eq(vandi2(q, vcast_vi2_i(2)), vcast_vi2_i(2)), d, vnegf(d));
+
+    u = vcast_vf_f(2.6083159809786593541503e-06f);
+    u = vmlaf(u, s, vcast_vf_f(-0.0001981069071916863322258f));
+    u = vmlaf(u, s, vcast_vf_f(0.00833307858556509017944336f));
+    u = vmlaf(u, s, vcast_vf_f(-0.166666597127914428710938f));
+
+    u = vmlaf(s, vmulf(u, d), d);
+
+    return u;
+}
+
+static INLINE vfloat2 xsincosf(vfloat d) {
+    vint2 q;
+    vmask m;
+    vfloat u, s, t, rx, ry;
+    vfloat2 r;
+
+    q = vrint_vi2_vf(vmulf(d, vcast_vf_f((float)rtengine::RT_2_PI)));
+
+    s = d;
+
+    u = vcast_vf_vi2(q);
+    s = vmlaf(u, vcast_vf_f(-PI4_Af*2), s);
+    s = vmlaf(u, vcast_vf_f(-PI4_Bf*2), s);
+    s = vmlaf(u, vcast_vf_f(-PI4_Cf*2), s);
+    s = vmlaf(u, vcast_vf_f(-PI4_Df*2), s);
+
+    t = s;
+
+    s = vmulf(s, s);
+
+    u = vcast_vf_f(-0.000195169282960705459117889f);
+    u = vmlaf(u, s, vcast_vf_f(0.00833215750753879547119141f));
+    u = vmlaf(u, s, vcast_vf_f(-0.166666537523269653320312f));
+    u = vmulf(vmulf(u, s), t);
+
+    rx = vaddf(t, u);
+
+    u = vcast_vf_f(-2.71811842367242206819355e-07f);
+    u = vmlaf(u, s, vcast_vf_f(2.47990446951007470488548e-05f));
+    u = vmlaf(u, s, vcast_vf_f(-0.00138888787478208541870117f));
+    u = vmlaf(u, s, vcast_vf_f(0.0416666641831398010253906f));
+    u = vmlaf(u, s, vcast_vf_f(-0.5));
+
+    ry = vaddf(vcast_vf_f(1), vmulf(s, u));
+
+    m = vmaski2_eq(vandi2(q, vcast_vi2_i(1)), vcast_vi2_i(0));
+    r.x = vself(m, rx, ry);
+    r.y = vself(m, ry, rx);
+
+    m = vmaski2_eq(vandi2(q, vcast_vi2_i(2)), vcast_vi2_i(2));
+    r.x = vreinterpret_vf_vm(vxorm(vandm(m, vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(r.x)));
+
+    m = vmaski2_eq(vandi2(vaddi2(q, vcast_vi2_i(1)), vcast_vi2_i(2)), vcast_vi2_i(2));
+    r.y = vreinterpret_vf_vm(vxorm(vandm(m, vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(r.y)));
+
+    m = vmaskf_isinf(d);
+    r.x = vself(m, vcast_vf_f(rtengine::RT_NAN), r.x);
+    r.y = vself(m, vcast_vf_f(rtengine::RT_NAN), r.y);
+
+    return r;
+}
+
+static INLINE vfloat xtanf(vfloat d) {
+    vint2 q;
+    vmask m;
+    vfloat u, s, x;
+
+    q = vrint_vi2_vf(vmulf(d, vcast_vf_f((float)(2 * rtengine::RT_1_PI))));
+
+    x = d;
+
+    u = vcast_vf_vi2(q);
+    x = vmlaf(u, vcast_vf_f(-PI4_Af*2), x);
+    x = vmlaf(u, vcast_vf_f(-PI4_Bf*2), x);
+    x = vmlaf(u, vcast_vf_f(-PI4_Cf*2), x);
+    x = vmlaf(u, vcast_vf_f(-PI4_Df*2), x);
+
+    s = vmulf(x, x);
+
+    m = vmaski2_eq(vandi2(q, vcast_vi2_i(1)), vcast_vi2_i(1));
+    x = vself(m, vnegf(x), x);
+
+    u = vcast_vf_f(0.00927245803177356719970703f);
+    u = vmlaf(u, s, vcast_vf_f(0.00331984995864331722259521f));
+    u = vmlaf(u, s, vcast_vf_f(0.0242998078465461730957031f));
+    u = vmlaf(u, s, vcast_vf_f(0.0534495301544666290283203f));
+    u = vmlaf(u, s, vcast_vf_f(0.133383005857467651367188f));
+    u = vmlaf(u, s, vcast_vf_f(0.333331853151321411132812f));
+
+    u = vmlaf(s, vmulf(u, x), x);
+
+    u = vself(m, vrecf(u), u);
+
+    u = vself(vmaskf_isinf(d), vcast_vf_f(NANf), u);
+
+    return u;
+}
+
+static INLINE vfloat xatanf(vfloat s) {
+    vfloat t, u;
+    vint2 q;
+
+    q = vseli2_lt(s, vcast_vf_f(0.0f), vcast_vi2_i(2), vcast_vi2_i(0));
+    s = vabsf(s);
+
+    q = vseli2_lt(vcast_vf_f(1.0f), s, vaddi2(q, vcast_vi2_i(1)), q);
+    s = vself(vmaskf_lt(vcast_vf_f(1.0f), s), vdivf(vcast_vf_f(1.0f), s), s);
+
+    t = vmulf(s, s);
+
+    u = vcast_vf_f(0.00282363896258175373077393f);
+    u = vmlaf(u, t, vcast_vf_f(-0.0159569028764963150024414f));
+    u = vmlaf(u, t, vcast_vf_f(0.0425049886107444763183594f));
+    u = vmlaf(u, t, vcast_vf_f(-0.0748900920152664184570312f));
+    u = vmlaf(u, t, vcast_vf_f(0.106347933411598205566406f));
+    u = vmlaf(u, t, vcast_vf_f(-0.142027363181114196777344f));
+    u = vmlaf(u, t, vcast_vf_f(0.199926957488059997558594f));
+    u = vmlaf(u, t, vcast_vf_f(-0.333331018686294555664062f));
+
+    t = vaddf(s, vmulf(s, vmulf(t, u)));
+
+    t = vself(vmaski2_eq(vandi2(q, vcast_vi2_i(1)), vcast_vi2_i(1)), vsubf(vcast_vf_f((float)(rtengine::RT_PI/2)), t), t);
+    t = vself(vmaski2_eq(vandi2(q, vcast_vi2_i(2)), vcast_vi2_i(2)), vnegf(t), t);
+
+    return t;
+}
+
+static INLINE vfloat atan2kf(vfloat y, vfloat x) {
+    vfloat s, t, u;
+    vint2 q;
+    vmask p;
+
+    q = vseli2_lt(x, vcast_vf_f(0.0f), vcast_vi2_i(-2), vcast_vi2_i(0));
+    x = vabsf(x);
+
+    q = vseli2_lt(x, y, vaddi2(q, vcast_vi2_i(1)), q);
+    p = vmaskf_lt(x, y);
+    s = vself(p, vnegf(x), y);
+    t = vmaxf(x, y);
+
+    s = vdivf(s, t);
+    t = vmulf(s, s);
+
+    u = vcast_vf_f(0.00282363896258175373077393f);
+    u = vmlaf(u, t, vcast_vf_f(-0.0159569028764963150024414f));
+    u = vmlaf(u, t, vcast_vf_f(0.0425049886107444763183594f));
+    u = vmlaf(u, t, vcast_vf_f(-0.0748900920152664184570312f));
+    u = vmlaf(u, t, vcast_vf_f(0.106347933411598205566406f));
+    u = vmlaf(u, t, vcast_vf_f(-0.142027363181114196777344f));
+    u = vmlaf(u, t, vcast_vf_f(0.199926957488059997558594f));
+    u = vmlaf(u, t, vcast_vf_f(-0.333331018686294555664062f));
+
+    t = vaddf(s, vmulf(s, vmulf(t, u)));
+    t = vaddf(t, vmulf(vcast_vf_vi2(q), vcast_vf_f((float)(rtengine::RT_PI/2))));
+
+    return t;
+}
+
+static INLINE vfloat xatan2f(vfloat y, vfloat x) {
+    vfloat r = atan2kf(vabsf(y), x);
+
+    r = vmulsignf(r, x);
+    r = vself(vorm(vmaskf_isinf(x), vmaskf_eq(x, vcast_vf_f(0.0f))), vsubf(vcast_vf_f((float)(rtengine::RT_PI/2)), visinf2f(x, vmulsignf(vcast_vf_f((float)(rtengine::RT_PI/2)), x))), r);
+    r = vself(vmaskf_isinf(y), vsubf(vcast_vf_f((float)(rtengine::RT_PI/2)), visinf2f(x, vmulsignf(vcast_vf_f((float)(rtengine::RT_PI/4)), x))), r);
+    r = vself(vmaskf_eq(y, vcast_vf_f(0.0f)), vselfzero(vmaskf_eq(vsignf(x), vcast_vf_f(-1.0f)), vcast_vf_f((float)rtengine::RT_PI)), r);
+
+    return vself(vmaskf_isnan(x, y), vcast_vf_f(NANf), vmulsignf(r, y));
+}
+
+static INLINE vfloat xasinf(vfloat d) {
+    vfloat x, y;
+    x = vaddf(vcast_vf_f(1.0f), d);
+    y = vsubf(vcast_vf_f(1.0f), d);
+    x = vmulf(x, y);
+    x = vsqrtf(x);
+    x = vself(vmaskf_isnan(x), vcast_vf_f(NANf), atan2kf(vabsf(d), x));
+    return vmulsignf(x, d);
+}
+
+static INLINE vfloat xacosf(vfloat d) {
+    vfloat x, y;
+    x = vaddf(vcast_vf_f(1.0f), d);
+    y = vsubf(vcast_vf_f(1.0f), d);
+    x = vmulf(x, y);
+    x = vsqrtf(x);
+    x = vmulsignf(atan2kf(x, vabsf(d)), d);
+    y = (vfloat)vandm(vmaskf_lt(d, vcast_vf_f(0.0f)), (vmask)vcast_vf_f((float)rtengine::RT_PI));
+    x = vaddf(x, y);
+    return x;
+}
+
+static INLINE vfloat xlogf(vfloat d) {
+    vfloat x, x2, t, m;
+    vint2 e;
+
+    e = vilogbp1f(vmulf(d, vcast_vf_f(0.7071f)));
+    m = vldexpf(d, vsubi2(vcast_vi2_i(0), e));
+
+    x = vdivf(vaddf(vcast_vf_f(-1.0f), m), vaddf(vcast_vf_f(1.0f), m));
+    x2 = vmulf(x, x);
+
+    t = vcast_vf_f(0.2371599674224853515625f);
+    t = vmlaf(t, x2, vcast_vf_f(0.285279005765914916992188f));
+    t = vmlaf(t, x2, vcast_vf_f(0.400005519390106201171875f));
+    t = vmlaf(t, x2, vcast_vf_f(0.666666567325592041015625f));
+    t = vmlaf(t, x2, vcast_vf_f(2.0f));
+
+    x = vaddf(vmulf(x, t), vmulf(vcast_vf_f(0.693147180559945286226764f), vcast_vf_vi2(e)));
+
+    x = vself(vmaskf_ispinf(d), vcast_vf_f(INFINITYf), x);
+    x = vself(vmaskf_gt(vcast_vf_f(0), d), vcast_vf_f(NANf), x);
+    x = vself(vmaskf_eq(d, vcast_vf_f(0)), vcast_vf_f(-INFINITYf), x);
+
+    return x;
+}
+
+static INLINE vfloat xlogf1(vfloat d) { // does xlogf(vmaxf(d, 1.f)) but faster
+    vfloat x, x2, t, m;
+    vint2 e;
+
+    e = vilogbp1f(vmulf(d, vcast_vf_f(0.7071f)));
+    m = vldexpf(d, vsubi2(vcast_vi2_i(0), e));
+
+    x = vdivf(vaddf(vcast_vf_f(-1.0f), m), vaddf(vcast_vf_f(1.0f), m));
+    x2 = vmulf(x, x);
+
+    t = vcast_vf_f(0.2371599674224853515625f);
+    t = vmlaf(t, x2, vcast_vf_f(0.285279005765914916992188f));
+    t = vmlaf(t, x2, vcast_vf_f(0.400005519390106201171875f));
+    t = vmlaf(t, x2, vcast_vf_f(0.666666567325592041015625f));
+    t = vmlaf(t, x2, vcast_vf_f(2.0f));
+
+    x = vaddf(vmulf(x, t), vmulf(vcast_vf_f(0.693147180559945286226764f), vcast_vf_vi2(e)));
+
+    x = vself(vmaskf_ispinf(d), vcast_vf_f(INFINITYf), x);
+    x = vselfnotzero(vmaskf_le(d, vcast_vf_f(1.f)), x);
+
+    return x;
+}
+
+static INLINE vfloat xlogf0(vfloat d) {
+    vfloat x, x2, t, m;
+    vint2 e;
+
+    e = vilogbp1f(vmulf(d, vcast_vf_f(0.7071f)));
+    m = vldexpf(d, vsubi2(vcast_vi2_i(0), e));
+
+    x = vdivf(vaddf(vcast_vf_f(-1.0f), m), vaddf(vcast_vf_f(1.0f), m));
+    x2 = vmulf(x, x);
+
+    t = vcast_vf_f(0.2371599674224853515625f);
+    t = vmlaf(t, x2, vcast_vf_f(0.285279005765914916992188f));
+    t = vmlaf(t, x2, vcast_vf_f(0.400005519390106201171875f));
+    t = vmlaf(t, x2, vcast_vf_f(0.666666567325592041015625f));
+    t = vmlaf(t, x2, vcast_vf_f(2.0f));
+
+    x = vaddf(vmulf(x, t), vmulf(vcast_vf_f(0.693147180559945286226764f), vcast_vf_vi2(e)));
+
+    x = vself(vmaskf_ispinf(d), vcast_vf_f(0), x);
+    x = vself(vmaskf_gt(vcast_vf_f(0), d), vcast_vf_f(0), x);
+    x = vself(vmaskf_eq(d, vcast_vf_f(0)), vcast_vf_f(0), x);
+
+    return x;
+}
+
+static INLINE vfloat xlogfNoCheck(vfloat d) { // this version does not check input values. Use it only when you know the input values are > 0 e.g. when filling a lookup table
+    vfloat x, x2, t, m;
+    vint2 e;
+
+    e = vilogbp1f(vmulf(d, vcast_vf_f(0.7071f)));
+    m = vldexpf(d, vsubi2(vcast_vi2_i(0), e));
+
+    x = vdivf(vaddf(vcast_vf_f(-1.0f), m), vaddf(vcast_vf_f(1.0f), m));
+    x2 = vmulf(x, x);
+
+    t = vcast_vf_f(0.2371599674224853515625f);
+    t = vmlaf(t, x2, vcast_vf_f(0.285279005765914916992188f));
+    t = vmlaf(t, x2, vcast_vf_f(0.400005519390106201171875f));
+    t = vmlaf(t, x2, vcast_vf_f(0.666666567325592041015625f));
+    t = vmlaf(t, x2, vcast_vf_f(2.0f));
+
+    return vaddf(vmulf(x, t), vmulf(vcast_vf_f(0.693147180559945286226764f), vcast_vf_vi2(e)));
+
+}
+
+static INLINE vfloat xexpf(vfloat d) {
+    vint2 q = vrint_vi2_vf(vmulf(d, vcast_vf_f(R_LN2f)));
+    vfloat s, u;
+
+    s = vmlaf(vcast_vf_vi2(q), vcast_vf_f(-L2Uf),d);
+    s = vmlaf(vcast_vf_vi2(q), vcast_vf_f(-L2Lf),s);
+
+    u = vcast_vf_f(0.00136324646882712841033936f);
+    u = vmlaf(u, s, vcast_vf_f(0.00836596917361021041870117f));
+    u = vmlaf(u, s, vcast_vf_f(0.0416710823774337768554688f));
+    u = vmlaf(u, s, vcast_vf_f(0.166665524244308471679688f));
+    u = vmlaf(u, s, vcast_vf_f(0.499999850988388061523438f));
+
+    u = vaddf(vcast_vf_f(1.0f), vmlaf(vmulf(s, s), u, s));
+
+    u = vldexpf(u, q);
+
+    // -104.0
+    return vselfnotzero(vmaskf_gt(vcast_vf_f(-104.f), d), u);
+}
+
+static INLINE vfloat xexpfNoCheck(vfloat d) { // this version does not check input values. Use it only when you know the input values are > -104.f e.g. when filling a lookup table
+    vint2 q = vrint_vi2_vf(vmulf(d, vcast_vf_f(R_LN2f)));
+    vfloat s, u;
+
+    s = vmlaf(vcast_vf_vi2(q), vcast_vf_f(-L2Uf),d);
+    s = vmlaf(vcast_vf_vi2(q), vcast_vf_f(-L2Lf),s);
+
+    u = vcast_vf_f(0.00136324646882712841033936f);
+    u = vmlaf(u, s, vcast_vf_f(0.00836596917361021041870117f));
+    u = vmlaf(u, s, vcast_vf_f(0.0416710823774337768554688f));
+    u = vmlaf(u, s, vcast_vf_f(0.166665524244308471679688f));
+    u = vmlaf(u, s, vcast_vf_f(0.499999850988388061523438f));
+
+    u = vaddf(vcast_vf_f(1.0f), vmlaf(vmulf(s, s), u, s));
+
+    return vldexpf(u, q);
+}
+
+static INLINE vfloat xcbrtf(vfloat d) {
+    vfloat x, y, q = vcast_vf_f(1.0), t;
+    vint2 e, qu, re;
+
+    e = vilogbp1f(vabsf(d));
+    d = vldexpf(d, vsubi2(vcast_vi2_i(0), e));
+
+    t = vaddf(vcast_vf_vi2(e), vcast_vf_f(6144));
+    qu = vtruncate_vi2_vf(vdivf(t, vcast_vf_f(3)));
+    re = vtruncate_vi2_vf(vsubf(t, vmulf(vcast_vf_vi2(qu), vcast_vf_f(3))));
+
+    q = vself(vmaski2_eq(re, vcast_vi2_i(1)), vcast_vf_f(1.2599210498948731647672106f), q);
+    q = vself(vmaski2_eq(re, vcast_vi2_i(2)), vcast_vf_f(1.5874010519681994747517056f), q);
+    q = vldexpf(q, vsubi2(qu, vcast_vi2_i(2048)));
+
+    q = vmulsignf(q, d);
+    d = vabsf(d);
+
+    x = vcast_vf_f(-0.601564466953277587890625f);
+    x = vmlaf(x, d, vcast_vf_f(2.8208892345428466796875f));
+    x = vmlaf(x, d, vcast_vf_f(-5.532182216644287109375f));
+    x = vmlaf(x, d, vcast_vf_f(5.898262500762939453125f));
+    x = vmlaf(x, d, vcast_vf_f(-3.8095417022705078125f));
+    x = vmlaf(x, d, vcast_vf_f(2.2241256237030029296875f));
+
+    y = vmulf(vmulf(d, x), x);
+    y = vmulf(vsubf(y, vmulf(vmulf(vcast_vf_f(2.0f / 3.0f), y), vmlaf(y, x, vcast_vf_f(-1.0f)))), q);
+
+    return y;
+}
+
+static INLINE vfloat vclampf(vfloat value, vfloat low, vfloat high) {
+    // clamps value in [low;high], returns low if value is NaN
+    return vmaxf(vminf(high, value), low);
+}
+
+static INLINE vfloat SQRV(vfloat a){
+    return a * a;
+}
+
+static inline void vswap( vmask condition, vfloat &a, vfloat &b) {
+    // conditional swap the elements of two vfloats
+    vfloat temp = vself(condition, a, b); // the values which fit to condition
+    a = vself(condition, b, a); // the values which fit to inverted condition
+    b = temp;
+}
+
+static inline float vhadd( vfloat a ) {
+    // returns a[0] + a[1] + a[2] + a[3]
+    a += _mm_movehl_ps(a, a);
+    return _mm_cvtss_f32(_mm_add_ss(a, _mm_shuffle_ps(a, a, 1)));
+}
+
+static inline float vhmin(vfloat a) {
+    // returns min(a[0], a[1], a[2], a[3])
+    a = vminf(a, _mm_movehl_ps(a, a));
+    return _mm_cvtss_f32(vminf(a, _mm_shuffle_ps(a, a, 1)));
+}
+
+static inline float vhmax(vfloat a) {
+    // returns max(a[0], a[1], a[2], a[3])
+    a = vmaxf(a, _mm_movehl_ps(a, a));
+    return _mm_cvtss_f32(vmaxf(a, _mm_shuffle_ps(a, a, 1)));
+}
+
+static INLINE vfloat vmul2f(vfloat a){
+    // fastest way to multiply by 2
+    return a + a;
+}
+
+static INLINE vfloat vintpf(vfloat a, vfloat b, vfloat c) {
+    // calculate a * b + (1 - a) * c (interpolate two values)
+    // following is valid:
+    // vintpf(a, b+x, c+x) = vintpf(a, b, c) + x
+    // vintpf(a, b*x, c*x) = vintpf(a, b, c) * x
+    return a * (b-c) + c;
+}
+
+static INLINE vfloat vdup(vfloat a){
+    // returns { a[0],a[0],a[1],a[1] }
+    return _mm_unpacklo_ps( a, a );
+}
+
+static INLINE vfloat vaddc2vfu(float &a)
+{
+    // loads a[0]..a[7] and returns { a[0]+a[1], a[2]+a[3], a[4]+a[5], a[6]+a[7] }
+    vfloat a1 = _mm_loadu_ps( &a );
+    vfloat a2 = _mm_loadu_ps( (&a) + 4 );
+    return _mm_shuffle_ps(a1,a2,_MM_SHUFFLE( 2,0,2,0 )) + _mm_shuffle_ps(a1,a2,_MM_SHUFFLE( 3,1,3,1 ));
+}
+
+static INLINE vfloat vadivapb (vfloat a, vfloat b) {
+    return a / (a+b);
+}
+
+static INLINE void vconvertrgbrgbrgbrgb2rrrrggggbbbb (const float * src, vfloat &rv, vfloat &gv, vfloat &bv) { // cool function name, isn't it ? :P
+    // converts a sequence of 4 float RGB triplets to 3 red, green and blue quadruples
+    rv = _mm_setr_ps(src[0],src[3],src[6],src[9]);
+    gv = _mm_setr_ps(src[1],src[4],src[7],src[10]);
+    bv = _mm_setr_ps(src[2],src[5],src[8],src[11]);
+}
+
+#if defined( __SSE4_1__ ) && defined( __x86_64__ )
+static INLINE vfloat vceilf(vfloat x) {
+    return _mm_round_ps(x, _MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC);
+}
+
+#else
+
+static INLINE vfloat vceilf(vfloat x) {
+    __m128i zerov = _mm_setzero_si128();
+    zerov = _mm_cmpeq_epi32(zerov, zerov);
+    const vfloat onev = (vfloat)_mm_slli_epi32(_mm_srli_epi32(zerov, 25), 23); //create vector 1.0f
+    const vfloat xi = _mm_cvtepi32_ps(_mm_cvttps_epi32(x));
+    return xi + _mm_and_ps(_mm_cmplt_ps(xi, x), onev);
+}
+#endif
+
+#endif // __SSE2__
diff --git a/rtengine/utils.cc b/rtengine/utils.cc
index a07a1235f..0674c9806 100644
--- a/rtengine/utils.cc
+++ b/rtengine/utils.cc
@@ -22,7 +22,6 @@
 #include "rt_math.h"
 
 #include "utils.h"
-#include "rt_math.h"
 
 using namespace std;
 
diff --git a/rtexif/pentaxattribs.cc b/rtexif/pentaxattribs.cc
index 46ae9f67e..0e866b09f 100644
--- a/rtexif/pentaxattribs.cc
+++ b/rtexif/pentaxattribs.cc
@@ -696,7 +696,7 @@ public:
 };
 PAColorSpaceInterpreter paColorSpaceInterpreter;
 
-class PALensTypeInterpreter : public IntLensInterpreter< int >
+class PALensTypeInterpreter final: public IntLensInterpreter< int >
 {
 public:
     PALensTypeInterpreter ()
diff --git a/rtexif/rtexif.h b/rtexif/rtexif.h
index c37533352..1a956a4a5 100644
--- a/rtexif/rtexif.h
+++ b/rtexif/rtexif.h
@@ -163,14 +163,14 @@ public:
     // Try to get the Tag in the current directory and in subdirectories
     // if lookUpward = true, it will scan the parents TagDirectory up to the root one,
     // but w/o looking into their subdirs
-    virtual Tag*     findTag       (const char* name, bool lookUpward = false) const;
+    Tag*     findTag       (const char* name, bool lookUpward = false) const;
     // Find a all Tags with the given name by scanning the whole tag tree
     std::vector<const Tag*> findTags (const char* name);
     // Find a all Tags with the given ID by scanning the whole tag tree
     std::vector<const Tag*> findTags (int ID);
     // Try to get the Tag in the current directory and in parent directories
     // (won't look into subdirs)
-    virtual Tag*     findTagUpward (const char* name) const;
+    Tag*     findTagUpward (const char* name) const;
     bool             getXMPTagValue (const char* name, char* value) const;
 
     void        keepTag       (int ID);
@@ -191,10 +191,10 @@ public:
     virtual TagDirectory* clone    (TagDirectory* parent) const;
     void     applyChange   (const std::string &field, const Glib::ustring &value);
 
-    virtual void     printAll      (unsigned  int level = 0) const; // reentrant debug function, keep level=0 on first call !
-    virtual bool     CPBDump       (const Glib::ustring &commFName, const Glib::ustring &imageFName, const Glib::ustring &profileFName, const Glib::ustring &defaultPParams,
+    void     printAll      (unsigned  int level = 0) const; // reentrant debug function, keep level=0 on first call !
+    bool     CPBDump       (const Glib::ustring &commFName, const Glib::ustring &imageFName, const Glib::ustring &profileFName, const Glib::ustring &defaultPParams,
                                     const CacheImageData* cfs, const bool flagMode, Glib::KeyFile *keyFile = nullptr, Glib::ustring tagDirName = "") const;
-    virtual void     sort     ();
+    void     sort     ();
 };
 
 // a table of tags: id are offset from beginning and not identifiers
diff --git a/rtexif/sonyminoltaattribs.cc b/rtexif/sonyminoltaattribs.cc
index 61223edf0..722b11faf 100644
--- a/rtexif/sonyminoltaattribs.cc
+++ b/rtexif/sonyminoltaattribs.cc
@@ -541,7 +541,7 @@ public:
 };
 SAAntiBlurInterpreter saAntiBlurInterpreter;
 
-class SALensIDInterpreter : public IntLensInterpreter<int>
+class SALensIDInterpreter final : public IntLensInterpreter<int>
 {
 public:
     SALensIDInterpreter ()
@@ -1015,7 +1015,7 @@ public:
 };
 SALensIDInterpreter saLensIDInterpreter;
 
-class SALensID2Interpreter : public IntLensInterpreter< int >
+class SALensID2Interpreter final : public IntLensInterpreter< int >
 {
 public:
     SALensID2Interpreter ()
diff --git a/rtgui/adjuster.h b/rtgui/adjuster.h
index 59250bc81..143268786 100644
--- a/rtgui/adjuster.h
+++ b/rtgui/adjuster.h
@@ -33,7 +33,7 @@ public:
 
 typedef double(*double2double_fun)(double val);
 
-class Adjuster : public Gtk::Grid
+class Adjuster final : public Gtk::Grid
 {
 
 protected:
diff --git a/rtgui/batchqueueentry.h b/rtgui/batchqueueentry.h
index c4cd48615..f06b65046 100644
--- a/rtgui/batchqueueentry.h
+++ b/rtgui/batchqueueentry.h
@@ -50,7 +50,7 @@ struct BatchQueueEntryIdleHelper {
     int pending;
 };
 
-class BatchQueueEntry : public ThumbBrowserEntryBase, public BQEntryUpdateListener, public rtengine::NonCopyable
+class BatchQueueEntry final : public ThumbBrowserEntryBase, public BQEntryUpdateListener, public rtengine::NonCopyable
 {
 
     guint8* opreview;
diff --git a/rtgui/batchtoolpanelcoord.h b/rtgui/batchtoolpanelcoord.h
index 7a5fe77ed..ea11f97c8 100644
--- a/rtgui/batchtoolpanelcoord.h
+++ b/rtgui/batchtoolpanelcoord.h
@@ -28,7 +28,7 @@
 
 class FilePanel;
 class Thumbnail;
-class BatchToolPanelCoordinator :
+class BatchToolPanelCoordinator final :
     public ToolPanelCoordinator,
     public FileSelectionChangeListener,
     public BatchPParamsChangeListener,
diff --git a/rtgui/coloredbar.h b/rtgui/coloredbar.h
index 6cc121cd5..69cfa47d1 100644
--- a/rtgui/coloredbar.h
+++ b/rtgui/coloredbar.h
@@ -30,7 +30,7 @@
  *          the bar itself, i.e. use render_background (depending on its Gtk::Style)
  *
  */
-class ColoredBar : private BackBuffer, public ColorCaller
+class ColoredBar final : private BackBuffer, public ColorCaller
 {
 
 private:
diff --git a/rtgui/coordinateadjuster.h b/rtgui/coordinateadjuster.h
index 70fe42233..24edc707a 100644
--- a/rtgui/coordinateadjuster.h
+++ b/rtgui/coordinateadjuster.h
@@ -69,7 +69,7 @@ public:
  *
  * The position of the Axis in the vector will be used in the communication between the Adjuster and the Provider to identify the Axis
  */
-class CoordinateAdjuster : public Gtk::FlowBox
+class CoordinateAdjuster final : public Gtk::FlowBox
 {
 
 public:
diff --git a/rtgui/cropwindow.cc b/rtgui/cropwindow.cc
index 0a9b81112..edc378700 100644
--- a/rtgui/cropwindow.cc
+++ b/rtgui/cropwindow.cc
@@ -37,6 +37,7 @@
 #include "rtsurface.h"
 
 #include "../rtengine/dcrop.h"
+#include "../rtengine/imagesource.h"
 #include "../rtengine/procparams.h"
 #include "../rtengine/rt_math.h"
 
diff --git a/rtgui/cropwindow.h b/rtgui/cropwindow.h
index 491124ad5..623653d2d 100644
--- a/rtgui/cropwindow.h
+++ b/rtgui/cropwindow.h
@@ -54,7 +54,7 @@ public:
 };
 
 class ImageArea;
-class CropWindow : public LWButtonListener, public CropDisplayHandler, public EditCoordSystem, public ObjectMOBuffer, public rtengine::NonCopyable
+class CropWindow final : public LWButtonListener, public CropDisplayHandler, public EditCoordSystem, public ObjectMOBuffer, public rtengine::NonCopyable
 {
     static bool initialized;
 
diff --git a/rtgui/curveeditor.h b/rtgui/curveeditor.h
index 21070fad6..d5ec49559 100644
--- a/rtgui/curveeditor.h
+++ b/rtgui/curveeditor.h
@@ -18,8 +18,8 @@
  */
 #pragma once
 
-#include "coloredbar.h"
 #include "editcallbacks.h"
+#include "guiutils.h"
 
 #include "../rtengine/diagonalcurvetypes.h"
 #include "../rtengine/flatcurvetypes.h"
@@ -28,6 +28,7 @@
 
 class CurveEditorGroup;
 class CurveEditorSubGroup;
+class ColorProvider;
 class PopUpToggleButton;
 
 /*
diff --git a/rtgui/diagonalcurveeditorsubgroup.h b/rtgui/diagonalcurveeditorsubgroup.h
index a077da807..9b1f67462 100644
--- a/rtgui/diagonalcurveeditorsubgroup.h
+++ b/rtgui/diagonalcurveeditorsubgroup.h
@@ -27,7 +27,7 @@
 class DiagonalCurveEditor;
 class MyDiagonalCurve;
 
-class DiagonalCurveEditorSubGroup :
+class DiagonalCurveEditorSubGroup final :
     public CurveEditorSubGroup,
     public SHCListener,
     public AdjusterListener,
diff --git a/rtgui/editcallbacks.h b/rtgui/editcallbacks.h
index d81e84806..524ec1622 100644
--- a/rtgui/editcallbacks.h
+++ b/rtgui/editcallbacks.h
@@ -129,19 +129,19 @@ public:
     @param picked True if the cursor is still above the the same object than on button pressed and with the same modifier keys.
                   If false, the user moved the cursor away or the modifier key is different, so the element is considered as NOT selected.
     @return true if the preview has to be redrawn, false otherwise */
-    virtual bool pick1 (bool picked);
+    bool pick1 (bool picked);
 
     /** @brief Triggered when the user is releasing mouse button 2 while in action==ES_ACTION_PICKING mode
     @param picked True if the cursor is still above the the same object than on button pressed and with the same modifier keys.
                   If false, the user moved the cursor away or the modifier key is different, so the element is considered as NOT selected.
     @return true if the preview has to be redrawn, false otherwise */
-    virtual bool pick2 (bool picked);
+    bool pick2 (bool picked);
 
     /** @brief Triggered when the user is releasing mouse button 3 while in action==ES_ACTION_PICKING mode
     @param picked True if the cursor is still above the the same object than on button pressed and with the same modifier keys.
                   If false, the user moved the cursor away or the modifier key is different, so the element is considered as NOT selected.
     @return true if the preview has to be redrawn, false otherwise */
-    virtual bool pick3 (bool picked);
+    bool pick3 (bool picked);
 
     /** @brief Get the geometry to be shown to the user */
     const std::vector<Geometry*>& getVisibleGeometry ();
diff --git a/rtgui/exifpanel.h b/rtgui/exifpanel.h
index 0894c21ad..d336d83f9 100644
--- a/rtgui/exifpanel.h
+++ b/rtgui/exifpanel.h
@@ -25,7 +25,7 @@
 #include "toolpanel.h"
 #include "../rtexif/rtexif.h"
 
-class ExifPanel :
+class ExifPanel final :
     public Gtk::VBox,
     public ToolPanel
 {
diff --git a/rtgui/filebrowser.h b/rtgui/filebrowser.h
index 86ab59395..53f3f1f2b 100644
--- a/rtgui/filebrowser.h
+++ b/rtgui/filebrowser.h
@@ -56,7 +56,7 @@ public:
 /*
  * Class handling actions common to all thumbnails of the file browser
  */
-class FileBrowser  : public ThumbBrowserBase,
+class FileBrowser final : public ThumbBrowserBase,
     public LWButtonListener,
     public ExportPanelListener,
     public ProfileStoreListener,
diff --git a/rtgui/filebrowserentry.h b/rtgui/filebrowserentry.h
index ea5140ed6..67b953514 100644
--- a/rtgui/filebrowserentry.h
+++ b/rtgui/filebrowserentry.h
@@ -43,7 +43,7 @@ struct FileBrowserEntryIdleHelper {
 };
 
 class FileThumbnailButtonSet;
-class FileBrowserEntry : public ThumbBrowserEntryBase,
+class FileBrowserEntry final : public ThumbBrowserEntryBase,
     public ThumbnailListener,
     public ThumbImageUpdateListener,
     public rtengine::NonCopyable
diff --git a/rtgui/filecatalog.h b/rtgui/filecatalog.h
index 0516017cd..5f3623c2c 100644
--- a/rtgui/filecatalog.h
+++ b/rtgui/filecatalog.h
@@ -43,7 +43,7 @@ class ToolBar;
  *   - handling the thumbnail toolbar,
  *   - monitoring the directory (for any change)
  */
-class FileCatalog : public Gtk::VBox,
+class FileCatalog final : public Gtk::VBox,
     public PreviewLoaderListener,
     public FilterPanelListener,
     public FileBrowserListener,
diff --git a/rtgui/flatcurveeditorsubgroup.h b/rtgui/flatcurveeditorsubgroup.h
index bfdca4aae..79dd5e7d0 100644
--- a/rtgui/flatcurveeditorsubgroup.h
+++ b/rtgui/flatcurveeditorsubgroup.h
@@ -27,7 +27,7 @@
 class FlatCurveEditor;
 class MyFlatCurve;
 
-class FlatCurveEditorSubGroup:
+class FlatCurveEditorSubGroup final :
     public CurveEditorSubGroup,
     public rtengine::NonCopyable
 {
diff --git a/rtgui/guiutils.cc b/rtgui/guiutils.cc
index 02a28607f..9a4e71ab4 100644
--- a/rtgui/guiutils.cc
+++ b/rtgui/guiutils.cc
@@ -20,7 +20,6 @@
 
 #include "guiutils.h"
 #include "options.h"
-#include "../rtengine/rt_math.h"
 #include "../rtengine/utils.h"
 #include "../rtengine/procparams.h"
 #include "rtimage.h"
diff --git a/rtgui/guiutils.h b/rtgui/guiutils.h
index 97c72513b..d4c91734a 100644
--- a/rtgui/guiutils.h
+++ b/rtgui/guiutils.h
@@ -92,7 +92,7 @@ private:
  *     }
  *   </code>
  */
-class GThreadLock
+class GThreadLock final
 {
 public:
     GThreadLock()
@@ -110,7 +110,7 @@ public:
  *
  * Will relock on destruction.
  */
-class GThreadUnLock
+class GThreadUnLock final
 {
 public:
     GThreadUnLock()
@@ -125,7 +125,7 @@ public:
 
 #pragma GCC diagnostic pop
 
-class ConnectionBlocker
+class ConnectionBlocker final
 {
 public:
     explicit ConnectionBlocker (Gtk::Widget *associatedWidget, sigc::connection& connection) : connection (associatedWidget ? &connection : nullptr), wasBlocked(false)
@@ -152,7 +152,7 @@ private:
 /**
  * @brief Glue box to control visibility of the MyExpender's content ; also handle the frame around it
  */
-class ExpanderBox: public Gtk::EventBox
+class ExpanderBox final : public Gtk::EventBox
 {
 private:
     Gtk::Container *pC;
@@ -185,7 +185,7 @@ public:
  *
  * Warning: once you've instantiated this class with a text label or a widget label, you won't be able to revert to the other solution.
  */
-class MyExpander : public Gtk::VBox
+class MyExpander final : public Gtk::VBox
 {
 public:
     typedef sigc::signal<void> type_signal_enabled_toggled;
@@ -295,7 +295,7 @@ public:
 /**
  * @brief subclass of Gtk::ScrolledWindow in order to handle the scrollwheel
  */
-class MyScrolledWindow : public Gtk::ScrolledWindow
+class MyScrolledWindow final : public Gtk::ScrolledWindow
 {
 
     bool on_scroll_event (GdkEventScroll* event) override;
@@ -310,7 +310,7 @@ public:
 /**
  * @brief subclass of Gtk::ScrolledWindow in order to handle the large toolbars (wider than available space)
  */
-class MyScrolledToolbar : public Gtk::ScrolledWindow
+class MyScrolledToolbar final : public Gtk::ScrolledWindow
 {
 
     bool on_scroll_event (GdkEventScroll* event) override;
@@ -340,7 +340,7 @@ public:
 /**
  * @brief subclass of Gtk::ComboBoxText in order to handle the scrollwheel
  */
-class MyComboBoxText : public Gtk::ComboBoxText
+class MyComboBoxText final : public Gtk::ComboBoxText
 {
     int naturalWidth, minimumWidth;
     sigc::connection myConnection;
@@ -360,7 +360,7 @@ public:
 /**
  * @brief subclass of Gtk::SpinButton in order to handle the scrollwheel
  */
-class MySpinButton : public Gtk::SpinButton
+class MySpinButton final : public Gtk::SpinButton
 {
 
 protected:
@@ -375,7 +375,7 @@ public:
 /**
  * @brief subclass of Gtk::HScale in order to handle the scrollwheel
  */
-class MyHScale : public Gtk::HScale
+class MyHScale final : public Gtk::HScale
 {
 
     bool on_scroll_event (GdkEventScroll* event) override;
@@ -385,7 +385,7 @@ class MyHScale : public Gtk::HScale
 /**
  * @brief subclass of Gtk::FileChooserButton in order to handle the scrollwheel
  */
-class MyFileChooserButton: public Gtk::Button {
+class MyFileChooserButton final : public Gtk::Button {
 private:
     void show_chooser();
 
@@ -473,14 +473,14 @@ typedef enum RTNav {
 /**
  * @brief Handle the switch between text and image to be displayed in the HBox (to be used in a button/toolpanel)
  */
-class TextOrIcon : public Gtk::HBox
+class TextOrIcon final : public Gtk::HBox
 {
 
 public:
     TextOrIcon (const Glib::ustring &filename, const Glib::ustring &labelTx, const Glib::ustring &tooltipTx);
 };
 
-class MyImageMenuItem : public Gtk::MenuItem
+class MyImageMenuItem final : public Gtk::MenuItem
 {
 private:
     Gtk::Grid *box;
@@ -493,7 +493,7 @@ public:
     const Gtk::Label* getLabel () const;
 };
 
-class MyProgressBar : public Gtk::ProgressBar
+class MyProgressBar final : public Gtk::ProgressBar
 {
 private:
     int w;
@@ -512,7 +512,7 @@ public:
 /**
  * @brief Define a gradient milestone
  */
-class GradientMilestone
+class GradientMilestone final
 {
 public:
     double position;
diff --git a/rtgui/histogrampanel.h b/rtgui/histogrampanel.h
index 4fd21bcc2..cec44ed03 100644
--- a/rtgui/histogrampanel.h
+++ b/rtgui/histogrampanel.h
@@ -53,7 +53,7 @@ public:
     double log (double vsize, double val);
 };
 
-class HistogramRGBArea : public Gtk::DrawingArea, public BackBuffer, private HistogramScaling, public rtengine::NonCopyable
+class HistogramRGBArea final : public Gtk::DrawingArea, public BackBuffer, private HistogramScaling, public rtengine::NonCopyable
 {
 private:
     typedef const double (*TMatrix)[3];
@@ -116,7 +116,7 @@ public:
     virtual void toggleButtonMode() = 0;
 };
 
-class HistogramArea : public Gtk::DrawingArea, public BackBuffer, private HistogramScaling, public rtengine::NonCopyable
+class HistogramArea final : public Gtk::DrawingArea, public BackBuffer, private HistogramScaling, public rtengine::NonCopyable
 {
 public:
     typedef sigc::signal<void, double> type_signal_factor_changed;
@@ -174,7 +174,7 @@ private:
     void get_preferred_width_for_height_vfunc (int height, int &minimum_width, int &natural_width) const override;
 };
 
-class HistogramPanel : public Gtk::Grid, public PointerMotionListener, public DrawModeListener, public rtengine::NonCopyable
+class HistogramPanel final : public Gtk::Grid, public PointerMotionListener, public DrawModeListener, public rtengine::NonCopyable
 {
 
 protected:
diff --git a/rtgui/iccprofilecreator.h b/rtgui/iccprofilecreator.h
index 2cd19e14f..8ee46a71d 100644
--- a/rtgui/iccprofilecreator.h
+++ b/rtgui/iccprofilecreator.h
@@ -27,7 +27,7 @@
 
 class RTWindow;
 
-class ICCProfileCreator : public Gtk::Dialog, public AdjusterListener
+class ICCProfileCreator final : public Gtk::Dialog, public AdjusterListener
 {
 
 private:
diff --git a/rtgui/imagearea.h b/rtgui/imagearea.h
index 3f5c321a3..ad6fd305f 100644
--- a/rtgui/imagearea.h
+++ b/rtgui/imagearea.h
@@ -34,7 +34,7 @@
 
 class ImageAreaPanel;
 
-class ImageArea :
+class ImageArea final :
     public Gtk::DrawingArea,
     public CropWindowListener,
     public EditDataProvider,
diff --git a/rtgui/imageareapanel.h b/rtgui/imageareapanel.h
index 831371dbb..f52ca9060 100644
--- a/rtgui/imageareapanel.h
+++ b/rtgui/imageareapanel.h
@@ -22,7 +22,7 @@
 
 class ImageArea;
 
-class ImageAreaPanel :
+class ImageAreaPanel final :
     public Gtk::VBox
 {
 
diff --git a/rtgui/inspector.h b/rtgui/inspector.h
index 8338259bf..1526f90be 100644
--- a/rtgui/inspector.h
+++ b/rtgui/inspector.h
@@ -40,7 +40,7 @@ public:
     //~InspectorBuffer();
 };
 
-class Inspector : public Gtk::DrawingArea
+class Inspector final : public Gtk::DrawingArea
 {
 
 private:
diff --git a/rtgui/iptcpanel.h b/rtgui/iptcpanel.h
index 15d117f87..100904d1a 100644
--- a/rtgui/iptcpanel.h
+++ b/rtgui/iptcpanel.h
@@ -25,7 +25,7 @@
 #include "guiutils.h"
 #include "toolpanel.h"
 
-class IPTCPanel :
+class IPTCPanel final :
     public Gtk::VBox,
     public ToolPanel
 {
diff --git a/rtgui/lockablecolorpicker.h b/rtgui/lockablecolorpicker.h
index 77d2e8e9f..baeea41ef 100644
--- a/rtgui/lockablecolorpicker.h
+++ b/rtgui/lockablecolorpicker.h
@@ -33,7 +33,7 @@ public:
     virtual void switchPickerVisibility(bool isVisible) = 0;
 };
 
-class LockableColorPicker : BackBuffer
+class LockableColorPicker final : BackBuffer
 {
 public:
     enum class Size {
diff --git a/rtgui/mydiagonalcurve.h b/rtgui/mydiagonalcurve.h
index b38373006..b2b83c02e 100644
--- a/rtgui/mydiagonalcurve.h
+++ b/rtgui/mydiagonalcurve.h
@@ -41,7 +41,7 @@ public:
     std::vector<double> x, y;   // in case of parametric curves the curve parameters are stored in vector x. In other cases these vectors store the coordinates of the bullets.
 };
 
-class MyDiagonalCurve : public MyCurve
+class MyDiagonalCurve final : public MyCurve
 {
 private:
     IdleRegister idle_register;
diff --git a/rtgui/myflatcurve.h b/rtgui/myflatcurve.h
index aaec8eb99..f51586567 100644
--- a/rtgui/myflatcurve.h
+++ b/rtgui/myflatcurve.h
@@ -66,7 +66,7 @@ public:
     double centerY;
 };
 
-class MyFlatCurve : public MyCurve
+class MyFlatCurve final : public MyCurve
 {
 private:
     IdleRegister idle_register;
diff --git a/rtgui/navigator.h b/rtgui/navigator.h
index c1c23c6dc..e9d40e309 100644
--- a/rtgui/navigator.h
+++ b/rtgui/navigator.h
@@ -25,7 +25,7 @@
 
 class PreviewWindow;
 
-class Navigator :
+class Navigator final :
     public Gtk::Frame,
     public PointerMotionListener
 {
diff --git a/rtgui/partialpastedlg.h b/rtgui/partialpastedlg.h
index 0c21430be..cc495d07c 100644
--- a/rtgui/partialpastedlg.h
+++ b/rtgui/partialpastedlg.h
@@ -34,7 +34,7 @@ class ProcParams;
 
 struct ParamsEdited;
 
-class PartialPasteDlg : public Gtk::Dialog
+class PartialPasteDlg final : public Gtk::Dialog
 {
 
 public:
diff --git a/rtgui/popupbutton.h b/rtgui/popupbutton.h
index 87b1b73a6..a67339a9f 100644
--- a/rtgui/popupbutton.h
+++ b/rtgui/popupbutton.h
@@ -24,7 +24,7 @@
 
 #include "popupcommon.h"
 
-class PopUpButton :
+class PopUpButton final :
     public Gtk::Button,
     public PopUpCommon
 {
diff --git a/rtgui/preferences.h b/rtgui/preferences.h
index 19e280762..3cf6b2043 100644
--- a/rtgui/preferences.h
+++ b/rtgui/preferences.h
@@ -29,7 +29,7 @@
 class RTWindow;
 class Splash;
 
-class Preferences :
+class Preferences final :
     public Gtk::Dialog,
     public ProfileStoreListener
 {
diff --git a/rtgui/previewhandler.h b/rtgui/previewhandler.h
index d9c91f6ad..4fd9a1e82 100644
--- a/rtgui/previewhandler.h
+++ b/rtgui/previewhandler.h
@@ -44,7 +44,7 @@ struct PreviewHandlerIdleHelper {
     int pending;
 };
 
-class PreviewHandler : public rtengine::PreviewImageListener, public rtengine::NonCopyable
+class PreviewHandler final : public rtengine::PreviewImageListener, public rtengine::NonCopyable
 {
 private:
     friend int setImageUI   (void* data);
diff --git a/rtgui/previewloader.h b/rtgui/previewloader.h
index 9a74ee2eb..a8032fcaf 100644
--- a/rtgui/previewloader.h
+++ b/rtgui/previewloader.h
@@ -20,10 +20,15 @@
 
 #include <set>
 
-#include <glibmm/ustring.h>
-
 #include "../rtengine/noncopyable.h"
 
+namespace Glib
+{
+
+class ustring;
+
+}
+
 class FileBrowserEntry;
 
 class PreviewLoaderListener
diff --git a/rtgui/profilepanel.h b/rtgui/profilepanel.h
index b3c968682..c3a125c49 100644
--- a/rtgui/profilepanel.h
+++ b/rtgui/profilepanel.h
@@ -49,7 +49,7 @@ class PartialProfile;
 }
 class RTImage;
 
-class ProfilePanel :
+class ProfilePanel final :
     public Gtk::Grid,
     public PParamsChangeListener,
     public ProfileStoreListener,
diff --git a/rtgui/progressconnector.h b/rtgui/progressconnector.h
index f4d1d8f7e..90c9f67f0 100644
--- a/rtgui/progressconnector.h
+++ b/rtgui/progressconnector.h
@@ -28,7 +28,7 @@
 
 #undef THREAD_PRIORITY_NORMAL
 
-class PLDBridge :
+class PLDBridge final :
     public rtengine::ProgressListener
 {
 public:
diff --git a/rtgui/renamedlg.h b/rtgui/renamedlg.h
index f9447fbac..3c92965f8 100644
--- a/rtgui/renamedlg.h
+++ b/rtgui/renamedlg.h
@@ -26,7 +26,7 @@
 
 class CacheImageData;
 
-class RenameDialog :
+class RenameDialog final :
     public Gtk::Dialog
 {
 
diff --git a/rtgui/rtimage.h b/rtgui/rtimage.h
index b68becd82..eb1930d28 100644
--- a/rtgui/rtimage.h
+++ b/rtgui/rtimage.h
@@ -25,7 +25,7 @@
 /**
  * @brief A derived class of Gtk::Image in order to handle theme-related icon sets.
  */
-class RTImage : public Gtk::Image, public RTScalable
+class RTImage final : public Gtk::Image, public RTScalable
 {
     static double dpiBack; // used to keep track of master dpi change
     static int scaleBack;  // used to keep track of master scale change
diff --git a/rtgui/rtwindow.h b/rtgui/rtwindow.h
index c8885e60e..2d4e390e8 100644
--- a/rtgui/rtwindow.h
+++ b/rtgui/rtwindow.h
@@ -35,7 +35,7 @@ class BatchQueuePanel;
 class EditorPanel;
 class FilePanel;
 class PLDBridge;
-class RTWindow :
+class RTWindow final :
     public Gtk::Window,
     public rtengine::ProgressListener,
     public rtengine::NonCopyable
diff --git a/rtgui/saveasdlg.h b/rtgui/saveasdlg.h
index 448b37fd7..445452a54 100644
--- a/rtgui/saveasdlg.h
+++ b/rtgui/saveasdlg.h
@@ -22,7 +22,7 @@
 
 #include "saveformatpanel.h"
 
-class SaveAsDialog :
+class SaveAsDialog final :
     public Gtk::Dialog,
     public FormatChangeListener
 {
diff --git a/rtgui/shcselector.h b/rtgui/shcselector.h
index 5c4421e0a..e03ca8e06 100644
--- a/rtgui/shcselector.h
+++ b/rtgui/shcselector.h
@@ -29,7 +29,7 @@ public:
     virtual void shcChanged() = 0;
 };
 
-class SHCSelector : public Gtk::DrawingArea, BackBuffer
+class SHCSelector final : public Gtk::DrawingArea, BackBuffer
 {
 
 protected:
diff --git a/rtgui/splash.h b/rtgui/splash.h
index 363c51489..bc63cef91 100644
--- a/rtgui/splash.h
+++ b/rtgui/splash.h
@@ -20,7 +20,7 @@
 
 #include <gtkmm.h>
 
-class SplashImage :
+class SplashImage final :
     public Gtk::DrawingArea
 {
 
@@ -39,7 +39,7 @@ public:
 };
 
 //class Splash : public Gtk::Window {
-class Splash : public Gtk::Dialog
+class Splash final : public Gtk::Dialog
 {
 
 private:
diff --git a/rtgui/thresholdselector.h b/rtgui/thresholdselector.h
index f948b56ad..4ae86560e 100644
--- a/rtgui/thresholdselector.h
+++ b/rtgui/thresholdselector.h
@@ -113,10 +113,10 @@ protected:
     void updateBackBuffer();
 
     Gtk::SizeRequestMode get_request_mode_vfunc () const override;
-    void get_preferred_height_vfunc (int& minimum_height, int& natural_height) const override;
-    void get_preferred_width_vfunc (int &minimum_width, int &natural_width) const override;
-    void get_preferred_height_for_width_vfunc (int width, int &minimum_height, int &natural_height) const override;
-    void get_preferred_width_for_height_vfunc (int height, int &minimum_width, int &natural_width) const override;
+    void get_preferred_height_vfunc (int& minimum_height, int& natural_height) const final;
+    void get_preferred_width_vfunc (int &minimum_width, int &natural_width) const final;
+    void get_preferred_height_for_width_vfunc (int width, int &minimum_height, int &natural_height) const final;
+    void get_preferred_width_for_height_vfunc (int height, int &minimum_width, int &natural_width) const final;
     void on_realize () override;
     bool on_draw(const ::Cairo::RefPtr< Cairo::Context> &cr) override;
     bool on_button_press_event (GdkEventButton* event) override;
diff --git a/rtgui/thumbbrowserbase.h b/rtgui/thumbbrowserbase.h
index b4caac0a9..69017b183 100644
--- a/rtgui/thumbbrowserbase.h
+++ b/rtgui/thumbbrowserbase.h
@@ -60,10 +60,10 @@ class ThumbBrowserBase :
         bool on_draw(const ::Cairo::RefPtr< Cairo::Context> &cr) override;
 
         Gtk::SizeRequestMode get_request_mode_vfunc () const override;
-        void get_preferred_height_vfunc (int &minimum_height, int &natural_height) const override;
-        void get_preferred_width_vfunc (int &minimum_width, int &natural_width) const override;
-        void get_preferred_height_for_width_vfunc (int width, int &minimum_height, int &natural_height) const override;
-        void get_preferred_width_for_height_vfunc (int height, int &minimum_width, int &natural_width) const override;
+        void get_preferred_height_vfunc (int &minimum_height, int &natural_height) const final;
+        void get_preferred_width_vfunc (int &minimum_width, int &natural_width) const final;
+        void get_preferred_height_for_width_vfunc (int width, int &minimum_height, int &natural_height) const final;
+        void get_preferred_width_for_height_vfunc (int height, int &minimum_width, int &natural_width) const final;
 
         bool on_button_press_event (GdkEventButton* event) override;
         bool on_button_release_event (GdkEventButton* event) override;
diff --git a/rtgui/toolpanel.h b/rtgui/toolpanel.h
index aecf1f39f..12b3eebcc 100644
--- a/rtgui/toolpanel.h
+++ b/rtgui/toolpanel.h
@@ -143,7 +143,7 @@ public:
         this->batchMode = batchMode;
     }
 
-    virtual Glib::ustring getToolName () {
+    Glib::ustring getToolName () {
         return toolName;
     }
 };
@@ -164,11 +164,11 @@ public:
 
     FoldableToolPanel(Gtk::Box* content, Glib::ustring toolName, Glib::ustring UILabel, bool need11 = false, bool useEnabled = false);
 
-    MyExpander* getExpander() override
+    MyExpander* getExpander() final
     {
         return exp;
     }
-    void setExpanded (bool expanded) override
+    void setExpanded (bool expanded) final
     {
         if (exp) {
             exp->set_expanded( expanded );
@@ -186,7 +186,7 @@ public:
             exp->show();
         }
     }
-    bool getExpanded () override
+    bool getExpanded () final
     {
         if (exp) {
             return exp->get_expanded();
@@ -194,11 +194,11 @@ public:
 
         return false;
     }
-    void setParent (Gtk::Box* parent) override
+    void setParent (Gtk::Box* parent) final
     {
         parentContainer = parent;
     }
-    Gtk::Box* getParent () override
+    Gtk::Box* getParent () final
     {
         return parentContainer;
     }
diff --git a/rtgui/toolpanelcoord.h b/rtgui/toolpanelcoord.h
index a40c7dfcd..93c4faee9 100644
--- a/rtgui/toolpanelcoord.h
+++ b/rtgui/toolpanelcoord.h
@@ -323,11 +323,11 @@ public:
 
     // imageareatoollistener interface
     void spotWBselected(int x, int y, Thumbnail* thm = nullptr) override;
-    void sharpMaskSelected(bool sharpMask) override;
+    void sharpMaskSelected(bool sharpMask) override final;
     int getSpotWBRectSize() const override;
     void cropSelectionReady() override;
     void rotateSelectionReady(double rotate_deg, Thumbnail* thm = nullptr) override;
-    ToolBar* getToolBar() const override;
+    ToolBar* getToolBar() const final;
     CropGUIListener* startCropEditing(Thumbnail* thm = nullptr) override;
 
     void updateTPVScrollbar(bool hide);
@@ -335,7 +335,7 @@ public:
 
     // ToolBarListener interface
     void toolSelected (ToolMode tool) override;
-    void editModeSwitchedOff () override;
+    void editModeSwitchedOff () final;
 
     void setEditProvider(EditDataProvider *provider);